summaryrefslogtreecommitdiff
path: root/mini_gap_math_api.py
diff options
context:
space:
mode:
authorYuren Hao <yurenh2@illinois.edu>2026-04-08 22:06:05 -0500
committerYuren Hao <yurenh2@illinois.edu>2026-04-08 22:06:05 -0500
commit05704d0eb2fa59fe727652465b07db40bcb06c38 (patch)
tree8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /mini_gap_math_api.py
Initial release: GAP framework
- Full pipeline: variant generation, multi-judge verification, evaluation - Loaders for OpenAI / Anthropic / Google / xAI / OpenRouter / vLLM - Framework-level mechanism analyses: paired structural overlap, repairability rescue, self-correction probe, cross-model agreement, topic x problem-type interaction - Unicode -> bare-LaTeX cleaner + audit + spot-check - Mirrors https://huggingface.co/datasets/blackhao0426/PutnamGAP
Diffstat (limited to 'mini_gap_math_api.py')
-rw-r--r--mini_gap_math_api.py192
1 files changed, 192 insertions, 0 deletions
diff --git a/mini_gap_math_api.py b/mini_gap_math_api.py
new file mode 100644
index 0000000..254db9c
--- /dev/null
+++ b/mini_gap_math_api.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+"""
+Mini-GAP-MATH: Evaluate MATH variants using OpenAI API.
+"""
+
+import json
+import re
+import os
+import sys
+import asyncio
+import time
+import argparse
+from pathlib import Path
+from openai import AsyncOpenAI
+
+client = AsyncOpenAI()
+SEMAPHORE = asyncio.Semaphore(50) # max concurrent requests
+
+# ============================================================
+# Answer extraction and checking
+# ============================================================
+
+def extract_boxed_answer(text):
+ """Extract answer from \\boxed{...}."""
+ if not text:
+ return None
+ # Handle nested braces
+ matches = []
+ i = 0
+ while i < len(text):
+ idx = text.find('\\boxed{', i)
+ if idx == -1:
+ break
+ # Find matching closing brace
+ depth = 1
+ j = idx + 7
+ while j < len(text) and depth > 0:
+ if text[j] == '{':
+ depth += 1
+ elif text[j] == '}':
+ depth -= 1
+ j += 1
+ if depth == 0:
+ matches.append(text[idx+7:j-1].strip())
+ i = j
+ return matches[-1] if matches else None
+
+def normalize_answer(ans):
+ """Normalize answer for comparison."""
+ if ans is None:
+ return None
+ ans = ans.strip()
+ ans = ans.replace('$', '').replace(' ', '')
+ ans = ans.replace('\\dfrac', '\\frac').replace('\\tfrac', '\\frac')
+ ans = ans.replace('\\left', '').replace('\\right', '')
+ ans = ans.replace('\\,', '').replace('\\;', '')
+ return ans
+
+def check_answer(generated, reference_solution):
+ """Check if generated answer matches reference."""
+ ref_answer = extract_boxed_answer(reference_solution)
+ gen_answer = extract_boxed_answer(generated)
+ if ref_answer is None or gen_answer is None:
+ return False
+ return normalize_answer(ref_answer) == normalize_answer(gen_answer)
+
+# ============================================================
+# API calls
+# ============================================================
+
+async def solve_problem(problem_text, model="gpt-4o-mini"):
+ """Solve a single problem using OpenAI API."""
+ async with SEMAPHORE:
+ try:
+ resp = await client.chat.completions.create(
+ model=model,
+ messages=[
+ {"role": "system", "content": "You are an expert mathematician. Solve the problem step by step and put your final answer in \\boxed{}."},
+ {"role": "user", "content": problem_text}
+ ],
+ max_tokens=2048,
+ temperature=0,
+ )
+ return resp.choices[0].message.content
+ except Exception as e:
+ print(f" API error: {e}")
+ return None
+
+async def evaluate_variant(variant_data, variant_type, model):
+ """Evaluate all problems for one variant type."""
+ problems = [item[variant_type]['problem'] for item in variant_data]
+ solutions = [item[variant_type]['solution'] for item in variant_data]
+
+ print(f"\n--- Evaluating {variant_type} ({len(problems)} problems) ---")
+
+ # Launch all requests concurrently
+ tasks = [solve_problem(p, model) for p in problems]
+ generated = await asyncio.gather(*tasks)
+
+ correct = 0
+ total = len(problems)
+ per_item = []
+ for j, (gen, sol) in enumerate(zip(generated, solutions)):
+ is_correct = check_answer(gen or "", sol)
+ correct += int(is_correct)
+ per_item.append({
+ 'index': variant_data[j]['index'],
+ 'correct': is_correct,
+ 'generated_answer': extract_boxed_answer(gen or ""),
+ 'reference_answer': extract_boxed_answer(sol),
+ })
+
+ acc = correct / total * 100 if total > 0 else 0
+ print(f" {variant_type}: {correct}/{total} = {acc:.1f}%")
+
+ return {
+ 'accuracy': acc,
+ 'correct': correct,
+ 'total': total,
+ 'per_item': per_item,
+ }
+
+async def evaluate_model(model, variant_data, output_dir):
+ """Evaluate a model on all variants."""
+ print(f"\n{'='*60}")
+ print(f"Evaluating model: {model}")
+ print(f"{'='*60}")
+
+ results = {'model': model, 'variants': {}}
+
+ for vt in ['original', 'garbled_string', 'descriptive_long_confusing']:
+ results['variants'][vt] = await evaluate_variant(variant_data, vt, model)
+
+ # Compute deltas
+ orig_acc = results['variants']['original']['accuracy']
+ for vt in ['garbled_string', 'descriptive_long_confusing']:
+ results['variants'][vt]['delta'] = results['variants'][vt]['accuracy'] - orig_acc
+
+ # Save
+ out_file = os.path.join(output_dir, f'{model.replace("/", "_")}_results.json')
+ with open(out_file, 'w') as f:
+ json.dump(results, f, indent=2)
+ print(f" Saved to {out_file}")
+
+ return results
+
+async def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--models', nargs='+', default=['gpt-4o-mini'])
+ parser.add_argument('--variants-file', default='/home/yurenh2/gap/mini_gap_math_results/math_variants.json')
+ parser.add_argument('--output-dir', default='/home/yurenh2/gap/mini_gap_math_results')
+ parser.add_argument('--concurrency', type=int, default=50)
+ args = parser.parse_args()
+
+ global SEMAPHORE
+ SEMAPHORE = asyncio.Semaphore(args.concurrency)
+
+ os.makedirs(args.output_dir, exist_ok=True)
+
+ with open(args.variants_file) as f:
+ variant_data = json.load(f)
+
+ print(f"Loaded {len(variant_data)} problems with variants")
+
+ all_results = []
+ for model in args.models:
+ result = await evaluate_model(model, variant_data, args.output_dir)
+ all_results.append(result)
+
+ # Print summary
+ print("\n" + "="*80)
+ print("MINI-GAP-MATH RESULTS SUMMARY")
+ print("="*80)
+ print(f"{'Model':<25} {'Original':>10} {'GS':>10} {'GS Δ':>8} {'DLC':>10} {'DLC Δ':>8}")
+ print("-"*75)
+ for r in all_results:
+ m = r['model']
+ orig = r['variants']['original']['accuracy']
+ gs = r['variants']['garbled_string']['accuracy']
+ gs_d = r['variants']['garbled_string']['delta']
+ dlc = r['variants']['descriptive_long_confusing']['accuracy']
+ dlc_d = r['variants']['descriptive_long_confusing']['delta']
+ print(f"{m:<25} {orig:>9.1f}% {gs:>9.1f}% {gs_d:>+7.1f} {dlc:>9.1f}% {dlc_d:>+7.1f}")
+
+ # Save combined
+ combined_file = os.path.join(args.output_dir, 'all_api_results.json')
+ with open(combined_file, 'w') as f:
+ json.dump(all_results, f, indent=2)
+ print(f"\nAll results saved to {combined_file}")
+
+if __name__ == '__main__':
+ asyncio.run(main())