Initial release: GAP framework

- Full pipeline: variant generation, multi-judge verification, evaluation - Loaders for OpenAI / Anthropic / Google / xAI / OpenRouter / vLLM - Framework-level mechanism analyses: paired structural overlap, repairability rescue, self-correction probe, cross-model agreement, topic x problem-type interaction - Unicode -> bare-LaTeX cleaner + audit + spot-check - Mirrors https://huggingface.co/datasets/blackhao0426/PutnamGAP
author: Yuren Hao <yurenh2@illinois.edu> 2026-04-08 22:06:05 -0500
committer: Yuren Hao <yurenh2@illinois.edu> 2026-04-08 22:06:05 -0500
commit: 05704d0eb2fa59fe727652465b07db40bcb06c38 (patch)
tree: 8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /mini_gap_math_api.py
1 files changed, 192 insertions, 0 deletions
diff --git a/mini_gap_math_api.py b/mini_gap_math_api.py
new file mode 100644
index 0000000..254db9c
--- /dev/null
+++ b/mini_gap_math_api.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+"""
+Mini-GAP-MATH: Evaluate MATH variants using OpenAI API.
+"""
+
+import json
+import re
+import os
+import sys
+import asyncio
+import time
+import argparse
+from pathlib import Path
+from openai import AsyncOpenAI
+
+client = AsyncOpenAI()
+SEMAPHORE = asyncio.Semaphore(50)  # max concurrent requests
+
+# ============================================================
+# Answer extraction and checking
+# ============================================================
+
+def extract_boxed_answer(text):
+    """Extract answer from \\boxed{...}."""
+    if not text:
+        return None
+    # Handle nested braces
+    matches = []
+    i = 0
+    while i < len(text):
+        idx = text.find('\\boxed{', i)
+        if idx == -1:
+            break
+        # Find matching closing brace
+        depth = 1
+        j = idx + 7
+        while j < len(text) and depth > 0:
+            if text[j] == '{':
+                depth += 1
+            elif text[j] == '}':
+                depth -= 1
+            j += 1
+        if depth == 0:
+            matches.append(text[idx+7:j-1].strip())
+        i = j
+    return matches[-1] if matches else None
+
+def normalize_answer(ans):
+    """Normalize answer for comparison."""
+    if ans is None:
+        return None
+    ans = ans.strip()
+    ans = ans.replace('$', '').replace(' ', '')
+    ans = ans.replace('\\dfrac', '\\frac').replace('\\tfrac', '\\frac')
+    ans = ans.replace('\\left', '').replace('\\right', '')
+    ans = ans.replace('\\,', '').replace('\\;', '')
+    return ans
+
+def check_answer(generated, reference_solution):
+    """Check if generated answer matches reference."""
+    ref_answer = extract_boxed_answer(reference_solution)
+    gen_answer = extract_boxed_answer(generated)
+    if ref_answer is None or gen_answer is None:
+        return False
+    return normalize_answer(ref_answer) == normalize_answer(gen_answer)
+
+# ============================================================
+# API calls
+# ============================================================
+
+async def solve_problem(problem_text, model="gpt-4o-mini"):
+    """Solve a single problem using OpenAI API."""
+    async with SEMAPHORE:
+        try:
+            resp = await client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": "You are an expert mathematician. Solve the problem step by step and put your final answer in \\boxed{}."},
+                    {"role": "user", "content": problem_text}
+                ],
+                max_tokens=2048,
+                temperature=0,
+            )
+            return resp.choices[0].message.content
+        except Exception as e:
+            print(f"  API error: {e}")
+            return None
+
+async def evaluate_variant(variant_data, variant_type, model):
+    """Evaluate all problems for one variant type."""
+    problems = [item[variant_type]['problem'] for item in variant_data]
+    solutions = [item[variant_type]['solution'] for item in variant_data]
+
+    print(f"\n--- Evaluating {variant_type} ({len(problems)} problems) ---")
+
+    # Launch all requests concurrently
+    tasks = [solve_problem(p, model) for p in problems]
+    generated = await asyncio.gather(*tasks)
+
+    correct = 0
+    total = len(problems)
+    per_item = []
+    for j, (gen, sol) in enumerate(zip(generated, solutions)):
+        is_correct = check_answer(gen or "", sol)
+        correct += int(is_correct)
+        per_item.append({
+            'index': variant_data[j]['index'],
+            'correct': is_correct,
+            'generated_answer': extract_boxed_answer(gen or ""),
+            'reference_answer': extract_boxed_answer(sol),
+        })
+
+    acc = correct / total * 100 if total > 0 else 0
+    print(f"  {variant_type}: {correct}/{total} = {acc:.1f}%")
+
+    return {
+        'accuracy': acc,
+        'correct': correct,
+        'total': total,
+        'per_item': per_item,
+    }
+
+async def evaluate_model(model, variant_data, output_dir):
+    """Evaluate a model on all variants."""
+    print(f"\n{'='*60}")
+    print(f"Evaluating model: {model}")
+    print(f"{'='*60}")
+
+    results = {'model': model, 'variants': {}}
+
+    for vt in ['original', 'garbled_string', 'descriptive_long_confusing']:
+        results['variants'][vt] = await evaluate_variant(variant_data, vt, model)
+
+    # Compute deltas
+    orig_acc = results['variants']['original']['accuracy']
+    for vt in ['garbled_string', 'descriptive_long_confusing']:
+        results['variants'][vt]['delta'] = results['variants'][vt]['accuracy'] - orig_acc
+
+    # Save
+    out_file = os.path.join(output_dir, f'{model.replace("/", "_")}_results.json')
+    with open(out_file, 'w') as f:
+        json.dump(results, f, indent=2)
+    print(f"  Saved to {out_file}")
+
+    return results
+
+async def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--models', nargs='+', default=['gpt-4o-mini'])
+    parser.add_argument('--variants-file', default='/home/yurenh2/gap/mini_gap_math_results/math_variants.json')
+    parser.add_argument('--output-dir', default='/home/yurenh2/gap/mini_gap_math_results')
+    parser.add_argument('--concurrency', type=int, default=50)
+    args = parser.parse_args()
+
+    global SEMAPHORE
+    SEMAPHORE = asyncio.Semaphore(args.concurrency)
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    with open(args.variants_file) as f:
+        variant_data = json.load(f)
+
+    print(f"Loaded {len(variant_data)} problems with variants")
+
+    all_results = []
+    for model in args.models:
+        result = await evaluate_model(model, variant_data, args.output_dir)
+        all_results.append(result)
+
+    # Print summary
+    print("\n" + "="*80)
+    print("MINI-GAP-MATH RESULTS SUMMARY")
+    print("="*80)
+    print(f"{'Model':<25} {'Original':>10} {'GS':>10} {'GS Δ':>8} {'DLC':>10} {'DLC Δ':>8}")
+    print("-"*75)
+    for r in all_results:
+        m = r['model']
+        orig = r['variants']['original']['accuracy']
+        gs = r['variants']['garbled_string']['accuracy']
+        gs_d = r['variants']['garbled_string']['delta']
+        dlc = r['variants']['descriptive_long_confusing']['accuracy']
+        dlc_d = r['variants']['descriptive_long_confusing']['delta']
+        print(f"{m:<25} {orig:>9.1f}% {gs:>9.1f}% {gs_d:>+7.1f} {dlc:>9.1f}% {dlc_d:>+7.1f}")
+
+    # Save combined
+    combined_file = os.path.join(args.output_dir, 'all_api_results.json')
+    with open(combined_file, 'w') as f:
+        json.dump(all_results, f, indent=2)
+    print(f"\nAll results saved to {combined_file}")
+
+if __name__ == '__main__':
+    asyncio.run(main())
author	Yuren Hao <yurenh2@illinois.edu>	2026-04-08 22:06:05 -0500
committer	Yuren Hao <yurenh2@illinois.edu>	2026-04-08 22:06:05 -0500
commit	05704d0eb2fa59fe727652465b07db40bcb06c38 (patch)
tree	8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /mini_gap_math_api.py