diff options
| author | Yuren Hao <yurenh2@illinois.edu> | 2026-04-08 22:06:05 -0500 |
|---|---|---|
| committer | Yuren Hao <yurenh2@illinois.edu> | 2026-04-08 22:06:05 -0500 |
| commit | 05704d0eb2fa59fe727652465b07db40bcb06c38 (patch) | |
| tree | 8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /mini_gap_math_api.py | |
Initial release: GAP framework
- Full pipeline: variant generation, multi-judge verification, evaluation
- Loaders for OpenAI / Anthropic / Google / xAI / OpenRouter / vLLM
- Framework-level mechanism analyses: paired structural overlap, repairability rescue, self-correction probe, cross-model agreement, topic x problem-type interaction
- Unicode -> bare-LaTeX cleaner + audit + spot-check
- Mirrors https://huggingface.co/datasets/blackhao0426/PutnamGAP
Diffstat (limited to 'mini_gap_math_api.py')
| -rw-r--r-- | mini_gap_math_api.py | 192 |
1 files changed, 192 insertions, 0 deletions
diff --git a/mini_gap_math_api.py b/mini_gap_math_api.py new file mode 100644 index 0000000..254db9c --- /dev/null +++ b/mini_gap_math_api.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +""" +Mini-GAP-MATH: Evaluate MATH variants using OpenAI API. +""" + +import json +import re +import os +import sys +import asyncio +import time +import argparse +from pathlib import Path +from openai import AsyncOpenAI + +client = AsyncOpenAI() +SEMAPHORE = asyncio.Semaphore(50) # max concurrent requests + +# ============================================================ +# Answer extraction and checking +# ============================================================ + +def extract_boxed_answer(text): + """Extract answer from \\boxed{...}.""" + if not text: + return None + # Handle nested braces + matches = [] + i = 0 + while i < len(text): + idx = text.find('\\boxed{', i) + if idx == -1: + break + # Find matching closing brace + depth = 1 + j = idx + 7 + while j < len(text) and depth > 0: + if text[j] == '{': + depth += 1 + elif text[j] == '}': + depth -= 1 + j += 1 + if depth == 0: + matches.append(text[idx+7:j-1].strip()) + i = j + return matches[-1] if matches else None + +def normalize_answer(ans): + """Normalize answer for comparison.""" + if ans is None: + return None + ans = ans.strip() + ans = ans.replace('$', '').replace(' ', '') + ans = ans.replace('\\dfrac', '\\frac').replace('\\tfrac', '\\frac') + ans = ans.replace('\\left', '').replace('\\right', '') + ans = ans.replace('\\,', '').replace('\\;', '') + return ans + +def check_answer(generated, reference_solution): + """Check if generated answer matches reference.""" + ref_answer = extract_boxed_answer(reference_solution) + gen_answer = extract_boxed_answer(generated) + if ref_answer is None or gen_answer is None: + return False + return normalize_answer(ref_answer) == normalize_answer(gen_answer) + +# ============================================================ +# API calls +# ============================================================ + +async def solve_problem(problem_text, model="gpt-4o-mini"): + """Solve a single problem using OpenAI API.""" + async with SEMAPHORE: + try: + resp = await client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": "You are an expert mathematician. Solve the problem step by step and put your final answer in \\boxed{}."}, + {"role": "user", "content": problem_text} + ], + max_tokens=2048, + temperature=0, + ) + return resp.choices[0].message.content + except Exception as e: + print(f" API error: {e}") + return None + +async def evaluate_variant(variant_data, variant_type, model): + """Evaluate all problems for one variant type.""" + problems = [item[variant_type]['problem'] for item in variant_data] + solutions = [item[variant_type]['solution'] for item in variant_data] + + print(f"\n--- Evaluating {variant_type} ({len(problems)} problems) ---") + + # Launch all requests concurrently + tasks = [solve_problem(p, model) for p in problems] + generated = await asyncio.gather(*tasks) + + correct = 0 + total = len(problems) + per_item = [] + for j, (gen, sol) in enumerate(zip(generated, solutions)): + is_correct = check_answer(gen or "", sol) + correct += int(is_correct) + per_item.append({ + 'index': variant_data[j]['index'], + 'correct': is_correct, + 'generated_answer': extract_boxed_answer(gen or ""), + 'reference_answer': extract_boxed_answer(sol), + }) + + acc = correct / total * 100 if total > 0 else 0 + print(f" {variant_type}: {correct}/{total} = {acc:.1f}%") + + return { + 'accuracy': acc, + 'correct': correct, + 'total': total, + 'per_item': per_item, + } + +async def evaluate_model(model, variant_data, output_dir): + """Evaluate a model on all variants.""" + print(f"\n{'='*60}") + print(f"Evaluating model: {model}") + print(f"{'='*60}") + + results = {'model': model, 'variants': {}} + + for vt in ['original', 'garbled_string', 'descriptive_long_confusing']: + results['variants'][vt] = await evaluate_variant(variant_data, vt, model) + + # Compute deltas + orig_acc = results['variants']['original']['accuracy'] + for vt in ['garbled_string', 'descriptive_long_confusing']: + results['variants'][vt]['delta'] = results['variants'][vt]['accuracy'] - orig_acc + + # Save + out_file = os.path.join(output_dir, f'{model.replace("/", "_")}_results.json') + with open(out_file, 'w') as f: + json.dump(results, f, indent=2) + print(f" Saved to {out_file}") + + return results + +async def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--models', nargs='+', default=['gpt-4o-mini']) + parser.add_argument('--variants-file', default='/home/yurenh2/gap/mini_gap_math_results/math_variants.json') + parser.add_argument('--output-dir', default='/home/yurenh2/gap/mini_gap_math_results') + parser.add_argument('--concurrency', type=int, default=50) + args = parser.parse_args() + + global SEMAPHORE + SEMAPHORE = asyncio.Semaphore(args.concurrency) + + os.makedirs(args.output_dir, exist_ok=True) + + with open(args.variants_file) as f: + variant_data = json.load(f) + + print(f"Loaded {len(variant_data)} problems with variants") + + all_results = [] + for model in args.models: + result = await evaluate_model(model, variant_data, args.output_dir) + all_results.append(result) + + # Print summary + print("\n" + "="*80) + print("MINI-GAP-MATH RESULTS SUMMARY") + print("="*80) + print(f"{'Model':<25} {'Original':>10} {'GS':>10} {'GS Δ':>8} {'DLC':>10} {'DLC Δ':>8}") + print("-"*75) + for r in all_results: + m = r['model'] + orig = r['variants']['original']['accuracy'] + gs = r['variants']['garbled_string']['accuracy'] + gs_d = r['variants']['garbled_string']['delta'] + dlc = r['variants']['descriptive_long_confusing']['accuracy'] + dlc_d = r['variants']['descriptive_long_confusing']['delta'] + print(f"{m:<25} {orig:>9.1f}% {gs:>9.1f}% {gs_d:>+7.1f} {dlc:>9.1f}% {dlc_d:>+7.1f}") + + # Save combined + combined_file = os.path.join(args.output_dir, 'all_api_results.json') + with open(combined_file, 'w') as f: + json.dump(all_results, f, indent=2) + print(f"\nAll results saved to {combined_file}") + +if __name__ == '__main__': + asyncio.run(main()) |
