#!/usr/bin/env python3 """ Mini-GAP-MATH: Evaluate MATH variants using OpenAI API. """ import json import re import os import sys import asyncio import time import argparse from pathlib import Path from openai import AsyncOpenAI client = AsyncOpenAI() SEMAPHORE = asyncio.Semaphore(50) # max concurrent requests # ============================================================ # Answer extraction and checking # ============================================================ def extract_boxed_answer(text): """Extract answer from \\boxed{...}.""" if not text: return None # Handle nested braces matches = [] i = 0 while i < len(text): idx = text.find('\\boxed{', i) if idx == -1: break # Find matching closing brace depth = 1 j = idx + 7 while j < len(text) and depth > 0: if text[j] == '{': depth += 1 elif text[j] == '}': depth -= 1 j += 1 if depth == 0: matches.append(text[idx+7:j-1].strip()) i = j return matches[-1] if matches else None def normalize_answer(ans): """Normalize answer for comparison.""" if ans is None: return None ans = ans.strip() ans = ans.replace('$', '').replace(' ', '') ans = ans.replace('\\dfrac', '\\frac').replace('\\tfrac', '\\frac') ans = ans.replace('\\left', '').replace('\\right', '') ans = ans.replace('\\,', '').replace('\\;', '') return ans def check_answer(generated, reference_solution): """Check if generated answer matches reference.""" ref_answer = extract_boxed_answer(reference_solution) gen_answer = extract_boxed_answer(generated) if ref_answer is None or gen_answer is None: return False return normalize_answer(ref_answer) == normalize_answer(gen_answer) # ============================================================ # API calls # ============================================================ async def solve_problem(problem_text, model="gpt-4o-mini"): """Solve a single problem using OpenAI API.""" async with SEMAPHORE: try: resp = await client.chat.completions.create( model=model, messages=[ {"role": "system", "content": "You are an expert mathematician. Solve the problem step by step and put your final answer in \\boxed{}."}, {"role": "user", "content": problem_text} ], max_tokens=2048, temperature=0, ) return resp.choices[0].message.content except Exception as e: print(f" API error: {e}") return None async def evaluate_variant(variant_data, variant_type, model): """Evaluate all problems for one variant type.""" problems = [item[variant_type]['problem'] for item in variant_data] solutions = [item[variant_type]['solution'] for item in variant_data] print(f"\n--- Evaluating {variant_type} ({len(problems)} problems) ---") # Launch all requests concurrently tasks = [solve_problem(p, model) for p in problems] generated = await asyncio.gather(*tasks) correct = 0 total = len(problems) per_item = [] for j, (gen, sol) in enumerate(zip(generated, solutions)): is_correct = check_answer(gen or "", sol) correct += int(is_correct) per_item.append({ 'index': variant_data[j]['index'], 'correct': is_correct, 'generated_answer': extract_boxed_answer(gen or ""), 'reference_answer': extract_boxed_answer(sol), }) acc = correct / total * 100 if total > 0 else 0 print(f" {variant_type}: {correct}/{total} = {acc:.1f}%") return { 'accuracy': acc, 'correct': correct, 'total': total, 'per_item': per_item, } async def evaluate_model(model, variant_data, output_dir): """Evaluate a model on all variants.""" print(f"\n{'='*60}") print(f"Evaluating model: {model}") print(f"{'='*60}") results = {'model': model, 'variants': {}} for vt in ['original', 'garbled_string', 'descriptive_long_confusing']: results['variants'][vt] = await evaluate_variant(variant_data, vt, model) # Compute deltas orig_acc = results['variants']['original']['accuracy'] for vt in ['garbled_string', 'descriptive_long_confusing']: results['variants'][vt]['delta'] = results['variants'][vt]['accuracy'] - orig_acc # Save out_file = os.path.join(output_dir, f'{model.replace("/", "_")}_results.json') with open(out_file, 'w') as f: json.dump(results, f, indent=2) print(f" Saved to {out_file}") return results async def main(): parser = argparse.ArgumentParser() parser.add_argument('--models', nargs='+', default=['gpt-4o-mini']) parser.add_argument('--variants-file', default='/home/yurenh2/gap/mini_gap_math_results/math_variants.json') parser.add_argument('--output-dir', default='/home/yurenh2/gap/mini_gap_math_results') parser.add_argument('--concurrency', type=int, default=50) args = parser.parse_args() global SEMAPHORE SEMAPHORE = asyncio.Semaphore(args.concurrency) os.makedirs(args.output_dir, exist_ok=True) with open(args.variants_file) as f: variant_data = json.load(f) print(f"Loaded {len(variant_data)} problems with variants") all_results = [] for model in args.models: result = await evaluate_model(model, variant_data, args.output_dir) all_results.append(result) # Print summary print("\n" + "="*80) print("MINI-GAP-MATH RESULTS SUMMARY") print("="*80) print(f"{'Model':<25} {'Original':>10} {'GS':>10} {'GS Δ':>8} {'DLC':>10} {'DLC Δ':>8}") print("-"*75) for r in all_results: m = r['model'] orig = r['variants']['original']['accuracy'] gs = r['variants']['garbled_string']['accuracy'] gs_d = r['variants']['garbled_string']['delta'] dlc = r['variants']['descriptive_long_confusing']['accuracy'] dlc_d = r['variants']['descriptive_long_confusing']['delta'] print(f"{m:<25} {orig:>9.1f}% {gs:>9.1f}% {gs_d:>+7.1f} {dlc:>9.1f}% {dlc_d:>+7.1f}") # Save combined combined_file = os.path.join(args.output_dir, 'all_api_results.json') with open(combined_file, 'w') as f: json.dump(all_results, f, indent=2) print(f"\nAll results saved to {combined_file}") if __name__ == '__main__': asyncio.run(main())