#!/usr/bin/env python3 """ Mini-GAP-MATH: Apply GAP surface renaming to MATH dataset and evaluate. Proves GAP framework generalizes beyond Putnam. """ import json import re import random import os import sys import time import argparse from pathlib import Path random.seed(42) # ============================================================ # Step 1: Extract variables from MATH problems # ============================================================ def extract_latex_variables(problem_text): """Extract single-letter and short math variables from LaTeX.""" # Find variables inside $...$ math mode math_segments = re.findall(r'\$([^$]+)\$', problem_text) all_text = ' '.join(math_segments) # Common math variables: single letters, subscripted versions vars_found = set() # Single-letter variables (a-z, A-Z) used standalone in math for m in re.finditer(r'(? 0 else 0 results['variants'][variant_type] = { 'accuracy': acc, 'correct': correct, 'total': total, 'per_item': per_item, } print(f" {variant_type}: {correct}/{total} = {acc:.1f}%") # Compute deltas orig_acc = results['variants']['original']['accuracy'] for vt in ['garbled_string', 'descriptive_long_confusing']: var_acc = results['variants'][vt]['accuracy'] results['variants'][vt]['delta'] = var_acc - orig_acc # Cleanup del model del tokenizer torch.cuda.empty_cache() return results def main(): parser = argparse.ArgumentParser(description='Mini-GAP-MATH experiment') parser.add_argument('--step', choices=['prepare', 'evaluate', 'all'], default='all') parser.add_argument('--models', nargs='+', default=['Qwen/Qwen2.5-7B-Instruct']) parser.add_argument('--device', default='cuda:2') parser.add_argument('--batch-size', type=int, default=4) parser.add_argument('--max-problems', type=int, default=200) parser.add_argument('--input', default='/home/yurenh2/gap/math_sample_200.json') parser.add_argument('--output-dir', default='/home/yurenh2/gap/mini_gap_math_results') args = parser.parse_args() os.makedirs(args.output_dir, exist_ok=True) variants_file = os.path.join(args.output_dir, 'math_variants.json') if args.step in ['prepare', 'all']: print("="*60) print("Step 1: Loading MATH problems and creating variants") print("="*60) with open(args.input) as f: problems = json.load(f) problems = problems[:args.max_problems] print(f"Loaded {len(problems)} problems") variants = create_variants(problems) print(f"Created variants for {len(variants)} problems") with open(variants_file, 'w') as f: json.dump(variants, f, indent=2) print(f"Saved to {variants_file}") # Show a sample if variants: v = variants[0] print(f"\nSample problem (original):") print(f" {v['original']['problem'][:200]}...") print(f" Variables: {v['variables']}") print(f"\nGS variant:") print(f" {v['garbled_string']['problem'][:200]}...") print(f" Map: {v['garbled_string']['map']}") if args.step in ['evaluate', 'all']: print("\n" + "="*60) print("Step 2: Evaluating models") print("="*60) with open(variants_file) as f: variants_data = json.load(f) all_results = [] for model_name in args.models: try: result = evaluate_model( model_name, variants_data, device=args.device, batch_size=args.batch_size ) all_results.append(result) # Save incrementally out_file = os.path.join(args.output_dir, 'evaluation_results.json') with open(out_file, 'w') as f: json.dump(all_results, f, indent=2) except Exception as e: print(f"ERROR with {model_name}: {e}") import traceback traceback.print_exc() # Print summary table print("\n" + "="*60) print("RESULTS SUMMARY") print("="*60) print(f"{'Model':<35} {'Original':>10} {'GS':>10} {'GS Δ':>8} {'DLC':>10} {'DLC Δ':>8}") print("-"*85) for r in all_results: m = r['model'].split('/')[-1] orig = r['variants']['original']['accuracy'] gs = r['variants']['garbled_string']['accuracy'] gs_d = r['variants']['garbled_string']['delta'] dlc = r['variants']['descriptive_long_confusing']['accuracy'] dlc_d = r['variants']['descriptive_long_confusing']['delta'] print(f"{m:<35} {orig:>9.1f}% {gs:>9.1f}% {gs_d:>+7.1f} {dlc:>9.1f}% {dlc_d:>+7.1f}") if __name__ == '__main__': main()