From 05704d0eb2fa59fe727652465b07db40bcb06c38 Mon Sep 17 00:00:00 2001 From: Yuren Hao Date: Wed, 8 Apr 2026 22:06:05 -0500 Subject: Initial release: GAP framework - Full pipeline: variant generation, multi-judge verification, evaluation - Loaders for OpenAI / Anthropic / Google / xAI / OpenRouter / vLLM - Framework-level mechanism analyses: paired structural overlap, repairability rescue, self-correction probe, cross-model agreement, topic x problem-type interaction - Unicode -> bare-LaTeX cleaner + audit + spot-check - Mirrors https://huggingface.co/datasets/blackhao0426/PutnamGAP --- mini_gap_math.py | 397 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 397 insertions(+) create mode 100644 mini_gap_math.py (limited to 'mini_gap_math.py') diff --git a/mini_gap_math.py b/mini_gap_math.py new file mode 100644 index 0000000..ff95727 --- /dev/null +++ b/mini_gap_math.py @@ -0,0 +1,397 @@ +#!/usr/bin/env python3 +""" +Mini-GAP-MATH: Apply GAP surface renaming to MATH dataset and evaluate. +Proves GAP framework generalizes beyond Putnam. +""" + +import json +import re +import random +import os +import sys +import time +import argparse +from pathlib import Path + +random.seed(42) + +# ============================================================ +# Step 1: Extract variables from MATH problems +# ============================================================ + +def extract_latex_variables(problem_text): + """Extract single-letter and short math variables from LaTeX.""" + # Find variables inside $...$ math mode + math_segments = re.findall(r'\$([^$]+)\$', problem_text) + all_text = ' '.join(math_segments) + + # Common math variables: single letters, subscripted versions + vars_found = set() + + # Single-letter variables (a-z, A-Z) used standalone in math + for m in re.finditer(r'(? 0 else 0 + results['variants'][variant_type] = { + 'accuracy': acc, + 'correct': correct, + 'total': total, + 'per_item': per_item, + } + print(f" {variant_type}: {correct}/{total} = {acc:.1f}%") + + # Compute deltas + orig_acc = results['variants']['original']['accuracy'] + for vt in ['garbled_string', 'descriptive_long_confusing']: + var_acc = results['variants'][vt]['accuracy'] + results['variants'][vt]['delta'] = var_acc - orig_acc + + # Cleanup + del model + del tokenizer + torch.cuda.empty_cache() + + return results + + +def main(): + parser = argparse.ArgumentParser(description='Mini-GAP-MATH experiment') + parser.add_argument('--step', choices=['prepare', 'evaluate', 'all'], default='all') + parser.add_argument('--models', nargs='+', default=['Qwen/Qwen2.5-7B-Instruct']) + parser.add_argument('--device', default='cuda:2') + parser.add_argument('--batch-size', type=int, default=4) + parser.add_argument('--max-problems', type=int, default=200) + parser.add_argument('--input', default='/home/yurenh2/gap/math_sample_200.json') + parser.add_argument('--output-dir', default='/home/yurenh2/gap/mini_gap_math_results') + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + variants_file = os.path.join(args.output_dir, 'math_variants.json') + + if args.step in ['prepare', 'all']: + print("="*60) + print("Step 1: Loading MATH problems and creating variants") + print("="*60) + + with open(args.input) as f: + problems = json.load(f) + + problems = problems[:args.max_problems] + print(f"Loaded {len(problems)} problems") + + variants = create_variants(problems) + print(f"Created variants for {len(variants)} problems") + + with open(variants_file, 'w') as f: + json.dump(variants, f, indent=2) + print(f"Saved to {variants_file}") + + # Show a sample + if variants: + v = variants[0] + print(f"\nSample problem (original):") + print(f" {v['original']['problem'][:200]}...") + print(f" Variables: {v['variables']}") + print(f"\nGS variant:") + print(f" {v['garbled_string']['problem'][:200]}...") + print(f" Map: {v['garbled_string']['map']}") + + if args.step in ['evaluate', 'all']: + print("\n" + "="*60) + print("Step 2: Evaluating models") + print("="*60) + + with open(variants_file) as f: + variants_data = json.load(f) + + all_results = [] + for model_name in args.models: + try: + result = evaluate_model( + model_name, variants_data, + device=args.device, batch_size=args.batch_size + ) + all_results.append(result) + + # Save incrementally + out_file = os.path.join(args.output_dir, 'evaluation_results.json') + with open(out_file, 'w') as f: + json.dump(all_results, f, indent=2) + + except Exception as e: + print(f"ERROR with {model_name}: {e}") + import traceback + traceback.print_exc() + + # Print summary table + print("\n" + "="*60) + print("RESULTS SUMMARY") + print("="*60) + print(f"{'Model':<35} {'Original':>10} {'GS':>10} {'GS Δ':>8} {'DLC':>10} {'DLC Δ':>8}") + print("-"*85) + for r in all_results: + m = r['model'].split('/')[-1] + orig = r['variants']['original']['accuracy'] + gs = r['variants']['garbled_string']['accuracy'] + gs_d = r['variants']['garbled_string']['delta'] + dlc = r['variants']['descriptive_long_confusing']['accuracy'] + dlc_d = r['variants']['descriptive_long_confusing']['delta'] + print(f"{m:<35} {orig:>9.1f}% {gs:>9.1f}% {gs_d:>+7.1f} {dlc:>9.1f}% {dlc_d:>+7.1f}") + + +if __name__ == '__main__': + main() -- cgit v1.2.3