From 05704d0eb2fa59fe727652465b07db40bcb06c38 Mon Sep 17 00:00:00 2001 From: Yuren Hao Date: Wed, 8 Apr 2026 22:06:05 -0500 Subject: Initial release: GAP framework - Full pipeline: variant generation, multi-judge verification, evaluation - Loaders for OpenAI / Anthropic / Google / xAI / OpenRouter / vLLM - Framework-level mechanism analyses: paired structural overlap, repairability rescue, self-correction probe, cross-model agreement, topic x problem-type interaction - Unicode -> bare-LaTeX cleaner + audit + spot-check - Mirrors https://huggingface.co/datasets/blackhao0426/PutnamGAP --- kv_math_50.py | 425 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 425 insertions(+) create mode 100644 kv_math_50.py (limited to 'kv_math_50.py') diff --git a/kv_math_50.py b/kv_math_50.py new file mode 100644 index 0000000..f9edfcd --- /dev/null +++ b/kv_math_50.py @@ -0,0 +1,425 @@ +#!/usr/bin/env python3 +""" +KV on MATH: Generate Kernel Variants for 50 MATH Level 5 problems. +3-stage pipeline with repair loop and 3 judges. +""" + +import json, asyncio, random, re, os, sys, time +from openai import AsyncOpenAI + +client = AsyncOpenAI() +SEM_GEN = asyncio.Semaphore(2) # generation calls (expensive o3) - low to avoid rate limits +SEM_EVAL = asyncio.Semaphore(30) # evaluation calls (cheaper) +random.seed(42) + +OUTPUT_DIR = '/home/yurenh2/gap/mini_gap_math_results/kv_50' +os.makedirs(OUTPUT_DIR, exist_ok=True) + +# ============================================================ +# Prompts +# ============================================================ + +SLOT_DISCOVERY = """You are a mathematical analysis expert. Given a math problem and its solution, identify all "mutable slots" — numerical constants, parameters, coefficients, or specific values that could be changed to create a new but structurally equivalent problem. + +For each slot: +1. The original value (as it appears in the problem) +2. What it represents mathematically +3. Constraints on alternative values (what must hold for the same solution technique to work) + +Return ONLY valid JSON: +{ + "mutable_slots": [ + {"value": "...", "role": "...", "constraints": "..."}, + ... + ] +} + +If the problem has no mutable slots (all constants are mathematically fixed), return: +{"mutable_slots": []}""" + +BACK_SYNTHESIS = """You are creating a mathematical variant problem. Given: +1. An original problem and solution +2. Mutable slots identified in the problem + +Your task: +- Choose NEW values for each mutable slot that satisfy the constraints +- Rewrite the problem with these new values +- Work out the complete new solution step by step +- The new problem MUST be solvable and the solution MUST follow the same mathematical reasoning + +Return ONLY valid JSON: +{ + "new_problem": "... (full LaTeX problem statement)", + "new_solution": "... (complete step-by-step solution)", + "new_answer": "... (final answer that would go in \\boxed{})", + "slot_changes": [{"original": "...", "new": "..."}] +}""" + +VERIFY = """You are a rigorous mathematical verifier. Given a problem and its proposed solution: + +1. Is the problem well-defined and solvable? +2. Is every step in the solution mathematically correct? +3. Does the solution correctly arrive at the stated answer? + +If ANY step is wrong or the answer doesn't follow, you MUST reject. + +Reply with EXACTLY: +VERDICT: ACCEPT +or +VERDICT: REJECT +REASON: [what is wrong]""" + +REPAIR = """The following mathematical variant was rejected by a verifier. + +Problem: {problem} +Solution: {solution} +Rejection reason: {reason} + +Please fix the solution (or the problem if needed) so that it is mathematically correct. +Keep the same problem structure and slot changes. + +Return ONLY valid JSON: +{{"new_problem": "...", "new_solution": "...", "new_answer": "..."}}""" + +# ============================================================ +# KV Generation Pipeline +# ============================================================ + +def extract_json(text): + """Extract JSON from text that may contain markdown or extra text.""" + # Try direct parse + try: + return json.loads(text) + except: + pass + # Try finding JSON block + match = re.search(r'```(?:json)?\s*(\{[\s\S]*?\})\s*```', text) + if match: + try: + return json.loads(match.group(1)) + except: + pass + # Try finding raw JSON + match = re.search(r'\{[\s\S]*\}', text) + if match: + try: + return json.loads(match.group()) + except: + pass + return None + + +async def call_llm(system_prompt, user_content, max_tokens=4000, model="gpt-4o"): + """Call LLM with rate limiting and exponential backoff.""" + sem = SEM_GEN if model == "o3" else SEM_EVAL + async with sem: + for attempt in range(4): + try: + kwargs = {"model": model, "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_content} + ]} + if model == "o3": + kwargs["max_completion_tokens"] = max_tokens + else: + kwargs["max_tokens"] = max_tokens + kwargs["temperature"] = 0 + resp = await client.chat.completions.create(**kwargs) + return resp.choices[0].message.content + except Exception as e: + wait = (2 ** attempt) * 3 + print(f" {model} error (attempt {attempt+1}): {e}, waiting {wait}s...") + await asyncio.sleep(wait) + print(f" {model} failed after 4 attempts") + return None + +async def call_o3(system_prompt, user_content, max_tokens=4000): + return await call_llm(system_prompt, user_content, max_tokens, model="o3") + + +async def verify_once(problem, solution): + """Single verification judge call.""" + async with SEM_GEN: + try: + resp = await client.chat.completions.create( + model="o3", max_completion_tokens=500, + messages=[ + {"role": "system", "content": VERIFY}, + {"role": "user", "content": f"Problem:\n{problem}\n\nSolution:\n{solution}"} + ], + ) + text = resp.choices[0].message.content + accepted = 'ACCEPT' in text.upper() and 'REJECT' not in text.upper() + reason = "" + if not accepted: + match = re.search(r'REASON:\s*(.*)', text, re.IGNORECASE) + reason = match.group(1).strip() if match else text + return accepted, reason + except Exception as e: + return False, str(e) + + +async def generate_kv_with_repair(problem, solution, idx, max_repairs=3, n_judges=3): + """Generate KV with repair loop and multiple judges.""" + print(f"\n[{idx}] Processing...") + + # Stage 1: Slot Discovery (use gpt-4o for speed) + slot_text = await call_llm(SLOT_DISCOVERY, f"Problem:\n{problem}\n\nSolution:\n{solution}", 2000, model="gpt-4o") + if not slot_text: + return {'status': 'error', 'reason': 'slot discovery failed'} + + slots_data = extract_json(slot_text) + if not slots_data or not slots_data.get('mutable_slots'): + print(f"[{idx}] No mutable slots found") + return {'status': 'no_slots', 'reason': 'no mutable slots identified'} + + n_slots = len(slots_data['mutable_slots']) + print(f"[{idx}] Found {n_slots} slots") + + # Stage 2: Back-synthesis (use o3 for quality) + synth_text = await call_llm( + BACK_SYNTHESIS, + f"Original problem:\n{problem}\n\nOriginal solution:\n{solution}\n\nMutable slots:\n{json.dumps(slots_data['mutable_slots'], indent=2)}\n\nCreate a variant with different values.", + 6000, model="o3" + ) + if not synth_text: + return {'status': 'error', 'reason': 'back-synthesis failed'} + + synth_data = extract_json(synth_text) + if not synth_data or not synth_data.get('new_problem'): + return {'status': 'error', 'reason': 'could not parse synthesized variant'} + + new_problem = synth_data['new_problem'] + new_solution = synth_data['new_solution'] + new_answer = synth_data.get('new_answer', '') + + # Stage 3: Verification with repair loop + for repair_round in range(max_repairs + 1): + # Run n_judges in parallel + judge_tasks = [verify_once(new_problem, new_solution) for _ in range(n_judges)] + verdicts = await asyncio.gather(*judge_tasks) + + accepts = sum(1 for v, _ in verdicts if v) + reasons = [r for v, r in verdicts if not v] + + if accepts == n_judges: + print(f"[{idx}] ACCEPTED (round {repair_round}, {n_judges}/{n_judges} judges)") + return { + 'status': 'accepted', + 'original_problem': problem, + 'original_solution': solution, + 'mutable_slots': slots_data['mutable_slots'], + 'kv_problem': new_problem, + 'kv_solution': new_solution, + 'kv_answer': new_answer, + 'slot_changes': synth_data.get('slot_changes', []), + 'repair_rounds': repair_round, + 'judge_count': n_judges, + } + + if repair_round < max_repairs: + print(f"[{idx}] Round {repair_round}: {accepts}/{n_judges} accepted, repairing...") + # Repair + reason_str = '; '.join(reasons[:2]) + repair_text = await call_llm( + REPAIR.format(problem=new_problem, solution=new_solution, reason=reason_str), + "Fix the variant.", 6000, model="o3" + ) + if repair_text: + repair_data = extract_json(repair_text) + if repair_data: + new_problem = repair_data.get('new_problem', new_problem) + new_solution = repair_data.get('new_solution', new_solution) + new_answer = repair_data.get('new_answer', new_answer) + + print(f"[{idx}] REJECTED after {max_repairs} repairs") + return { + 'status': 'rejected', + 'original_problem': problem, + 'reason': f'failed after {max_repairs} repair rounds', + 'last_reasons': reasons, + } + + +async def evaluate_kv(accepted_results, variants_data): + """Evaluate accepted KV variants with GPT-4o and GPT-4o-mini.""" + if not accepted_results: + return {} + + async def solve(problem, model): + async with SEM_EVAL: + resp = await client.chat.completions.create( + model=model, temperature=0, max_tokens=2048, + messages=[ + {"role": "system", "content": "Solve step by step. Put final answer in \\boxed{}."}, + {"role": "user", "content": problem} + ], + ) + return resp.choices[0].message.content + + async def grade(ref_answer, student_answer, model="gpt-4o"): + async with SEM_EVAL: + resp = await client.chat.completions.create( + model=model, temperature=0, max_tokens=10, + messages=[{"role": "user", "content": f"Are these two mathematical answers equivalent? Reference: {ref_answer}\nStudent: {student_answer}\nReply CORRECT or INCORRECT."}], + ) + return 'CORRECT' in resp.choices[0].message.content.upper() + + def extract_boxed(text): + if not text: return None + matches = [] + i = 0 + while i < len(text): + idx = text.find('\\boxed{', i) + if idx == -1: break + depth = 1; j = idx + 7 + while j < len(text) and depth > 0: + if text[j] == '{': depth += 1 + elif text[j] == '}': depth -= 1 + j += 1 + if depth == 0: matches.append(text[idx+7:j-1].strip()) + i = j + return matches[-1] if matches else None + + eval_models = ['gpt-4o', 'gpt-4o-mini'] + results = {} + + for model in eval_models: + print(f"\nEvaluating with {model}...") + + # Solve originals + orig_tasks = [solve(r['original_problem'], model) for r in accepted_results] + orig_solutions = await asyncio.gather(*orig_tasks) + + # Solve KV variants + kv_tasks = [solve(r['kv_problem'], model) for r in accepted_results] + kv_solutions = await asyncio.gather(*kv_tasks) + + # Grade originals + orig_grades = [] + for i, (sol, r) in enumerate(zip(orig_solutions, accepted_results)): + ref = extract_boxed(r['original_solution']) + stu = extract_boxed(sol) + if ref and stu: + g = await grade(ref, stu) + orig_grades.append(g) + else: + orig_grades.append(False) + + # Grade KVs + kv_grades = [] + for i, (sol, r) in enumerate(zip(kv_solutions, accepted_results)): + ref = r['kv_answer'] or extract_boxed(r['kv_solution']) + stu = extract_boxed(sol) + if ref and stu: + g = await grade(ref, stu) + kv_grades.append(g) + else: + kv_grades.append(False) + + orig_acc = sum(orig_grades) / len(orig_grades) * 100 + kv_acc = sum(kv_grades) / len(kv_grades) * 100 + delta = kv_acc - orig_acc + + results[model] = { + 'original_accuracy': orig_acc, + 'kv_accuracy': kv_acc, + 'delta': delta, + 'n': len(accepted_results), + 'orig_correct': sum(orig_grades), + 'kv_correct': sum(kv_grades), + } + print(f" {model}: orig={orig_acc:.1f}%, kv={kv_acc:.1f}%, Δ={delta:+.1f}pp") + + return results + + +async def main(): + # Load MATH Level 5 problems + with open('/home/yurenh2/gap/math_sample_200.json') as f: + all_problems = json.load(f) + + level5 = [p for p in all_problems if p['level'] == 'Level 5' and len(p['solution']) > 50] + selected = random.sample(level5, min(50, len(level5))) + print(f"Selected {len(selected)} Level 5 problems for KV generation") + + # Load any previous progress + progress_file = os.path.join(OUTPUT_DIR, 'kv_generation.json') + if os.path.exists(progress_file): + with open(progress_file) as f: + kv_results = json.load(f) + done_indices = {r['original_index'] for r in kv_results} + print(f"Resuming: {len(kv_results)} already done") + else: + kv_results = [] + done_indices = set() + + # Generate KVs sequentially to avoid rate limits + for i, p in enumerate(selected): + if i in done_indices: + continue + result = await generate_kv_with_repair( + p['problem'], p['solution'], i, + max_repairs=3, n_judges=3 + ) + result['original_index'] = i + result['subject'] = p.get('subject', 'unknown') + kv_results.append(result) + + # Save incrementally + with open(progress_file, 'w') as f: + json.dump(kv_results, f, indent=2) + + # Small delay between problems + await asyncio.sleep(2) + + # Summary + accepted = [r for r in kv_results if r['status'] == 'accepted'] + rejected = [r for r in kv_results if r['status'] == 'rejected'] + no_slots = [r for r in kv_results if r['status'] == 'no_slots'] + errors = [r for r in kv_results if r['status'] == 'error'] + + print(f"\n{'='*60}") + print(f"KV GENERATION SUMMARY") + print(f"{'='*60}") + print(f"Total attempted: {len(selected)}") + print(f"Accepted: {len(accepted)} ({len(accepted)/len(selected)*100:.0f}%)") + print(f"Rejected: {len(rejected)} ({len(rejected)/len(selected)*100:.0f}%)") + print(f"No slots: {len(no_slots)} ({len(no_slots)/len(selected)*100:.0f}%)") + print(f"Errors: {len(errors)} ({len(errors)/len(selected)*100:.0f}%)") + + if accepted: + avg_repairs = sum(r['repair_rounds'] for r in accepted) / len(accepted) + print(f"Avg repair rounds (accepted): {avg_repairs:.1f}") + + # Evaluate accepted variants + if accepted: + print(f"\n{'='*60}") + print(f"EVALUATING {len(accepted)} ACCEPTED KV VARIANTS") + print(f"{'='*60}") + eval_results = await evaluate_kv(accepted, None) + + # Save everything + final_results = { + 'generation_summary': { + 'total': len(selected), + 'accepted': len(accepted), + 'rejected': len(rejected), + 'no_slots': len(no_slots), + 'errors': len(errors), + }, + 'evaluation': eval_results, + 'accepted_variants': accepted, + } + with open(os.path.join(OUTPUT_DIR, 'kv_final_results.json'), 'w') as f: + json.dump(final_results, f, indent=2) + + print(f"\n{'='*60}") + print(f"FINAL RESULTS") + print(f"{'='*60}") + print(f"{'Model':<20} {'Orig%':>8} {'KV%':>8} {'Δ':>8} {'N':>5}") + print("-"*50) + for model, res in eval_results.items(): + print(f"{model:<20} {res['original_accuracy']:>7.1f}% {res['kv_accuracy']:>7.1f}% {res['delta']:>+7.1f} {res['n']:>5}") + +asyncio.run(main()) -- cgit v1.2.3