#!/usr/bin/env python3 """ Multi-grader consistency analysis for Mini-GAP-MATH. Uses multiple LLM graders to evaluate the same (problem, solution) pairs. Computes Cohen's kappa and percent agreement. """ import json import os import asyncio import argparse from openai import AsyncOpenAI client = AsyncOpenAI() SEMAPHORE = asyncio.Semaphore(30) GRADING_PROMPT = """You are a strict math grader. You are given a math problem, its reference solution, and a student's solution. Determine if the student's final answer is CORRECT or INCORRECT. - For numerical answers: the answer must match exactly (after simplification). - For expressions: must be mathematically equivalent. - Ignore intermediate steps; focus only on the final answer. Respond with EXACTLY one word: CORRECT or INCORRECT""" async def grade_one(problem, reference_solution, student_solution, model): """Grade a single (problem, student_solution) pair.""" async with SEMAPHORE: try: resp = await client.chat.completions.create( model=model, messages=[ {"role": "system", "content": GRADING_PROMPT}, {"role": "user", "content": f"Problem:\n{problem}\n\nReference Solution:\n{reference_solution}\n\nStudent Solution:\n{student_solution}"} ], max_tokens=10, temperature=0, ) answer = resp.choices[0].message.content.strip().upper() return 'CORRECT' in answer except Exception as e: print(f" Grading error: {e}") return None async def grade_all(problems, ref_solutions, student_solutions, model): """Grade all solutions with a given model.""" tasks = [ grade_one(p, r, s, model) for p, r, s in zip(problems, ref_solutions, student_solutions) ] return await asyncio.gather(*tasks) def cohens_kappa(labels1, labels2): """Compute Cohen's kappa between two sets of binary labels.""" assert len(labels1) == len(labels2) n = len(labels1) # Filter out None valid = [(a, b) for a, b in zip(labels1, labels2) if a is not None and b is not None] if not valid: return 0.0, 0 n = len(valid) agree = sum(1 for a, b in valid if a == b) p_o = agree / n # observed agreement # Expected agreement p1_yes = sum(1 for a, _ in valid if a) / n p2_yes = sum(1 for _, b in valid if b) / n p_e = p1_yes * p2_yes + (1 - p1_yes) * (1 - p2_yes) if p_e == 1.0: return 1.0, n kappa = (p_o - p_e) / (1 - p_e) return kappa, n async def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-file', required=True, help='Path to evaluation results JSON') parser.add_argument('--variants-file', required=True, help='Path to math_variants.json') parser.add_argument('--grader-models', nargs='+', default=['gpt-4o', 'gpt-4o-mini']) parser.add_argument('--variant-type', default='original') parser.add_argument('--output', default='/home/yurenh2/gap/mini_gap_math_results/regrade_consistency.json') args = parser.parse_args() # Load original eval results with open(args.results_file) as f: eval_data = json.load(f) # Handle both list and single-model format if isinstance(eval_data, list): eval_results = eval_data[0] else: eval_results = eval_data # Load variant data for problems with open(args.variants_file) as f: variants = json.load(f) variant_type = args.variant_type # Get problems and student solutions problems = [v[variant_type]['problem'] for v in variants] ref_solutions = [v[variant_type]['solution'] for v in variants] # Extract student solutions from eval results (we need the raw generated text) # Since we don't store raw text, we'll re-generate or use reference answers # Actually, for regrade we need the student solutions. Let's use the per_item data # to identify which problems were attempted and grade the reference vs generated answers. # For this analysis, we'll ask graders to evaluate whether the reference answer # from each variant matches the original. This tests grader consistency. print(f"Loaded {len(variants)} problems") print(f"Grader models: {args.grader_models}") print(f"Variant type: {variant_type}") # Strategy: For each problem, take the ORIGINAL solution as "student solution" # and the GS/DLC variant solution as reference, and see if graders agree. # But actually the more useful thing: re-grade the ORIGINAL problems with # multiple graders and compare grades. # Simpler approach: Grade the reference solutions with each model to check # if graders are consistent on "obviously correct" answers all_grades = {} for model in args.grader_models: print(f"\n--- Grading with {model} ---") grades = await grade_all(problems, ref_solutions, ref_solutions, model) all_grades[model] = grades correct_count = sum(1 for g in grades if g is True) none_count = sum(1 for g in grades if g is None) print(f" {model}: {correct_count}/{len(grades)} correct, {none_count} errors") # Compute pairwise kappa models = list(all_grades.keys()) print("\n" + "="*60) print("PAIRWISE COHEN'S KAPPA") print("="*60) results = {'models': models, 'kappas': {}, 'agreement': {}} for i in range(len(models)): for j in range(i+1, len(models)): m1, m2 = models[i], models[j] kappa, n = cohens_kappa(all_grades[m1], all_grades[m2]) pct_agree = sum(1 for a, b in zip(all_grades[m1], all_grades[m2]) if a is not None and b is not None and a == b) total_valid = sum(1 for a, b in zip(all_grades[m1], all_grades[m2]) if a is not None and b is not None) pct = pct_agree / total_valid * 100 if total_valid > 0 else 0 key = f"{m1}_vs_{m2}" results['kappas'][key] = kappa results['agreement'][key] = pct print(f" {m1} vs {m2}: κ={kappa:.3f}, agreement={pct:.1f}% (n={n})") # Save with open(args.output, 'w') as f: json.dump(results, f, indent=2) print(f"\nSaved to {args.output}") if __name__ == '__main__': asyncio.run(main())