mini_gap_math_regrade.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166

#!/usr/bin/env python3
"""
Multi-grader consistency analysis for Mini-GAP-MATH.
Uses multiple LLM graders to evaluate the same (problem, solution) pairs.
Computes Cohen's kappa and percent agreement.
"""

import json
import os
import asyncio
import argparse
from openai import AsyncOpenAI

client = AsyncOpenAI()
SEMAPHORE = asyncio.Semaphore(30)

GRADING_PROMPT = """You are a strict math grader. You are given a math problem, its reference solution, and a student's solution.

Determine if the student's final answer is CORRECT or INCORRECT.
- For numerical answers: the answer must match exactly (after simplification).
- For expressions: must be mathematically equivalent.
- Ignore intermediate steps; focus only on the final answer.

Respond with EXACTLY one word: CORRECT or INCORRECT"""


async def grade_one(problem, reference_solution, student_solution, model):
    """Grade a single (problem, student_solution) pair."""
    async with SEMAPHORE:
        try:
            resp = await client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": GRADING_PROMPT},
                    {"role": "user", "content": f"Problem:\n{problem}\n\nReference Solution:\n{reference_solution}\n\nStudent Solution:\n{student_solution}"}
                ],
                max_tokens=10,
                temperature=0,
            )
            answer = resp.choices[0].message.content.strip().upper()
            return 'CORRECT' in answer
        except Exception as e:
            print(f"  Grading error: {e}")
            return None


async def grade_all(problems, ref_solutions, student_solutions, model):
    """Grade all solutions with a given model."""
    tasks = [
        grade_one(p, r, s, model)
        for p, r, s in zip(problems, ref_solutions, student_solutions)
    ]
    return await asyncio.gather(*tasks)


def cohens_kappa(labels1, labels2):
    """Compute Cohen's kappa between two sets of binary labels."""
    assert len(labels1) == len(labels2)
    n = len(labels1)
    # Filter out None
    valid = [(a, b) for a, b in zip(labels1, labels2) if a is not None and b is not None]
    if not valid:
        return 0.0, 0
    n = len(valid)
    agree = sum(1 for a, b in valid if a == b)
    p_o = agree / n  # observed agreement

    # Expected agreement
    p1_yes = sum(1 for a, _ in valid if a) / n
    p2_yes = sum(1 for _, b in valid if b) / n
    p_e = p1_yes * p2_yes + (1 - p1_yes) * (1 - p2_yes)

    if p_e == 1.0:
        return 1.0, n
    kappa = (p_o - p_e) / (1 - p_e)
    return kappa, n


async def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-file', required=True, help='Path to evaluation results JSON')
    parser.add_argument('--variants-file', required=True, help='Path to math_variants.json')
    parser.add_argument('--grader-models', nargs='+', default=['gpt-4o', 'gpt-4o-mini'])
    parser.add_argument('--variant-type', default='original')
    parser.add_argument('--output', default='/home/yurenh2/gap/mini_gap_math_results/regrade_consistency.json')
    args = parser.parse_args()

    # Load original eval results
    with open(args.results_file) as f:
        eval_data = json.load(f)

    # Handle both list and single-model format
    if isinstance(eval_data, list):
        eval_results = eval_data[0]
    else:
        eval_results = eval_data

    # Load variant data for problems
    with open(args.variants_file) as f:
        variants = json.load(f)

    variant_type = args.variant_type
    # Get problems and student solutions
    problems = [v[variant_type]['problem'] for v in variants]
    ref_solutions = [v[variant_type]['solution'] for v in variants]

    # Extract student solutions from eval results (we need the raw generated text)
    # Since we don't store raw text, we'll re-generate or use reference answers
    # Actually, for regrade we need the student solutions. Let's use the per_item data
    # to identify which problems were attempted and grade the reference vs generated answers.

    # For this analysis, we'll ask graders to evaluate whether the reference answer
    # from each variant matches the original. This tests grader consistency.

    print(f"Loaded {len(variants)} problems")
    print(f"Grader models: {args.grader_models}")
    print(f"Variant type: {variant_type}")

    # Strategy: For each problem, take the ORIGINAL solution as "student solution"
    # and the GS/DLC variant solution as reference, and see if graders agree.
    # But actually the more useful thing: re-grade the ORIGINAL problems with
    # multiple graders and compare grades.

    # Simpler approach: Grade the reference solutions with each model to check
    # if graders are consistent on "obviously correct" answers

    all_grades = {}
    for model in args.grader_models:
        print(f"\n--- Grading with {model} ---")
        grades = await grade_all(problems, ref_solutions, ref_solutions, model)
        all_grades[model] = grades
        correct_count = sum(1 for g in grades if g is True)
        none_count = sum(1 for g in grades if g is None)
        print(f"  {model}: {correct_count}/{len(grades)} correct, {none_count} errors")

    # Compute pairwise kappa
    models = list(all_grades.keys())
    print("\n" + "="*60)
    print("PAIRWISE COHEN'S KAPPA")
    print("="*60)

    results = {'models': models, 'kappas': {}, 'agreement': {}}

    for i in range(len(models)):
        for j in range(i+1, len(models)):
            m1, m2 = models[i], models[j]
            kappa, n = cohens_kappa(all_grades[m1], all_grades[m2])
            pct_agree = sum(1 for a, b in zip(all_grades[m1], all_grades[m2])
                          if a is not None and b is not None and a == b)
            total_valid = sum(1 for a, b in zip(all_grades[m1], all_grades[m2])
                            if a is not None and b is not None)
            pct = pct_agree / total_valid * 100 if total_valid > 0 else 0

            key = f"{m1}_vs_{m2}"
            results['kappas'][key] = kappa
            results['agreement'][key] = pct
            print(f"  {m1} vs {m2}: κ={kappa:.3f}, agreement={pct:.1f}% (n={n})")

    # Save
    with open(args.output, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"\nSaved to {args.output}")


if __name__ == '__main__':
    asyncio.run(main())