putnamsup/evaluate_putnam_gap.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

import json
import argparse
import re

def normalize_answer(text):
    """Simple normalization for comparison."""
    if text is None: return ""
    text = text.strip().lower()
    # Remove latex formatting for simple check
    text = re.sub(r'\\[\(\)\[\]]', ' ', text)
    return text

def simple_evaluate(ground_truth, generated):
    """
    A very naive evaluator. 
    Returns True if the generated answer seems to contain the ground truth 
    (if ground truth is short) or based on some heuristics.
    """
    gt_norm = normalize_answer(ground_truth)
    gen_norm = normalize_answer(generated)
    
    # If ground truth is very short (likely a number or variable), check if it's in the generated text
    if len(gt_norm) < 20:
        return gt_norm in gen_norm
    
    # For longer proofs, this metric is useless.
    return False

def main():
    parser = argparse.ArgumentParser(description="Evaluate PutnamGAP results")
    parser.add_argument("--results_file", type=str, required=True, help="Path to JSONL results file")
    args = parser.parse_args()

    total = 0
    correct_heuristic = 0
    by_type = {}

    print(f"Evaluating {args.results_file}...")
    
    with open(args.results_file, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line: continue
            
            data = json.loads(line)
            prob_type = data.get("problem_type", "unknown")
            
            total += 1
            if prob_type not in by_type:
                by_type[prob_type] = {"count": 0, "heuristic_match": 0}
            
            by_type[prob_type]["count"] += 1
            
            # This is a placeholder evaluation.
            # Real evaluation for proofs needs an LLM judge.
            is_match = simple_evaluate(data["solution"], data["generated_solution"])
            
            if is_match:
                correct_heuristic += 1
                by_type[prob_type]["heuristic_match"] += 1

    print(f"Total processed: {total}")
    print("-" * 40)
    print("Breakdown by Problem Type:")
    for p_type, stats in by_type.items():
        acc = (stats["heuristic_match"] / stats["count"]) * 100 if stats["count"] > 0 else 0
        print(f"  {p_type}: {stats['count']} items, {stats['heuristic_match']} heuristic matches ({acc:.2f}%)")
    print("-" * 40)
    print("Note: The heuristic match is very basic (checks if short ground truth is substring of generated output).")
    print("For 'proof' problems, this metric is not reliable. Use an LLM-based judge for accurate evaluation.")

if __name__ == "__main__":
    main()