1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
|
import json
import argparse
import re
def normalize_answer(text):
"""Simple normalization for comparison."""
if text is None: return ""
text = text.strip().lower()
# Remove latex formatting for simple check
text = re.sub(r'\\[\(\)\[\]]', ' ', text)
return text
def simple_evaluate(ground_truth, generated):
"""
A very naive evaluator.
Returns True if the generated answer seems to contain the ground truth
(if ground truth is short) or based on some heuristics.
"""
gt_norm = normalize_answer(ground_truth)
gen_norm = normalize_answer(generated)
# If ground truth is very short (likely a number or variable), check if it's in the generated text
if len(gt_norm) < 20:
return gt_norm in gen_norm
# For longer proofs, this metric is useless.
return False
def main():
parser = argparse.ArgumentParser(description="Evaluate PutnamGAP results")
parser.add_argument("--results_file", type=str, required=True, help="Path to JSONL results file")
args = parser.parse_args()
total = 0
correct_heuristic = 0
by_type = {}
print(f"Evaluating {args.results_file}...")
with open(args.results_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line: continue
data = json.loads(line)
prob_type = data.get("problem_type", "unknown")
total += 1
if prob_type not in by_type:
by_type[prob_type] = {"count": 0, "heuristic_match": 0}
by_type[prob_type]["count"] += 1
# This is a placeholder evaluation.
# Real evaluation for proofs needs an LLM judge.
is_match = simple_evaluate(data["solution"], data["generated_solution"])
if is_match:
correct_heuristic += 1
by_type[prob_type]["heuristic_match"] += 1
print(f"Total processed: {total}")
print("-" * 40)
print("Breakdown by Problem Type:")
for p_type, stats in by_type.items():
acc = (stats["heuristic_match"] / stats["count"]) * 100 if stats["count"] > 0 else 0
print(f" {p_type}: {stats['count']} items, {stats['heuristic_match']} heuristic matches ({acc:.2f}%)")
print("-" * 40)
print("Note: The heuristic match is very basic (checks if short ground truth is substring of generated output).")
print("For 'proof' problems, this metric is not reliable. Use an LLM-based judge for accurate evaluation.")
if __name__ == "__main__":
main()
|