summaryrefslogtreecommitdiff
path: root/putnamsup/evaluate_putnam_gap.py
diff options
context:
space:
mode:
authorYuren Hao <yurenh2@illinois.edu>2026-04-08 22:06:05 -0500
committerYuren Hao <yurenh2@illinois.edu>2026-04-08 22:06:05 -0500
commit05704d0eb2fa59fe727652465b07db40bcb06c38 (patch)
tree8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /putnamsup/evaluate_putnam_gap.py
Initial release: GAP framework
- Full pipeline: variant generation, multi-judge verification, evaluation - Loaders for OpenAI / Anthropic / Google / xAI / OpenRouter / vLLM - Framework-level mechanism analyses: paired structural overlap, repairability rescue, self-correction probe, cross-model agreement, topic x problem-type interaction - Unicode -> bare-LaTeX cleaner + audit + spot-check - Mirrors https://huggingface.co/datasets/blackhao0426/PutnamGAP
Diffstat (limited to 'putnamsup/evaluate_putnam_gap.py')
-rw-r--r--putnamsup/evaluate_putnam_gap.py74
1 files changed, 74 insertions, 0 deletions
diff --git a/putnamsup/evaluate_putnam_gap.py b/putnamsup/evaluate_putnam_gap.py
new file mode 100644
index 0000000..5c9f35e
--- /dev/null
+++ b/putnamsup/evaluate_putnam_gap.py
@@ -0,0 +1,74 @@
+import json
+import argparse
+import re
+
+def normalize_answer(text):
+ """Simple normalization for comparison."""
+ if text is None: return ""
+ text = text.strip().lower()
+ # Remove latex formatting for simple check
+ text = re.sub(r'\\[\(\)\[\]]', ' ', text)
+ return text
+
+def simple_evaluate(ground_truth, generated):
+ """
+ A very naive evaluator.
+ Returns True if the generated answer seems to contain the ground truth
+ (if ground truth is short) or based on some heuristics.
+ """
+ gt_norm = normalize_answer(ground_truth)
+ gen_norm = normalize_answer(generated)
+
+ # If ground truth is very short (likely a number or variable), check if it's in the generated text
+ if len(gt_norm) < 20:
+ return gt_norm in gen_norm
+
+ # For longer proofs, this metric is useless.
+ return False
+
+def main():
+ parser = argparse.ArgumentParser(description="Evaluate PutnamGAP results")
+ parser.add_argument("--results_file", type=str, required=True, help="Path to JSONL results file")
+ args = parser.parse_args()
+
+ total = 0
+ correct_heuristic = 0
+ by_type = {}
+
+ print(f"Evaluating {args.results_file}...")
+
+ with open(args.results_file, "r", encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if not line: continue
+
+ data = json.loads(line)
+ prob_type = data.get("problem_type", "unknown")
+
+ total += 1
+ if prob_type not in by_type:
+ by_type[prob_type] = {"count": 0, "heuristic_match": 0}
+
+ by_type[prob_type]["count"] += 1
+
+ # This is a placeholder evaluation.
+ # Real evaluation for proofs needs an LLM judge.
+ is_match = simple_evaluate(data["solution"], data["generated_solution"])
+
+ if is_match:
+ correct_heuristic += 1
+ by_type[prob_type]["heuristic_match"] += 1
+
+ print(f"Total processed: {total}")
+ print("-" * 40)
+ print("Breakdown by Problem Type:")
+ for p_type, stats in by_type.items():
+ acc = (stats["heuristic_match"] / stats["count"]) * 100 if stats["count"] > 0 else 0
+ print(f" {p_type}: {stats['count']} items, {stats['heuristic_match']} heuristic matches ({acc:.2f}%)")
+ print("-" * 40)
+ print("Note: The heuristic match is very basic (checks if short ground truth is substring of generated output).")
+ print("For 'proof' problems, this metric is not reliable. Use an LLM-based judge for accurate evaluation.")
+
+if __name__ == "__main__":
+ main()
+