Initial release: GAP framework

- Full pipeline: variant generation, multi-judge verification, evaluation - Loaders for OpenAI / Anthropic / Google / xAI / OpenRouter / vLLM - Framework-level mechanism analyses: paired structural overlap, repairability rescue, self-correction probe, cross-model agreement, topic x problem-type interaction - Unicode -> bare-LaTeX cleaner + audit + spot-check - Mirrors https://huggingface.co/datasets/blackhao0426/PutnamGAP
author: Yuren Hao <yurenh2@illinois.edu> 2026-04-08 22:06:05 -0500
committer: Yuren Hao <yurenh2@illinois.edu> 2026-04-08 22:06:05 -0500
commit: 05704d0eb2fa59fe727652465b07db40bcb06c38 (patch)
tree: 8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /putnamsup/evaluate_putnam_gap.py
1 files changed, 74 insertions, 0 deletions
diff --git a/putnamsup/evaluate_putnam_gap.py b/putnamsup/evaluate_putnam_gap.py
new file mode 100644
index 0000000..5c9f35e
--- /dev/null
+++ b/putnamsup/evaluate_putnam_gap.py
@@ -0,0 +1,74 @@
+import json
+import argparse
+import re
+
+def normalize_answer(text):
+    """Simple normalization for comparison."""
+    if text is None: return ""
+    text = text.strip().lower()
+    # Remove latex formatting for simple check
+    text = re.sub(r'\\[\(\)\[\]]', ' ', text)
+    return text
+
+def simple_evaluate(ground_truth, generated):
+    """
+    A very naive evaluator. 
+    Returns True if the generated answer seems to contain the ground truth 
+    (if ground truth is short) or based on some heuristics.
+    """
+    gt_norm = normalize_answer(ground_truth)
+    gen_norm = normalize_answer(generated)
+    
+    # If ground truth is very short (likely a number or variable), check if it's in the generated text
+    if len(gt_norm) < 20:
+        return gt_norm in gen_norm
+    
+    # For longer proofs, this metric is useless.
+    return False
+
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate PutnamGAP results")
+    parser.add_argument("--results_file", type=str, required=True, help="Path to JSONL results file")
+    args = parser.parse_args()
+
+    total = 0
+    correct_heuristic = 0
+    by_type = {}
+
+    print(f"Evaluating {args.results_file}...")
+    
+    with open(args.results_file, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line: continue
+            
+            data = json.loads(line)
+            prob_type = data.get("problem_type", "unknown")
+            
+            total += 1
+            if prob_type not in by_type:
+                by_type[prob_type] = {"count": 0, "heuristic_match": 0}
+            
+            by_type[prob_type]["count"] += 1
+            
+            # This is a placeholder evaluation.
+            # Real evaluation for proofs needs an LLM judge.
+            is_match = simple_evaluate(data["solution"], data["generated_solution"])
+            
+            if is_match:
+                correct_heuristic += 1
+                by_type[prob_type]["heuristic_match"] += 1
+
+    print(f"Total processed: {total}")
+    print("-" * 40)
+    print("Breakdown by Problem Type:")
+    for p_type, stats in by_type.items():
+        acc = (stats["heuristic_match"] / stats["count"]) * 100 if stats["count"] > 0 else 0
+        print(f"  {p_type}: {stats['count']} items, {stats['heuristic_match']} heuristic matches ({acc:.2f}%)")
+    print("-" * 40)
+    print("Note: The heuristic match is very basic (checks if short ground truth is substring of generated output).")
+    print("For 'proof' problems, this metric is not reliable. Use an LLM-based judge for accurate evaluation.")
+
+if __name__ == "__main__":
+    main()
+
author	Yuren Hao <yurenh2@illinois.edu>	2026-04-08 22:06:05 -0500
committer	Yuren Hao <yurenh2@illinois.edu>	2026-04-08 22:06:05 -0500
commit	05704d0eb2fa59fe727652465b07db40bcb06c38 (patch)
tree	8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /putnamsup/evaluate_putnam_gap.py