Initial release: GAP framework

- Full pipeline: variant generation, multi-judge verification, evaluation - Loaders for OpenAI / Anthropic / Google / xAI / OpenRouter / vLLM - Framework-level mechanism analyses: paired structural overlap, repairability rescue, self-correction probe, cross-model agreement, topic x problem-type interaction - Unicode -> bare-LaTeX cleaner + audit + spot-check - Mirrors https://huggingface.co/datasets/blackhao0426/PutnamGAP
author: Yuren Hao <yurenh2@illinois.edu> 2026-04-08 22:06:05 -0500
committer: Yuren Hao <yurenh2@illinois.edu> 2026-04-08 22:06:05 -0500
commit: 05704d0eb2fa59fe727652465b07db40bcb06c38 (patch)
tree: 8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /analysis/topic_problemtype_interaction.py
1 files changed, 112 insertions, 0 deletions
diff --git a/analysis/topic_problemtype_interaction.py b/analysis/topic_problemtype_interaction.py
new file mode 100644
index 0000000..405b33a
--- /dev/null
+++ b/analysis/topic_problemtype_interaction.py
@@ -0,0 +1,112 @@
+"""KV fragility broken down by Topic × Problem-type (proof vs calculation)."""
+from __future__ import annotations
+import json
+import sys
+import statistics
+from pathlib import Path
+from collections import defaultdict
+
+THIS_DIR = Path(__file__).resolve().parent
+sys.path.insert(0, str(THIS_DIR))
+from structural_overlap import find_variant_file, load_problems, RESULTS_DIR, SURFACE_VARIANTS
+
+DATASET_DIR = Path("/home/yurenh2/gap/putnam-bench-anon/dataset")
+
+
+def load_metadata():
+    out = {}
+    for f in sorted(DATASET_DIR.glob("*.json")):
+        d = json.load(open(f))
+        idx = d.get("index")
+        if not idx: continue
+        out[idx] = {
+            "tag": d.get("tag"),
+            "problem_type": d.get("problem_type"),
+        }
+    return out
+
+
+def main():
+    metadata = load_metadata()
+    base = RESULTS_DIR
+    models = sorted([d.name for d in base.iterdir() if d.is_dir()])
+
+    # cells[(topic, ptype, model, variant)] = (n, n_correct)
+    cells = defaultdict(lambda: [0, 0])
+    for m in models:
+        mdir = base / m
+        for v in ["original"] + SURFACE_VARIANTS + ["kernel_variant"]:
+            vp = find_variant_file(mdir, v)
+            if not vp: continue
+            for p in load_problems(vp):
+                idx = p.get("index")
+                correct = p.get("correct")
+                if idx is None or correct is None: continue
+                md = metadata.get(idx, {})
+                tag = md.get("tag")
+                ptype = md.get("problem_type")
+                if not tag or not ptype: continue
+                tags = tag if isinstance(tag, list) else [tag]
+                for t in tags:
+                    if t not in ["ALG", "ANA", "NT", "COMB", "GEO"]: continue
+                    cells[(t, ptype, m, v)][0] += 1
+                    if correct: cells[(t, ptype, m, v)][1] += 1
+
+    print("=" * 80)
+    print("ACCURACY BY TOPIC × PROBLEM-TYPE × VARIANT (mean across 18 models)")
+    print("=" * 80)
+    print()
+
+    for ptype in ["proof", "calculation"]:
+        print(f"\n--- {ptype.upper()} ---\n")
+        print(f"{'Topic':<6}", end="")
+        for v in ["original", "garbled_string", "kernel_variant"]:
+            short = {"original":"orig","garbled_string":"GS","kernel_variant":"KV"}[v]
+            print(f"  {short:>6}", end="")
+        print(f"  {'Δ_GS':>7} {'Δ_KV':>7}")
+        print("-" * 50)
+        for t in ["ALG", "ANA", "NT", "COMB", "GEO"]:
+            orig_rates = []
+            gs_rates = []
+            kv_rates = []
+            for m in models:
+                no, co = cells.get((t, ptype, m, "original"), [0, 0])
+                ng, cg = cells.get((t, ptype, m, "garbled_string"), [0, 0])
+                nk, ck = cells.get((t, ptype, m, "kernel_variant"), [0, 0])
+                if no >= 5 and ng >= 5 and nk >= 5:
+                    orig_rates.append(co / no)
+                    gs_rates.append(cg / ng)
+                    kv_rates.append(ck / nk)
+            if not orig_rates: continue
+            mo = statistics.fmean(orig_rates) * 100
+            mg = statistics.fmean(gs_rates) * 100
+            mk = statistics.fmean(kv_rates) * 100
+            print(f"{t:<6}  {mo:>5.1f}% {mg:>5.1f}% {mk:>5.1f}%  {mg-mo:>+5.1f}pp {mk-mo:>+5.1f}pp")
+
+    print("\n\n=== KEY DIFFERENTIAL: Δ KV by Topic for proof vs calculation ===\n")
+    print(f"{'Topic':<6}  {'proof Δ':>10} {'calc Δ':>10} {'(calc - proof)':>16}")
+    print("-" * 50)
+    for t in ["ALG", "ANA", "NT", "COMB", "GEO"]:
+        deltas = {}
+        for ptype in ["proof", "calculation"]:
+            orig_rates = []
+            kv_rates = []
+            for m in models:
+                no, co = cells.get((t, ptype, m, "original"), [0, 0])
+                nk, ck = cells.get((t, ptype, m, "kernel_variant"), [0, 0])
+                if no >= 5 and nk >= 5:
+                    orig_rates.append(co / no)
+                    kv_rates.append(ck / nk)
+            if orig_rates:
+                deltas[ptype] = (statistics.fmean(kv_rates) - statistics.fmean(orig_rates)) * 100
+        if "proof" in deltas and "calculation" in deltas:
+            diff = deltas["calculation"] - deltas["proof"]
+            print(f"{t:<6}  {deltas['proof']:>+9.1f}pp {deltas['calculation']:>+9.1f}pp {diff:>+15.1f}pp")
+        elif "proof" in deltas:
+            print(f"{t:<6}  {deltas['proof']:>+9.1f}pp {'-':>10} {'-':>16}")
+        elif "calculation" in deltas:
+            print(f"{t:<6}  {'-':>10} {deltas['calculation']:>+9.1f}pp {'-':>16}")
+
+
+if __name__ == "__main__":
+    main()
author	Yuren Hao <yurenh2@illinois.edu>	2026-04-08 22:06:05 -0500
committer	Yuren Hao <yurenh2@illinois.edu>	2026-04-08 22:06:05 -0500
commit	05704d0eb2fa59fe727652465b07db40bcb06c38 (patch)
tree	8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /analysis/topic_problemtype_interaction.py