Initial release: GAP framework

- Full pipeline: variant generation, multi-judge verification, evaluation - Loaders for OpenAI / Anthropic / Google / xAI / OpenRouter / vLLM - Framework-level mechanism analyses: paired structural overlap, repairability rescue, self-correction probe, cross-model agreement, topic x problem-type interaction - Unicode -> bare-LaTeX cleaner + audit + spot-check - Mirrors https://huggingface.co/datasets/blackhao0426/PutnamGAP
author: Yuren Hao <yurenh2@illinois.edu> 2026-04-08 22:06:05 -0500
committer: Yuren Hao <yurenh2@illinois.edu> 2026-04-08 22:06:05 -0500
commit: 05704d0eb2fa59fe727652465b07db40bcb06c38 (patch)
tree: 8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /analysis/aggregate_overlap.py
1 files changed, 91 insertions, 0 deletions
diff --git a/analysis/aggregate_overlap.py b/analysis/aggregate_overlap.py
new file mode 100644
index 0000000..cd6b53e
--- /dev/null
+++ b/analysis/aggregate_overlap.py
@@ -0,0 +1,91 @@
+"""Aggregate structural_overlap results by variant type and by model.
+
+Produces a clean rebuttal table.
+"""
+from __future__ import annotations
+import json
+import statistics
+from pathlib import Path
+from collections import defaultdict
+
+RESULTS = Path("/home/yurenh2/gap/analysis/structural_overlap_results.json")
+SHORT = {"descriptive_long":"DL","descriptive_long_confusing":"DLC",
+         "descriptive_long_misleading":"DLM","garbled_string":"GS"}
+
+
+def main():
+    cells = json.load(open(RESULTS))
+    print(f"Loaded {len(cells)} cells.\n")
+
+    # Per-variant aggregate
+    per_variant = defaultdict(list)
+    for c in cells:
+        per_variant[c["variant"]].append(c)
+
+    print("=" * 90)
+    print("HEADLINE TABLE: Surface variants — stable vs brittle structural overlap")
+    print("(token Jaccard on canonicalized trajectories, drift cases only)")
+    print("=" * 90)
+    print(f"\n{'Variant':<6} {'#cells':>7} {'#dir+':>6} {'#p<.05':>8} "
+          f"{'med-d':>7} {'mean-d':>7} {'mean-dlt':>9} "
+          f"{'mean-stbl':>10} {'mean-brit':>10} {'mean-noise':>11} "
+          f"{'mean-collapse%':>14}")
+    print("-" * 100)
+    for v, cs in per_variant.items():
+        ds = [c["metrics"]["token_jaccard"]["cohens_d"] for c in cs]
+        ps = [c["metrics"]["token_jaccard"]["p_two_sided"] for c in cs]
+        n_pos = sum(1 for d in ds if d > 0)
+        n_sig = sum(1 for p in ps if p < 0.05)
+        deltas = [c["metrics"]["token_jaccard"]["delta_median"] for c in cs]
+        stbl = [c["metrics"]["token_jaccard"]["stable_median"] for c in cs]
+        brit = [c["metrics"]["token_jaccard"]["brittle_median"] for c in cs]
+        noise = [c["metrics"]["token_jaccard"]["noise_floor_median"] for c in cs
+                 if c["metrics"]["token_jaccard"].get("noise_floor_median") is not None]
+        collapse = [c["brittle_collapse_rate"] for c in cs]
+        print(f"{SHORT[v]:<6} {len(cs):>7} {n_pos:>6} {n_sig:>8} "
+              f"{statistics.median(ds):>+7.2f} {statistics.fmean(ds):>+7.2f} "
+              f"{statistics.fmean(deltas):>+9.4f} "
+              f"{statistics.fmean(stbl):>10.3f} {statistics.fmean(brit):>10.3f} "
+              f"{statistics.fmean(noise):>11.3f} "
+              f"{statistics.fmean(collapse)*100:>13.1f}%")
+
+    # Variant-aggregate (across all models, n-weighted)
+    print("\n" + "=" * 90)
+    print("ALL CELLS (18 models × 4 surface variants)")
+    print("=" * 90)
+    all_d = [c["metrics"]["token_jaccard"]["cohens_d"] for c in cells]
+    all_p = [c["metrics"]["token_jaccard"]["p_two_sided"] for c in cells]
+    print(f"  cells:                  {len(cells)}")
+    print(f"  direction-positive:     {sum(1 for d in all_d if d>0)}/{len(cells)}")
+    print(f"  p<0.05:                 {sum(1 for p in all_p if p<0.05)}/{len(cells)}")
+    print(f"  p<0.001:                {sum(1 for p in all_p if p<0.001)}/{len(cells)}")
+    print(f"  p<1e-6:                 {sum(1 for p in all_p if p<1e-6)}/{len(cells)}")
+    print(f"  Cohen's d median:       {statistics.median(all_d):+.3f}")
+    print(f"  Cohen's d mean:         {statistics.fmean(all_d):+.3f}")
+    print(f"  Cohen's d range:        [{min(all_d):+.2f}, {max(all_d):+.2f}]")
+
+    # Per-model aggregate (averaged across 4 surface variants)
+    per_model = defaultdict(list)
+    for c in cells:
+        per_model[c["model"]].append(c)
+    print("\n" + "=" * 90)
+    print("PER MODEL (averaged across 4 surface variants)")
+    print("=" * 90)
+    print(f"\n{'Model':<25} {'mean-d':>7} {'mean-stbl':>10} {'mean-brit':>10} "
+          f"{'mean-coll%':>11} {'min-p':>9}")
+    print("-" * 80)
+    rows = []
+    for m, cs in per_model.items():
+        if len(cs) == 0: continue
+        d = statistics.fmean(c["metrics"]["token_jaccard"]["cohens_d"] for c in cs)
+        s = statistics.fmean(c["metrics"]["token_jaccard"]["stable_median"] for c in cs)
+        b = statistics.fmean(c["metrics"]["token_jaccard"]["brittle_median"] for c in cs)
+        col = statistics.fmean(c["brittle_collapse_rate"] for c in cs) * 100
+        mp = min(c["metrics"]["token_jaccard"]["p_two_sided"] for c in cs)
+        rows.append((m, d, s, b, col, mp))
+    for r in sorted(rows, key=lambda r: -r[1]):
+        print(f"{r[0]:<25} {r[1]:>+7.2f} {r[2]:>10.3f} {r[3]:>10.3f} {r[4]:>10.1f}% {r[5]:>9.1e}")
+
+
+if __name__ == "__main__":
+    main()
author	Yuren Hao <yurenh2@illinois.edu>	2026-04-08 22:06:05 -0500
committer	Yuren Hao <yurenh2@illinois.edu>	2026-04-08 22:06:05 -0500
commit	05704d0eb2fa59fe727652465b07db40bcb06c38 (patch)
tree	8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /analysis/aggregate_overlap.py