summaryrefslogtreecommitdiff
path: root/analysis/aggregate_overlap.py
diff options
context:
space:
mode:
authorYuren Hao <yurenh2@illinois.edu>2026-04-08 22:06:05 -0500
committerYuren Hao <yurenh2@illinois.edu>2026-04-08 22:06:05 -0500
commit05704d0eb2fa59fe727652465b07db40bcb06c38 (patch)
tree8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /analysis/aggregate_overlap.py
Initial release: GAP framework
- Full pipeline: variant generation, multi-judge verification, evaluation - Loaders for OpenAI / Anthropic / Google / xAI / OpenRouter / vLLM - Framework-level mechanism analyses: paired structural overlap, repairability rescue, self-correction probe, cross-model agreement, topic x problem-type interaction - Unicode -> bare-LaTeX cleaner + audit + spot-check - Mirrors https://huggingface.co/datasets/blackhao0426/PutnamGAP
Diffstat (limited to 'analysis/aggregate_overlap.py')
-rw-r--r--analysis/aggregate_overlap.py91
1 files changed, 91 insertions, 0 deletions
diff --git a/analysis/aggregate_overlap.py b/analysis/aggregate_overlap.py
new file mode 100644
index 0000000..cd6b53e
--- /dev/null
+++ b/analysis/aggregate_overlap.py
@@ -0,0 +1,91 @@
+"""Aggregate structural_overlap results by variant type and by model.
+
+Produces a clean rebuttal table.
+"""
+from __future__ import annotations
+import json
+import statistics
+from pathlib import Path
+from collections import defaultdict
+
+RESULTS = Path("/home/yurenh2/gap/analysis/structural_overlap_results.json")
+SHORT = {"descriptive_long":"DL","descriptive_long_confusing":"DLC",
+ "descriptive_long_misleading":"DLM","garbled_string":"GS"}
+
+
+def main():
+ cells = json.load(open(RESULTS))
+ print(f"Loaded {len(cells)} cells.\n")
+
+ # Per-variant aggregate
+ per_variant = defaultdict(list)
+ for c in cells:
+ per_variant[c["variant"]].append(c)
+
+ print("=" * 90)
+ print("HEADLINE TABLE: Surface variants — stable vs brittle structural overlap")
+ print("(token Jaccard on canonicalized trajectories, drift cases only)")
+ print("=" * 90)
+ print(f"\n{'Variant':<6} {'#cells':>7} {'#dir+':>6} {'#p<.05':>8} "
+ f"{'med-d':>7} {'mean-d':>7} {'mean-dlt':>9} "
+ f"{'mean-stbl':>10} {'mean-brit':>10} {'mean-noise':>11} "
+ f"{'mean-collapse%':>14}")
+ print("-" * 100)
+ for v, cs in per_variant.items():
+ ds = [c["metrics"]["token_jaccard"]["cohens_d"] for c in cs]
+ ps = [c["metrics"]["token_jaccard"]["p_two_sided"] for c in cs]
+ n_pos = sum(1 for d in ds if d > 0)
+ n_sig = sum(1 for p in ps if p < 0.05)
+ deltas = [c["metrics"]["token_jaccard"]["delta_median"] for c in cs]
+ stbl = [c["metrics"]["token_jaccard"]["stable_median"] for c in cs]
+ brit = [c["metrics"]["token_jaccard"]["brittle_median"] for c in cs]
+ noise = [c["metrics"]["token_jaccard"]["noise_floor_median"] for c in cs
+ if c["metrics"]["token_jaccard"].get("noise_floor_median") is not None]
+ collapse = [c["brittle_collapse_rate"] for c in cs]
+ print(f"{SHORT[v]:<6} {len(cs):>7} {n_pos:>6} {n_sig:>8} "
+ f"{statistics.median(ds):>+7.2f} {statistics.fmean(ds):>+7.2f} "
+ f"{statistics.fmean(deltas):>+9.4f} "
+ f"{statistics.fmean(stbl):>10.3f} {statistics.fmean(brit):>10.3f} "
+ f"{statistics.fmean(noise):>11.3f} "
+ f"{statistics.fmean(collapse)*100:>13.1f}%")
+
+ # Variant-aggregate (across all models, n-weighted)
+ print("\n" + "=" * 90)
+ print("ALL CELLS (18 models × 4 surface variants)")
+ print("=" * 90)
+ all_d = [c["metrics"]["token_jaccard"]["cohens_d"] for c in cells]
+ all_p = [c["metrics"]["token_jaccard"]["p_two_sided"] for c in cells]
+ print(f" cells: {len(cells)}")
+ print(f" direction-positive: {sum(1 for d in all_d if d>0)}/{len(cells)}")
+ print(f" p<0.05: {sum(1 for p in all_p if p<0.05)}/{len(cells)}")
+ print(f" p<0.001: {sum(1 for p in all_p if p<0.001)}/{len(cells)}")
+ print(f" p<1e-6: {sum(1 for p in all_p if p<1e-6)}/{len(cells)}")
+ print(f" Cohen's d median: {statistics.median(all_d):+.3f}")
+ print(f" Cohen's d mean: {statistics.fmean(all_d):+.3f}")
+ print(f" Cohen's d range: [{min(all_d):+.2f}, {max(all_d):+.2f}]")
+
+ # Per-model aggregate (averaged across 4 surface variants)
+ per_model = defaultdict(list)
+ for c in cells:
+ per_model[c["model"]].append(c)
+ print("\n" + "=" * 90)
+ print("PER MODEL (averaged across 4 surface variants)")
+ print("=" * 90)
+ print(f"\n{'Model':<25} {'mean-d':>7} {'mean-stbl':>10} {'mean-brit':>10} "
+ f"{'mean-coll%':>11} {'min-p':>9}")
+ print("-" * 80)
+ rows = []
+ for m, cs in per_model.items():
+ if len(cs) == 0: continue
+ d = statistics.fmean(c["metrics"]["token_jaccard"]["cohens_d"] for c in cs)
+ s = statistics.fmean(c["metrics"]["token_jaccard"]["stable_median"] for c in cs)
+ b = statistics.fmean(c["metrics"]["token_jaccard"]["brittle_median"] for c in cs)
+ col = statistics.fmean(c["brittle_collapse_rate"] for c in cs) * 100
+ mp = min(c["metrics"]["token_jaccard"]["p_two_sided"] for c in cs)
+ rows.append((m, d, s, b, col, mp))
+ for r in sorted(rows, key=lambda r: -r[1]):
+ print(f"{r[0]:<25} {r[1]:>+7.2f} {r[2]:>10.3f} {r[3]:>10.3f} {r[4]:>10.1f}% {r[5]:>9.1e}")
+
+
+if __name__ == "__main__":
+ main()