diff options
| author | Yuren Hao <yurenh2@illinois.edu> | 2026-04-08 22:06:05 -0500 |
|---|---|---|
| committer | Yuren Hao <yurenh2@illinois.edu> | 2026-04-08 22:06:05 -0500 |
| commit | 05704d0eb2fa59fe727652465b07db40bcb06c38 (patch) | |
| tree | 8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /analysis/aggregate_overlap.py | |
Initial release: GAP framework
- Full pipeline: variant generation, multi-judge verification, evaluation
- Loaders for OpenAI / Anthropic / Google / xAI / OpenRouter / vLLM
- Framework-level mechanism analyses: paired structural overlap, repairability rescue, self-correction probe, cross-model agreement, topic x problem-type interaction
- Unicode -> bare-LaTeX cleaner + audit + spot-check
- Mirrors https://huggingface.co/datasets/blackhao0426/PutnamGAP
Diffstat (limited to 'analysis/aggregate_overlap.py')
| -rw-r--r-- | analysis/aggregate_overlap.py | 91 |
1 files changed, 91 insertions, 0 deletions
diff --git a/analysis/aggregate_overlap.py b/analysis/aggregate_overlap.py new file mode 100644 index 0000000..cd6b53e --- /dev/null +++ b/analysis/aggregate_overlap.py @@ -0,0 +1,91 @@ +"""Aggregate structural_overlap results by variant type and by model. + +Produces a clean rebuttal table. +""" +from __future__ import annotations +import json +import statistics +from pathlib import Path +from collections import defaultdict + +RESULTS = Path("/home/yurenh2/gap/analysis/structural_overlap_results.json") +SHORT = {"descriptive_long":"DL","descriptive_long_confusing":"DLC", + "descriptive_long_misleading":"DLM","garbled_string":"GS"} + + +def main(): + cells = json.load(open(RESULTS)) + print(f"Loaded {len(cells)} cells.\n") + + # Per-variant aggregate + per_variant = defaultdict(list) + for c in cells: + per_variant[c["variant"]].append(c) + + print("=" * 90) + print("HEADLINE TABLE: Surface variants — stable vs brittle structural overlap") + print("(token Jaccard on canonicalized trajectories, drift cases only)") + print("=" * 90) + print(f"\n{'Variant':<6} {'#cells':>7} {'#dir+':>6} {'#p<.05':>8} " + f"{'med-d':>7} {'mean-d':>7} {'mean-dlt':>9} " + f"{'mean-stbl':>10} {'mean-brit':>10} {'mean-noise':>11} " + f"{'mean-collapse%':>14}") + print("-" * 100) + for v, cs in per_variant.items(): + ds = [c["metrics"]["token_jaccard"]["cohens_d"] for c in cs] + ps = [c["metrics"]["token_jaccard"]["p_two_sided"] for c in cs] + n_pos = sum(1 for d in ds if d > 0) + n_sig = sum(1 for p in ps if p < 0.05) + deltas = [c["metrics"]["token_jaccard"]["delta_median"] for c in cs] + stbl = [c["metrics"]["token_jaccard"]["stable_median"] for c in cs] + brit = [c["metrics"]["token_jaccard"]["brittle_median"] for c in cs] + noise = [c["metrics"]["token_jaccard"]["noise_floor_median"] for c in cs + if c["metrics"]["token_jaccard"].get("noise_floor_median") is not None] + collapse = [c["brittle_collapse_rate"] for c in cs] + print(f"{SHORT[v]:<6} {len(cs):>7} {n_pos:>6} {n_sig:>8} " + f"{statistics.median(ds):>+7.2f} {statistics.fmean(ds):>+7.2f} " + f"{statistics.fmean(deltas):>+9.4f} " + f"{statistics.fmean(stbl):>10.3f} {statistics.fmean(brit):>10.3f} " + f"{statistics.fmean(noise):>11.3f} " + f"{statistics.fmean(collapse)*100:>13.1f}%") + + # Variant-aggregate (across all models, n-weighted) + print("\n" + "=" * 90) + print("ALL CELLS (18 models × 4 surface variants)") + print("=" * 90) + all_d = [c["metrics"]["token_jaccard"]["cohens_d"] for c in cells] + all_p = [c["metrics"]["token_jaccard"]["p_two_sided"] for c in cells] + print(f" cells: {len(cells)}") + print(f" direction-positive: {sum(1 for d in all_d if d>0)}/{len(cells)}") + print(f" p<0.05: {sum(1 for p in all_p if p<0.05)}/{len(cells)}") + print(f" p<0.001: {sum(1 for p in all_p if p<0.001)}/{len(cells)}") + print(f" p<1e-6: {sum(1 for p in all_p if p<1e-6)}/{len(cells)}") + print(f" Cohen's d median: {statistics.median(all_d):+.3f}") + print(f" Cohen's d mean: {statistics.fmean(all_d):+.3f}") + print(f" Cohen's d range: [{min(all_d):+.2f}, {max(all_d):+.2f}]") + + # Per-model aggregate (averaged across 4 surface variants) + per_model = defaultdict(list) + for c in cells: + per_model[c["model"]].append(c) + print("\n" + "=" * 90) + print("PER MODEL (averaged across 4 surface variants)") + print("=" * 90) + print(f"\n{'Model':<25} {'mean-d':>7} {'mean-stbl':>10} {'mean-brit':>10} " + f"{'mean-coll%':>11} {'min-p':>9}") + print("-" * 80) + rows = [] + for m, cs in per_model.items(): + if len(cs) == 0: continue + d = statistics.fmean(c["metrics"]["token_jaccard"]["cohens_d"] for c in cs) + s = statistics.fmean(c["metrics"]["token_jaccard"]["stable_median"] for c in cs) + b = statistics.fmean(c["metrics"]["token_jaccard"]["brittle_median"] for c in cs) + col = statistics.fmean(c["brittle_collapse_rate"] for c in cs) * 100 + mp = min(c["metrics"]["token_jaccard"]["p_two_sided"] for c in cs) + rows.append((m, d, s, b, col, mp)) + for r in sorted(rows, key=lambda r: -r[1]): + print(f"{r[0]:<25} {r[1]:>+7.2f} {r[2]:>10.3f} {r[3]:>10.3f} {r[4]:>10.1f}% {r[5]:>9.1e}") + + +if __name__ == "__main__": + main() |
