analysis/aggregate_overlap.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91

"""Aggregate structural_overlap results by variant type and by model.

Produces a clean rebuttal table.
"""
from __future__ import annotations
import json
import statistics
from pathlib import Path
from collections import defaultdict

RESULTS = Path("/home/yurenh2/gap/analysis/structural_overlap_results.json")
SHORT = {"descriptive_long":"DL","descriptive_long_confusing":"DLC",
         "descriptive_long_misleading":"DLM","garbled_string":"GS"}


def main():
    cells = json.load(open(RESULTS))
    print(f"Loaded {len(cells)} cells.\n")

    # Per-variant aggregate
    per_variant = defaultdict(list)
    for c in cells:
        per_variant[c["variant"]].append(c)

    print("=" * 90)
    print("HEADLINE TABLE: Surface variants — stable vs brittle structural overlap")
    print("(token Jaccard on canonicalized trajectories, drift cases only)")
    print("=" * 90)
    print(f"\n{'Variant':<6} {'#cells':>7} {'#dir+':>6} {'#p<.05':>8} "
          f"{'med-d':>7} {'mean-d':>7} {'mean-dlt':>9} "
          f"{'mean-stbl':>10} {'mean-brit':>10} {'mean-noise':>11} "
          f"{'mean-collapse%':>14}")
    print("-" * 100)
    for v, cs in per_variant.items():
        ds = [c["metrics"]["token_jaccard"]["cohens_d"] for c in cs]
        ps = [c["metrics"]["token_jaccard"]["p_two_sided"] for c in cs]
        n_pos = sum(1 for d in ds if d > 0)
        n_sig = sum(1 for p in ps if p < 0.05)
        deltas = [c["metrics"]["token_jaccard"]["delta_median"] for c in cs]
        stbl = [c["metrics"]["token_jaccard"]["stable_median"] for c in cs]
        brit = [c["metrics"]["token_jaccard"]["brittle_median"] for c in cs]
        noise = [c["metrics"]["token_jaccard"]["noise_floor_median"] for c in cs
                 if c["metrics"]["token_jaccard"].get("noise_floor_median") is not None]
        collapse = [c["brittle_collapse_rate"] for c in cs]
        print(f"{SHORT[v]:<6} {len(cs):>7} {n_pos:>6} {n_sig:>8} "
              f"{statistics.median(ds):>+7.2f} {statistics.fmean(ds):>+7.2f} "
              f"{statistics.fmean(deltas):>+9.4f} "
              f"{statistics.fmean(stbl):>10.3f} {statistics.fmean(brit):>10.3f} "
              f"{statistics.fmean(noise):>11.3f} "
              f"{statistics.fmean(collapse)*100:>13.1f}%")

    # Variant-aggregate (across all models, n-weighted)
    print("\n" + "=" * 90)
    print("ALL CELLS (18 models × 4 surface variants)")
    print("=" * 90)
    all_d = [c["metrics"]["token_jaccard"]["cohens_d"] for c in cells]
    all_p = [c["metrics"]["token_jaccard"]["p_two_sided"] for c in cells]
    print(f"  cells:                  {len(cells)}")
    print(f"  direction-positive:     {sum(1 for d in all_d if d>0)}/{len(cells)}")
    print(f"  p<0.05:                 {sum(1 for p in all_p if p<0.05)}/{len(cells)}")
    print(f"  p<0.001:                {sum(1 for p in all_p if p<0.001)}/{len(cells)}")
    print(f"  p<1e-6:                 {sum(1 for p in all_p if p<1e-6)}/{len(cells)}")
    print(f"  Cohen's d median:       {statistics.median(all_d):+.3f}")
    print(f"  Cohen's d mean:         {statistics.fmean(all_d):+.3f}")
    print(f"  Cohen's d range:        [{min(all_d):+.2f}, {max(all_d):+.2f}]")

    # Per-model aggregate (averaged across 4 surface variants)
    per_model = defaultdict(list)
    for c in cells:
        per_model[c["model"]].append(c)
    print("\n" + "=" * 90)
    print("PER MODEL (averaged across 4 surface variants)")
    print("=" * 90)
    print(f"\n{'Model':<25} {'mean-d':>7} {'mean-stbl':>10} {'mean-brit':>10} "
          f"{'mean-coll%':>11} {'min-p':>9}")
    print("-" * 80)
    rows = []
    for m, cs in per_model.items():
        if len(cs) == 0: continue
        d = statistics.fmean(c["metrics"]["token_jaccard"]["cohens_d"] for c in cs)
        s = statistics.fmean(c["metrics"]["token_jaccard"]["stable_median"] for c in cs)
        b = statistics.fmean(c["metrics"]["token_jaccard"]["brittle_median"] for c in cs)
        col = statistics.fmean(c["brittle_collapse_rate"] for c in cs) * 100
        mp = min(c["metrics"]["token_jaccard"]["p_two_sided"] for c in cs)
        rows.append((m, d, s, b, col, mp))
    for r in sorted(rows, key=lambda r: -r[1]):
        print(f"{r[0]:<25} {r[1]:>+7.2f} {r[2]:>10.3f} {r[3]:>10.3f} {r[4]:>10.1f}% {r[5]:>9.1e}")


if __name__ == "__main__":
    main()