analysis/make_figures.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272

"""Three rebuttal figures.

Fig1 — Structural Cohen's d heatmap
       18 models × 5 variants (4 surface + KV).
       Surface cells use the self-anchor metric (model's own original under
       inverse rename). KV uses the canonical-anchor metric.

Fig2 — Rescue rebound rates by variant + condition
       Pooled across 4 models. Bar plot with Wilson 95 % CI.
       Three bars per variant: null / canonical_T2 / own_T2 (KV: only 2).

Fig3 — own_T2 vs canonical_T2 per (model, variant)
       Scatter plot of own_T2 rebound rate vs canonical_T2 rebound rate per
       cell, with the y=x line. Points above the diagonal: own outperforms
       canonical (rare); below: canonical outperforms own (typical).
"""
from __future__ import annotations
import json
import math
import statistics
from pathlib import Path
from collections import defaultdict

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np

ROOT = Path("/home/yurenh2/gap/analysis")
FIG_DIR = ROOT / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

VARIANT_LABELS = {
    "descriptive_long": "DL",
    "descriptive_long_confusing": "DLC",
    "descriptive_long_misleading": "DLM",
    "garbled_string": "GS",
    "kernel_variant": "KV",
}
VARIANT_ORDER_SURF = ["descriptive_long", "descriptive_long_confusing",
                      "descriptive_long_misleading", "garbled_string"]
VARIANT_ORDER_ALL = VARIANT_ORDER_SURF + ["kernel_variant"]

# ----------------------------------------------------------------------
# Fig 1 — Structural Cohen's d heatmap
# ----------------------------------------------------------------------

def fig1_structural_d_heatmap():
    """Heatmap of Cohen's d for the stable-vs-brittle structural metric.

    Surface cells: self-anchor (token Jaccard between model's variant
    trajectory and its own original-correct trajectory after canonicalization).
    Source file: structural_overlap_results.json.

    KV cells: canonical-anchor (token Jaccard between model's KV trajectory and
    the dataset's canonical KV solution).
    Source file: kv_overlap_results.json.
    """
    surf = json.load(open(ROOT / "structural_overlap_results.json"))
    kv = json.load(open(ROOT / "kv_overlap_results.json"))

    # Build matrix: rows = models (sorted by mean d), cols = variants (DL, DLC, DLM, GS, KV)
    by_cell = {}
    for c in surf:
        by_cell[(c["model"], c["variant"])] = c["metrics"]["token_jaccard"]["cohens_d"]
    for c in kv:
        by_cell[(c["model"], "kernel_variant")] = c["metrics"]["token_jaccard"]["cohens_d"]

    models = sorted({k[0] for k in by_cell})
    # Sort by mean d across surface variants only (so KV doesn't bias the order)
    def mean_surface_d(m):
        ds = [by_cell.get((m, v)) for v in VARIANT_ORDER_SURF
              if by_cell.get((m, v)) is not None]
        return statistics.fmean(ds) if ds else 0.0
    models.sort(key=mean_surface_d, reverse=True)

    M = np.full((len(models), len(VARIANT_ORDER_ALL)), np.nan)
    for i, m in enumerate(models):
        for j, v in enumerate(VARIANT_ORDER_ALL):
            d = by_cell.get((m, v))
            if d is not None:
                M[i, j] = d

    fig, ax = plt.subplots(figsize=(7, 9))
    vmin = 0.0
    vmax = 1.4
    cmap = plt.cm.viridis
    im = ax.imshow(M, cmap=cmap, vmin=vmin, vmax=vmax, aspect="auto")
    ax.set_xticks(range(len(VARIANT_ORDER_ALL)))
    ax.set_xticklabels([VARIANT_LABELS[v] for v in VARIANT_ORDER_ALL])
    ax.set_yticks(range(len(models)))
    ax.set_yticklabels(models, fontsize=9)
    # Annotate values
    for i in range(len(models)):
        for j in range(len(VARIANT_ORDER_ALL)):
            v = M[i, j]
            if not math.isnan(v):
                color = "white" if v < 0.7 else "black"
                ax.text(j, i, f"{v:+.2f}", ha="center", va="center",
                        fontsize=8, color=color)
    # Vertical line separating surface from KV
    ax.axvline(x=3.5, color="white", linewidth=2)
    cbar = plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    cbar.set_label("Cohen's d (stable − brittle)\non canonicalized token Jaccard",
                   fontsize=9)
    ax.set_title("Structural overlap effect size: stable vs brittle\n"
                 "(surface = self-anchor; KV = canonical-anchor)",
                 fontsize=11)
    ax.set_xlabel("Variant family", fontsize=10)
    plt.tight_layout()
    out = FIG_DIR / "fig1_structural_d_heatmap.png"
    plt.savefig(out, dpi=200, bbox_inches="tight")
    plt.close()
    print(f"Saved {out}")


# ----------------------------------------------------------------------
# Fig 2 — Rescue rebound rates with Wilson CI
# ----------------------------------------------------------------------

def wilson_ci(k: int, n: int, z: float = 1.96):
    if n == 0:
        return (0.0, 0.0, 0.0)
    p = k / n
    denom = 1 + z * z / n
    center = (p + z * z / (2 * n)) / denom
    half = z * math.sqrt(p * (1 - p) / n + z * z / (4 * n * n)) / denom
    return (p, max(0.0, center - half), min(1.0, center + half))


def fig2_rescue_rates():
    rows = [json.loads(l) for l in open(ROOT / "rescue_results/rescue_30.jsonl")]

    counts = defaultdict(lambda: {"k": 0, "n": 0})
    for r in rows:
        counts[(r["variant"], r["condition"])]["n"] += 1
        if r.get("grade") == "CORRECT":
            counts[(r["variant"], r["condition"])]["k"] += 1

    conds_full = ["null", "canonical_T2", "own_T2"]
    cond_color = {"null": "#888888", "canonical_T2": "#1f77b4", "own_T2": "#d62728"}
    cond_label = {"null": "null (generic scaffold)",
                  "canonical_T2": "canonical_T2 (item-specific, expert prose)",
                  "own_T2": "own_T2 (item-specific, model's own work, renamed)"}

    fig, ax = plt.subplots(figsize=(8, 5))
    n_var = len(VARIANT_ORDER_ALL)
    width = 0.27
    x = np.arange(n_var)
    for ci, cond in enumerate(conds_full):
        ks, lows, highs, ps = [], [], [], []
        for v in VARIANT_ORDER_ALL:
            d = counts.get((v, cond))
            if d is None:
                ks.append(0); lows.append(0); highs.append(0); ps.append(0)
                continue
            p, lo, hi = wilson_ci(d["k"], d["n"])
            ps.append(p * 100)
            lows.append((p - lo) * 100)
            highs.append((hi - p) * 100)
            ks.append(d["k"])
        offset = (ci - 1) * width
        ax.bar(x + offset, ps, width=width, color=cond_color[cond], label=cond_label[cond],
               yerr=[lows, highs], capsize=3, error_kw={"elinewidth": 1, "ecolor": "#444444"})
        # Annotate counts above each bar
        for xi, p, k in zip(x + offset, ps, ks):
            if k > 0:
                ax.text(xi, p + 0.5, f"{p:.0f}%", ha="center", va="bottom", fontsize=8)

    ax.set_xticks(x)
    ax.set_xticklabels([VARIANT_LABELS[v] for v in VARIANT_ORDER_ALL], fontsize=10)
    ax.set_ylabel("Rebound rate (%) on flip cases", fontsize=10)
    ax.set_title("Repairability rescue: rebound rate by variant and prefix condition\n"
                 "(pooled across 4 models, n ≈ 100–120 per cell, 95% Wilson CI)",
                 fontsize=11)
    ax.set_ylim(0, 60)
    ax.legend(loc="upper right", fontsize=8, framealpha=0.95)
    ax.grid(axis="y", linestyle="--", alpha=0.4)
    ax.set_axisbelow(True)
    plt.tight_layout()
    out = FIG_DIR / "fig2_rescue_rebound.png"
    plt.savefig(out, dpi=200, bbox_inches="tight")
    plt.close()
    print(f"Saved {out}")


# ----------------------------------------------------------------------
# Fig 3 — own_T2 vs canonical_T2 scatter
# ----------------------------------------------------------------------

def fig3_own_vs_canonical_scatter():
    rows = [json.loads(l) for l in open(ROOT / "rescue_results/rescue_30.jsonl")]

    counts = defaultdict(lambda: {"k": 0, "n": 0})
    for r in rows:
        counts[(r["model"], r["variant"], r["condition"])]["n"] += 1
        if r.get("grade") == "CORRECT":
            counts[(r["model"], r["variant"], r["condition"])]["k"] += 1

    fig, ax = plt.subplots(figsize=(7, 7))

    models_in_data = sorted({k[0] for k in counts})
    model_color = {
        "claude-sonnet-4":  "#ff7f0e",
        "gemini-2.5-flash": "#2ca02c",
        "gpt-4.1-mini":     "#1f77b4",
        "gpt-4o-mini":      "#d62728",
    }
    var_marker = {
        "descriptive_long": "o",
        "descriptive_long_confusing": "s",
        "descriptive_long_misleading": "^",
        "garbled_string": "D",
    }

    # Diagonal
    ax.plot([0, 0.7], [0, 0.7], "k--", lw=1, alpha=0.5)
    ax.text(0.62, 0.66, "y = x", fontsize=8, alpha=0.6)

    for m in models_in_data:
        for v in VARIANT_ORDER_SURF:
            own = counts.get((m, v, "own_T2"))
            can = counts.get((m, v, "canonical_T2"))
            if own is None or can is None or own["n"] == 0 or can["n"] == 0:
                continue
            x = can["k"] / can["n"]
            y = own["k"] / own["n"]
            ax.scatter(x, y, s=110, c=model_color.get(m, "gray"),
                       marker=var_marker[v], alpha=0.85,
                       edgecolors="black", linewidths=0.6)

    # Build legend
    from matplotlib.lines import Line2D
    model_handles = [Line2D([], [], marker="o", linestyle="", markersize=9,
                            markerfacecolor=c, markeredgecolor="black",
                            markeredgewidth=0.6, label=m)
                     for m, c in model_color.items() if m in models_in_data]
    variant_handles = [Line2D([], [], marker=mk, linestyle="", markersize=9,
                              markerfacecolor="lightgray", markeredgecolor="black",
                              markeredgewidth=0.6, label=VARIANT_LABELS[v])
                       for v, mk in var_marker.items()]
    leg1 = ax.legend(handles=model_handles, loc="upper left", title="Model",
                     fontsize=8, title_fontsize=9, framealpha=0.95)
    ax.add_artist(leg1)
    ax.legend(handles=variant_handles, loc="lower right", title="Variant",
              fontsize=8, title_fontsize=9, framealpha=0.95)

    ax.set_xlim(0, 0.7)
    ax.set_ylim(0, 0.7)
    ax.set_xlabel("canonical_T2 rebound rate", fontsize=10)
    ax.set_ylabel("own_T2 rebound rate", fontsize=10)
    ax.set_title("Per-cell rescue rates: model's own prefix vs canonical prefix\n"
                 "(below diagonal = canonical wins; gpt-4o-mini is the only family above)",
                 fontsize=11)
    ax.grid(linestyle="--", alpha=0.4)
    ax.set_axisbelow(True)
    plt.tight_layout()
    out = FIG_DIR / "fig3_own_vs_canonical_scatter.png"
    plt.savefig(out, dpi=200, bbox_inches="tight")
    plt.close()
    print(f"Saved {out}")


def main():
    fig1_structural_d_heatmap()
    fig2_rescue_rates()
    fig3_own_vs_canonical_scatter()
    print("\nAll figures written to:", FIG_DIR)


if __name__ == "__main__":
    main()