"""Three rebuttal figures.

Fig1 — Structural Cohen's d heatmap
       18 models × 5 variants (4 surface + KV).
       Surface cells use the self-anchor metric (model's own original under
       inverse rename). KV uses the canonical-anchor metric.

Fig2 — Rescue rebound rates by variant + condition
       Pooled across 4 models. Bar plot with Wilson 95 % CI.
       Three bars per variant: null / canonical_T2 / own_T2 (KV: only 2).

Fig3 — own_T2 vs canonical_T2 per (model, variant)
       Scatter plot of own_T2 rebound rate vs canonical_T2 rebound rate per
       cell, with the y=x line. Points above the diagonal: own outperforms
       canonical (rare); below: canonical outperforms own (typical).
"""
from __future__ import annotations
import json
import math
import statistics
from pathlib import Path
from collections import defaultdict

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np

ROOT = Path("/home/yurenh2/gap/analysis")
FIG_DIR = ROOT / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

VARIANT_LABELS = {
    "descriptive_long": "DL",
    "descriptive_long_confusing": "DLC",
    "descriptive_long_misleading": "DLM",
    "garbled_string": "GS",
    "kernel_variant": "KV",
}
VARIANT_ORDER_SURF = ["descriptive_long", "descriptive_long_confusing",
                      "descriptive_long_misleading", "garbled_string"]
VARIANT_ORDER_ALL = VARIANT_ORDER_SURF + ["kernel_variant"]

# ----------------------------------------------------------------------
# Fig 1 — Structural Cohen's d heatmap
# ----------------------------------------------------------------------

def fig1_structural_d_heatmap():
    """Heatmap of Cohen's d for the stable-vs-brittle structural metric.

    Surface cells: self-anchor (token Jaccard between model's variant
    trajectory and its own original-correct trajectory after canonicalization).
    Source file: structural_overlap_results.json.

    KV cells: canonical-anchor (token Jaccard between model's KV trajectory and
    the dataset's canonical KV solution).
    Source file: kv_overlap_results.json.
    """
    surf = json.load(open(ROOT / "structural_overlap_results.json"))
    kv = json.load(open(ROOT / "kv_overlap_results.json"))

    # Build matrix: rows = models (sorted by mean d), cols = variants (DL, DLC, DLM, GS, KV)
    by_cell = {}
    for c in surf:
        by_cell[(c["model"], c["variant"])] = c["metrics"]["token_jaccard"]["cohens_d"]
    for c in kv:
        by_cell[(c["model"], "kernel_variant")] = c["metrics"]["token_jaccard"]["cohens_d"]

    models = sorted({k[0] for k in by_cell})
    # Sort by mean d across surface variants only (so KV doesn't bias the order)
    def mean_surface_d(m):
        ds = [by_cell.get((m, v)) for v in VARIANT_ORDER_SURF
              if by_cell.get((m, v)) is not None]
        return statistics.fmean(ds) if ds else 0.0
    models.sort(key=mean_surface_d, reverse=True)

    M = np.full((len(models), len(VARIANT_ORDER_ALL)), np.nan)
    for i, m in enumerate(models):
        for j, v in enumerate(VARIANT_ORDER_ALL):
            d = by_cell.get((m, v))
            if d is not None:
                M[i, j] = d

    fig, ax = plt.subplots(figsize=(7, 9))
    vmin = 0.0
    vmax = 1.4
    cmap = plt.cm.viridis
    im = ax.imshow(M, cmap=cmap, vmin=vmin, vmax=vmax, aspect="auto")
    ax.set_xticks(range(len(VARIANT_ORDER_ALL)))
    ax.set_xticklabels([VARIANT_LABELS[v] for v in VARIANT_ORDER_ALL])
    ax.set_yticks(range(len(models)))
    ax.set_yticklabels(models, fontsize=9)
    # Annotate values
    for i in range(len(models)):
        for j in range(len(VARIANT_ORDER_ALL)):
            v = M[i, j]
            if not math.isnan(v):
                color = "white" if v < 0.7 else "black"
                ax.text(j, i, f"{v:+.2f}", ha="center", va="center",
                        fontsize=8, color=color)
    # Vertical line separating surface from KV
    ax.axvline(x=3.5, color="white", linewidth=2)
    cbar = plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    cbar.set_label("Cohen's d (stable − brittle)\non canonicalized token Jaccard",
                   fontsize=9)
    ax.set_title("Structural overlap effect size: stable vs brittle\n"
                 "(surface = self-anchor; KV = canonical-anchor)",
                 fontsize=11)
    ax.set_xlabel("Variant family", fontsize=10)
    plt.tight_layout()
    out = FIG_DIR / "fig1_structural_d_heatmap.png"
    plt.savefig(out, dpi=200, bbox_inches="tight")
    plt.close()
    print(f"Saved {out}")


# ----------------------------------------------------------------------
# Fig 2 — Rescue rebound rates with Wilson CI
# ----------------------------------------------------------------------

def wilson_ci(k: int, n: int, z: float = 1.96):
    if n == 0:
        return (0.0, 0.0, 0.0)
    p = k / n
    denom = 1 + z * z / n
    center = (p + z * z / (2 * n)) / denom
    half = z * math.sqrt(p * (1 - p) / n + z * z / (4 * n * n)) / denom
    return (p, max(0.0, center - half), min(1.0, center + half))


def fig2_rescue_rates():
    rows = [json.loads(l) for l in open(ROOT / "rescue_results/rescue_30.jsonl")]

    counts = defaultdict(lambda: {"k": 0, "n": 0})
    for r in rows:
        counts[(r["variant"], r["condition"])]["n"] += 1
        if r.get("grade") == "CORRECT":
            counts[(r["variant"], r["condition"])]["k"] += 1

    conds_full = ["null", "canonical_T2", "own_T2"]
    cond_color = {"null": "#888888", "canonical_T2": "#1f77b4", "own_T2": "#d62728"}
    cond_label = {"null": "null (generic scaffold)",
                  "canonical_T2": "canonical_T2 (item-specific, expert prose)",
                  "own_T2": "own_T2 (item-specific, model's own work, renamed)"}

    fig, ax = plt.subplots(figsize=(8, 5))
    n_var = len(VARIANT_ORDER_ALL)
    width = 0.27
    x = np.arange(n_var)
    for ci, cond in enumerate(conds_full):
        ks, lows, highs, ps = [], [], [], []
        for v in VARIANT_ORDER_ALL:
            d = counts.get((v, cond))
            if d is None:
                ks.append(0); lows.append(0); highs.append(0); ps.append(0)
                continue
            p, lo, hi = wilson_ci(d["k"], d["n"])
            ps.append(p * 100)
            lows.append((p - lo) * 100)
            highs.append((hi - p) * 100)
            ks.append(d["k"])
        offset = (ci - 1) * width
        ax.bar(x + offset, ps, width=width, color=cond_color[cond], label=cond_label[cond],
               yerr=[lows, highs], capsize=3, error_kw={"elinewidth": 1, "ecolor": "#444444"})
        # Annotate counts above each bar
        for xi, p, k in zip(x + offset, ps, ks):
            if k > 0:
                ax.text(xi, p + 0.5, f"{p:.0f}%", ha="center", va="bottom", fontsize=8)

    ax.set_xticks(x)
    ax.set_xticklabels([VARIANT_LABELS[v] for v in VARIANT_ORDER_ALL], fontsize=10)
    ax.set_ylabel("Rebound rate (%) on flip cases", fontsize=10)
    ax.set_title("Repairability rescue: rebound rate by variant and prefix condition\n"
                 "(pooled across 4 models, n ≈ 100–120 per cell, 95% Wilson CI)",
                 fontsize=11)
    ax.set_ylim(0, 60)
    ax.legend(loc="upper right", fontsize=8, framealpha=0.95)
    ax.grid(axis="y", linestyle="--", alpha=0.4)
    ax.set_axisbelow(True)
    plt.tight_layout()
    out = FIG_DIR / "fig2_rescue_rebound.png"
    plt.savefig(out, dpi=200, bbox_inches="tight")
    plt.close()
    print(f"Saved {out}")


# ----------------------------------------------------------------------
# Fig 3 — own_T2 vs canonical_T2 scatter
# ----------------------------------------------------------------------

def fig3_own_vs_canonical_scatter():
    rows = [json.loads(l) for l in open(ROOT / "rescue_results/rescue_30.jsonl")]

    counts = defaultdict(lambda: {"k": 0, "n": 0})
    for r in rows:
        counts[(r["model"], r["variant"], r["condition"])]["n"] += 1
        if r.get("grade") == "CORRECT":
            counts[(r["model"], r["variant"], r["condition"])]["k"] += 1

    fig, ax = plt.subplots(figsize=(7, 7))

    models_in_data = sorted({k[0] for k in counts})
    model_color = {
        "claude-sonnet-4":  "#ff7f0e",
        "gemini-2.5-flash": "#2ca02c",
        "gpt-4.1-mini":     "#1f77b4",
        "gpt-4o-mini":      "#d62728",
    }
    var_marker = {
        "descriptive_long": "o",
        "descriptive_long_confusing": "s",
        "descriptive_long_misleading": "^",
        "garbled_string": "D",
    }

    # Diagonal
    ax.plot([0, 0.7], [0, 0.7], "k--", lw=1, alpha=0.5)
    ax.text(0.62, 0.66, "y = x", fontsize=8, alpha=0.6)

    for m in models_in_data:
        for v in VARIANT_ORDER_SURF:
            own = counts.get((m, v, "own_T2"))
            can = counts.get((m, v, "canonical_T2"))
            if own is None or can is None or own["n"] == 0 or can["n"] == 0:
                continue
            x = can["k"] / can["n"]
            y = own["k"] / own["n"]
            ax.scatter(x, y, s=110, c=model_color.get(m, "gray"),
                       marker=var_marker[v], alpha=0.85,
                       edgecolors="black", linewidths=0.6)

    # Build legend
    from matplotlib.lines import Line2D
    model_handles = [Line2D([], [], marker="o", linestyle="", markersize=9,
                            markerfacecolor=c, markeredgecolor="black",
                            markeredgewidth=0.6, label=m)
                     for m, c in model_color.items() if m in models_in_data]
    variant_handles = [Line2D([], [], marker=mk, linestyle="", markersize=9,
                              markerfacecolor="lightgray", markeredgecolor="black",
                              markeredgewidth=0.6, label=VARIANT_LABELS[v])
                       for v, mk in var_marker.items()]
    leg1 = ax.legend(handles=model_handles, loc="upper left", title="Model",
                     fontsize=8, title_fontsize=9, framealpha=0.95)
    ax.add_artist(leg1)
    ax.legend(handles=variant_handles, loc="lower right", title="Variant",
              fontsize=8, title_fontsize=9, framealpha=0.95)

    ax.set_xlim(0, 0.7)
    ax.set_ylim(0, 0.7)
    ax.set_xlabel("canonical_T2 rebound rate", fontsize=10)
    ax.set_ylabel("own_T2 rebound rate", fontsize=10)
    ax.set_title("Per-cell rescue rates: model's own prefix vs canonical prefix\n"
                 "(below diagonal = canonical wins; gpt-4o-mini is the only family above)",
                 fontsize=11)
    ax.grid(linestyle="--", alpha=0.4)
    ax.set_axisbelow(True)
    plt.tight_layout()
    out = FIG_DIR / "fig3_own_vs_canonical_scatter.png"
    plt.savefig(out, dpi=200, bbox_inches="tight")
    plt.close()
    print(f"Saved {out}")


def main():
    fig1_structural_d_heatmap()
    fig2_rescue_rates()
    fig3_own_vs_canonical_scatter()
    print("\nAll figures written to:", FIG_DIR)


if __name__ == "__main__":
    main()