1 files changed, 192 insertions, 0 deletions
diff --git a/analysis/sc_success_and_difficulty.py b/analysis/sc_success_and_difficulty.py
new file mode 100644
index 0000000..a8b44db
--- /dev/null
+++ b/analysis/sc_success_and_difficulty.py
@@ -0,0 +1,192 @@
+"""Two follow-up analyses (zero API):
+1. Per-model self-correction success rate: P(correct | SC) vs P(correct | no SC)
+2. Difficulty-stratified surface vs kernel dichotomy
+"""
+from __future__ import annotations
+import json
+import sys
+import statistics
+from pathlib import Path
+from collections import defaultdict
+
+THIS_DIR = Path(__file__).resolve().parent
+sys.path.insert(0, str(THIS_DIR))
+from structural_overlap import find_variant_file, load_problems, RESULTS_DIR, SURFACE_VARIANTS
+from self_correction import has_self_correction
+
+
+# ----------------- 1. SC success rate per model -----------------
+
+def sc_success_rate():
+    base = RESULTS_DIR
+    models = sorted([d.name for d in base.iterdir() if d.is_dir()])
+
+    print("=" * 80)
+    print("PER-MODEL SELF-CORRECTION SUCCESS RATE")
+    print("(does an SC attempt improve probability of being correct?)")
+    print("=" * 80)
+    print()
+
+    rows = []
+    for m in models:
+        mdir = base / m
+        # Aggregate over all variants
+        n_sc_correct = 0
+        n_sc_total = 0
+        n_nosc_correct = 0
+        n_nosc_total = 0
+        for v in ["original"] + SURFACE_VARIANTS + ["kernel_variant"]:
+            vp = find_variant_file(mdir, v)
+            if not vp: continue
+            for p in load_problems(vp):
+                text = (p.get("solve") or {}).get("solution") or ""
+                if not text: continue
+                correct = p.get("correct")
+                if correct is None: continue
+                if has_self_correction(text):
+                    n_sc_total += 1
+                    if correct: n_sc_correct += 1
+                else:
+                    n_nosc_total += 1
+                    if correct: n_nosc_correct += 1
+        if n_sc_total < 5 or n_nosc_total < 5:
+            continue
+        p_sc = n_sc_correct / n_sc_total
+        p_nosc = n_nosc_correct / n_nosc_total
+        delta = p_sc - p_nosc
+        # Wilson 95% CI on each rate
+        rows.append({
+            "model": m,
+            "sc_n": n_sc_total, "sc_correct": n_sc_correct, "p_sc": p_sc,
+            "nosc_n": n_nosc_total, "nosc_correct": n_nosc_correct, "p_nosc": p_nosc,
+            "delta": delta,
+        })
+
+    rows.sort(key=lambda r: -r["sc_n"])
+    print(f"{'Model':<22} {'#SC trials':>11} {'P(corr|SC)':>12} {'P(corr|noSC)':>13} {'Δ':>9}")
+    print("-" * 75)
+    for r in rows:
+        print(f"{r['model']:<22} {r['sc_n']:>11} "
+              f"{r['p_sc']*100:>10.1f}% {r['p_nosc']*100:>11.1f}% "
+              f"{r['delta']*100:>+7.1f}pp")
+
+    json.dump(rows, open(THIS_DIR / "sc_success_per_model.json", "w"), indent=2)
+    return rows
+
+
+# ----------------- 2. Difficulty stratified dichotomy -----------------
+
+DATASET_DIR = Path("/home/yurenh2/gap/putnam-bench-anon/dataset")
+
+def load_difficulty_metadata():
+    """Per-problem difficulty assignment using year/section/index heuristic.
+
+    Per the paper's existing exposition, we derive Easy/Medium/Hard from the
+    problem index (1-2 = Easy, 3-4 = Medium, 5-6 = Hard, 7-8 = extra-hard tail)
+    because the dataset's `difficulty` field is heterogeneous.
+    """
+    out = {}
+    for f in sorted(DATASET_DIR.glob("*.json")):
+        d = json.load(open(f))
+        idx = d.get("index")
+        if not idx: continue
+        # Extract problem number from "YEAR-PART-NUM"
+        parts = idx.split("-")
+        if len(parts) != 3: continue
+        try:
+            num = int(parts[2])
+        except ValueError:
+            continue
+        if num <= 2: bucket = "Easy"
+        elif num <= 4: bucket = "Medium"
+        elif num <= 6: bucket = "Hard"
+        else: bucket = "ExtraHard"
+        out[idx] = bucket
+    return out
+
+
+def difficulty_stratified_dichotomy():
+    print("\n\n" + "=" * 80)
+    print("DIFFICULTY-STRATIFIED ACCURACY (mean across 18 models)")
+    print("Easy/Medium/Hard buckets defined by problem index 1-2/3-4/5-6")
+    print("=" * 80)
+    print()
+
+    diff = load_difficulty_metadata()
+    base = RESULTS_DIR
+    models = sorted([d.name for d in base.iterdir() if d.is_dir()])
+
+    # buckets[(model, variant, difficulty)] = (n, n_correct)
+    cells = defaultdict(lambda: [0, 0])
+    for m in models:
+        mdir = base / m
+        for v in ["original"] + SURFACE_VARIANTS + ["kernel_variant"]:
+            vp = find_variant_file(mdir, v)
+            if not vp: continue
+            for p in load_problems(vp):
+                idx = p.get("index")
+                correct = p.get("correct")
+                if idx is None or correct is None: continue
+                bucket = diff.get(idx, "Unknown")
+                cells[(m, v, bucket)][0] += 1
+                if correct: cells[(m, v, bucket)][1] += 1
+
+    # Aggregate per (variant, difficulty) by averaging per-model rates
+    print(f"{'Variant':<24} {'Easy':>8} {'Medium':>8} {'Hard':>8} {'XHard':>8}")
+    print("-" * 60)
+    for v in ["original"] + SURFACE_VARIANTS + ["kernel_variant"]:
+        row = {}
+        for bucket in ["Easy", "Medium", "Hard", "ExtraHard"]:
+            rates = []
+            for m in models:
+                n, c = cells.get((m, v, bucket), [0, 0])
+                if n >= 5:
+                    rates.append(c / n)
+            row[bucket] = statistics.fmean(rates) * 100 if rates else None
+        print(f"{v:<24} "
+              f"{row['Easy']:>7.1f}% " if row['Easy'] is not None else f"{v:<24} {'-':>8}",
+              end="")
+        for bucket in ["Medium", "Hard", "ExtraHard"]:
+            print(f"{row[bucket]:>7.1f}% " if row[bucket] is not None else f"{'-':>8}", end="")
+        print()
+
+    # Compute Δ_orig→KV per difficulty bucket
+    print(f"\n--- Δ original → KV per difficulty bucket ---")
+    for bucket in ["Easy", "Medium", "Hard", "ExtraHard"]:
+        orig_rates = []
+        kv_rates = []
+        for m in models:
+            no, co = cells.get((m, "original", bucket), [0, 0])
+            nk, ck = cells.get((m, "kernel_variant", bucket), [0, 0])
+            if no >= 5 and nk >= 5:
+                orig_rates.append(co / no)
+                kv_rates.append(ck / nk)
+        if orig_rates:
+            mo = statistics.fmean(orig_rates) * 100
+            mk = statistics.fmean(kv_rates) * 100
+            print(f"  {bucket:<10} orig={mo:5.1f}%  kv={mk:5.1f}%  Δ={mk-mo:+.1f}pp")
+
+    # Compute Δ_orig→GS per difficulty bucket
+    print(f"\n--- Δ original → GS (surface, hardest renamer) per difficulty bucket ---")
+    for bucket in ["Easy", "Medium", "Hard", "ExtraHard"]:
+        orig_rates = []
+        gs_rates = []
+        for m in models:
+            no, co = cells.get((m, "original", bucket), [0, 0])
+            ng, cg = cells.get((m, "garbled_string", bucket), [0, 0])
+            if no >= 5 and ng >= 5:
+                orig_rates.append(co / no)
+                gs_rates.append(cg / ng)
+        if orig_rates:
+            mo = statistics.fmean(orig_rates) * 100
+            mg = statistics.fmean(gs_rates) * 100
+            print(f"  {bucket:<10} orig={mo:5.1f}%  GS={mg:5.1f}%  Δ={mg-mo:+.1f}pp")
+
+
+def main():
+    sc_success_rate()
+    difficulty_stratified_dichotomy()
+
+
+if __name__ == "__main__":
+    main()