diff options
Diffstat (limited to 'analysis/sc_success_and_difficulty.py')
| -rw-r--r-- | analysis/sc_success_and_difficulty.py | 192 |
1 files changed, 192 insertions, 0 deletions
diff --git a/analysis/sc_success_and_difficulty.py b/analysis/sc_success_and_difficulty.py new file mode 100644 index 0000000..a8b44db --- /dev/null +++ b/analysis/sc_success_and_difficulty.py @@ -0,0 +1,192 @@ +"""Two follow-up analyses (zero API): +1. Per-model self-correction success rate: P(correct | SC) vs P(correct | no SC) +2. Difficulty-stratified surface vs kernel dichotomy +""" +from __future__ import annotations +import json +import sys +import statistics +from pathlib import Path +from collections import defaultdict + +THIS_DIR = Path(__file__).resolve().parent +sys.path.insert(0, str(THIS_DIR)) +from structural_overlap import find_variant_file, load_problems, RESULTS_DIR, SURFACE_VARIANTS +from self_correction import has_self_correction + + +# ----------------- 1. SC success rate per model ----------------- + +def sc_success_rate(): + base = RESULTS_DIR + models = sorted([d.name for d in base.iterdir() if d.is_dir()]) + + print("=" * 80) + print("PER-MODEL SELF-CORRECTION SUCCESS RATE") + print("(does an SC attempt improve probability of being correct?)") + print("=" * 80) + print() + + rows = [] + for m in models: + mdir = base / m + # Aggregate over all variants + n_sc_correct = 0 + n_sc_total = 0 + n_nosc_correct = 0 + n_nosc_total = 0 + for v in ["original"] + SURFACE_VARIANTS + ["kernel_variant"]: + vp = find_variant_file(mdir, v) + if not vp: continue + for p in load_problems(vp): + text = (p.get("solve") or {}).get("solution") or "" + if not text: continue + correct = p.get("correct") + if correct is None: continue + if has_self_correction(text): + n_sc_total += 1 + if correct: n_sc_correct += 1 + else: + n_nosc_total += 1 + if correct: n_nosc_correct += 1 + if n_sc_total < 5 or n_nosc_total < 5: + continue + p_sc = n_sc_correct / n_sc_total + p_nosc = n_nosc_correct / n_nosc_total + delta = p_sc - p_nosc + # Wilson 95% CI on each rate + rows.append({ + "model": m, + "sc_n": n_sc_total, "sc_correct": n_sc_correct, "p_sc": p_sc, + "nosc_n": n_nosc_total, "nosc_correct": n_nosc_correct, "p_nosc": p_nosc, + "delta": delta, + }) + + rows.sort(key=lambda r: -r["sc_n"]) + print(f"{'Model':<22} {'#SC trials':>11} {'P(corr|SC)':>12} {'P(corr|noSC)':>13} {'Δ':>9}") + print("-" * 75) + for r in rows: + print(f"{r['model']:<22} {r['sc_n']:>11} " + f"{r['p_sc']*100:>10.1f}% {r['p_nosc']*100:>11.1f}% " + f"{r['delta']*100:>+7.1f}pp") + + json.dump(rows, open(THIS_DIR / "sc_success_per_model.json", "w"), indent=2) + return rows + + +# ----------------- 2. Difficulty stratified dichotomy ----------------- + +DATASET_DIR = Path("/home/yurenh2/gap/putnam-bench-anon/dataset") + +def load_difficulty_metadata(): + """Per-problem difficulty assignment using year/section/index heuristic. + + Per the paper's existing exposition, we derive Easy/Medium/Hard from the + problem index (1-2 = Easy, 3-4 = Medium, 5-6 = Hard, 7-8 = extra-hard tail) + because the dataset's `difficulty` field is heterogeneous. + """ + out = {} + for f in sorted(DATASET_DIR.glob("*.json")): + d = json.load(open(f)) + idx = d.get("index") + if not idx: continue + # Extract problem number from "YEAR-PART-NUM" + parts = idx.split("-") + if len(parts) != 3: continue + try: + num = int(parts[2]) + except ValueError: + continue + if num <= 2: bucket = "Easy" + elif num <= 4: bucket = "Medium" + elif num <= 6: bucket = "Hard" + else: bucket = "ExtraHard" + out[idx] = bucket + return out + + +def difficulty_stratified_dichotomy(): + print("\n\n" + "=" * 80) + print("DIFFICULTY-STRATIFIED ACCURACY (mean across 18 models)") + print("Easy/Medium/Hard buckets defined by problem index 1-2/3-4/5-6") + print("=" * 80) + print() + + diff = load_difficulty_metadata() + base = RESULTS_DIR + models = sorted([d.name for d in base.iterdir() if d.is_dir()]) + + # buckets[(model, variant, difficulty)] = (n, n_correct) + cells = defaultdict(lambda: [0, 0]) + for m in models: + mdir = base / m + for v in ["original"] + SURFACE_VARIANTS + ["kernel_variant"]: + vp = find_variant_file(mdir, v) + if not vp: continue + for p in load_problems(vp): + idx = p.get("index") + correct = p.get("correct") + if idx is None or correct is None: continue + bucket = diff.get(idx, "Unknown") + cells[(m, v, bucket)][0] += 1 + if correct: cells[(m, v, bucket)][1] += 1 + + # Aggregate per (variant, difficulty) by averaging per-model rates + print(f"{'Variant':<24} {'Easy':>8} {'Medium':>8} {'Hard':>8} {'XHard':>8}") + print("-" * 60) + for v in ["original"] + SURFACE_VARIANTS + ["kernel_variant"]: + row = {} + for bucket in ["Easy", "Medium", "Hard", "ExtraHard"]: + rates = [] + for m in models: + n, c = cells.get((m, v, bucket), [0, 0]) + if n >= 5: + rates.append(c / n) + row[bucket] = statistics.fmean(rates) * 100 if rates else None + print(f"{v:<24} " + f"{row['Easy']:>7.1f}% " if row['Easy'] is not None else f"{v:<24} {'-':>8}", + end="") + for bucket in ["Medium", "Hard", "ExtraHard"]: + print(f"{row[bucket]:>7.1f}% " if row[bucket] is not None else f"{'-':>8}", end="") + print() + + # Compute Δ_orig→KV per difficulty bucket + print(f"\n--- Δ original → KV per difficulty bucket ---") + for bucket in ["Easy", "Medium", "Hard", "ExtraHard"]: + orig_rates = [] + kv_rates = [] + for m in models: + no, co = cells.get((m, "original", bucket), [0, 0]) + nk, ck = cells.get((m, "kernel_variant", bucket), [0, 0]) + if no >= 5 and nk >= 5: + orig_rates.append(co / no) + kv_rates.append(ck / nk) + if orig_rates: + mo = statistics.fmean(orig_rates) * 100 + mk = statistics.fmean(kv_rates) * 100 + print(f" {bucket:<10} orig={mo:5.1f}% kv={mk:5.1f}% Δ={mk-mo:+.1f}pp") + + # Compute Δ_orig→GS per difficulty bucket + print(f"\n--- Δ original → GS (surface, hardest renamer) per difficulty bucket ---") + for bucket in ["Easy", "Medium", "Hard", "ExtraHard"]: + orig_rates = [] + gs_rates = [] + for m in models: + no, co = cells.get((m, "original", bucket), [0, 0]) + ng, cg = cells.get((m, "garbled_string", bucket), [0, 0]) + if no >= 5 and ng >= 5: + orig_rates.append(co / no) + gs_rates.append(cg / ng) + if orig_rates: + mo = statistics.fmean(orig_rates) * 100 + mg = statistics.fmean(gs_rates) * 100 + print(f" {bucket:<10} orig={mo:5.1f}% GS={mg:5.1f}% Δ={mg-mo:+.1f}pp") + + +def main(): + sc_success_rate() + difficulty_stratified_dichotomy() + + +if __name__ == "__main__": + main() |
