diff options
| author | Yuren Hao <yurenh2@illinois.edu> | 2026-04-08 22:06:05 -0500 |
|---|---|---|
| committer | Yuren Hao <yurenh2@illinois.edu> | 2026-04-08 22:06:05 -0500 |
| commit | 05704d0eb2fa59fe727652465b07db40bcb06c38 (patch) | |
| tree | 8904aca836cf552fd1a5ae8c2174e9f91e70bbbc /analysis/rescue_pooled.py | |
Initial release: GAP framework
- Full pipeline: variant generation, multi-judge verification, evaluation
- Loaders for OpenAI / Anthropic / Google / xAI / OpenRouter / vLLM
- Framework-level mechanism analyses: paired structural overlap, repairability rescue, self-correction probe, cross-model agreement, topic x problem-type interaction
- Unicode -> bare-LaTeX cleaner + audit + spot-check
- Mirrors https://huggingface.co/datasets/blackhao0426/PutnamGAP
Diffstat (limited to 'analysis/rescue_pooled.py')
| -rw-r--r-- | analysis/rescue_pooled.py | 174 |
1 files changed, 174 insertions, 0 deletions
diff --git a/analysis/rescue_pooled.py b/analysis/rescue_pooled.py new file mode 100644 index 0000000..cc9f782 --- /dev/null +++ b/analysis/rescue_pooled.py @@ -0,0 +1,174 @@ +"""Pooled rescue analysis for the rebuttal headline. + +Reports: +1. Per-variant pooled rebound rates with Wilson 95% CI for each condition +2. Pooled McNemar (paired) tests across all 4 models per variant +3. Pooled McNemar across all 5 surface variants for each model +4. Headline single-cell numbers +""" +from __future__ import annotations +import json +import math +import statistics +from collections import defaultdict +from pathlib import Path + +PATH = Path("/home/yurenh2/gap/analysis/rescue_results/rescue_30.jsonl") +OUT_PATH = Path("/home/yurenh2/gap/analysis/rescue_pooled_summary.json") + + +def wilson_ci(k: int, n: int, z: float = 1.96): + if n == 0: + return (0.0, 0.0, 0.0) + p = k / n + denom = 1 + z * z / n + center = (p + z * z / (2 * n)) / denom + half = z * math.sqrt(p * (1 - p) / n + z * z / (4 * n * n)) / denom + return (p, max(0.0, center - half), min(1.0, center + half)) + + +def mcnemar_p(b: int, c: int) -> float: + n = b + c + if n == 0: + return 1.0 + k = min(b, c) + cum = sum(math.comb(n, i) * (0.5 ** n) for i in range(k + 1)) + return min(1.0, 2 * cum) + + +def main(): + rows = [json.loads(l) for l in open(PATH)] + print(f"Loaded {len(rows)} rows\n") + + # case_grades[(model, variant, index)] = {cond: grade} + case_grades = defaultdict(dict) + for r in rows: + case_grades[(r["model"], r["variant"], r["index"])][r["condition"]] = r.get("grade") + + variants_order = ["descriptive_long", "descriptive_long_confusing", + "descriptive_long_misleading", "garbled_string", "kernel_variant"] + short = {"descriptive_long":"DL","descriptive_long_confusing":"DLC", + "descriptive_long_misleading":"DLM","garbled_string":"GS","kernel_variant":"KV"} + + summary = {} + + print("=" * 92) + print("HEADLINE: Rescue rebound by variant (pooled across 4 models)") + print("=" * 92) + print(f"{'Variant':<6} {'Condition':<14} {'k/n':>10} {'rate':>7} " + f"{'95% Wilson CI':>20} {'Δ vs null':>11}") + print("-" * 80) + var_summary = {} + for v in variants_order: + # Pool counts across models + cell_counts = defaultdict(lambda: {"k": 0, "n": 0}) + for k, grds in case_grades.items(): + if k[1] != v: continue + for cond in ("null", "canonical_T2", "own_T2"): + if cond in grds: + cell_counts[cond]["n"] += 1 + if grds[cond] == "CORRECT": + cell_counts[cond]["k"] += 1 + # Wilson CIs + per_cond = {} + null_p = cell_counts["null"]["k"] / max(1, cell_counts["null"]["n"]) + for cond in ("null", "canonical_T2", "own_T2"): + if cond not in cell_counts: continue + c = cell_counts[cond] + if c["n"] == 0: continue + p, lo, hi = wilson_ci(c["k"], c["n"]) + delta = (p - null_p) * 100 if cond != "null" else 0.0 + per_cond[cond] = {"k": c["k"], "n": c["n"], "p": p, "ci": [lo, hi], "delta_pp": delta} + print(f"{short[v]:<6} {cond:<14} {c['k']:>4}/{c['n']:>4} " + f"{p*100:>5.1f}% [{lo*100:>5.1f}%, {hi*100:>5.1f}%] " + f"{'+' if delta > 0 else ('' if delta == 0 else '-')}{abs(delta):>5.1f} pp") + # Pooled McNemar (own vs null, can vs null, own vs can) + mc = {} + for a, b in [("canonical_T2", "null"), ("own_T2", "null"), + ("own_T2", "canonical_T2")]: + b_count = c_count = 0 + for k, grds in case_grades.items(): + if k[1] != v: continue + ga = grds.get(a); gb = grds.get(b) + if ga is None or gb is None: continue + if ga == "CORRECT" and gb == "INCORRECT": b_count += 1 + elif ga == "INCORRECT" and gb == "CORRECT": c_count += 1 + p = mcnemar_p(b_count, c_count) + mc[f"{a}_vs_{b}"] = {"b": b_count, "c": c_count, "p": p} + var_summary[v] = {"per_cond": per_cond, "mcnemar": mc} + print() + + summary["per_variant"] = var_summary + + # Pooled McNemar across all surface variants for canonical vs null and own vs null + print("\n" + "=" * 92) + print("POOLED McNEMAR (across all 4 surface variants × 4 models)") + print("=" * 92) + surface_vs = ["descriptive_long", "descriptive_long_confusing", + "descriptive_long_misleading", "garbled_string"] + for a, b in [("canonical_T2", "null"), ("own_T2", "null"), + ("own_T2", "canonical_T2")]: + b_count = c_count = 0 + for k, grds in case_grades.items(): + if k[1] not in surface_vs: continue + ga = grds.get(a); gb = grds.get(b) + if ga is None or gb is None: continue + if ga == "CORRECT" and gb == "INCORRECT": b_count += 1 + elif ga == "INCORRECT" and gb == "CORRECT": c_count += 1 + p = mcnemar_p(b_count, c_count) + n = b_count + c_count + odds_ratio = b_count / max(1, c_count) + print(f" {a:<14} > {b:<14} b={b_count:>4}, c={c_count:>4} " + f"OR={odds_ratio:>4.2f} McNemar p={p:.2e} (n_discordant={n})") + # KV separately + print() + for a, b in [("canonical_T2", "null")]: + b_count = c_count = 0 + for k, grds in case_grades.items(): + if k[1] != "kernel_variant": continue + ga = grds.get(a); gb = grds.get(b) + if ga is None or gb is None: continue + if ga == "CORRECT" and gb == "INCORRECT": b_count += 1 + elif ga == "INCORRECT" and gb == "CORRECT": c_count += 1 + p = mcnemar_p(b_count, c_count) + odds_ratio = b_count / max(1, c_count) + print(f" KV: {a:<14} > {b:<14} b={b_count:>4}, c={c_count:>4} " + f"OR={odds_ratio:>4.2f} McNemar p={p:.2e}") + + # Per model summary + print("\n" + "=" * 92) + print("PER MODEL (averaged across 4 surface variants)") + print("=" * 92) + print(f"{'Model':<22} {'null':>10} {'canonical_T2':>14} {'own_T2':>10} " + f"{'can-null':>10} {'own-null':>10}") + per_model = {} + for model in sorted({k[0] for k in case_grades}): + cnts = defaultdict(lambda: {"k": 0, "n": 0}) + for k, grds in case_grades.items(): + if k[0] != model: continue + if k[1] not in surface_vs: continue + for cond in ("null", "canonical_T2", "own_T2"): + if cond in grds: + cnts[cond]["n"] += 1 + if grds[cond] == "CORRECT": + cnts[cond]["k"] += 1 + nul_p = cnts["null"]["k"] / max(1, cnts["null"]["n"]) + can_p = cnts["canonical_T2"]["k"] / max(1, cnts["canonical_T2"]["n"]) + own_p = cnts["own_T2"]["k"] / max(1, cnts["own_T2"]["n"]) + per_model[model] = { + "null": {"k": cnts["null"]["k"], "n": cnts["null"]["n"], "p": nul_p}, + "canonical_T2": {"k": cnts["canonical_T2"]["k"], "n": cnts["canonical_T2"]["n"], "p": can_p}, + "own_T2": {"k": cnts["own_T2"]["k"], "n": cnts["own_T2"]["n"], "p": own_p}, + "can_minus_null_pp": (can_p - nul_p) * 100, + "own_minus_null_pp": (own_p - nul_p) * 100, + } + print(f" {model:<20} {nul_p*100:>9.1f}% {can_p*100:>13.1f}% {own_p*100:>9.1f}% " + f"{(can_p-nul_p)*100:>+9.1f}pp {(own_p-nul_p)*100:>+9.1f}pp") + summary["per_model"] = per_model + + json.dump(summary, open(OUT_PATH, "w"), indent=2) + print(f"\nSaved -> {OUT_PATH}") + + +if __name__ == "__main__": + main() |
