analysis/topic_problemtype_interaction.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112

"""KV fragility broken down by Topic × Problem-type (proof vs calculation)."""
from __future__ import annotations
import json
import sys
import statistics
from pathlib import Path
from collections import defaultdict

THIS_DIR = Path(__file__).resolve().parent
sys.path.insert(0, str(THIS_DIR))
from structural_overlap import find_variant_file, load_problems, RESULTS_DIR, SURFACE_VARIANTS

DATASET_DIR = Path("/home/yurenh2/gap/putnam-bench-anon/dataset")


def load_metadata():
    out = {}
    for f in sorted(DATASET_DIR.glob("*.json")):
        d = json.load(open(f))
        idx = d.get("index")
        if not idx: continue
        out[idx] = {
            "tag": d.get("tag"),
            "problem_type": d.get("problem_type"),
        }
    return out


def main():
    metadata = load_metadata()
    base = RESULTS_DIR
    models = sorted([d.name for d in base.iterdir() if d.is_dir()])

    # cells[(topic, ptype, model, variant)] = (n, n_correct)
    cells = defaultdict(lambda: [0, 0])
    for m in models:
        mdir = base / m
        for v in ["original"] + SURFACE_VARIANTS + ["kernel_variant"]:
            vp = find_variant_file(mdir, v)
            if not vp: continue
            for p in load_problems(vp):
                idx = p.get("index")
                correct = p.get("correct")
                if idx is None or correct is None: continue
                md = metadata.get(idx, {})
                tag = md.get("tag")
                ptype = md.get("problem_type")
                if not tag or not ptype: continue
                tags = tag if isinstance(tag, list) else [tag]
                for t in tags:
                    if t not in ["ALG", "ANA", "NT", "COMB", "GEO"]: continue
                    cells[(t, ptype, m, v)][0] += 1
                    if correct: cells[(t, ptype, m, v)][1] += 1

    print("=" * 80)
    print("ACCURACY BY TOPIC × PROBLEM-TYPE × VARIANT (mean across 18 models)")
    print("=" * 80)
    print()

    for ptype in ["proof", "calculation"]:
        print(f"\n--- {ptype.upper()} ---\n")
        print(f"{'Topic':<6}", end="")
        for v in ["original", "garbled_string", "kernel_variant"]:
            short = {"original":"orig","garbled_string":"GS","kernel_variant":"KV"}[v]
            print(f"  {short:>6}", end="")
        print(f"  {'Δ_GS':>7} {'Δ_KV':>7}")
        print("-" * 50)
        for t in ["ALG", "ANA", "NT", "COMB", "GEO"]:
            orig_rates = []
            gs_rates = []
            kv_rates = []
            for m in models:
                no, co = cells.get((t, ptype, m, "original"), [0, 0])
                ng, cg = cells.get((t, ptype, m, "garbled_string"), [0, 0])
                nk, ck = cells.get((t, ptype, m, "kernel_variant"), [0, 0])
                if no >= 5 and ng >= 5 and nk >= 5:
                    orig_rates.append(co / no)
                    gs_rates.append(cg / ng)
                    kv_rates.append(ck / nk)
            if not orig_rates: continue
            mo = statistics.fmean(orig_rates) * 100
            mg = statistics.fmean(gs_rates) * 100
            mk = statistics.fmean(kv_rates) * 100
            print(f"{t:<6}  {mo:>5.1f}% {mg:>5.1f}% {mk:>5.1f}%  {mg-mo:>+5.1f}pp {mk-mo:>+5.1f}pp")

    print("\n\n=== KEY DIFFERENTIAL: Δ KV by Topic for proof vs calculation ===\n")
    print(f"{'Topic':<6}  {'proof Δ':>10} {'calc Δ':>10} {'(calc - proof)':>16}")
    print("-" * 50)
    for t in ["ALG", "ANA", "NT", "COMB", "GEO"]:
        deltas = {}
        for ptype in ["proof", "calculation"]:
            orig_rates = []
            kv_rates = []
            for m in models:
                no, co = cells.get((t, ptype, m, "original"), [0, 0])
                nk, ck = cells.get((t, ptype, m, "kernel_variant"), [0, 0])
                if no >= 5 and nk >= 5:
                    orig_rates.append(co / no)
                    kv_rates.append(ck / nk)
            if orig_rates:
                deltas[ptype] = (statistics.fmean(kv_rates) - statistics.fmean(orig_rates)) * 100
        if "proof" in deltas and "calculation" in deltas:
            diff = deltas["calculation"] - deltas["proof"]
            print(f"{t:<6}  {deltas['proof']:>+9.1f}pp {deltas['calculation']:>+9.1f}pp {diff:>+15.1f}pp")
        elif "proof" in deltas:
            print(f"{t:<6}  {deltas['proof']:>+9.1f}pp {'-':>10} {'-':>16}")
        elif "calculation" in deltas:
            print(f"{t:<6}  {'-':>10} {deltas['calculation']:>+9.1f}pp {'-':>16}")


if __name__ == "__main__":
    main()