1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
|
"""KV fragility broken down by Topic × Problem-type (proof vs calculation)."""
from __future__ import annotations
import json
import sys
import statistics
from pathlib import Path
from collections import defaultdict
THIS_DIR = Path(__file__).resolve().parent
sys.path.insert(0, str(THIS_DIR))
from structural_overlap import find_variant_file, load_problems, RESULTS_DIR, SURFACE_VARIANTS
DATASET_DIR = Path("/home/yurenh2/gap/putnam-bench-anon/dataset")
def load_metadata():
out = {}
for f in sorted(DATASET_DIR.glob("*.json")):
d = json.load(open(f))
idx = d.get("index")
if not idx: continue
out[idx] = {
"tag": d.get("tag"),
"problem_type": d.get("problem_type"),
}
return out
def main():
metadata = load_metadata()
base = RESULTS_DIR
models = sorted([d.name for d in base.iterdir() if d.is_dir()])
# cells[(topic, ptype, model, variant)] = (n, n_correct)
cells = defaultdict(lambda: [0, 0])
for m in models:
mdir = base / m
for v in ["original"] + SURFACE_VARIANTS + ["kernel_variant"]:
vp = find_variant_file(mdir, v)
if not vp: continue
for p in load_problems(vp):
idx = p.get("index")
correct = p.get("correct")
if idx is None or correct is None: continue
md = metadata.get(idx, {})
tag = md.get("tag")
ptype = md.get("problem_type")
if not tag or not ptype: continue
tags = tag if isinstance(tag, list) else [tag]
for t in tags:
if t not in ["ALG", "ANA", "NT", "COMB", "GEO"]: continue
cells[(t, ptype, m, v)][0] += 1
if correct: cells[(t, ptype, m, v)][1] += 1
print("=" * 80)
print("ACCURACY BY TOPIC × PROBLEM-TYPE × VARIANT (mean across 18 models)")
print("=" * 80)
print()
for ptype in ["proof", "calculation"]:
print(f"\n--- {ptype.upper()} ---\n")
print(f"{'Topic':<6}", end="")
for v in ["original", "garbled_string", "kernel_variant"]:
short = {"original":"orig","garbled_string":"GS","kernel_variant":"KV"}[v]
print(f" {short:>6}", end="")
print(f" {'Δ_GS':>7} {'Δ_KV':>7}")
print("-" * 50)
for t in ["ALG", "ANA", "NT", "COMB", "GEO"]:
orig_rates = []
gs_rates = []
kv_rates = []
for m in models:
no, co = cells.get((t, ptype, m, "original"), [0, 0])
ng, cg = cells.get((t, ptype, m, "garbled_string"), [0, 0])
nk, ck = cells.get((t, ptype, m, "kernel_variant"), [0, 0])
if no >= 5 and ng >= 5 and nk >= 5:
orig_rates.append(co / no)
gs_rates.append(cg / ng)
kv_rates.append(ck / nk)
if not orig_rates: continue
mo = statistics.fmean(orig_rates) * 100
mg = statistics.fmean(gs_rates) * 100
mk = statistics.fmean(kv_rates) * 100
print(f"{t:<6} {mo:>5.1f}% {mg:>5.1f}% {mk:>5.1f}% {mg-mo:>+5.1f}pp {mk-mo:>+5.1f}pp")
print("\n\n=== KEY DIFFERENTIAL: Δ KV by Topic for proof vs calculation ===\n")
print(f"{'Topic':<6} {'proof Δ':>10} {'calc Δ':>10} {'(calc - proof)':>16}")
print("-" * 50)
for t in ["ALG", "ANA", "NT", "COMB", "GEO"]:
deltas = {}
for ptype in ["proof", "calculation"]:
orig_rates = []
kv_rates = []
for m in models:
no, co = cells.get((t, ptype, m, "original"), [0, 0])
nk, ck = cells.get((t, ptype, m, "kernel_variant"), [0, 0])
if no >= 5 and nk >= 5:
orig_rates.append(co / no)
kv_rates.append(ck / nk)
if orig_rates:
deltas[ptype] = (statistics.fmean(kv_rates) - statistics.fmean(orig_rates)) * 100
if "proof" in deltas and "calculation" in deltas:
diff = deltas["calculation"] - deltas["proof"]
print(f"{t:<6} {deltas['proof']:>+9.1f}pp {deltas['calculation']:>+9.1f}pp {diff:>+15.1f}pp")
elif "proof" in deltas:
print(f"{t:<6} {deltas['proof']:>+9.1f}pp {'-':>10} {'-':>16}")
elif "calculation" in deltas:
print(f"{t:<6} {'-':>10} {deltas['calculation']:>+9.1f}pp {'-':>16}")
if __name__ == "__main__":
main()
|