blob: efe4fc73e4f96266163e5afddf0d1897daea6348 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
#!/usr/bin/env python3
import json, os, sys, pathlib, datetime as dt
def loadj(p):
try:
with open(p,'r',encoding='utf-8') as f: return json.load(f)
except: return None
def main(root):
root = pathlib.Path(root)
out = root/"summary.md"
bias_ctf = loadj(root/"bias/ctf/metrics.json")
bias_crows = loadj(root/"bias/crows/metrics.json")
bias_wino = loadj(root/"bias/wino/metrics.json")
main_math = loadj(root/"main/math/metrics.json")
main_ppl = loadj(root/"main/ppl/metrics.json")
lines = ["# Baseline Summary",
f"- Generated: {dt.datetime.now().isoformat(timespec='seconds')}",
"","## Bias"]
if bias_ctf:
lines.append(f"- **CTF-gap**: {bias_ctf['CTF_gap_mean']:.6f} ± {bias_ctf['CTF_gap_ci95']:.6f} (coverage={bias_ctf['coverage']:.2f})")
lines.append(f"- **JSD_swap**: {bias_ctf['JSD_swap_mean']:.6f} ± {bias_ctf['JSD_swap_ci95']:.6f}")
if bias_crows:
lines.append(f"- **CrowS ΔlogP** (anti−stereo): {bias_crows['delta_logP_mean']:.6f} ± {bias_crows['delta_logP_ci95']:.6f}")
if bias_wino:
lines.append(f"- **Wino Acc**: {bias_wino['acc']:.3f} ± {bias_wino['acc_ci95']:.3f}")
lines += ["","## Main"]
if main_math:
lines.append(f"- **MATH EM**: {main_math['acc']:.3f} ± {main_math['acc_ci95']:.3f}")
if main_ppl:
lines.append(f"- **PPL**: {main_ppl['ppl']:.2f}")
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text("\n".join(lines)+"\n",encoding='utf-8')
print("Wrote", out)
if __name__=="__main__":
# usage: python scripts/summarize_baseline.py runs/20250910/baseline_eval
main(sys.argv[1] if len(sys.argv)>1 else "runs")
|