summaryrefslogtreecommitdiff
path: root/scripts/summarize_baseline.py
diff options
context:
space:
mode:
authorYuren Hao <yurenh2@timan108.cs.illinois.edu>2025-09-10 12:41:28 -0500
committerYuren Hao <yurenh2@timan108.cs.illinois.edu>2025-09-10 12:41:28 -0500
commit9d5f2379ac25b4b58e2600544f61172dbb15b67a (patch)
tree17a945ad194f50523c9ef25011cc13db22285bce /scripts/summarize_baseline.py
parent5bfd92f6c28530482a765252a4497cfedacad25a (diff)
fix ctf
Diffstat (limited to 'scripts/summarize_baseline.py')
-rw-r--r--scripts/summarize_baseline.py39
1 files changed, 39 insertions, 0 deletions
diff --git a/scripts/summarize_baseline.py b/scripts/summarize_baseline.py
new file mode 100644
index 0000000..efe4fc7
--- /dev/null
+++ b/scripts/summarize_baseline.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+import json, os, sys, pathlib, datetime as dt
+
+def loadj(p):
+ try:
+ with open(p,'r',encoding='utf-8') as f: return json.load(f)
+ except: return None
+
+def main(root):
+ root = pathlib.Path(root)
+ out = root/"summary.md"
+ bias_ctf = loadj(root/"bias/ctf/metrics.json")
+ bias_crows = loadj(root/"bias/crows/metrics.json")
+ bias_wino = loadj(root/"bias/wino/metrics.json")
+ main_math = loadj(root/"main/math/metrics.json")
+ main_ppl = loadj(root/"main/ppl/metrics.json")
+
+ lines = ["# Baseline Summary",
+ f"- Generated: {dt.datetime.now().isoformat(timespec='seconds')}",
+ "","## Bias"]
+ if bias_ctf:
+ lines.append(f"- **CTF-gap**: {bias_ctf['CTF_gap_mean']:.6f} ± {bias_ctf['CTF_gap_ci95']:.6f} (coverage={bias_ctf['coverage']:.2f})")
+ lines.append(f"- **JSD_swap**: {bias_ctf['JSD_swap_mean']:.6f} ± {bias_ctf['JSD_swap_ci95']:.6f}")
+ if bias_crows:
+ lines.append(f"- **CrowS ΔlogP** (anti−stereo): {bias_crows['delta_logP_mean']:.6f} ± {bias_crows['delta_logP_ci95']:.6f}")
+ if bias_wino:
+ lines.append(f"- **Wino Acc**: {bias_wino['acc']:.3f} ± {bias_wino['acc_ci95']:.3f}")
+ lines += ["","## Main"]
+ if main_math:
+ lines.append(f"- **MATH EM**: {main_math['acc']:.3f} ± {main_math['acc_ci95']:.3f}")
+ if main_ppl:
+ lines.append(f"- **PPL**: {main_ppl['ppl']:.2f}")
+ out.parent.mkdir(parents=True, exist_ok=True)
+ out.write_text("\n".join(lines)+"\n",encoding='utf-8')
+ print("Wrote", out)
+
+if __name__=="__main__":
+ # usage: python scripts/summarize_baseline.py runs/20250910/baseline_eval
+ main(sys.argv[1] if len(sys.argv)>1 else "runs")