summaryrefslogtreecommitdiff
path: root/ep_run/watch_runs.py
diff options
context:
space:
mode:
authorYuren Hao <yurenh2@illinois.edu>2026-07-03 05:56:50 -0500
committerYuren Hao <yurenh2@illinois.edu>2026-07-03 05:56:50 -0500
commitb83947778e2c776f757a07d4719b7ce961d7ed55 (patch)
treeb9cc01d7adda691d9156d9d04f4fb2f644674e96 /ep_run/watch_runs.py
Initial commit: ept — backprop-free equilibrium transformer (EP)
Code (ep_run/), organized docs (docs/{method,campaign,hardware,outreach,paper}), analysis scripts (scripts/), ONBOARDING.md entry point. Large data/checkpoints git-ignored (share separately). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_014FAPDWQ49M5Ye3NpTndTpn
Diffstat (limited to 'ep_run/watch_runs.py')
-rw-r--r--ep_run/watch_runs.py39
1 files changed, 39 insertions, 0 deletions
diff --git a/ep_run/watch_runs.py b/ep_run/watch_runs.py
new file mode 100644
index 0000000..92acf16
--- /dev/null
+++ b/ep_run/watch_runs.py
@@ -0,0 +1,39 @@
+"""Watch specnorm (contraction test) + bptt_ctrl (premise test). Fires on a decisive event."""
+import time, os, re
+RUNS = [
+ ("specnorm", "/home/yurenh2/ept/ep_run/runs/ep_specnorm09_scratch.log", 1435898),
+ ("bptt", "/home/yurenh2/ept/ep_run/runs/bptt_ctrl.log", 1511172),
+]
+def alive(pid):
+ try: os.kill(pid, 0); return True
+ except Exception: return False
+def latest(log):
+ try: ls = [l for l in open(log) if l.startswith("step")]
+ except FileNotFoundError: return None
+ if not ls: return None
+ m = re.search(r"step (\d+)/.*val CE ([\d.eE+-]+).*res=([\d.eE+-]+)", ls[-1])
+ return (int(m.group(1)), float(m.group(2)), float(m.group(3)), ls[-1].strip()) if m else None
+def status():
+ o = []
+ for t, l, p in RUNS:
+ d = latest(l); o.append(f"[{t}] {'ALIVE' if alive(p) else 'DEAD'} | {d[3] if d else 'no steps'}")
+ return "\n".join(o)
+t0 = time.time(); fired = None
+while fired is None and time.time() - t0 < 15 * 3600:
+ for t, l, p in RUNS:
+ d = latest(l)
+ if d:
+ step, val, res, _ = d
+ if t == "specnorm":
+ if res > 0.2 or val > 15: fired = f"specnorm DIVERGED res={res:.2e} val={val:.2f} step {step}"; break
+ if step >= 10200 and res < 0.06 and val < 2.5: fired = f"specnorm CLEARED step {step} val {val:.4f} res {res:.2e}"; break
+ if t == "bptt":
+ if val < 1.95: fired = f"bptt reached GOOD loss val {val:.4f} (res {res:.2e}) step {step} -> READY to probe rho(S^-1 A) at the good solution"; break
+ if res > 0.25: fired = f"bptt res HIGH res={res:.2e} val={val:.2f} step {step} (BPTT riding a NON-converged state?)"; break
+ if not alive(p):
+ fired = f"{t} process EXITED; last: {d[3] if d else 'no steps (early crash/OOM?)'}"; break
+ if fired: break
+ time.sleep(180)
+print("=== WATCHER FIRED ===")
+print("trigger:", fired or "15h timeout, no decisive event")
+print(status())