summaryrefslogtreecommitdiff
path: root/ep_run/ep_sn_monitor.py
diff options
context:
space:
mode:
Diffstat (limited to 'ep_run/ep_sn_monitor.py')
-rw-r--r--ep_run/ep_sn_monitor.py43
1 files changed, 43 insertions, 0 deletions
diff --git a/ep_run/ep_sn_monitor.py b/ep_run/ep_sn_monitor.py
new file mode 100644
index 0000000..523fd6d
--- /dev/null
+++ b/ep_run/ep_sn_monitor.py
@@ -0,0 +1,43 @@
+"""Combined monitor for ep_sn (hr=0.2 + specnorm 0.9): watch loss + probe cos per ckpt.
+Fire on: diverge / val<2.0 (cleared wall) / cos<0.82 (specnorm not holding gradient) / exit."""
+import time, os, re, subprocess, shutil
+WD = "/home/yurenh2/ept/ep_run"; os.chdir(WD)
+LOG, CK, FROZEN, COSLOG, PID = "runs/ep_sn.log", "runs/ep_sn.pt", "runs/ep_sn_cosprobe.pt", "runs/cos_monitor_sn.log", 2428946
+BLOG = "runs/bptt_clean.log"
+def alive(p):
+ try: os.kill(p, 0); return True
+ except Exception: return False
+def latest(log):
+ try: ls = [l for l in open(log) if l.startswith("step")]
+ except Exception: return None
+ if not ls: return None
+ m = re.search(r"step (\d+)/.*val CE ([\d.eE+-]+).*res=([\d.eE+-]+)", ls[-1])
+ return (int(m.group(1)), float(m.group(2)), float(m.group(3))) if m else None
+open(COSLOG, "a").write("# ep_sn monitor (hr=0.2 + specnorm 0.9)\n")
+last = -1; fired = None; t0 = time.time()
+while fired is None and time.time() - t0 < 18 * 3600:
+ time.sleep(120)
+ if not alive(PID): fired = f"ep_sn EXITED; last {latest(LOG)}"; break
+ d = latest(LOG)
+ if not d: continue
+ step, val, res = d
+ if res > 0.2 or val > 15: fired = f"ep_sn DIVERGED step {step} val {val:.2f} res {res:.2e} — specnorm did NOT prevent it"; break
+ if val < 2.0: fired = f"ep_sn reached val {val:.4f} step {step} res {res:.2e} — CLEARED the wall (hr+specnorm worked)"; break
+ if step >= last + 450 and os.path.exists(CK) and os.path.getsize(CK) > 1e6:
+ try: shutil.copy2(CK, FROZEN)
+ except Exception: continue
+ last = step
+ env = dict(os.environ, CUDA_VISIBLE_DEVICES="0", PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True")
+ cosv = "?"
+ try:
+ r = subprocess.run(["python3", "asym_probe.py", "--ckpt", FROZEN, "--B", "8"],
+ env=env, capture_output=True, text=True, timeout=600)
+ m = re.search(r"cos\(g_EP, ?g_transpose\)=([+-][0-9.]+)", r.stdout + r.stderr)
+ cosv = float(m.group(1)) if m else "?"
+ except Exception as e: cosv = f"err:{e}"
+ line = f"step {step}: cos={cosv} val={val:.4f} res={res:.2e}"
+ open(COSLOG, "a").write(line + "\n"); print(line, flush=True)
+ if isinstance(cosv, float) and cosv < 0.82:
+ fired = f"ep_sn COS DEGRADED to {cosv:.3f} step {step} (res {res:.2e}) — specnorm not holding the gradient"; break
+print("=== EP_SN MONITOR FIRED ==="); print("trigger:", fired or "18h timeout")
+print("ep_sn:", latest(LOG), "| bptt:", latest(BLOG))