summaryrefslogtreecommitdiff
path: root/ep_run/cos_monitor.py
diff options
context:
space:
mode:
Diffstat (limited to 'ep_run/cos_monitor.py')
-rw-r--r--ep_run/cos_monitor.py49
1 files changed, 49 insertions, 0 deletions
diff --git a/ep_run/cos_monitor.py b/ep_run/cos_monitor.py
new file mode 100644
index 0000000..12a4fe9
--- /dev/null
+++ b/ep_run/cos_monitor.py
@@ -0,0 +1,49 @@
+"""Lightweight cos monitor for ep_hr02: probe each new ckpt, log step->cos(g_EP,exact-adjoint).
+Fire on: cos degrades <0.90 (gradient going bad) / survived to step>=9500 (cleared old danger zone) / death."""
+import time, os, re, subprocess, shutil
+WD = "/home/yurenh2/ept/ep_run"; os.chdir(WD)
+LOG, CK, FROZEN, COSLOG, PID = "runs/ep_hr02.log", "runs/ep_hr02.pt", "runs/ep_hr02_cosprobe.pt", "runs/cos_monitor.log", 1684249
+def alive():
+ try: os.kill(PID, 0); return True
+ except Exception: return False
+def cur_step():
+ try:
+ ls = [l for l in open(LOG) if l.startswith("step")]
+ if ls: return int(re.search(r"step (\d+)", ls[-1]).group(1))
+ except Exception: pass
+ return 0
+open(COSLOG, "a").write(f"# cos monitor start (ep_hr02, hr=0.2)\n")
+last = -1; fired = None; traj = []; t0 = time.time()
+while fired is None and time.time() - t0 < 18 * 3600:
+ time.sleep(120)
+ if not alive(): fired = f"ep_hr02 EXITED at step {cur_step()}"; break
+ step = cur_step()
+ if step >= last + 450 and os.path.exists(CK) and os.path.getsize(CK) > 1e6:
+ try: shutil.copy2(CK, FROZEN)
+ except Exception: continue
+ last = step
+ env = dict(os.environ, CUDA_VISIBLE_DEVICES="0", PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True")
+ cosv, zres = None, "?"
+ try:
+ r = subprocess.run(["python3", "asym_probe.py", "--ckpt", FROZEN, "--B", "8"],
+ env=env, capture_output=True, text=True, timeout=600)
+ out = r.stdout + r.stderr
+ m = re.search(r"cos\(g_EP, ?g_transpose\)=([+-][0-9.]+)", out)
+ zr = re.search(r"z\* residual.*?step_rel=([0-9.eE+-]+)", out)
+ cosv = float(m.group(1)) if m else None
+ zres = zr.group(1) if zr else "?"
+ except Exception as e:
+ zres = f"probe-err:{e}"
+ # also grab current val from the training log
+ val = "?"
+ try: val = re.search(r"val CE ([\d.eE+-]+)", [l for l in open(LOG) if l.startswith("step")][-1]).group(1)
+ except Exception: pass
+ line = f"step {step}: cos={cosv} val={val} z_res={zres}"
+ traj.append((step, cosv, val)); open(COSLOG, "a").write(line + "\n"); print(line, flush=True)
+ if cosv is not None and cosv < 0.82: # below the historical per-batch floor (~0.85) => real degradation, not variance
+ fired = f"COS DEGRADED to {cosv:.3f} at step {step} (val {val}) — below historical floor, real gradient degradation"; break
+ if step >= 9500:
+ fired = f"ep_hr02 SURVIVED to step {step} (val {val}) with cos staying high — cleared the old ~9200 danger zone"; break
+print("=== COS MONITOR FIRED ==="); print("trigger:", fired or "18h timeout")
+print("cos trajectory (step | cos | val):")
+for s, c, v in traj: print(f" {s:6d} | {c} | {v}")