diff options
| author | Yuren Hao <yurenh2@illinois.edu> | 2026-07-03 05:56:50 -0500 |
|---|---|---|
| committer | Yuren Hao <yurenh2@illinois.edu> | 2026-07-03 05:56:50 -0500 |
| commit | b83947778e2c776f757a07d4719b7ce961d7ed55 (patch) | |
| tree | b9cc01d7adda691d9156d9d04f4fb2f644674e96 /ep_run/cos_monitor.py | |
Initial commit: ept — backprop-free equilibrium transformer (EP)
Code (ep_run/), organized docs (docs/{method,campaign,hardware,outreach,paper}),
analysis scripts (scripts/), ONBOARDING.md entry point. Large data/checkpoints
git-ignored (share separately).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_014FAPDWQ49M5Ye3NpTndTpn
Diffstat (limited to 'ep_run/cos_monitor.py')
| -rw-r--r-- | ep_run/cos_monitor.py | 49 |
1 files changed, 49 insertions, 0 deletions
diff --git a/ep_run/cos_monitor.py b/ep_run/cos_monitor.py new file mode 100644 index 0000000..12a4fe9 --- /dev/null +++ b/ep_run/cos_monitor.py @@ -0,0 +1,49 @@ +"""Lightweight cos monitor for ep_hr02: probe each new ckpt, log step->cos(g_EP,exact-adjoint). +Fire on: cos degrades <0.90 (gradient going bad) / survived to step>=9500 (cleared old danger zone) / death.""" +import time, os, re, subprocess, shutil +WD = "/home/yurenh2/ept/ep_run"; os.chdir(WD) +LOG, CK, FROZEN, COSLOG, PID = "runs/ep_hr02.log", "runs/ep_hr02.pt", "runs/ep_hr02_cosprobe.pt", "runs/cos_monitor.log", 1684249 +def alive(): + try: os.kill(PID, 0); return True + except Exception: return False +def cur_step(): + try: + ls = [l for l in open(LOG) if l.startswith("step")] + if ls: return int(re.search(r"step (\d+)", ls[-1]).group(1)) + except Exception: pass + return 0 +open(COSLOG, "a").write(f"# cos monitor start (ep_hr02, hr=0.2)\n") +last = -1; fired = None; traj = []; t0 = time.time() +while fired is None and time.time() - t0 < 18 * 3600: + time.sleep(120) + if not alive(): fired = f"ep_hr02 EXITED at step {cur_step()}"; break + step = cur_step() + if step >= last + 450 and os.path.exists(CK) and os.path.getsize(CK) > 1e6: + try: shutil.copy2(CK, FROZEN) + except Exception: continue + last = step + env = dict(os.environ, CUDA_VISIBLE_DEVICES="0", PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True") + cosv, zres = None, "?" + try: + r = subprocess.run(["python3", "asym_probe.py", "--ckpt", FROZEN, "--B", "8"], + env=env, capture_output=True, text=True, timeout=600) + out = r.stdout + r.stderr + m = re.search(r"cos\(g_EP, ?g_transpose\)=([+-][0-9.]+)", out) + zr = re.search(r"z\* residual.*?step_rel=([0-9.eE+-]+)", out) + cosv = float(m.group(1)) if m else None + zres = zr.group(1) if zr else "?" + except Exception as e: + zres = f"probe-err:{e}" + # also grab current val from the training log + val = "?" + try: val = re.search(r"val CE ([\d.eE+-]+)", [l for l in open(LOG) if l.startswith("step")][-1]).group(1) + except Exception: pass + line = f"step {step}: cos={cosv} val={val} z_res={zres}" + traj.append((step, cosv, val)); open(COSLOG, "a").write(line + "\n"); print(line, flush=True) + if cosv is not None and cosv < 0.82: # below the historical per-batch floor (~0.85) => real degradation, not variance + fired = f"COS DEGRADED to {cosv:.3f} at step {step} (val {val}) — below historical floor, real gradient degradation"; break + if step >= 9500: + fired = f"ep_hr02 SURVIVED to step {step} (val {val}) with cos staying high — cleared the old ~9200 danger zone"; break +print("=== COS MONITOR FIRED ==="); print("trigger:", fired or "18h timeout") +print("cos trajectory (step | cos | val):") +for s, c, v in traj: print(f" {s:6d} | {c} | {v}") |
