diff options
Diffstat (limited to 'ep_run/watch_runs.py')
| -rw-r--r-- | ep_run/watch_runs.py | 39 |
1 files changed, 39 insertions, 0 deletions
diff --git a/ep_run/watch_runs.py b/ep_run/watch_runs.py new file mode 100644 index 0000000..92acf16 --- /dev/null +++ b/ep_run/watch_runs.py @@ -0,0 +1,39 @@ +"""Watch specnorm (contraction test) + bptt_ctrl (premise test). Fires on a decisive event.""" +import time, os, re +RUNS = [ + ("specnorm", "/home/yurenh2/ept/ep_run/runs/ep_specnorm09_scratch.log", 1435898), + ("bptt", "/home/yurenh2/ept/ep_run/runs/bptt_ctrl.log", 1511172), +] +def alive(pid): + try: os.kill(pid, 0); return True + except Exception: return False +def latest(log): + try: ls = [l for l in open(log) if l.startswith("step")] + except FileNotFoundError: return None + if not ls: return None + m = re.search(r"step (\d+)/.*val CE ([\d.eE+-]+).*res=([\d.eE+-]+)", ls[-1]) + return (int(m.group(1)), float(m.group(2)), float(m.group(3)), ls[-1].strip()) if m else None +def status(): + o = [] + for t, l, p in RUNS: + d = latest(l); o.append(f"[{t}] {'ALIVE' if alive(p) else 'DEAD'} | {d[3] if d else 'no steps'}") + return "\n".join(o) +t0 = time.time(); fired = None +while fired is None and time.time() - t0 < 15 * 3600: + for t, l, p in RUNS: + d = latest(l) + if d: + step, val, res, _ = d + if t == "specnorm": + if res > 0.2 or val > 15: fired = f"specnorm DIVERGED res={res:.2e} val={val:.2f} step {step}"; break + if step >= 10200 and res < 0.06 and val < 2.5: fired = f"specnorm CLEARED step {step} val {val:.4f} res {res:.2e}"; break + if t == "bptt": + if val < 1.95: fired = f"bptt reached GOOD loss val {val:.4f} (res {res:.2e}) step {step} -> READY to probe rho(S^-1 A) at the good solution"; break + if res > 0.25: fired = f"bptt res HIGH res={res:.2e} val={val:.2f} step {step} (BPTT riding a NON-converged state?)"; break + if not alive(p): + fired = f"{t} process EXITED; last: {d[3] if d else 'no steps (early crash/OOM?)'}"; break + if fired: break + time.sleep(180) +print("=== WATCHER FIRED ===") +print("trigger:", fired or "15h timeout, no decisive event") +print(status()) |
