summaryrefslogtreecommitdiff
path: root/ep_run/watch_hr.py
diff options
context:
space:
mode:
Diffstat (limited to 'ep_run/watch_hr.py')
-rw-r--r--ep_run/watch_hr.py33
1 files changed, 33 insertions, 0 deletions
diff --git a/ep_run/watch_hr.py b/ep_run/watch_hr.py
new file mode 100644
index 0000000..33a44bb
--- /dev/null
+++ b/ep_run/watch_hr.py
@@ -0,0 +1,33 @@
+"""Watch EP(hr=0.2) [the fix] + BPTT. Fire on: EP diverges (fix failed) / EP reaches good loss (fix worked) / exit."""
+import time, os, re
+RUNS = [
+ ("ep_hr02", "/home/yurenh2/ept/ep_run/runs/ep_hr02.log", 1684249),
+ ("bptt", "/home/yurenh2/ept/ep_run/runs/bptt_clean.log", 1646261),
+]
+def alive(pid):
+ try: os.kill(pid, 0); return True
+ except Exception: return False
+def latest(log):
+ try: ls = [l for l in open(log) if l.startswith("step")]
+ except FileNotFoundError: return None
+ if not ls: return None
+ m = re.search(r"step (\d+)/.*val CE ([\d.eE+-]+).*res=([\d.eE+-]+)", ls[-1])
+ return (int(m.group(1)), float(m.group(2)), float(m.group(3)), ls[-1].strip()) if m else None
+def status():
+ return "\n".join(f"[{t}] {'ALIVE' if alive(p) else 'DEAD'} | {(latest(l) or [None,None,None,'no steps'])[3]}" for t, l, p in RUNS)
+t0 = time.time(); fired = None
+while fired is None and time.time() - t0 < 18 * 3600:
+ for t, l, p in RUNS:
+ d = latest(l)
+ if d:
+ step, val, res, _ = d
+ if t == "ep_hr02":
+ if res > 0.2 or val > 15: fired = f"EP(hr=0.2) STILL DIVERGED res={res:.2e} val={val:.2f} step {step} -> hr was not the (only) cause"; break
+ if val < 2.00: fired = f"EP(hr=0.2) reached GOOD loss val {val:.4f} (res {res:.2e}) step {step} -> the hr fix WORKED (past the old 2.09 wall region)"; break
+ if t == "bptt":
+ if val < 1.95: fired = f"bptt reached GOOD loss val {val:.4f} (res {res:.2e}) step {step}"; break
+ if not alive(p):
+ fired = f"{t} EXITED; last: {(latest(l) or [None,None,None,'no steps'])[3]}"; break
+ if fired: break
+ time.sleep(180)
+print("=== HR-FIX WATCHER FIRED ==="); print("trigger:", fired or "18h timeout"); print(status())