1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
"""Watch specnorm (contraction test) + bptt_ctrl (premise test). Fires on a decisive event."""
import time, os, re
RUNS = [
("specnorm", "/home/yurenh2/ept/ep_run/runs/ep_specnorm09_scratch.log", 1435898),
("bptt", "/home/yurenh2/ept/ep_run/runs/bptt_ctrl.log", 1511172),
]
def alive(pid):
try: os.kill(pid, 0); return True
except Exception: return False
def latest(log):
try: ls = [l for l in open(log) if l.startswith("step")]
except FileNotFoundError: return None
if not ls: return None
m = re.search(r"step (\d+)/.*val CE ([\d.eE+-]+).*res=([\d.eE+-]+)", ls[-1])
return (int(m.group(1)), float(m.group(2)), float(m.group(3)), ls[-1].strip()) if m else None
def status():
o = []
for t, l, p in RUNS:
d = latest(l); o.append(f"[{t}] {'ALIVE' if alive(p) else 'DEAD'} | {d[3] if d else 'no steps'}")
return "\n".join(o)
t0 = time.time(); fired = None
while fired is None and time.time() - t0 < 15 * 3600:
for t, l, p in RUNS:
d = latest(l)
if d:
step, val, res, _ = d
if t == "specnorm":
if res > 0.2 or val > 15: fired = f"specnorm DIVERGED res={res:.2e} val={val:.2f} step {step}"; break
if step >= 10200 and res < 0.06 and val < 2.5: fired = f"specnorm CLEARED step {step} val {val:.4f} res {res:.2e}"; break
if t == "bptt":
if val < 1.95: fired = f"bptt reached GOOD loss val {val:.4f} (res {res:.2e}) step {step} -> READY to probe rho(S^-1 A) at the good solution"; break
if res > 0.25: fired = f"bptt res HIGH res={res:.2e} val={val:.2f} step {step} (BPTT riding a NON-converged state?)"; break
if not alive(p):
fired = f"{t} process EXITED; last: {d[3] if d else 'no steps (early crash/OOM?)'}"; break
if fired: break
time.sleep(180)
print("=== WATCHER FIRED ===")
print("trigger:", fired or "15h timeout, no decisive event")
print(status())
|