summaryrefslogtreecommitdiff
path: root/ep_run/watch_clean.py
blob: 3f9f06ce2b97b53ac8442488d2013729c30bbe05 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
"""Watch the clean-code re-run: pure EP (ep_clean) + BPTT control (bptt_clean). Fire on a decisive event."""
import time, os, re
RUNS = [
    ("ep",   "/home/yurenh2/ept/ep_run/runs/ep_clean.log",   1646260),
    ("bptt", "/home/yurenh2/ept/ep_run/runs/bptt_clean.log", 1646261),
]
def alive(pid):
    try: os.kill(pid, 0); return True
    except Exception: return False
def latest(log):
    try: ls = [l for l in open(log) if l.startswith("step")]
    except FileNotFoundError: return None
    if not ls: return None
    m = re.search(r"step (\d+)/.*val CE ([\d.eE+-]+).*res=([\d.eE+-]+)", ls[-1])
    return (int(m.group(1)), float(m.group(2)), float(m.group(3)), ls[-1].strip()) if m else None
def status():
    return "\n".join(f"[{t}] {'ALIVE' if alive(p) else 'DEAD'} | {(latest(l) or [None,None,None,'no steps'])[3]}" for t, l, p in RUNS)
t0 = time.time(); fired = None
while fired is None and time.time() - t0 < 16 * 3600:
    for t, l, p in RUNS:
        d = latest(l)
        if d:
            step, val, res, _ = d
            if t == "ep":
                if res > 0.2 or val > 15: fired = f"EP DIVERGED res={res:.2e} val={val:.2f} step {step} (clean-code baseline of the problem)"; break
                if val < 2.20: fired = f"EP reached val {val:.4f} (res {res:.2e}) step {step} — good converged ckpt to probe"; break
            if t == "bptt":
                if val < 1.95: fired = f"bptt reached GOOD loss val {val:.4f} (res {res:.2e}) step {step} -> probe rho/g_transpose here"; break
                if res > 0.25: fired = f"bptt res HIGH res={res:.2e} val={val:.2f} step {step} (BPTT riding NON-converged state?)"; break
        if not alive(p):
            fired = f"{t} process EXITED; last: {(latest(l) or [None,None,None,'no steps (early crash/OOM?)'])[3]}"; break
    if fired: break
    time.sleep(180)
print("=== CLEAN-RERUN WATCHER FIRED ===")
print("trigger:", fired or "16h timeout, no decisive event")
print(status())