summaryrefslogtreecommitdiff
path: root/ep_run/watch_contraction.py
blob: 64f6a9f2f38aa257d3b878aa57ad1fcbfb544cd2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
"""Watcher for the two contraction experiments (c3 + specnorm). Fires (exits) when either run
hits a decisive state: DIVERGED (res>0.2 or val>15), CLEARED the danger zone (step>=10200, res<0.06,
val<2.5 -> survived past the step ~9400 where the unconstrained run blew), or process EXITED."""
import time, os, re
RUNS = [
    ("c3",       "/home/yurenh2/ept/ep_run/runs/ep_c3_scratch.log",        1429784),
    ("specnorm", "/home/yurenh2/ept/ep_run/runs/ep_specnorm09_scratch.log",1435898),
]
def alive(pid):
    try: os.kill(pid, 0); return True
    except Exception: return False
def latest(log):
    try: lines = [l for l in open(log) if l.startswith("step")]
    except FileNotFoundError: return None
    if not lines: return None
    m = re.search(r"step (\d+)/.*val CE ([\d.eE+-]+).*res=([\d.eE+-]+)", lines[-1])
    if not m: return None
    return int(m.group(1)), float(m.group(2)), float(m.group(3)), lines[-1].strip()
def status_all():
    out = []
    for tag, log, pid in RUNS:
        d = latest(log)
        out.append(f"[{tag}] {'ALIVE' if alive(pid) else 'DEAD'} | {d[3] if d else 'no steps yet'}")
    return "\n".join(out)
t0 = time.time(); fired = None
while fired is None and time.time() - t0 < 15 * 3600:
    for tag, log, pid in RUNS:
        d = latest(log)
        if d:
            step, val, res, _ = d
            if res > 0.2 or val > 15:
                fired = f"{tag} DIVERGED (res={res:.2e}, val={val:.2f}) at step {step}"; break
            if step >= 10200 and res < 0.06 and val < 2.5:
                fired = f"{tag} CLEARED danger zone: step {step}, val {val:.4f}, res {res:.2e} (survived past ~9400)"; break
        if not alive(pid):
            fired = f"{tag} process EXITED (abort_res / crash / done); last: {d[3] if d else 'no steps'}"; break
    if fired: break
    time.sleep(300)
print("=== CONTRACTION WATCHER FIRED ===")
print("trigger:", fired if fired else "max wall-time (15h) reached, no decisive event")
print(status_all())