1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
"""Watcher for the two contraction experiments (c3 + specnorm). Fires (exits) when either run
hits a decisive state: DIVERGED (res>0.2 or val>15), CLEARED the danger zone (step>=10200, res<0.06,
val<2.5 -> survived past the step ~9400 where the unconstrained run blew), or process EXITED."""
import time, os, re
RUNS = [
("c3", "/home/yurenh2/ept/ep_run/runs/ep_c3_scratch.log", 1429784),
("specnorm", "/home/yurenh2/ept/ep_run/runs/ep_specnorm09_scratch.log",1435898),
]
def alive(pid):
try: os.kill(pid, 0); return True
except Exception: return False
def latest(log):
try: lines = [l for l in open(log) if l.startswith("step")]
except FileNotFoundError: return None
if not lines: return None
m = re.search(r"step (\d+)/.*val CE ([\d.eE+-]+).*res=([\d.eE+-]+)", lines[-1])
if not m: return None
return int(m.group(1)), float(m.group(2)), float(m.group(3)), lines[-1].strip()
def status_all():
out = []
for tag, log, pid in RUNS:
d = latest(log)
out.append(f"[{tag}] {'ALIVE' if alive(pid) else 'DEAD'} | {d[3] if d else 'no steps yet'}")
return "\n".join(out)
t0 = time.time(); fired = None
while fired is None and time.time() - t0 < 15 * 3600:
for tag, log, pid in RUNS:
d = latest(log)
if d:
step, val, res, _ = d
if res > 0.2 or val > 15:
fired = f"{tag} DIVERGED (res={res:.2e}, val={val:.2f}) at step {step}"; break
if step >= 10200 and res < 0.06 and val < 2.5:
fired = f"{tag} CLEARED danger zone: step {step}, val {val:.4f}, res {res:.2e} (survived past ~9400)"; break
if not alive(pid):
fired = f"{tag} process EXITED (abort_res / crash / done); last: {d[3] if d else 'no steps'}"; break
if fired: break
time.sleep(300)
print("=== CONTRACTION WATCHER FIRED ===")
print("trigger:", fired if fired else "max wall-time (15h) reached, no decisive event")
print(status_all())
|