"""Watcher for the two contraction experiments (c3 + specnorm). Fires (exits) when either run hits a decisive state: DIVERGED (res>0.2 or val>15), CLEARED the danger zone (step>=10200, res<0.06, val<2.5 -> survived past the step ~9400 where the unconstrained run blew), or process EXITED.""" import time, os, re RUNS = [ ("c3", "/home/yurenh2/ept/ep_run/runs/ep_c3_scratch.log", 1429784), ("specnorm", "/home/yurenh2/ept/ep_run/runs/ep_specnorm09_scratch.log",1435898), ] def alive(pid): try: os.kill(pid, 0); return True except Exception: return False def latest(log): try: lines = [l for l in open(log) if l.startswith("step")] except FileNotFoundError: return None if not lines: return None m = re.search(r"step (\d+)/.*val CE ([\d.eE+-]+).*res=([\d.eE+-]+)", lines[-1]) if not m: return None return int(m.group(1)), float(m.group(2)), float(m.group(3)), lines[-1].strip() def status_all(): out = [] for tag, log, pid in RUNS: d = latest(log) out.append(f"[{tag}] {'ALIVE' if alive(pid) else 'DEAD'} | {d[3] if d else 'no steps yet'}") return "\n".join(out) t0 = time.time(); fired = None while fired is None and time.time() - t0 < 15 * 3600: for tag, log, pid in RUNS: d = latest(log) if d: step, val, res, _ = d if res > 0.2 or val > 15: fired = f"{tag} DIVERGED (res={res:.2e}, val={val:.2f}) at step {step}"; break if step >= 10200 and res < 0.06 and val < 2.5: fired = f"{tag} CLEARED danger zone: step {step}, val {val:.4f}, res {res:.2e} (survived past ~9400)"; break if not alive(pid): fired = f"{tag} process EXITED (abort_res / crash / done); last: {d[3] if d else 'no steps'}"; break if fired: break time.sleep(300) print("=== CONTRACTION WATCHER FIRED ===") print("trigger:", fired if fired else "max wall-time (15h) reached, no decisive event") print(status_all())