1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
import time, os, re, subprocess
os.chdir("/home/yurenh2/ept/ep_run"); LOG="runs/ep_jacreg.log"
def alive(): return subprocess.run(["pgrep","-f","ckpt runs/ep_jacreg.pt"],capture_output=True).returncode==0
def steps():
out=[]
try:
for l in open(LOG):
if not l.startswith("step"): continue
ms=re.search(r"step\s+(\d+)",l); mv=re.search(r"val CE ([\d.]+)",l)
mj=re.search(r"jr=([\d.eE+-]+)",l); mr=re.search(r"res=([\d.eE+-]+)",l)
if ms and mv and mj: out.append((int(ms.group(1)),float(mv.group(1)),float(mj.group(1)),float(mr.group(1)) if mr else 0))
except Exception: pass
return out
fired=None; t0=time.time(); hi=0; prev=None; seen=set([r[0] for r in steps()]) # ignore already-seen (incl the 6250 spike)
while fired is None and time.time()-t0<4*3600:
time.sleep(60)
if not alive(): fired=f"EXITED last={steps()[-1] if steps() else None}"; break
for r in steps():
if r[0] in seen: continue
seen.add(r[0]); step,val,jr,res=r
if val>15 or res>0.3: fired=f"DIVERGED @{step} val{val:.2f} res{res:.1e} jr{jr:.1f}"; break
hi = hi+1 if jr>=8 else 0
if jr<2.0 and val<2.55: fired=f"SUPPRESSED @{step}: jr relaxed to {jr:.1f}, CE recovered {val:.3f} (best2.4381) res{res:.1e} -> controller WON the spike"; break
if hi>=3: fired=f"jr SATURATING @{step}: jr>=8 for {hi} logged-steps (now {jr:.1f}), val{val:.3f} res{res:.1e} -> controller maxed, not relaxing (early hijack/saturation)"; break
if fired: break
print("=== EP_JACREG SPIKE-RECOVERY ==="); print(fired or "4h timeout"); print("last5:", steps()[-5:])
|