summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYuren Hao <yurenh2@illinois.edu>2026-07-03 18:21:21 -0500
committerYuren Hao <yurenh2@illinois.edu>2026-07-03 18:21:21 -0500
commit6e78420da6e613964d93da06156b556e1a91caef (patch)
tree9329b0bc134fbc0627a80e6fd095651fcf9e4975
parentbcec9560cf5c9b113e9381a52d1a941daa8865f2 (diff)
floss-ept: graded finite-horizon LE penalty (--floss) + three-arm from-scratch ablation queueHEADmaster
- ep_step: floss block after resreg — unroll q=10 steps past z_T1 on a sub-batch WITH graph, rho_hat = mean per-step delta growth, one-sided relu(rho_hat - 0.995)^2, ramp keyed on (rho_hat - target) NOT resT1 (de-cliffed resreg: same fundamental path-LE quantity, linear early signal), capped at floss fraction of task-grad norm (resreg convention). - smoke: below-target = untouched (cos 1.0000); force-fire = finite grads, capped perturbation (cos 0.9803). - runs/abl3_queue.sh (runner live): waits for free GPU slots (0/1/3, GPU2 excluded), launches abl_floss (floss-only) / abl_resreg (resreg-only, never cleanly run) / abl_pair (proven 2.09 recipe, control) with identical remaining flags + seed. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_014FAPDWQ49M5Ye3NpTndTpn
-rw-r--r--ep_run/floss_smoke.log8
-rw-r--r--ep_run/floss_smoke.py39
-rw-r--r--ep_run/lt_ep_train.py36
-rwxr-xr-xep_run/runs/abl3_queue.sh45
4 files changed, 125 insertions, 3 deletions
diff --git a/ep_run/floss_smoke.log b/ep_run/floss_smoke.log
new file mode 100644
index 0000000..a476b9f
--- /dev/null
+++ b/ep_run/floss_smoke.log
@@ -0,0 +1,8 @@
+[off ] res=5.006e-02 n_grads=18
+/home/yurenh2/ept/ep_run/lt_ep_train.py:248: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
+Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:836.)
+ gf = torch.autograd.grad(Rf, blk.block, allow_unused=True) if float(Rf) > 0 else None
+[floss] res=5.006e-02 rho_hat=0.9919 finite=True n_grads=18
+cos(g_off, g_floss)=1.0000 (below-target: should be 1.0 = untouched)
+[fire ] rho_hat=0.9919 finite=True cos(g_off, g_fire)=0.9803 (should be <1 but >0.9: capped perturbation)
+FLOSS_SMOKE_DONE
diff --git a/ep_run/floss_smoke.py b/ep_run/floss_smoke.py
new file mode 100644
index 0000000..371f98a
--- /dev/null
+++ b/ep_run/floss_smoke.py
@@ -0,0 +1,39 @@
+"""floss-ept mechanics smoke: (a) default-off path unchanged, (b) floss fires at random init
+(rho_hat > target expected there) with finite grads, (c) the floss contribution is a bounded
+perturbation of the task gradient (cos(g_off, g_floss) stays high — the 0.2 task-norm cap)."""
+import math, torch
+import lt_ep_train as L
+
+torch.manual_seed(0)
+blk = L.EQBlock(512, 16, 256, 256, c=1.0, attn_mode='thick'); blk.qknorm = True
+torch.manual_seed(7)
+idx, y = L.get_batch('train', 8, 256)
+
+g0, r0 = L.ep_step(blk, idx, y, 30, 8, 0.1, 0.02, jacreg=0.0)
+print(f"[off ] res={r0:.3e} n_grads={len(g0)}", flush=True)
+
+g1, r1 = L.ep_step(blk, idx, y, 30, 8, 0.1, 0.02, jacreg=0.0, floss=0.2, floss_q=8, floss_bsub=4)
+rho = getattr(blk, '_floss_rho', None)
+fin = all(torch.isfinite(v).all().item() for v in g1.values() if v is not None)
+print(f"[floss] res={r1:.3e} rho_hat={rho:.4f} finite={fin} n_grads={len(g1)}", flush=True)
+
+dot = ne = nb = 0.0
+for p in blk.block:
+ a, b = g0.get(id(p)), g1.get(id(p))
+ if a is None or b is None:
+ continue
+ dot += float((a * b).sum()); ne += float((a * a).sum()); nb += float((b * b).sum())
+print(f"cos(g_off, g_floss)={dot / math.sqrt(ne * nb + 1e-20):.4f} (below-target: should be 1.0 = untouched)", flush=True)
+
+# force-fire the penalty path (target below the measured rho) — exercises grad/lam/accumulate end-to-end
+g2, r2 = L.ep_step(blk, idx, y, 30, 8, 0.1, 0.02, jacreg=0.0, floss=0.2, floss_q=8, floss_bsub=4, floss_rho=0.90)
+fin2 = all(torch.isfinite(v).all().item() for v in g2.values() if v is not None)
+dot = ne = nb = 0.0
+for p in blk.block:
+ a, b = g0.get(id(p)), g2.get(id(p))
+ if a is None or b is None:
+ continue
+ dot += float((a * b).sum()); ne += float((a * a).sum()); nb += float((b * b).sum())
+cos2 = dot / math.sqrt(ne * nb + 1e-20)
+print(f"[fire ] rho_hat={blk._floss_rho:.4f} finite={fin2} cos(g_off, g_fire)={cos2:.4f} (should be <1 but >0.9: capped perturbation)", flush=True)
+print("FLOSS_SMOKE_DONE", flush=True)
diff --git a/ep_run/lt_ep_train.py b/ep_run/lt_ep_train.py
index 4e7b8b1..99a1811 100644
--- a/ep_run/lt_ep_train.py
+++ b/ep_run/lt_ep_train.py
@@ -138,7 +138,8 @@ def ce(blk, z, y):
def ep_step(blk, idx, y, T1, T2, eps, beta, jacreg=0.0, holo=0, hr=0.02, t1max=0, res_est=1e-4, t2sel=0,
- corr_every=1, res_gate=0.0, resreg=0.0, eigreg=0.0, eig_margin=1.0):
+ corr_every=1, res_gate=0.0, resreg=0.0, eigreg=0.0, eig_margin=1.0,
+ floss=0.0, floss_q=10, floss_rho=0.995, floss_bsub=4):
xin0 = blk.embed(idx).detach()
zs = relax(blk, xin0.clone(), xin0, T1, eps)
res = (relax(blk, zs, xin0, 1, eps) - zs).norm().item() / (zs.norm().item() + 1e-9)
@@ -229,6 +230,30 @@ def ep_step(blk, idx, y, T1, T2, eps, beta, jacreg=0.0, holo=0, hr=0.02, t1max=0
for p, g in zip(blk.block, grr):
if g is not None:
grads[id(p)] = g * lam if grads.get(id(p)) is None else grads[id(p)] + lam * g
+ if floss > 0: # "floss-ept": GRADED finite-horizon LE penalty (aep 'floss' ported).
+ # Same fundamental quantity as resreg (path contraction rate = what EP-estimator validity needs;
+ # broadband, state-contamination-free) but a LINEAR early signal instead of resreg's rho^T1 cliff:
+ # unroll q steps from z_T1 on a sub-batch WITH graph, rho_hat = mean per-step growth of the update
+ # delta, one-sided penalty above rho target. Ramp keys on (rho_hat - target), NOT on resT1.
+ zb, xb = zT[:floss_bsub].detach(), xin0[:floss_bsub].detach()
+ with torch.enable_grad():
+ zc, ds = zb, []
+ for _ in range(floss_q):
+ d = eps * blk.tforce(zc, xb) # deterministic thick force, graph kept through the path
+ ds.append(d.pow(2).sum())
+ zc = zc + d
+ rho_hat = (ds[-1] / (ds[0] + 1e-20)) ** (0.5 / max(1, floss_q - 1)) # (||d_q||/||d_1||)^(1/(q-1))
+ blk._floss_rho = float(rho_hat.detach()) # logged by the train loop
+ Rf = torch.relu(rho_hat - floss_rho) ** 2
+ gf = torch.autograd.grad(Rf, blk.block, allow_unused=True) if float(Rf) > 0 else None
+ if gf is not None:
+ ratio = floss * min(1.0, float(rho_hat - floss_rho) / 0.005) # graded: full strength 0.5% over target
+ gtask = math.sqrt(sum(float((grads[id(p)] ** 2).sum()) for p in blk.block if grads.get(id(p)) is not None) + 1e-20)
+ gfl = math.sqrt(sum(float((g ** 2).sum()) for g in gf if g is not None) + 1e-20)
+ lam = ratio * gtask / gfl # cap at `ratio` of the task-grad norm (resreg convention)
+ for p, g in zip(blk.block, gf):
+ if g is not None:
+ grads[id(p)] = g * lam if grads.get(id(p)) is None else grads[id(p)] + lam * g
if eigreg > 0: # #2 v2: TRUE leading map-eigenvalue control (aep 'spectral', soft one-sided)
from eig_control import spec_penalty # (omega/numerical-abscissa version refuted 2026-07-03, eig_recheck)
ge, _rho, _mu = spec_penalty(blk, zs, eps, blk.c, eigreg, eig_margin,
@@ -378,6 +403,10 @@ def main():
ap.add_argument('--resreg', type=float, default=0.0) # T1-residual penalty: defend z_T1 (cap ratio vs task grad); run res_gate=0
ap.add_argument('--eigreg', type=float, default=0.0) # #2 v2: soft penalty on TRUE |lam|_lead(I+eps*J_F) — aep 'spectral' at C512
ap.add_argument('--eig_margin', type=float, default=0.995) # rho target: penalize |lam|_lead above this (<1 = contracting relaxation map)
+ ap.add_argument('--floss', type=float, default=0.0) # floss-ept: graded finite-horizon LE penalty (de-cliffed resreg; FINDINGS 2026-07-03)
+ ap.add_argument('--floss_q', type=int, default=10) # unroll horizon (steps past z_T1, with graph, sub-batch)
+ ap.add_argument('--floss_rho', type=float, default=0.995) # per-step contraction target (one-sided; matches eig_margin)
+ ap.add_argument('--floss_bsub', type=int, default=4) # sub-batch rows for the graphed unroll (memory)
ap.add_argument('--diag_cos', type=int, default=0) # #1: every N steps, log cos(EP grad, exact BPTT grad) + res
ap.add_argument('--fingerprint', action='store_true') # load --init_ckpt, print (res,cos,abscissa,val) fingerprint, exit
ap.add_argument('--opt', choices=['adamw', 'lion', 'lionlars', 'sgdm', 'sgdsai'], default='adamw')
@@ -531,7 +560,7 @@ def main():
sw = hw_swap() if hw_on else None
grads, res = ep_step(blk, idx, y, cfg.T1, cfg.T2, cfg.eps, cfg.beta, jr, cfg.holo, cfg.hr,
cfg.t1max, cfg.res_est, cfg.t2sel, cfg.corr_every, cfg.res_gate, cfg.resreg,
- cfg.eigreg, cfg.eig_margin)
+ cfg.eigreg, cfg.eig_margin, cfg.floss, cfg.floss_q, cfg.floss_rho, cfg.floss_bsub)
if sw is not None:
hw_restore(sw)
if cfg.jacreg > 0: # continuous controller: drive residual -> res_target (smooth)
@@ -619,7 +648,8 @@ def main():
torch.save({'allp': [p.detach().cpu() for p in blk.allp],
'pema': [s.cpu() for s in pema] if pema is not None else None,
'step': step, 'best': best}, cfg.ckpt)
- print(f"step {step:4d}/{cfg.steps} | val CE {v:.4f}{etag} (best {best:.4f}) | jr={jr:.1f} res={res:.1e} | {step/(time.time()-t0):.2f} it/s", flush=True)
+ ftag = f" rho={blk._floss_rho:.4f}" if cfg.floss > 0 and hasattr(blk, '_floss_rho') else ""
+ print(f"step {step:4d}/{cfg.steps} | val CE {v:.4f}{etag} (best {best:.4f}) | jr={jr:.1f} res={res:.1e}{ftag} | {step/(time.time()-t0):.2f} it/s", flush=True)
save_state(step) # full-state checkpoint each log interval (Colab resume)
print(f"[{cfg.mode}] DONE best val CE {best:.4f} (random baseline ln({vocab})={math.log(vocab):.3f})", flush=True)
out_dir = Path('runs')
diff --git a/ep_run/runs/abl3_queue.sh b/ep_run/runs/abl3_queue.sh
new file mode 100755
index 0000000..e7c6bde
--- /dev/null
+++ b/ep_run/runs/abl3_queue.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Three-arm from-scratch reg ablation (docs/campaign/FINDINGS.md 2026-07-03 verdict):
+# arm0 abl_floss — floss-only (graded finite-horizon LE penalty; the "one fundamental reg" candidate)
+# arm1 abl_resreg — resreg-only (never cleanly run before; jacreg fully off)
+# arm2 abl_pair — proven pair (resreg 0.2 + FROZEN jr 0.1; the ★2.09 recipe = control arm)
+# All arms share every other flag with the proven ep_resreg_scratch cmd (EP_BELOW210:97-101), same seed.
+# Queue: poll GPUs 0/1/3 (GPU2 = japardi2 NV-Embed server, DO NOT TOUCH); a slot is free when
+# mem.used < 38 GB AND util < 30% for 3 consecutive 60 s polls; launch the next arm per freed slot.
+cd /home/yurenh2/ept/ep_run || exit 1
+LOG=runs/abl3_queue.log
+echo "[$(date)] queue runner up (pid $$)" >> "$LOG"
+
+launch () { # $1 = gpu id, $2 = arm index
+ case $2 in
+ 0) CUDA_VISIBLE_DEVICES=$1 nohup python3 lt_ep_train.py --mode ep --attn_mode thick --B 24 --C 512 --H 16 --T 256 --c 1.0 --jacreg 0 --resreg 0 --floss 0.2 --holo 2 --hr 0.02 --t2sel 40 --track --pema 0.999 --t1max 300 --res_est 1e-4 --res_gate 0 --qknorm --resinit 0.1 --warmup 800 --T1 150 --T2 20 --lr 6e-4 --wsd 0.25 --steps 32000 --log 200 --save_every 500 --abort_res 0.3 --data data/tinystories_bpe --ckpt runs/abl_floss.pt --state runs/abl_floss.state > runs/abl_floss.log 2>&1 &
+ echo "[$(date)] abl_floss (floss-only) -> GPU$1 pid $!" >> "$LOG" ;;
+ 1) CUDA_VISIBLE_DEVICES=$1 nohup python3 lt_ep_train.py --mode ep --attn_mode thick --B 24 --C 512 --H 16 --T 256 --c 1.0 --jacreg 0 --resreg 0.2 --holo 2 --hr 0.02 --t2sel 40 --track --pema 0.999 --t1max 300 --res_est 1e-4 --res_gate 0 --qknorm --resinit 0.1 --warmup 800 --T1 150 --T2 20 --lr 6e-4 --wsd 0.25 --steps 32000 --log 200 --save_every 500 --abort_res 0.3 --data data/tinystories_bpe --ckpt runs/abl_resreg.pt --state runs/abl_resreg.state > runs/abl_resreg.log 2>&1 &
+ echo "[$(date)] abl_resreg (resreg-only) -> GPU$1 pid $!" >> "$LOG" ;;
+ 2) CUDA_VISIBLE_DEVICES=$1 nohup python3 lt_ep_train.py --mode ep --attn_mode thick --B 24 --C 512 --H 16 --T 256 --c 1.0 --jacreg 0.1 --jr_floor 0.1 --jr_max 0.1 --resreg 0.2 --holo 2 --hr 0.02 --t2sel 40 --track --pema 0.999 --t1max 300 --res_est 1e-4 --res_gate 0 --qknorm --resinit 0.1 --warmup 800 --T1 150 --T2 20 --lr 6e-4 --wsd 0.25 --steps 32000 --log 200 --save_every 500 --abort_res 0.3 --data data/tinystories_bpe --ckpt runs/abl_pair.pt --state runs/abl_pair.state > runs/abl_pair.log 2>&1 &
+ echo "[$(date)] abl_pair (proven pair, control) -> GPU$1 pid $!" >> "$LOG" ;;
+ esac
+}
+
+i=0
+declare -A CNT USED
+while [ $i -lt 3 ]; do
+ for g in 0 1 3; do
+ [ $i -ge 3 ] && break
+ [ -n "${USED[$g]}" ] && continue
+ read -r mem util <<< "$(nvidia-smi --query-gpu=memory.used,utilization.gpu --format=csv,noheader,nounits -i "$g" 2>/dev/null | awk -F',' '{gsub(/ /,""); print $1" "$2}')"
+ if [ -n "$mem" ] && [ "$mem" -lt 38000 ] && [ "$util" -lt 30 ] 2>/dev/null; then
+ CNT[$g]=$(( ${CNT[$g]:-0} + 1 ))
+ else
+ CNT[$g]=0
+ fi
+ if [ "${CNT[$g]:-0}" -ge 3 ]; then
+ launch "$g" $i
+ USED[$g]=1
+ i=$((i + 1))
+ sleep 120 # let nvidia-smi register the new job before next slot check
+ fi
+ done
+ sleep 60
+done
+echo "[$(date)] all three arms launched — queue runner exiting" >> "$LOG"