summaryrefslogtreecommitdiff
path: root/ep_run/runs
diff options
context:
space:
mode:
Diffstat (limited to 'ep_run/runs')
-rwxr-xr-xep_run/runs/abl3_queue.sh45
1 files changed, 45 insertions, 0 deletions
diff --git a/ep_run/runs/abl3_queue.sh b/ep_run/runs/abl3_queue.sh
new file mode 100755
index 0000000..e7c6bde
--- /dev/null
+++ b/ep_run/runs/abl3_queue.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Three-arm from-scratch reg ablation (docs/campaign/FINDINGS.md 2026-07-03 verdict):
+# arm0 abl_floss — floss-only (graded finite-horizon LE penalty; the "one fundamental reg" candidate)
+# arm1 abl_resreg — resreg-only (never cleanly run before; jacreg fully off)
+# arm2 abl_pair — proven pair (resreg 0.2 + FROZEN jr 0.1; the ★2.09 recipe = control arm)
+# All arms share every other flag with the proven ep_resreg_scratch cmd (EP_BELOW210:97-101), same seed.
+# Queue: poll GPUs 0/1/3 (GPU2 = japardi2 NV-Embed server, DO NOT TOUCH); a slot is free when
+# mem.used < 38 GB AND util < 30% for 3 consecutive 60 s polls; launch the next arm per freed slot.
+cd /home/yurenh2/ept/ep_run || exit 1
+LOG=runs/abl3_queue.log
+echo "[$(date)] queue runner up (pid $$)" >> "$LOG"
+
+launch () { # $1 = gpu id, $2 = arm index
+ case $2 in
+ 0) CUDA_VISIBLE_DEVICES=$1 nohup python3 lt_ep_train.py --mode ep --attn_mode thick --B 24 --C 512 --H 16 --T 256 --c 1.0 --jacreg 0 --resreg 0 --floss 0.2 --holo 2 --hr 0.02 --t2sel 40 --track --pema 0.999 --t1max 300 --res_est 1e-4 --res_gate 0 --qknorm --resinit 0.1 --warmup 800 --T1 150 --T2 20 --lr 6e-4 --wsd 0.25 --steps 32000 --log 200 --save_every 500 --abort_res 0.3 --data data/tinystories_bpe --ckpt runs/abl_floss.pt --state runs/abl_floss.state > runs/abl_floss.log 2>&1 &
+ echo "[$(date)] abl_floss (floss-only) -> GPU$1 pid $!" >> "$LOG" ;;
+ 1) CUDA_VISIBLE_DEVICES=$1 nohup python3 lt_ep_train.py --mode ep --attn_mode thick --B 24 --C 512 --H 16 --T 256 --c 1.0 --jacreg 0 --resreg 0.2 --holo 2 --hr 0.02 --t2sel 40 --track --pema 0.999 --t1max 300 --res_est 1e-4 --res_gate 0 --qknorm --resinit 0.1 --warmup 800 --T1 150 --T2 20 --lr 6e-4 --wsd 0.25 --steps 32000 --log 200 --save_every 500 --abort_res 0.3 --data data/tinystories_bpe --ckpt runs/abl_resreg.pt --state runs/abl_resreg.state > runs/abl_resreg.log 2>&1 &
+ echo "[$(date)] abl_resreg (resreg-only) -> GPU$1 pid $!" >> "$LOG" ;;
+ 2) CUDA_VISIBLE_DEVICES=$1 nohup python3 lt_ep_train.py --mode ep --attn_mode thick --B 24 --C 512 --H 16 --T 256 --c 1.0 --jacreg 0.1 --jr_floor 0.1 --jr_max 0.1 --resreg 0.2 --holo 2 --hr 0.02 --t2sel 40 --track --pema 0.999 --t1max 300 --res_est 1e-4 --res_gate 0 --qknorm --resinit 0.1 --warmup 800 --T1 150 --T2 20 --lr 6e-4 --wsd 0.25 --steps 32000 --log 200 --save_every 500 --abort_res 0.3 --data data/tinystories_bpe --ckpt runs/abl_pair.pt --state runs/abl_pair.state > runs/abl_pair.log 2>&1 &
+ echo "[$(date)] abl_pair (proven pair, control) -> GPU$1 pid $!" >> "$LOG" ;;
+ esac
+}
+
+i=0
+declare -A CNT USED
+while [ $i -lt 3 ]; do
+ for g in 0 1 3; do
+ [ $i -ge 3 ] && break
+ [ -n "${USED[$g]}" ] && continue
+ read -r mem util <<< "$(nvidia-smi --query-gpu=memory.used,utilization.gpu --format=csv,noheader,nounits -i "$g" 2>/dev/null | awk -F',' '{gsub(/ /,""); print $1" "$2}')"
+ if [ -n "$mem" ] && [ "$mem" -lt 38000 ] && [ "$util" -lt 30 ] 2>/dev/null; then
+ CNT[$g]=$(( ${CNT[$g]:-0} + 1 ))
+ else
+ CNT[$g]=0
+ fi
+ if [ "${CNT[$g]:-0}" -ge 3 ]; then
+ launch "$g" $i
+ USED[$g]=1
+ i=$((i + 1))
+ sleep 120 # let nvidia-smi register the new job before next slot check
+ fi
+ done
+ sleep 60
+done
+echo "[$(date)] all three arms launched — queue runner exiting" >> "$LOG"