blob: e7c6bde578ceeffe270cddc39780d12f8e02472c (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
#!/bin/bash
# Three-arm from-scratch reg ablation (docs/campaign/FINDINGS.md 2026-07-03 verdict):
# arm0 abl_floss — floss-only (graded finite-horizon LE penalty; the "one fundamental reg" candidate)
# arm1 abl_resreg — resreg-only (never cleanly run before; jacreg fully off)
# arm2 abl_pair — proven pair (resreg 0.2 + FROZEN jr 0.1; the ★2.09 recipe = control arm)
# All arms share every other flag with the proven ep_resreg_scratch cmd (EP_BELOW210:97-101), same seed.
# Queue: poll GPUs 0/1/3 (GPU2 = japardi2 NV-Embed server, DO NOT TOUCH); a slot is free when
# mem.used < 38 GB AND util < 30% for 3 consecutive 60 s polls; launch the next arm per freed slot.
cd /home/yurenh2/ept/ep_run || exit 1
LOG=runs/abl3_queue.log
echo "[$(date)] queue runner up (pid $$)" >> "$LOG"
launch () { # $1 = gpu id, $2 = arm index
case $2 in
0) CUDA_VISIBLE_DEVICES=$1 nohup python3 lt_ep_train.py --mode ep --attn_mode thick --B 24 --C 512 --H 16 --T 256 --c 1.0 --jacreg 0 --resreg 0 --floss 0.2 --holo 2 --hr 0.02 --t2sel 40 --track --pema 0.999 --t1max 300 --res_est 1e-4 --res_gate 0 --qknorm --resinit 0.1 --warmup 800 --T1 150 --T2 20 --lr 6e-4 --wsd 0.25 --steps 32000 --log 200 --save_every 500 --abort_res 0.3 --data data/tinystories_bpe --ckpt runs/abl_floss.pt --state runs/abl_floss.state > runs/abl_floss.log 2>&1 &
echo "[$(date)] abl_floss (floss-only) -> GPU$1 pid $!" >> "$LOG" ;;
1) CUDA_VISIBLE_DEVICES=$1 nohup python3 lt_ep_train.py --mode ep --attn_mode thick --B 24 --C 512 --H 16 --T 256 --c 1.0 --jacreg 0 --resreg 0.2 --holo 2 --hr 0.02 --t2sel 40 --track --pema 0.999 --t1max 300 --res_est 1e-4 --res_gate 0 --qknorm --resinit 0.1 --warmup 800 --T1 150 --T2 20 --lr 6e-4 --wsd 0.25 --steps 32000 --log 200 --save_every 500 --abort_res 0.3 --data data/tinystories_bpe --ckpt runs/abl_resreg.pt --state runs/abl_resreg.state > runs/abl_resreg.log 2>&1 &
echo "[$(date)] abl_resreg (resreg-only) -> GPU$1 pid $!" >> "$LOG" ;;
2) CUDA_VISIBLE_DEVICES=$1 nohup python3 lt_ep_train.py --mode ep --attn_mode thick --B 24 --C 512 --H 16 --T 256 --c 1.0 --jacreg 0.1 --jr_floor 0.1 --jr_max 0.1 --resreg 0.2 --holo 2 --hr 0.02 --t2sel 40 --track --pema 0.999 --t1max 300 --res_est 1e-4 --res_gate 0 --qknorm --resinit 0.1 --warmup 800 --T1 150 --T2 20 --lr 6e-4 --wsd 0.25 --steps 32000 --log 200 --save_every 500 --abort_res 0.3 --data data/tinystories_bpe --ckpt runs/abl_pair.pt --state runs/abl_pair.state > runs/abl_pair.log 2>&1 &
echo "[$(date)] abl_pair (proven pair, control) -> GPU$1 pid $!" >> "$LOG" ;;
esac
}
i=0
declare -A CNT USED
while [ $i -lt 3 ]; do
for g in 0 1 3; do
[ $i -ge 3 ] && break
[ -n "${USED[$g]}" ] && continue
read -r mem util <<< "$(nvidia-smi --query-gpu=memory.used,utilization.gpu --format=csv,noheader,nounits -i "$g" 2>/dev/null | awk -F',' '{gsub(/ /,""); print $1" "$2}')"
if [ -n "$mem" ] && [ "$mem" -lt 38000 ] && [ "$util" -lt 30 ] 2>/dev/null; then
CNT[$g]=$(( ${CNT[$g]:-0} + 1 ))
else
CNT[$g]=0
fi
if [ "${CNT[$g]:-0}" -ge 3 ]; then
launch "$g" $i
USED[$g]=1
i=$((i + 1))
sleep 120 # let nvidia-smi register the new job before next slot check
fi
done
sleep 60
done
echo "[$(date)] all three arms launched — queue runner exiting" >> "$LOG"
|