1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
#!/usr/bin/env bash
# Phase-1 queue (experiment_framework.md): E5 horizon sweeps, E2 run-level replication,
# E6 matched-objective step9 pairs. Waits for a free GPU (12h fallback), runs sequentially.
set -o pipefail
cd /home/yurenh2/rrm/research/flossing
source /home/yurenh2/miniconda3/etc/profile.d/conda.sh
conda activate rrm
OUTDIR=analysis_2x2/phase1
mkdir -p "$OUTDIR"
STATUS="$OUTDIR/queue_status.log"
TRM_OFF="/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro"
TRM_SGL="/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_singleGPU"
HRM_ROOT="/home/yurenh2/rrm/hrm/checkpoints/Sudoku-extreme-1k-aug-1000 ACT-torch/HierarchicalReasoningModel_ACTV1 righteous-python"
S9=/home/yurenh2/rrm/research/flossing
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$STATUS"; }
free_gpu() {
nvidia-smi --query-gpu=index,utilization.gpu,memory.used --format=csv,noheader,nounits \
| awk -F', ' '$2<30 && $3<8000 {print $1; exit}'
}
log "phase-1 queue started (E5 horizon sweeps, E2 step9_E replication, E6 step9 pairs)"
DEADLINE=$(( $(date +%s) + 12*3600 ))
GPU=""
while true; do
g1="$(free_gpu)"
if [[ -n "$g1" ]]; then
sleep 60; g2="$(free_gpu)"
if [[ "$g2" == "$g1" ]]; then GPU="$g1"; break; fi
fi
if (( $(date +%s) > DEADLINE )); then
GPU="$(nvidia-smi --query-gpu=index,memory.used --format=csv,noheader,nounits | sort -t, -k2 -n | head -1 | cut -d, -f1)"
log "12h fallback: taking GPU $GPU"
break
fi
sleep 300
done
log "claimed GPU $GPU"
export CUDA_VISIBLE_DEVICES="$GPU"
run_job() { # name horizon script args...
local name="$1" hor="$2"; shift 2
if [[ -f "$OUTDIR/${name}.npz" ]]; then log "skip $name"; return 0; fi
log "start $name"
if DIAG_HORIZON="$hor" python "$@" --out "$OUTDIR/${name}.npz" > "$OUTDIR/${name}.log" 2>&1; then
log "done $name"
else
log "FAILED $name"
fi
}
# --- E5: TRM horizon sweep (h=4 already exists in retest/) ---
for H in 2 6 8 10 12; do
run_job "trm_official58590_h${H}_n2048" "$H" diagnose_trm_joint_horizon.py \
--ckpt-root "$TRM_OFF" --ckpt-name step_58590 --n-samples 2048 --batch-size 16 \
--k-lyap 8 --t-ons 1 --seed 0
done
# --- E5: HRM horizon sweep ---
for H in 2 6 8 10 12; do
run_job "hrm26040_h${H}_n2048" "$H" diagnose_hrm_joint_horizon.py \
--ckpt-root "$HRM_ROOT" --ckpt-name step_26040 --n-samples 2048 --batch-size 32 \
--k-lyap 8 --t-ons 1 --seed 0
done
# --- E2: HRM second training run (step9_E fixed-unroll baseline), full window ---
run_job "step9E_hrm_best_full_n2048" 16 diagnose_hrm_joint.py \
--ckpt-root "$HRM_ROOT" --ckpt-name "$S9/step9_E_hrm_baseline_parallel_fixed_26040_50k_ckpts/best.pt" \
--n-samples 2048 --batch-size 32 --k-lyap 8 --t-ons 1 --seed 0
run_job "step9E_hrm_final_full_n2048" 16 diagnose_hrm_joint.py \
--ckpt-root "$HRM_ROOT" --ckpt-name "$S9/step9_E_hrm_baseline_parallel_fixed_26040_50k_ckpts/final.pt" \
--n-samples 2048 --batch-size 32 --k-lyap 8 --t-ons 1 --seed 0
# --- E6: matched-objective pairs (n=512): HRM E vs F, TRM G vs H ---
for CK in step_12500 step_25000 best final; do
run_job "step9E_hrm_${CK}_n512" 16 diagnose_hrm_joint.py \
--ckpt-root "$HRM_ROOT" --ckpt-name "$S9/step9_E_hrm_baseline_parallel_fixed_26040_50k_ckpts/${CK}.pt" \
--n-samples 512 --batch-size 32 --k-lyap 8 --t-ons 1 --seed 0
run_job "step9F_hrm_${CK}_n512" 16 diagnose_hrm_joint.py \
--ckpt-root "$HRM_ROOT" --ckpt-name "$S9/step9_F_hrm_multi4_loguniform_ramp_26040_50k_ckpts/${CK}.pt" \
--n-samples 512 --batch-size 32 --k-lyap 8 --t-ons 1 --seed 0
run_job "step9G_trm_${CK}_n512" 16 diagnose_trm_joint.py \
--ckpt-root "$TRM_SGL" --ckpt-name "$S9/step9_G_trm_baseline_parallel_fixed_26041_batch4_50k_ckpts/${CK}.pt" \
--n-samples 512 --batch-size 16 --k-lyap 8 --t-ons 1 --seed 0
run_job "step9H_trm_${CK}_n512" 16 diagnose_trm_joint.py \
--ckpt-root "$TRM_SGL" --ckpt-name "$S9/step9_H_trm_multi4_loguniform_ramp_26041_batch4_50k_ckpts/${CK}.pt" \
--n-samples 512 --batch-size 16 --k-lyap 8 --t-ons 1 --seed 0
done
log "phase-1 queue finished"
|