summaryrefslogtreecommitdiff
path: root/research/flossing/analysis_2x2/run_retest_2x2.sh
diff options
context:
space:
mode:
authorYurenHao0426 <blackhao0426@gmail.com>2026-06-13 12:35:36 -0500
committerYurenHao0426 <blackhao0426@gmail.com>2026-06-13 12:35:36 -0500
commit66e0d8b9fd4d0f7a2231d689c055e26fdf1cf04a (patch)
treec29cba61124018755a19b02c9d33e3ad5f2e05cc /research/flossing/analysis_2x2/run_retest_2x2.sh
rrm workspace: TRM/HRM/SRM code, Maze dataset, dynamical-analysis pipelineHEADmain
Curated export for clone-and-run Maze training (2x A6000) + diagnostics. trm/hrm pretrain.py carry trajectory-augmentation code (backward-compatible). Heavy artifacts (checkpoints/wandb/npz) gitignored; see PROVENANCE.md. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Diffstat (limited to 'research/flossing/analysis_2x2/run_retest_2x2.sh')
-rwxr-xr-xresearch/flossing/analysis_2x2/run_retest_2x2.sh75
1 files changed, 75 insertions, 0 deletions
diff --git a/research/flossing/analysis_2x2/run_retest_2x2.sh b/research/flossing/analysis_2x2/run_retest_2x2.sh
new file mode 100755
index 0000000..9cdcc13
--- /dev/null
+++ b/research/flossing/analysis_2x2/run_retest_2x2.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+# Queue: wait for a free GPU, then run the 2x2 re-test diagnostics:
+# 1. TRM official_gbs768 @ step_58590 (86.9% ckpt) full-window n=2048
+# 2. TRM official_gbs768 @ step_58590 early-window n=2048 (first 4 ACT steps)
+# 3. HRM righteous-python @ step_26040 (joint est.) full-window n=2048
+# 4. HRM righteous-python @ step_26040 early-window n=2048
+# Same --seed 0 and same n across full/short pairs so idx fields pair up.
+set -o pipefail
+
+cd /home/yurenh2/rrm/research/flossing
+source /home/yurenh2/miniconda3/etc/profile.d/conda.sh
+conda activate rrm
+
+OUTDIR=analysis_2x2/retest
+mkdir -p "$OUTDIR"
+STATUS="$OUTDIR/queue_status.log"
+TRM_ROOT="/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro"
+HRM_ROOT="/home/yurenh2/rrm/hrm/checkpoints/Sudoku-extreme-1k-aug-1000 ACT-torch/HierarchicalReasoningModel_ACTV1 righteous-python"
+
+log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$STATUS"; }
+
+free_gpu() {
+ # print index of a GPU with util<30% and mem<8GB, else empty
+ nvidia-smi --query-gpu=index,utilization.gpu,memory.used --format=csv,noheader,nounits \
+ | awk -F', ' '$2<30 && $3<8000 {print $1; exit}'
+}
+
+log "queue started, waiting for a free GPU (util<30%, mem<8GB, two checks 60s apart; 12h fallback)"
+DEADLINE=$(( $(date +%s) + 12*3600 ))
+GPU=""
+while true; do
+ g1="$(free_gpu)"
+ if [[ -n "$g1" ]]; then
+ sleep 60
+ g2="$(free_gpu)"
+ if [[ "$g2" == "$g1" ]]; then GPU="$g1"; break; fi
+ fi
+ if (( $(date +%s) > DEADLINE )); then
+ GPU="$(nvidia-smi --query-gpu=index,memory.used --format=csv,noheader,nounits | sort -t, -k2 -n | head -1 | cut -d, -f1)"
+ log "12h fallback: taking GPU $GPU (most free memory) despite utilization"
+ break
+ fi
+ sleep 300
+done
+log "claimed GPU $GPU"
+export CUDA_VISIBLE_DEVICES="$GPU"
+
+run_job() {
+ local name="$1"; shift
+ if [[ -f "$OUTDIR/${name}.npz" ]]; then log "skip $name (output exists)"; return 0; fi
+ log "start $name"
+ if python "$@" --out "$OUTDIR/${name}.npz" > "$OUTDIR/${name}.log" 2>&1; then
+ log "done $name"
+ else
+ log "FAILED $name (see $OUTDIR/${name}.log)"
+ fi
+}
+
+run_job trm_gbs768_step58590_full_n2048 \
+ diagnose_trm_joint.py --ckpt-root "$TRM_ROOT" --ckpt-name step_58590 \
+ --n-samples 2048 --batch-size 16 --k-lyap 8 --t-ons 1 --seed 0
+
+run_job trm_gbs768_step58590_short_n2048 \
+ diagnose_trm_joint_short.py --ckpt-root "$TRM_ROOT" --ckpt-name step_58590 \
+ --n-samples 2048 --batch-size 16 --k-lyap 8 --t-ons 1 --seed 0
+
+run_job hrm_righteous_step26040_full_n2048 \
+ diagnose_hrm_joint.py --ckpt-root "$HRM_ROOT" --ckpt-name step_26040 \
+ --n-samples 2048 --batch-size 32 --k-lyap 8 --t-ons 1 --seed 0
+
+run_job hrm_righteous_step26040_short_n2048 \
+ diagnose_hrm_joint_short.py --ckpt-root "$HRM_ROOT" --ckpt-name step_26040 \
+ --n-samples 2048 --batch-size 32 --k-lyap 8 --t-ons 1 --seed 0
+
+log "all retest diagnostics finished"