From 66e0d8b9fd4d0f7a2231d689c055e26fdf1cf04a Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Sat, 13 Jun 2026 12:35:36 -0500 Subject: rrm workspace: TRM/HRM/SRM code, Maze dataset, dynamical-analysis pipeline Curated export for clone-and-run Maze training (2x A6000) + diagnostics. trm/hrm pretrain.py carry trajectory-augmentation code (backward-compatible). Heavy artifacts (checkpoints/wandb/npz) gitignored; see PROVENANCE.md. Co-Authored-By: Claude Fable 5 --- research/flossing/analysis_2x2/run_phase1_queue.sh | 92 ++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100755 research/flossing/analysis_2x2/run_phase1_queue.sh (limited to 'research/flossing/analysis_2x2/run_phase1_queue.sh') diff --git a/research/flossing/analysis_2x2/run_phase1_queue.sh b/research/flossing/analysis_2x2/run_phase1_queue.sh new file mode 100755 index 0000000..989c7ee --- /dev/null +++ b/research/flossing/analysis_2x2/run_phase1_queue.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# Phase-1 queue (experiment_framework.md): E5 horizon sweeps, E2 run-level replication, +# E6 matched-objective step9 pairs. Waits for a free GPU (12h fallback), runs sequentially. +set -o pipefail + +cd /home/yurenh2/rrm/research/flossing +source /home/yurenh2/miniconda3/etc/profile.d/conda.sh +conda activate rrm + +OUTDIR=analysis_2x2/phase1 +mkdir -p "$OUTDIR" +STATUS="$OUTDIR/queue_status.log" +TRM_OFF="/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro" +TRM_SGL="/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_singleGPU" +HRM_ROOT="/home/yurenh2/rrm/hrm/checkpoints/Sudoku-extreme-1k-aug-1000 ACT-torch/HierarchicalReasoningModel_ACTV1 righteous-python" +S9=/home/yurenh2/rrm/research/flossing + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$STATUS"; } +free_gpu() { + nvidia-smi --query-gpu=index,utilization.gpu,memory.used --format=csv,noheader,nounits \ + | awk -F', ' '$2<30 && $3<8000 {print $1; exit}' +} + +log "phase-1 queue started (E5 horizon sweeps, E2 step9_E replication, E6 step9 pairs)" +DEADLINE=$(( $(date +%s) + 12*3600 )) +GPU="" +while true; do + g1="$(free_gpu)" + if [[ -n "$g1" ]]; then + sleep 60; g2="$(free_gpu)" + if [[ "$g2" == "$g1" ]]; then GPU="$g1"; break; fi + fi + if (( $(date +%s) > DEADLINE )); then + GPU="$(nvidia-smi --query-gpu=index,memory.used --format=csv,noheader,nounits | sort -t, -k2 -n | head -1 | cut -d, -f1)" + log "12h fallback: taking GPU $GPU" + break + fi + sleep 300 +done +log "claimed GPU $GPU" +export CUDA_VISIBLE_DEVICES="$GPU" + +run_job() { # name horizon script args... + local name="$1" hor="$2"; shift 2 + if [[ -f "$OUTDIR/${name}.npz" ]]; then log "skip $name"; return 0; fi + log "start $name" + if DIAG_HORIZON="$hor" python "$@" --out "$OUTDIR/${name}.npz" > "$OUTDIR/${name}.log" 2>&1; then + log "done $name" + else + log "FAILED $name" + fi +} + +# --- E5: TRM horizon sweep (h=4 already exists in retest/) --- +for H in 2 6 8 10 12; do + run_job "trm_official58590_h${H}_n2048" "$H" diagnose_trm_joint_horizon.py \ + --ckpt-root "$TRM_OFF" --ckpt-name step_58590 --n-samples 2048 --batch-size 16 \ + --k-lyap 8 --t-ons 1 --seed 0 +done + +# --- E5: HRM horizon sweep --- +for H in 2 6 8 10 12; do + run_job "hrm26040_h${H}_n2048" "$H" diagnose_hrm_joint_horizon.py \ + --ckpt-root "$HRM_ROOT" --ckpt-name step_26040 --n-samples 2048 --batch-size 32 \ + --k-lyap 8 --t-ons 1 --seed 0 +done + +# --- E2: HRM second training run (step9_E fixed-unroll baseline), full window --- +run_job "step9E_hrm_best_full_n2048" 16 diagnose_hrm_joint.py \ + --ckpt-root "$HRM_ROOT" --ckpt-name "$S9/step9_E_hrm_baseline_parallel_fixed_26040_50k_ckpts/best.pt" \ + --n-samples 2048 --batch-size 32 --k-lyap 8 --t-ons 1 --seed 0 +run_job "step9E_hrm_final_full_n2048" 16 diagnose_hrm_joint.py \ + --ckpt-root "$HRM_ROOT" --ckpt-name "$S9/step9_E_hrm_baseline_parallel_fixed_26040_50k_ckpts/final.pt" \ + --n-samples 2048 --batch-size 32 --k-lyap 8 --t-ons 1 --seed 0 + +# --- E6: matched-objective pairs (n=512): HRM E vs F, TRM G vs H --- +for CK in step_12500 step_25000 best final; do + run_job "step9E_hrm_${CK}_n512" 16 diagnose_hrm_joint.py \ + --ckpt-root "$HRM_ROOT" --ckpt-name "$S9/step9_E_hrm_baseline_parallel_fixed_26040_50k_ckpts/${CK}.pt" \ + --n-samples 512 --batch-size 32 --k-lyap 8 --t-ons 1 --seed 0 + run_job "step9F_hrm_${CK}_n512" 16 diagnose_hrm_joint.py \ + --ckpt-root "$HRM_ROOT" --ckpt-name "$S9/step9_F_hrm_multi4_loguniform_ramp_26040_50k_ckpts/${CK}.pt" \ + --n-samples 512 --batch-size 32 --k-lyap 8 --t-ons 1 --seed 0 + run_job "step9G_trm_${CK}_n512" 16 diagnose_trm_joint.py \ + --ckpt-root "$TRM_SGL" --ckpt-name "$S9/step9_G_trm_baseline_parallel_fixed_26041_batch4_50k_ckpts/${CK}.pt" \ + --n-samples 512 --batch-size 16 --k-lyap 8 --t-ons 1 --seed 0 + run_job "step9H_trm_${CK}_n512" 16 diagnose_trm_joint.py \ + --ckpt-root "$TRM_SGL" --ckpt-name "$S9/step9_H_trm_multi4_loguniform_ramp_26041_batch4_50k_ckpts/${CK}.pt" \ + --n-samples 512 --batch-size 16 --k-lyap 8 --t-ons 1 --seed 0 +done + +log "phase-1 queue finished" -- cgit v1.2.3