summaryrefslogtreecommitdiff
path: root/analysis_2x2/queue_maze_followups.sh
diff options
context:
space:
mode:
Diffstat (limited to 'analysis_2x2/queue_maze_followups.sh')
-rwxr-xr-xanalysis_2x2/queue_maze_followups.sh50
1 files changed, 50 insertions, 0 deletions
diff --git a/analysis_2x2/queue_maze_followups.sh b/analysis_2x2/queue_maze_followups.sh
new file mode 100755
index 0000000..2257013
--- /dev/null
+++ b/analysis_2x2/queue_maze_followups.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Pre-HRM-Maze queue (user request): (1) per-cell failure structure of TRM-Maze (cheap inference),
+# (2) continue-train TRM-Maze from step_130200 to test saturation (does acc climb toward ~1.0?).
+# Waits for a genuinely free GPU; good-neighbor on the shared box.
+set -o pipefail
+cd /home/yurenh2/rrm/research/flossing
+source /home/yurenh2/miniconda3/etc/profile.d/conda.sh
+conda activate rrm
+OUTDIR=analysis_2x2/maze_followup; mkdir -p "$OUTDIR"
+ST="$OUTDIR/queue.log"; log(){ echo "[$(date '+%m-%d %H:%M:%S')] $*" >> "$ST"; }
+CKDIR=/home/yurenh2/rrm/trm/checkpoints/maze-30x30-hard-1k-ACT-torch/pretrain_att_maze30x30_2gpu_gbs384
+DATA=/home/yurenh2/rrm/data/maze-30x30-hard-1k
+free_gpu(){ nvidia-smi --query-gpu=index,utilization.gpu,memory.used --format=csv,noheader,nounits \
+ | awk -F', ' '$2<20 && $3<4000 {print $1; exit}'; }
+
+log "queue started; waiting for a free GPU (util<20, mem<4G; 12h fallback)"
+DEADLINE=$(( $(date +%s) + 12*3600 )); GPU=""
+while true; do
+ g1="$(free_gpu)"; if [[ -n "$g1" ]]; then sleep 60; g2="$(free_gpu)"; [[ "$g2" == "$g1" ]] && GPU="$g1" && break; fi
+ if (( $(date +%s) > DEADLINE )); then GPU="$(nvidia-smi --query-gpu=index,memory.used --format=csv,noheader,nounits|sort -t, -k2 -n|head -1|cut -d, -f1)"; log "fallback GPU $GPU"; break; fi
+ sleep 300
+done
+log "claimed GPU $GPU"; export CUDA_VISIBLE_DEVICES="$GPU"
+
+# (1) Solution-space dynamics + per-cell structure: TRM-Maze final ckpt (saves preds, inputs,
+# labels, AND per-step decoded-answer Hamming drift over solution-space cells).
+log "start MAZE solution-space dump (inference, no JVP)"
+if python maze_pred_dump.py --ckpt-root "$CKDIR" --ckpt-name step_130200 --data "$DATA" \
+ --n 512 --out "$OUTDIR/maze_preds_step130200.npz" > "$OUTDIR/preds_maze.log" 2>&1; then
+ log "done maze dump"; else log "FAILED maze dump"; fi
+# (1b) SUDOKU control: solution-space dynamics where full-space DID separate.
+SUD_CK=/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro
+SUD_DATA=/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000
+log "start SUDOKU solution-space control dump"
+if python maze_pred_dump.py --ckpt-root "$SUD_CK" --ckpt-name step_58590 --data "$SUD_DATA" \
+ --n 512 --out "$OUTDIR/sudoku_preds_step58590.npz" > "$OUTDIR/preds_sudoku.log" 2>&1; then
+ log "done sudoku dump"; else log "FAILED sudoku dump"; fi
+
+# (2) continue-train TRM-Maze from step_130200 (saturation test). gbs 192 on 1 GPU; constant LR.
+log "start continue-train TRM-Maze from step_130200 (+25000 epochs, eval every 2500)"
+cd /home/yurenh2/rrm/trm
+export WANDB_MODE=offline
+RUN=pretrain_att_maze30x30_CONTINUE_from130200
+nohup python pretrain.py arch=trm "data_paths=[$DATA]" "evaluators=[]" \
+ epochs=12500 eval_interval=2500 \
+ lr=1e-4 puzzle_emb_lr=1e-4 weight_decay=1.0 puzzle_emb_weight_decay=1.0 global_batch_size=192 \
+ arch.L_layers=2 arch.H_cycles=3 arch.L_cycles=4 +run_name="$RUN" ema=True \
+ +checkpoint_every_eval=true "+load_checkpoint=$CKDIR/step_130200" \
+ > /home/yurenh2/rrm/research/flossing/$OUTDIR/continue_train.log 2>&1
+echo "continue-train exited" >> "/home/yurenh2/rrm/research/flossing/$ST"