diff options
Diffstat (limited to 'scripts/queue_trm_sudoku.sh')
| -rwxr-xr-x | scripts/queue_trm_sudoku.sh | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/scripts/queue_trm_sudoku.sh b/scripts/queue_trm_sudoku.sh new file mode 100755 index 0000000..fa447ff --- /dev/null +++ b/scripts/queue_trm_sudoku.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Queue TRM Sudoku 1k 4-GPU training: wait for Step 3 to finish then launch. +# Run via: nohup bash scripts/queue_trm_sudoku.sh > runs/trm_queue.log 2>&1 & +set -e +LOG_DIR=/home/yurenh2/rrm/runs +mkdir -p "$LOG_DIR" +TRM_LOG="$LOG_DIR/trm_sudoku_$(date +%Y%m%d_%H%M%S).log" +echo "[$(date)] queued, waiting for step3_train processes to exit..." + +while pgrep -f step3_train_with_rf >/dev/null; do + sleep 60 +done + +echo "[$(date)] Step 3 done — launching TRM Sudoku 1k 4-GPU" + +source "$(conda info --base)/etc/profile.d/conda.sh" +conda activate rrm + +cd /home/yurenh2/rrm/trm + +# Official TRM Sudoku 1k pretrain_mlp_t_sudoku command, scaled to 4-GPU torchrun +WANDB_MODE=offline OMP_NUM_THREADS=8 \ + torchrun --standalone --nproc-per-node 4 pretrain.py \ + arch=trm \ + data_paths="[/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000]" \ + evaluators="[]" \ + epochs=50000 eval_interval=5000 \ + lr=1e-4 puzzle_emb_lr=1e-4 weight_decay=1.0 puzzle_emb_weight_decay=1.0 \ + arch.mlp_t=True arch.pos_encodings=none \ + arch.L_layers=2 \ + arch.H_cycles=3 arch.L_cycles=6 \ + +run_name=pretrain_mlp_t_sudoku ema=True \ + > "$TRM_LOG" 2>&1 + +echo "[$(date)] TRM training finished. log: $TRM_LOG" |
