#!/usr/bin/env bash # Queue TRM Sudoku 1k 4-GPU training: wait for Step 3 to finish then launch. # Run via: nohup bash scripts/queue_trm_sudoku.sh > runs/trm_queue.log 2>&1 & set -e LOG_DIR=/home/yurenh2/rrm/runs mkdir -p "$LOG_DIR" TRM_LOG="$LOG_DIR/trm_sudoku_$(date +%Y%m%d_%H%M%S).log" echo "[$(date)] queued, waiting for step3_train processes to exit..." while pgrep -f step3_train_with_rf >/dev/null; do sleep 60 done echo "[$(date)] Step 3 done — launching TRM Sudoku 1k 4-GPU" source "$(conda info --base)/etc/profile.d/conda.sh" conda activate rrm cd /home/yurenh2/rrm/trm # Official TRM Sudoku 1k pretrain_mlp_t_sudoku command, scaled to 4-GPU torchrun WANDB_MODE=offline OMP_NUM_THREADS=8 \ torchrun --standalone --nproc-per-node 4 pretrain.py \ arch=trm \ data_paths="[/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000]" \ evaluators="[]" \ epochs=50000 eval_interval=5000 \ lr=1e-4 puzzle_emb_lr=1e-4 weight_decay=1.0 puzzle_emb_weight_decay=1.0 \ arch.mlp_t=True arch.pos_encodings=none \ arch.L_layers=2 \ arch.H_cycles=3 arch.L_cycles=6 \ +run_name=pretrain_mlp_t_sudoku ema=True \ > "$TRM_LOG" 2>&1 echo "[$(date)] TRM training finished. log: $TRM_LOG"