summaryrefslogtreecommitdiff
path: root/scripts/queue_trm_sudoku.sh
blob: fa447ffac4f3f870cd3f5555f6b6d283a1a6992a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/usr/bin/env bash
# Queue TRM Sudoku 1k 4-GPU training: wait for Step 3 to finish then launch.
# Run via: nohup bash scripts/queue_trm_sudoku.sh > runs/trm_queue.log 2>&1 &
set -e
LOG_DIR=/home/yurenh2/rrm/runs
mkdir -p "$LOG_DIR"
TRM_LOG="$LOG_DIR/trm_sudoku_$(date +%Y%m%d_%H%M%S).log"
echo "[$(date)] queued, waiting for step3_train processes to exit..."

while pgrep -f step3_train_with_rf >/dev/null; do
  sleep 60
done

echo "[$(date)] Step 3 done — launching TRM Sudoku 1k 4-GPU"

source "$(conda info --base)/etc/profile.d/conda.sh"
conda activate rrm

cd /home/yurenh2/rrm/trm

# Official TRM Sudoku 1k pretrain_mlp_t_sudoku command, scaled to 4-GPU torchrun
WANDB_MODE=offline OMP_NUM_THREADS=8 \
  torchrun --standalone --nproc-per-node 4 pretrain.py \
    arch=trm \
    data_paths="[/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000]" \
    evaluators="[]" \
    epochs=50000 eval_interval=5000 \
    lr=1e-4 puzzle_emb_lr=1e-4 weight_decay=1.0 puzzle_emb_weight_decay=1.0 \
    arch.mlp_t=True arch.pos_encodings=none \
    arch.L_layers=2 \
    arch.H_cycles=3 arch.L_cycles=6 \
    +run_name=pretrain_mlp_t_sudoku ema=True \
    > "$TRM_LOG" 2>&1

echo "[$(date)] TRM training finished. log: $TRM_LOG"