#!/usr/bin/env bash # Portable TRM Maze-Hard launcher — run on any machine with the TRM repo + rrm conda env. # # Set these two paths for the target machine (env vars or edit here): # TRM_DIR = path to the TinyRecursiveModels repo clone (contains pretrain.py) # DATA_DIR = path to the maze-30x30-hard-1k dataset (from this bundle) # CONDA_SH = path to conda.sh (default tries common locations) # # Usage: TRM_DIR=~/TinyRecursiveModels DATA_DIR=~/maze-30x30-hard-1k bash launch_maze_trm_portable.sh [NGPU] [GBS] # 2x A6000 (48G): ... bash launch_maze_trm_portable.sh 2 384 # 2x A5000 (24G): ... bash launch_maze_trm_portable.sh 2 192 (-> 128 if OOM) set -eo pipefail TRM_DIR="${TRM_DIR:?set TRM_DIR to the TinyRecursiveModels repo path}" DATA_DIR="${DATA_DIR:?set DATA_DIR to the maze-30x30-hard-1k dataset path}" NGPU="${1:-2}" GBS="${2:-384}" RUN_NAME="pretrain_att_maze30x30_${NGPU}gpu_gbs${GBS}" # conda CONDA_SH="${CONDA_SH:-}" if [[ -z "${CONDA_SH}" ]]; then for p in "$HOME/miniconda3/etc/profile.d/conda.sh" "$HOME/anaconda3/etc/profile.d/conda.sh" \ "/opt/conda/etc/profile.d/conda.sh"; do [[ -f "$p" ]] && CONDA_SH="$p" && break done fi [[ -f "${CONDA_SH}" ]] && source "${CONDA_SH}" && conda activate "${CONDA_ENV:-rrm}" cd "${TRM_DIR}" export WANDB_MODE=offline ARGS=( arch=trm "data_paths=[${DATA_DIR}]" "evaluators=[]" epochs=50000 eval_interval=5000 lr=1e-4 puzzle_emb_lr=1e-4 weight_decay=1.0 puzzle_emb_weight_decay=1.0 global_batch_size="${GBS}" arch.L_layers=2 arch.H_cycles=3 arch.L_cycles=4 +run_name="${RUN_NAME}" ema=True +checkpoint_every_eval=true ) LOG="maze_${RUN_NAME}.log" if [[ "${NGPU}" -gt 1 ]]; then nohup torchrun --nproc-per-node "${NGPU}" --rdzv_backend=c10d --rdzv_endpoint=localhost:0 \ --nnodes=1 pretrain.py "${ARGS[@]}" > "${LOG}" 2>&1 & else nohup python pretrain.py "${ARGS[@]}" > "${LOG}" 2>&1 & fi echo "launched ${RUN_NAME} (pid $!)" echo "log: ${TRM_DIR}/${LOG}" echo "ckpts: ${TRM_DIR}/checkpoints/maze-30x30-hard-1k.../${RUN_NAME}/ (1 per 5000 epochs)" echo "watch: tail -f ${TRM_DIR}/${LOG} | grep -E 'exact|accuracy'" echo echo "When done: rsync the run's checkpoint dir back to the lab box for the diagnostic pipeline," echo "or run diagnostics here (see TRANSFER_README.md, note the attention-arch + n=512 caveats)."