summaryrefslogtreecommitdiff
path: root/research/flossing/maze_package/launch_maze_trm_portable.sh
diff options
context:
space:
mode:
Diffstat (limited to 'research/flossing/maze_package/launch_maze_trm_portable.sh')
-rwxr-xr-xresearch/flossing/maze_package/launch_maze_trm_portable.sh58
1 files changed, 58 insertions, 0 deletions
diff --git a/research/flossing/maze_package/launch_maze_trm_portable.sh b/research/flossing/maze_package/launch_maze_trm_portable.sh
new file mode 100755
index 0000000..d801ceb
--- /dev/null
+++ b/research/flossing/maze_package/launch_maze_trm_portable.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Portable TRM Maze-Hard launcher — run on any machine with the TRM repo + rrm conda env.
+#
+# Set these two paths for the target machine (env vars or edit here):
+# TRM_DIR = path to the TinyRecursiveModels repo clone (contains pretrain.py)
+# DATA_DIR = path to the maze-30x30-hard-1k dataset (from this bundle)
+# CONDA_SH = path to conda.sh (default tries common locations)
+#
+# Usage: TRM_DIR=~/TinyRecursiveModels DATA_DIR=~/maze-30x30-hard-1k bash launch_maze_trm_portable.sh [NGPU] [GBS]
+# 2x A6000 (48G): ... bash launch_maze_trm_portable.sh 2 384
+# 2x A5000 (24G): ... bash launch_maze_trm_portable.sh 2 192 (-> 128 if OOM)
+set -eo pipefail
+
+TRM_DIR="${TRM_DIR:?set TRM_DIR to the TinyRecursiveModels repo path}"
+DATA_DIR="${DATA_DIR:?set DATA_DIR to the maze-30x30-hard-1k dataset path}"
+NGPU="${1:-2}"
+GBS="${2:-384}"
+RUN_NAME="pretrain_att_maze30x30_${NGPU}gpu_gbs${GBS}"
+
+# conda
+CONDA_SH="${CONDA_SH:-}"
+if [[ -z "${CONDA_SH}" ]]; then
+ for p in "$HOME/miniconda3/etc/profile.d/conda.sh" "$HOME/anaconda3/etc/profile.d/conda.sh" \
+ "/opt/conda/etc/profile.d/conda.sh"; do
+ [[ -f "$p" ]] && CONDA_SH="$p" && break
+ done
+fi
+[[ -f "${CONDA_SH}" ]] && source "${CONDA_SH}" && conda activate "${CONDA_ENV:-rrm}"
+
+cd "${TRM_DIR}"
+export WANDB_MODE=offline
+
+ARGS=(
+ arch=trm
+ "data_paths=[${DATA_DIR}]"
+ "evaluators=[]"
+ epochs=50000 eval_interval=5000
+ lr=1e-4 puzzle_emb_lr=1e-4 weight_decay=1.0 puzzle_emb_weight_decay=1.0
+ global_batch_size="${GBS}"
+ arch.L_layers=2 arch.H_cycles=3 arch.L_cycles=4
+ +run_name="${RUN_NAME}" ema=True
+ +checkpoint_every_eval=true
+)
+LOG="maze_${RUN_NAME}.log"
+
+if [[ "${NGPU}" -gt 1 ]]; then
+ nohup torchrun --nproc-per-node "${NGPU}" --rdzv_backend=c10d --rdzv_endpoint=localhost:0 \
+ --nnodes=1 pretrain.py "${ARGS[@]}" > "${LOG}" 2>&1 &
+else
+ nohup python pretrain.py "${ARGS[@]}" > "${LOG}" 2>&1 &
+fi
+echo "launched ${RUN_NAME} (pid $!)"
+echo "log: ${TRM_DIR}/${LOG}"
+echo "ckpts: ${TRM_DIR}/checkpoints/maze-30x30-hard-1k.../${RUN_NAME}/ (1 per 5000 epochs)"
+echo "watch: tail -f ${TRM_DIR}/${LOG} | grep -E 'exact|accuracy'"
+echo
+echo "When done: rsync the run's checkpoint dir back to the lab box for the diagnostic pipeline,"
+echo "or run diagnostics here (see TRANSFER_README.md, note the attention-arch + n=512 caveats)."