summaryrefslogtreecommitdiff
path: root/research/flossing/maze_package/launch_maze_trm_portable.sh
blob: d801cebeea8880ebedd5299f9bdc8db8b70d2b51 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env bash
# Portable TRM Maze-Hard launcher — run on any machine with the TRM repo + rrm conda env.
#
# Set these two paths for the target machine (env vars or edit here):
#   TRM_DIR   = path to the TinyRecursiveModels repo clone (contains pretrain.py)
#   DATA_DIR  = path to the maze-30x30-hard-1k dataset (from this bundle)
#   CONDA_SH  = path to conda.sh (default tries common locations)
#
# Usage: TRM_DIR=~/TinyRecursiveModels DATA_DIR=~/maze-30x30-hard-1k bash launch_maze_trm_portable.sh [NGPU] [GBS]
#   2x A6000 (48G):  ... bash launch_maze_trm_portable.sh 2 384
#   2x A5000 (24G):  ... bash launch_maze_trm_portable.sh 2 192   (-> 128 if OOM)
set -eo pipefail

TRM_DIR="${TRM_DIR:?set TRM_DIR to the TinyRecursiveModels repo path}"
DATA_DIR="${DATA_DIR:?set DATA_DIR to the maze-30x30-hard-1k dataset path}"
NGPU="${1:-2}"
GBS="${2:-384}"
RUN_NAME="pretrain_att_maze30x30_${NGPU}gpu_gbs${GBS}"

# conda
CONDA_SH="${CONDA_SH:-}"
if [[ -z "${CONDA_SH}" ]]; then
  for p in "$HOME/miniconda3/etc/profile.d/conda.sh" "$HOME/anaconda3/etc/profile.d/conda.sh" \
           "/opt/conda/etc/profile.d/conda.sh"; do
    [[ -f "$p" ]] && CONDA_SH="$p" && break
  done
fi
[[ -f "${CONDA_SH}" ]] && source "${CONDA_SH}" && conda activate "${CONDA_ENV:-rrm}"

cd "${TRM_DIR}"
export WANDB_MODE=offline

ARGS=(
  arch=trm
  "data_paths=[${DATA_DIR}]"
  "evaluators=[]"
  epochs=50000 eval_interval=5000
  lr=1e-4 puzzle_emb_lr=1e-4 weight_decay=1.0 puzzle_emb_weight_decay=1.0
  global_batch_size="${GBS}"
  arch.L_layers=2 arch.H_cycles=3 arch.L_cycles=4
  +run_name="${RUN_NAME}" ema=True
  +checkpoint_every_eval=true
)
LOG="maze_${RUN_NAME}.log"

if [[ "${NGPU}" -gt 1 ]]; then
  nohup torchrun --nproc-per-node "${NGPU}" --rdzv_backend=c10d --rdzv_endpoint=localhost:0 \
    --nnodes=1 pretrain.py "${ARGS[@]}" > "${LOG}" 2>&1 &
else
  nohup python pretrain.py "${ARGS[@]}" > "${LOG}" 2>&1 &
fi
echo "launched ${RUN_NAME} (pid $!)"
echo "log:  ${TRM_DIR}/${LOG}"
echo "ckpts: ${TRM_DIR}/checkpoints/maze-30x30-hard-1k.../${RUN_NAME}/ (1 per 5000 epochs)"
echo "watch: tail -f ${TRM_DIR}/${LOG} | grep -E 'exact|accuracy'"
echo
echo "When done: rsync the run's checkpoint dir back to the lab box for the diagnostic pipeline,"
echo "or run diagnostics here (see TRANSFER_README.md, note the attention-arch + n=512 caveats)."