summaryrefslogtreecommitdiff
path: root/research/flossing/launch_interfloss_queue.sh
blob: 1c5807fb3167cc6d406581887bbae4642901ad58 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env bash
set -eo pipefail

FLOSS_DIR="/home/yurenh2/rrm/research/flossing"
CONDA_SH="/home/yurenh2/miniconda3/etc/profile.d/conda.sh"
HRM_ROOT="/home/yurenh2/rrm/hrm/checkpoints/Sudoku-extreme-1k-aug-1000 ACT-torch/HierarchicalReasoningModel_ACTV1 righteous-python"
TRM_ROOT="/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_singleGPU"

wait_for_pid() {
  local pid="$1"
  if [[ "${pid}" == "0" ]]; then
    return 0
  fi
  while kill -0 "${pid}" 2>/dev/null; do
    sleep 60
  done
}

activate_env() {
  source "${CONDA_SH}"
  conda activate rrm
  cd "${FLOSS_DIR}"
}

run_hrm_engelken() {
  wait_for_pid "${1:-0}"
  activate_env
  CUDA_VISIBLE_DEVICES=0 python step7_interfloss.py \
    --model hrm \
    --ckpt-root "${HRM_ROOT}" \
    --ckpt-name step_26040 \
    --train-steps 10000 \
    --batch-size 8 \
    --train-lr 1e-5 \
    --floss-lr 1e-4 \
    --floss-steps 500 \
    --interfloss-at 0,500 \
    --floss-mode engelken_l2 \
    --lambda-star 0 \
    --k-lyap 8 \
    --lyap-act-steps 4 \
    --seed 42 \
    --eval-every 1000 \
    --eval-n 512 \
    --eval-batch-size 32 \
    --floss-log-every 10 \
    --out step7_A_hrm_engelken_interfloss_26040_k8_10k.json \
    > step7_A_hrm_engelken_interfloss_26040_k8_10k.log 2>&1
}

run_trm_engelken() {
  wait_for_pid "${1:-0}"
  activate_env
  CUDA_VISIBLE_DEVICES=2 python step7_interfloss.py \
    --model trm \
    --ckpt-root "${TRM_ROOT}" \
    --ckpt-name step_26041 \
    --train-steps 10000 \
    --batch-size 4 \
    --train-lr 1e-5 \
    --floss-lr 1e-4 \
    --floss-steps 500 \
    --interfloss-at 0,500 \
    --floss-mode engelken_l2 \
    --lambda-star 0 \
    --k-lyap 4 \
    --lyap-act-steps 4 \
    --seed 42 \
    --eval-every 1000 \
    --eval-n 512 \
    --eval-batch-size 32 \
    --floss-log-every 10 \
    --out step7_B_trm_engelken_interfloss_26041_k4_batch4_10k.json \
    > step7_B_trm_engelken_interfloss_26041_k4_batch4_10k.log 2>&1
}

cmd="${1:?usage: launch_interfloss_queue.sh MODE [wait_pid]}"
wait_pid="${2:-0}"

case "${cmd}" in
  hrm_engelken) run_hrm_engelken "${wait_pid}" ;;
  trm_engelken) run_trm_engelken "${wait_pid}" ;;
  *) echo "unknown command: ${cmd}" >&2; exit 2 ;;
esac