summaryrefslogtreecommitdiff
path: root/research/flossing/launch_10k_queue.sh
blob: 8376ead9a8341297dbb86f1d787a48261b91a7ee (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env bash
set -eo pipefail

FLOSS_DIR="/home/yurenh2/rrm/research/flossing"
CONDA_SH="/home/yurenh2/miniconda3/etc/profile.d/conda.sh"
HRM_ROOT="/home/yurenh2/rrm/hrm/checkpoints/Sudoku-extreme-1k-aug-1000 ACT-torch/HierarchicalReasoningModel_ACTV1 righteous-python"
TRM_ROOT="/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_singleGPU"

wait_for_pid() {
  local pid="$1"
  if [[ "${pid}" == "0" ]]; then
    return 0
  fi
  while kill -0 "${pid}" 2>/dev/null; do
    sleep 60
  done
}

run_hrm_baseline() {
  wait_for_pid "${1:-0}"
  source "${CONDA_SH}"
  conda activate rrm
  cd "${FLOSS_DIR}"
  CUDA_VISIBLE_DEVICES=0 python step3_train_with_rf.py \
    --ckpt-root "${HRM_ROOT}" \
    --ckpt-name step_26040 \
    --n-steps 10000 \
    --batch-size 8 \
    --lr 1e-5 \
    --alpha-rf 0 \
    --lambda-star 0 \
    --rf-mode volume_cf \
    --k-lyap 0 \
    --lyap-act-steps 4 \
    --seed 42 \
    --eval-every 1000 \
    --eval-n 512 \
    --eval-batch-size 32 \
    --out step3_L_baseline_26040_fast_10k.json \
    > step3_L_baseline_26040_fast_10k.log 2>&1
}

run_hrm_volume() {
  wait_for_pid "${1:-0}"
  source "${CONDA_SH}"
  conda activate rrm
  cd "${FLOSS_DIR}"
  CUDA_VISIBLE_DEVICES=1 python step3_train_with_rf.py \
    --ckpt-root "${HRM_ROOT}" \
    --ckpt-name step_26040 \
    --n-steps 10000 \
    --batch-size 8 \
    --lr 1e-5 \
    --alpha-rf 10 \
    --lambda-star -0.15 \
    --rf-mode volume_cf \
    --k-lyap 8 \
    --lyap-act-steps 4 \
    --seed 42 \
    --eval-every 1000 \
    --eval-n 512 \
    --eval-batch-size 32 \
    --out step3_M_volume_cf_26040_lstar_neg015_k8_a10_10k.json \
    > step3_M_volume_cf_26040_lstar_neg015_k8_a10_10k.log 2>&1
}

run_trm_baseline() {
  wait_for_pid "${1:-0}"
  source "${CONDA_SH}"
  conda activate rrm
  cd "${FLOSS_DIR}"
  CUDA_VISIBLE_DEVICES=2 python step5_train_trm_cf.py \
    --ckpt-root "${TRM_ROOT}" \
    --ckpt-name step_26041 \
    --n-steps 10000 \
    --batch-size 4 \
    --lr 1e-5 \
    --alpha-rf 0 \
    --lambda-star 0.02 \
    --rf-mode volume_cf \
    --k-lyap 0 \
    --lyap-act-steps 4 \
    --seed 42 \
    --eval-every 1000 \
    --eval-n 512 \
    --eval-batch-size 32 \
    --out step5_L_trm_baseline_26041_batch4_fast_10k.json \
    > step5_L_trm_baseline_26041_batch4_fast_10k.log 2>&1
}

run_trm_volume() {
  wait_for_pid "${1:-0}"
  source "${CONDA_SH}"
  conda activate rrm
  cd "${FLOSS_DIR}"
  CUDA_VISIBLE_DEVICES=3 python step5_train_trm_cf.py \
    --ckpt-root "${TRM_ROOT}" \
    --ckpt-name step_26041 \
    --n-steps 10000 \
    --batch-size 4 \
    --lr 1e-5 \
    --alpha-rf 10 \
    --lambda-star 0.02 \
    --rf-mode volume_cf \
    --k-lyap 4 \
    --lyap-act-steps 4 \
    --seed 42 \
    --eval-every 1000 \
    --eval-n 512 \
    --eval-batch-size 32 \
    --out step5_M_trm_volume_cf_26041_lstar002_batch4_k4_a10_10k.json \
    > step5_M_trm_volume_cf_26041_lstar002_batch4_k4_a10_10k.log 2>&1
}

cmd="${1:?usage: launch_10k_queue.sh MODE [wait_pid]}"
wait_pid="${2:-0}"

case "${cmd}" in
  hrm_baseline) run_hrm_baseline "${wait_pid}" ;;
  hrm_volume) run_hrm_volume "${wait_pid}" ;;
  trm_baseline) run_trm_baseline "${wait_pid}" ;;
  trm_volume) run_trm_volume "${wait_pid}" ;;
  *) echo "unknown command: ${cmd}" >&2; exit 2 ;;
esac