blob: 1c5807fb3167cc6d406581887bbae4642901ad58 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
#!/usr/bin/env bash
set -eo pipefail
FLOSS_DIR="/home/yurenh2/rrm/research/flossing"
CONDA_SH="/home/yurenh2/miniconda3/etc/profile.d/conda.sh"
HRM_ROOT="/home/yurenh2/rrm/hrm/checkpoints/Sudoku-extreme-1k-aug-1000 ACT-torch/HierarchicalReasoningModel_ACTV1 righteous-python"
TRM_ROOT="/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_singleGPU"
wait_for_pid() {
local pid="$1"
if [[ "${pid}" == "0" ]]; then
return 0
fi
while kill -0 "${pid}" 2>/dev/null; do
sleep 60
done
}
activate_env() {
source "${CONDA_SH}"
conda activate rrm
cd "${FLOSS_DIR}"
}
run_hrm_engelken() {
wait_for_pid "${1:-0}"
activate_env
CUDA_VISIBLE_DEVICES=0 python step7_interfloss.py \
--model hrm \
--ckpt-root "${HRM_ROOT}" \
--ckpt-name step_26040 \
--train-steps 10000 \
--batch-size 8 \
--train-lr 1e-5 \
--floss-lr 1e-4 \
--floss-steps 500 \
--interfloss-at 0,500 \
--floss-mode engelken_l2 \
--lambda-star 0 \
--k-lyap 8 \
--lyap-act-steps 4 \
--seed 42 \
--eval-every 1000 \
--eval-n 512 \
--eval-batch-size 32 \
--floss-log-every 10 \
--out step7_A_hrm_engelken_interfloss_26040_k8_10k.json \
> step7_A_hrm_engelken_interfloss_26040_k8_10k.log 2>&1
}
run_trm_engelken() {
wait_for_pid "${1:-0}"
activate_env
CUDA_VISIBLE_DEVICES=2 python step7_interfloss.py \
--model trm \
--ckpt-root "${TRM_ROOT}" \
--ckpt-name step_26041 \
--train-steps 10000 \
--batch-size 4 \
--train-lr 1e-5 \
--floss-lr 1e-4 \
--floss-steps 500 \
--interfloss-at 0,500 \
--floss-mode engelken_l2 \
--lambda-star 0 \
--k-lyap 4 \
--lyap-act-steps 4 \
--seed 42 \
--eval-every 1000 \
--eval-n 512 \
--eval-batch-size 32 \
--floss-log-every 10 \
--out step7_B_trm_engelken_interfloss_26041_k4_batch4_10k.json \
> step7_B_trm_engelken_interfloss_26041_k4_batch4_10k.log 2>&1
}
cmd="${1:?usage: launch_interfloss_queue.sh MODE [wait_pid]}"
wait_pid="${2:-0}"
case "${cmd}" in
hrm_engelken) run_hrm_engelken "${wait_pid}" ;;
trm_engelken) run_trm_engelken "${wait_pid}" ;;
*) echo "unknown command: ${cmd}" >&2; exit 2 ;;
esac
|