summaryrefslogtreecommitdiff
path: root/research/flossing/launch_10k_queue.sh
diff options
context:
space:
mode:
Diffstat (limited to 'research/flossing/launch_10k_queue.sh')
-rwxr-xr-xresearch/flossing/launch_10k_queue.sh124
1 files changed, 124 insertions, 0 deletions
diff --git a/research/flossing/launch_10k_queue.sh b/research/flossing/launch_10k_queue.sh
new file mode 100755
index 0000000..8376ead
--- /dev/null
+++ b/research/flossing/launch_10k_queue.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+FLOSS_DIR="/home/yurenh2/rrm/research/flossing"
+CONDA_SH="/home/yurenh2/miniconda3/etc/profile.d/conda.sh"
+HRM_ROOT="/home/yurenh2/rrm/hrm/checkpoints/Sudoku-extreme-1k-aug-1000 ACT-torch/HierarchicalReasoningModel_ACTV1 righteous-python"
+TRM_ROOT="/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_singleGPU"
+
+wait_for_pid() {
+ local pid="$1"
+ if [[ "${pid}" == "0" ]]; then
+ return 0
+ fi
+ while kill -0 "${pid}" 2>/dev/null; do
+ sleep 60
+ done
+}
+
+run_hrm_baseline() {
+ wait_for_pid "${1:-0}"
+ source "${CONDA_SH}"
+ conda activate rrm
+ cd "${FLOSS_DIR}"
+ CUDA_VISIBLE_DEVICES=0 python step3_train_with_rf.py \
+ --ckpt-root "${HRM_ROOT}" \
+ --ckpt-name step_26040 \
+ --n-steps 10000 \
+ --batch-size 8 \
+ --lr 1e-5 \
+ --alpha-rf 0 \
+ --lambda-star 0 \
+ --rf-mode volume_cf \
+ --k-lyap 0 \
+ --lyap-act-steps 4 \
+ --seed 42 \
+ --eval-every 1000 \
+ --eval-n 512 \
+ --eval-batch-size 32 \
+ --out step3_L_baseline_26040_fast_10k.json \
+ > step3_L_baseline_26040_fast_10k.log 2>&1
+}
+
+run_hrm_volume() {
+ wait_for_pid "${1:-0}"
+ source "${CONDA_SH}"
+ conda activate rrm
+ cd "${FLOSS_DIR}"
+ CUDA_VISIBLE_DEVICES=1 python step3_train_with_rf.py \
+ --ckpt-root "${HRM_ROOT}" \
+ --ckpt-name step_26040 \
+ --n-steps 10000 \
+ --batch-size 8 \
+ --lr 1e-5 \
+ --alpha-rf 10 \
+ --lambda-star -0.15 \
+ --rf-mode volume_cf \
+ --k-lyap 8 \
+ --lyap-act-steps 4 \
+ --seed 42 \
+ --eval-every 1000 \
+ --eval-n 512 \
+ --eval-batch-size 32 \
+ --out step3_M_volume_cf_26040_lstar_neg015_k8_a10_10k.json \
+ > step3_M_volume_cf_26040_lstar_neg015_k8_a10_10k.log 2>&1
+}
+
+run_trm_baseline() {
+ wait_for_pid "${1:-0}"
+ source "${CONDA_SH}"
+ conda activate rrm
+ cd "${FLOSS_DIR}"
+ CUDA_VISIBLE_DEVICES=2 python step5_train_trm_cf.py \
+ --ckpt-root "${TRM_ROOT}" \
+ --ckpt-name step_26041 \
+ --n-steps 10000 \
+ --batch-size 4 \
+ --lr 1e-5 \
+ --alpha-rf 0 \
+ --lambda-star 0.02 \
+ --rf-mode volume_cf \
+ --k-lyap 0 \
+ --lyap-act-steps 4 \
+ --seed 42 \
+ --eval-every 1000 \
+ --eval-n 512 \
+ --eval-batch-size 32 \
+ --out step5_L_trm_baseline_26041_batch4_fast_10k.json \
+ > step5_L_trm_baseline_26041_batch4_fast_10k.log 2>&1
+}
+
+run_trm_volume() {
+ wait_for_pid "${1:-0}"
+ source "${CONDA_SH}"
+ conda activate rrm
+ cd "${FLOSS_DIR}"
+ CUDA_VISIBLE_DEVICES=3 python step5_train_trm_cf.py \
+ --ckpt-root "${TRM_ROOT}" \
+ --ckpt-name step_26041 \
+ --n-steps 10000 \
+ --batch-size 4 \
+ --lr 1e-5 \
+ --alpha-rf 10 \
+ --lambda-star 0.02 \
+ --rf-mode volume_cf \
+ --k-lyap 4 \
+ --lyap-act-steps 4 \
+ --seed 42 \
+ --eval-every 1000 \
+ --eval-n 512 \
+ --eval-batch-size 32 \
+ --out step5_M_trm_volume_cf_26041_lstar002_batch4_k4_a10_10k.json \
+ > step5_M_trm_volume_cf_26041_lstar002_batch4_k4_a10_10k.log 2>&1
+}
+
+cmd="${1:?usage: launch_10k_queue.sh MODE [wait_pid]}"
+wait_pid="${2:-0}"
+
+case "${cmd}" in
+ hrm_baseline) run_hrm_baseline "${wait_pid}" ;;
+ hrm_volume) run_hrm_volume "${wait_pid}" ;;
+ trm_baseline) run_trm_baseline "${wait_pid}" ;;
+ trm_volume) run_trm_volume "${wait_pid}" ;;
+ *) echo "unknown command: ${cmd}" >&2; exit 2 ;;
+esac