summaryrefslogtreecommitdiff
path: root/research/flossing/flossing_suite/launch_trm_variant_suite.sh
blob: 86c2e6402427282bd90c5249ca6d4f7e05f7c54a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env bash
set -euo pipefail

ROOT="/home/yurenh2/rrm"
PY="/home/yurenh2/miniconda3/envs/rrm/bin/python"
OUT_DIR="${ROOT}/research/flossing/flossing_suite/results/trm_variants"
CKPT_ROOT="${ROOT}/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro"
mkdir -p "${OUT_DIR}"

GPU_TOP1="${GPU_TOP1:-0}"
GPU_VOLUME="${GPU_VOLUME:-1}"
GPU_KL="${GPU_KL:-3}"
TRAIN_STEPS="${TRAIN_STEPS:-20000}"
FLOSS_STEPS="${FLOSS_STEPS:-100}"
INTERFLOSS_EVERY="${INTERFLOSS_EVERY:-2000}"
INTERFLOSS_START="${INTERFLOSS_START:-2000}"
INTERFLOSS_STOP="${INTERFLOSS_STOP:-10000}"
EVAL_N="${EVAL_N:-1000}"
TASK_BATCH_SIZE="${TASK_BATCH_SIZE:-32}"
FLOSS_BATCH_SIZE="${FLOSS_BATCH_SIZE:-4}"

write_and_launch() {
  local gpu="$1"
  local name="$2"
  local floss_mode="$3"
  local kl_beta="$4"
  local cmd="${OUT_DIR}/${name}.cmd.sh"
  local log="${OUT_DIR}/${name}.log"
  local pid="${OUT_DIR}/${name}.pid"

  cat > "${cmd}" <<EOF
#!/usr/bin/env bash
set -euo pipefail
cd "${ROOT}"
export CUDA_VISIBLE_DEVICES="${gpu}"
export PYTHONUNBUFFERED=1
exec "${PY}" research/flossing/step7_interfloss.py \\
  --model trm \\
  --ckpt-root "${CKPT_ROOT}" \\
  --ckpt-name __random__ \\
  --init-seed 123 \\
  --train-steps "${TRAIN_STEPS}" \\
  --batch-size "${TASK_BATCH_SIZE}" \\
  --task-batch-size "${TASK_BATCH_SIZE}" \\
  --floss-batch-size "${FLOSS_BATCH_SIZE}" \\
  --train-lr 1e-4 \\
  --floss-lr 1e-4 \\
  --floss-mode "${floss_mode}" \\
  --lambda-star 0 \\
  --k-lyap 4 \\
  --lyap-act-steps 4 \\
  --seed 42 \\
  --eval-every 1000 \\
  --eval-n "${EVAL_N}" \\
  --eval-batch-size 64 \\
  --floss-log-every 10 \\
  --train-puzzle-emb \\
  --puzzle-emb-lr 1e-4 \\
  --puzzle-emb-weight-decay 1.0 \\
  --floss-steps "${FLOSS_STEPS}" \\
  --interfloss-at "0" \\
  --interfloss-every "${INTERFLOSS_EVERY}" \\
  --interfloss-start "${INTERFLOSS_START}" \\
  --interfloss-stop "${INTERFLOSS_STOP}" \\
  --kl-beta "${kl_beta}" \\
  --kl-replay-size 64 \\
  --kl-batch-size 8 \\
  --kl-temperature 1.0 \\
  --out "${OUT_DIR}/${name}.json"
EOF
  chmod +x "${cmd}"
  setsid bash "${cmd}" > "${log}" 2>&1 < /dev/null &
  echo $! > "${pid}"
  echo "${name}: pid $(cat "${pid}") on GPU ${gpu}"
}

write_and_launch "${GPU_TOP1}" \
  "trm_seed123_top1_cf_periodic${INTERFLOSS_EVERY}_tb${TASK_BATCH_SIZE}_fb${FLOSS_BATCH_SIZE}_k4_${TRAIN_STEPS}" \
  top1_cf 0
write_and_launch "${GPU_VOLUME}" \
  "trm_seed123_volume_cf_periodic${INTERFLOSS_EVERY}_tb${TASK_BATCH_SIZE}_fb${FLOSS_BATCH_SIZE}_k4_${TRAIN_STEPS}" \
  volume_cf 0
write_and_launch "${GPU_KL}" \
  "trm_seed123_volume_cf_kl10_periodic${INTERFLOSS_EVERY}_tb${TASK_BATCH_SIZE}_fb${FLOSS_BATCH_SIZE}_k4_${TRAIN_STEPS}" \
  volume_cf 10

echo "queued TRM CF/volume variant suite in ${OUT_DIR}"