summaryrefslogtreecommitdiff
path: root/research/flossing/analysis_2x2/run_retest_2x2.sh
blob: 9cdcc1396dd1a7f89ef026ac35dd50120bf9e689 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env bash
# Queue: wait for a free GPU, then run the 2x2 re-test diagnostics:
#   1. TRM official_gbs768 @ step_58590 (86.9% ckpt)  full-window  n=2048
#   2. TRM official_gbs768 @ step_58590               early-window n=2048 (first 4 ACT steps)
#   3. HRM righteous-python @ step_26040 (joint est.) full-window  n=2048
#   4. HRM righteous-python @ step_26040              early-window n=2048
# Same --seed 0 and same n across full/short pairs so idx fields pair up.
set -o pipefail

cd /home/yurenh2/rrm/research/flossing
source /home/yurenh2/miniconda3/etc/profile.d/conda.sh
conda activate rrm

OUTDIR=analysis_2x2/retest
mkdir -p "$OUTDIR"
STATUS="$OUTDIR/queue_status.log"
TRM_ROOT="/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_mlp_t_sudoku_official_gbs768_repro"
HRM_ROOT="/home/yurenh2/rrm/hrm/checkpoints/Sudoku-extreme-1k-aug-1000 ACT-torch/HierarchicalReasoningModel_ACTV1 righteous-python"

log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$STATUS"; }

free_gpu() {
  # print index of a GPU with util<30% and mem<8GB, else empty
  nvidia-smi --query-gpu=index,utilization.gpu,memory.used --format=csv,noheader,nounits \
    | awk -F', ' '$2<30 && $3<8000 {print $1; exit}'
}

log "queue started, waiting for a free GPU (util<30%, mem<8GB, two checks 60s apart; 12h fallback)"
DEADLINE=$(( $(date +%s) + 12*3600 ))
GPU=""
while true; do
  g1="$(free_gpu)"
  if [[ -n "$g1" ]]; then
    sleep 60
    g2="$(free_gpu)"
    if [[ "$g2" == "$g1" ]]; then GPU="$g1"; break; fi
  fi
  if (( $(date +%s) > DEADLINE )); then
    GPU="$(nvidia-smi --query-gpu=index,memory.used --format=csv,noheader,nounits | sort -t, -k2 -n | head -1 | cut -d, -f1)"
    log "12h fallback: taking GPU $GPU (most free memory) despite utilization"
    break
  fi
  sleep 300
done
log "claimed GPU $GPU"
export CUDA_VISIBLE_DEVICES="$GPU"

run_job() {
  local name="$1"; shift
  if [[ -f "$OUTDIR/${name}.npz" ]]; then log "skip $name (output exists)"; return 0; fi
  log "start $name"
  if python "$@" --out "$OUTDIR/${name}.npz" > "$OUTDIR/${name}.log" 2>&1; then
    log "done $name"
  else
    log "FAILED $name (see $OUTDIR/${name}.log)"
  fi
}

run_job trm_gbs768_step58590_full_n2048 \
  diagnose_trm_joint.py --ckpt-root "$TRM_ROOT" --ckpt-name step_58590 \
  --n-samples 2048 --batch-size 16 --k-lyap 8 --t-ons 1 --seed 0

run_job trm_gbs768_step58590_short_n2048 \
  diagnose_trm_joint_short.py --ckpt-root "$TRM_ROOT" --ckpt-name step_58590 \
  --n-samples 2048 --batch-size 16 --k-lyap 8 --t-ons 1 --seed 0

run_job hrm_righteous_step26040_full_n2048 \
  diagnose_hrm_joint.py --ckpt-root "$HRM_ROOT" --ckpt-name step_26040 \
  --n-samples 2048 --batch-size 32 --k-lyap 8 --t-ons 1 --seed 0

run_job hrm_righteous_step26040_short_n2048 \
  diagnose_hrm_joint_short.py --ckpt-root "$HRM_ROOT" --ckpt-name step_26040 \
  --n-samples 2048 --batch-size 32 --k-lyap 8 --t-ons 1 --seed 0

log "all retest diagnostics finished"