summaryrefslogtreecommitdiff
path: root/research/flossing/watch_trm_directional_multi4_long.sh
blob: 67531b735e5350c901ca7bd2a286f108a42c201d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/usr/bin/env bash
set -euo pipefail

RUN_NAME="pretrain_mlp_t_sudoku_directional_multi4_parallel_c4_eps003_sigma003"
CKPT_DIR="/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/${RUN_NAME}"
TRAIN_LOG="/home/yurenh2/rrm/research/flossing/trm_directional_multi4_long_gpu3.log"
STATUS_LOG="/home/yurenh2/rrm/research/flossing/trm_directional_multi4_long_status.log"

last_step=""
while true; do
  ts="$(date '+%Y-%m-%d %H:%M:%S')"
  if pgrep -af "${RUN_NAME}" >/tmp/trm_directional_pgrep.txt; then
    proc="running"
  else
    proc="stopped"
  fi

  latest_step="$(find "${CKPT_DIR}" -maxdepth 1 -type f -name 'step_*' -printf '%f\n' 2>/dev/null | sort -V | tail -1 || true)"
  gpu_line="$(nvidia-smi --query-gpu=index,memory.used,utilization.gpu --format=csv,noheader,nounits | awk -F', ' '$1==3 {print "gpu3_mem_mb="$2" gpu3_util="$3"%"}')"
  progress="$(grep -ao '[0-9]\\+/65104' "${TRAIN_LOG}" 2>/dev/null | tail -1 || true)"

  if [[ "${latest_step}" != "${last_step}" ]]; then
    last_step="${latest_step}"
    {
      echo "${ts} proc=${proc} latest_ckpt=${latest_step:-none} progress=${progress:-unknown} ${gpu_line:-gpu3_unknown}"
      tail -20 "${TRAIN_LOG}" 2>/dev/null || true
      echo
    } >> "${STATUS_LOG}"
  else
    echo "${ts} proc=${proc} latest_ckpt=${latest_step:-none} progress=${progress:-unknown} ${gpu_line:-gpu3_unknown}" >> "${STATUS_LOG}"
  fi

  [[ "${proc}" == "running" ]] || break
  sleep 300
done