#!/usr/bin/env bash set -euo pipefail RUN_NAME="pretrain_mlp_t_sudoku_directional_multi4_parallel_c4_eps003_sigma003" CKPT_DIR="/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/${RUN_NAME}" TRAIN_LOG="/home/yurenh2/rrm/research/flossing/trm_directional_multi4_long_gpu3.log" STATUS_LOG="/home/yurenh2/rrm/research/flossing/trm_directional_multi4_long_status.log" last_step="" while true; do ts="$(date '+%Y-%m-%d %H:%M:%S')" if pgrep -af "${RUN_NAME}" >/tmp/trm_directional_pgrep.txt; then proc="running" else proc="stopped" fi latest_step="$(find "${CKPT_DIR}" -maxdepth 1 -type f -name 'step_*' -printf '%f\n' 2>/dev/null | sort -V | tail -1 || true)" gpu_line="$(nvidia-smi --query-gpu=index,memory.used,utilization.gpu --format=csv,noheader,nounits | awk -F', ' '$1==3 {print "gpu3_mem_mb="$2" gpu3_util="$3"%"}')" progress="$(grep -ao '[0-9]\\+/65104' "${TRAIN_LOG}" 2>/dev/null | tail -1 || true)" if [[ "${latest_step}" != "${last_step}" ]]; then last_step="${latest_step}" { echo "${ts} proc=${proc} latest_ckpt=${latest_step:-none} progress=${progress:-unknown} ${gpu_line:-gpu3_unknown}" tail -20 "${TRAIN_LOG}" 2>/dev/null || true echo } >> "${STATUS_LOG}" else echo "${ts} proc=${proc} latest_ckpt=${latest_step:-none} progress=${progress:-unknown} ${gpu_line:-gpu3_unknown}" >> "${STATUS_LOG}" fi [[ "${proc}" == "running" ]] || break sleep 300 done