summaryrefslogtreecommitdiff
path: root/research/flossing/watch_trm_directional_multi4_long.sh
diff options
context:
space:
mode:
authorYurenHao0426 <blackhao0426@gmail.com>2026-06-13 12:35:36 -0500
committerYurenHao0426 <blackhao0426@gmail.com>2026-06-13 12:35:36 -0500
commit66e0d8b9fd4d0f7a2231d689c055e26fdf1cf04a (patch)
treec29cba61124018755a19b02c9d33e3ad5f2e05cc /research/flossing/watch_trm_directional_multi4_long.sh
rrm workspace: TRM/HRM/SRM code, Maze dataset, dynamical-analysis pipelineHEADmain
Curated export for clone-and-run Maze training (2x A6000) + diagnostics. trm/hrm pretrain.py carry trajectory-augmentation code (backward-compatible). Heavy artifacts (checkpoints/wandb/npz) gitignored; see PROVENANCE.md. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Diffstat (limited to 'research/flossing/watch_trm_directional_multi4_long.sh')
-rwxr-xr-xresearch/flossing/watch_trm_directional_multi4_long.sh35
1 files changed, 35 insertions, 0 deletions
diff --git a/research/flossing/watch_trm_directional_multi4_long.sh b/research/flossing/watch_trm_directional_multi4_long.sh
new file mode 100755
index 0000000..67531b7
--- /dev/null
+++ b/research/flossing/watch_trm_directional_multi4_long.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+RUN_NAME="pretrain_mlp_t_sudoku_directional_multi4_parallel_c4_eps003_sigma003"
+CKPT_DIR="/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/${RUN_NAME}"
+TRAIN_LOG="/home/yurenh2/rrm/research/flossing/trm_directional_multi4_long_gpu3.log"
+STATUS_LOG="/home/yurenh2/rrm/research/flossing/trm_directional_multi4_long_status.log"
+
+last_step=""
+while true; do
+ ts="$(date '+%Y-%m-%d %H:%M:%S')"
+ if pgrep -af "${RUN_NAME}" >/tmp/trm_directional_pgrep.txt; then
+ proc="running"
+ else
+ proc="stopped"
+ fi
+
+ latest_step="$(find "${CKPT_DIR}" -maxdepth 1 -type f -name 'step_*' -printf '%f\n' 2>/dev/null | sort -V | tail -1 || true)"
+ gpu_line="$(nvidia-smi --query-gpu=index,memory.used,utilization.gpu --format=csv,noheader,nounits | awk -F', ' '$1==3 {print "gpu3_mem_mb="$2" gpu3_util="$3"%"}')"
+ progress="$(grep -ao '[0-9]\\+/65104' "${TRAIN_LOG}" 2>/dev/null | tail -1 || true)"
+
+ if [[ "${latest_step}" != "${last_step}" ]]; then
+ last_step="${latest_step}"
+ {
+ echo "${ts} proc=${proc} latest_ckpt=${latest_step:-none} progress=${progress:-unknown} ${gpu_line:-gpu3_unknown}"
+ tail -20 "${TRAIN_LOG}" 2>/dev/null || true
+ echo
+ } >> "${STATUS_LOG}"
+ else
+ echo "${ts} proc=${proc} latest_ckpt=${latest_step:-none} progress=${progress:-unknown} ${gpu_line:-gpu3_unknown}" >> "${STATUS_LOG}"
+ fi
+
+ [[ "${proc}" == "running" ]] || break
+ sleep 300
+done