From 66e0d8b9fd4d0f7a2231d689c055e26fdf1cf04a Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Sat, 13 Jun 2026 12:35:36 -0500 Subject: rrm workspace: TRM/HRM/SRM code, Maze dataset, dynamical-analysis pipeline Curated export for clone-and-run Maze training (2x A6000) + diagnostics. trm/hrm pretrain.py carry trajectory-augmentation code (backward-compatible). Heavy artifacts (checkpoints/wandb/npz) gitignored; see PROVENANCE.md. Co-Authored-By: Claude Fable 5 --- .../flossing/watch_trm_directional_multi4_long.sh | 35 ++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100755 research/flossing/watch_trm_directional_multi4_long.sh (limited to 'research/flossing/watch_trm_directional_multi4_long.sh') diff --git a/research/flossing/watch_trm_directional_multi4_long.sh b/research/flossing/watch_trm_directional_multi4_long.sh new file mode 100755 index 0000000..67531b7 --- /dev/null +++ b/research/flossing/watch_trm_directional_multi4_long.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +set -euo pipefail + +RUN_NAME="pretrain_mlp_t_sudoku_directional_multi4_parallel_c4_eps003_sigma003" +CKPT_DIR="/home/yurenh2/rrm/trm/checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/${RUN_NAME}" +TRAIN_LOG="/home/yurenh2/rrm/research/flossing/trm_directional_multi4_long_gpu3.log" +STATUS_LOG="/home/yurenh2/rrm/research/flossing/trm_directional_multi4_long_status.log" + +last_step="" +while true; do + ts="$(date '+%Y-%m-%d %H:%M:%S')" + if pgrep -af "${RUN_NAME}" >/tmp/trm_directional_pgrep.txt; then + proc="running" + else + proc="stopped" + fi + + latest_step="$(find "${CKPT_DIR}" -maxdepth 1 -type f -name 'step_*' -printf '%f\n' 2>/dev/null | sort -V | tail -1 || true)" + gpu_line="$(nvidia-smi --query-gpu=index,memory.used,utilization.gpu --format=csv,noheader,nounits | awk -F', ' '$1==3 {print "gpu3_mem_mb="$2" gpu3_util="$3"%"}')" + progress="$(grep -ao '[0-9]\\+/65104' "${TRAIN_LOG}" 2>/dev/null | tail -1 || true)" + + if [[ "${latest_step}" != "${last_step}" ]]; then + last_step="${latest_step}" + { + echo "${ts} proc=${proc} latest_ckpt=${latest_step:-none} progress=${progress:-unknown} ${gpu_line:-gpu3_unknown}" + tail -20 "${TRAIN_LOG}" 2>/dev/null || true + echo + } >> "${STATUS_LOG}" + else + echo "${ts} proc=${proc} latest_ckpt=${latest_step:-none} progress=${progress:-unknown} ${gpu_line:-gpu3_unknown}" >> "${STATUS_LOG}" + fi + + [[ "${proc}" == "running" ]] || break + sleep 300 +done -- cgit v1.2.3