summaryrefslogtreecommitdiff
path: root/analysis_2x2/run_maze_diag_queue.sh
diff options
context:
space:
mode:
Diffstat (limited to 'analysis_2x2/run_maze_diag_queue.sh')
-rwxr-xr-xanalysis_2x2/run_maze_diag_queue.sh40
1 files changed, 40 insertions, 0 deletions
diff --git a/analysis_2x2/run_maze_diag_queue.sh b/analysis_2x2/run_maze_diag_queue.sh
new file mode 100755
index 0000000..7dfc757
--- /dev/null
+++ b/analysis_2x2/run_maze_diag_queue.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Maze FTLE diagnostics (att+rope, math SDP backend). Final checkpoint first (answers the
+# headline separation question), then two more for an evolution view. Waits for a GPU.
+set -o pipefail
+cd /home/yurenh2/rrm/research/flossing
+source /home/yurenh2/miniconda3/etc/profile.d/conda.sh
+conda activate rrm
+
+CKPT_ROOT=/home/yurenh2/rrm/trm/checkpoints/maze-30x30-hard-1k-ACT-torch/pretrain_att_maze30x30_2gpu_gbs384
+OUTDIR=analysis_2x2/maze; mkdir -p "$OUTDIR"
+STATUS="$OUTDIR/queue_status.log"
+log(){ echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$STATUS"; }
+free_gpu(){ nvidia-smi --query-gpu=index,utilization.gpu,memory.used --format=csv,noheader,nounits \
+ | awk -F', ' '$2<25 && $3<6000 {print $1; exit}'; }
+
+log "maze diag queue started (n=512, batch 2, math SDP; final ckpt first)"
+DEADLINE=$(( $(date +%s) + 12*3600 )); GPU=""
+while true; do
+ g1="$(free_gpu)"; if [[ -n "$g1" ]]; then sleep 60; g2="$(free_gpu)"; [[ "$g2" == "$g1" ]] && GPU="$g1" && break; fi
+ if (( $(date +%s) > DEADLINE )); then
+ GPU="$(nvidia-smi --query-gpu=index,memory.used --format=csv,noheader,nounits | sort -t, -k2 -n | head -1 | cut -d, -f1)"
+ log "12h fallback: GPU $GPU"; break; fi
+ sleep 300
+done
+log "claimed GPU $GPU"; export CUDA_VISIBLE_DEVICES="$GPU"
+
+for CK in step_130200 step_65100 step_26040; do
+ OUT="$OUTDIR/maze_${CK}_n512.npz"
+ [[ -f "$OUT" ]] && { log "skip $CK"; continue; }
+ log "start $CK"
+ if python diagnose_trm_joint_maze.py --ckpt-root "$CKPT_ROOT" --ckpt-name "$CK" \
+ --n-samples 512 --batch-size 2 --k-lyap 8 --t-ons 1 --seed 0 \
+ --out "$OUT" > "$OUTDIR/${CK}.log" 2>&1; then
+ acc=$(grep -oE "acc=[0-9.]+" "$OUTDIR/${CK}.log" | tail -1)
+ log "done $CK ($acc)"
+ else
+ log "FAILED $CK (see $OUTDIR/${CK}.log)"
+ fi
+done
+log "maze diag queue finished"