From 66e0d8b9fd4d0f7a2231d689c055e26fdf1cf04a Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Sat, 13 Jun 2026 12:35:36 -0500 Subject: rrm workspace: TRM/HRM/SRM code, Maze dataset, dynamical-analysis pipeline Curated export for clone-and-run Maze training (2x A6000) + diagnostics. trm/hrm pretrain.py carry trajectory-augmentation code (backward-compatible). Heavy artifacts (checkpoints/wandb/npz) gitignored; see PROVENANCE.md. Co-Authored-By: Claude Fable 5 --- scripts/build_datasets.sh | 86 ++++++++++++++++++++++ scripts/check_ptrm_gram.sh | 62 ++++++++++++++++ .../ptrm_gram_watch/SamsungSAILMontreal_repos.txt | 23 ++++++ scripts/ptrm_gram_watch/ahn-ml_repos.txt | 16 ++++ scripts/ptrm_gram_watch/search_known.txt | 3 + scripts/queue_trm_sudoku.sh | 35 +++++++++ scripts/run_hrm_sudoku.sh | 15 ++++ scripts/run_trm_sudoku.sh | 21 ++++++ scripts/smoke_test.sh | 38 ++++++++++ 9 files changed, 299 insertions(+) create mode 100755 scripts/build_datasets.sh create mode 100755 scripts/check_ptrm_gram.sh create mode 100644 scripts/ptrm_gram_watch/SamsungSAILMontreal_repos.txt create mode 100644 scripts/ptrm_gram_watch/ahn-ml_repos.txt create mode 100644 scripts/ptrm_gram_watch/search_known.txt create mode 100755 scripts/queue_trm_sudoku.sh create mode 100755 scripts/run_hrm_sudoku.sh create mode 100755 scripts/run_trm_sudoku.sh create mode 100755 scripts/smoke_test.sh (limited to 'scripts') diff --git a/scripts/build_datasets.sh b/scripts/build_datasets.sh new file mode 100755 index 0000000..df22e84 --- /dev/null +++ b/scripts/build_datasets.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +# 构建 HRM/TRM 共享的数据集,存放到 rrm/data/ 下 +# TRM 的构建脚本在 metadata 里多了 total_puzzles 字段,HRM 也能加载 (Pydantic ignore extra) +# 用法: bash scripts/build_datasets.sh [sudoku|maze|arc1|arc2|all] +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +DATA_DIR="$REPO_ROOT/data" +TRM_DIR="$REPO_ROOT/trm" +HRM_DIR="$REPO_ROOT/hrm" + +source "$(conda info --base)/etc/profile.d/conda.sh" +conda activate rrm + +mkdir -p "$DATA_DIR" + +target="${1:-all}" + +build_sudoku() { + if [[ ! -d "$DATA_DIR/sudoku-extreme-1k-aug-1000" ]]; then + cd "$TRM_DIR" + python dataset/build_sudoku_dataset.py \ + --output-dir "$DATA_DIR/sudoku-extreme-1k-aug-1000" \ + --subsample-size 1000 --num-aug 1000 + else + echo "[skip] sudoku-extreme-1k-aug-1000 already exists" + fi +} + +build_maze() { + if [[ ! -d "$DATA_DIR/maze-30x30-hard-1k" ]]; then + cd "$TRM_DIR" + python dataset/build_maze_dataset.py --output-dir "$DATA_DIR/maze-30x30-hard-1k" + else + echo "[skip] maze-30x30-hard-1k already exists" + fi +} + +# ARC 需要 HRM 的 git submodules (ARC-AGI / ARC-AGI-2 / ConceptARC) +prepare_arc_submodules() { + cd "$HRM_DIR" + if [[ ! -f dataset/raw-data/ARC-AGI/data/training/.gitkeep ]] 2>/dev/null && \ + [[ -z "$(ls -A dataset/raw-data/ARC-AGI 2>/dev/null)" ]]; then + git submodule update --init --recursive + fi +} + +build_arc1() { + prepare_arc_submodules + if [[ ! -d "$DATA_DIR/arc1concept-aug-1000" ]]; then + cd "$TRM_DIR" + python -m dataset.build_arc_dataset \ + --input-file-prefix "$HRM_DIR/dataset/raw-data/ARC-AGI/data" \ + --output-dir "$DATA_DIR/arc1concept-aug-1000" \ + --subsets training evaluation concept \ + --test-set-name evaluation + else + echo "[skip] arc1concept-aug-1000 already exists" + fi +} + +build_arc2() { + prepare_arc_submodules + if [[ ! -d "$DATA_DIR/arc2concept-aug-1000" ]]; then + cd "$TRM_DIR" + python -m dataset.build_arc_dataset \ + --input-file-prefix "$HRM_DIR/dataset/raw-data/ARC-AGI-2/data" \ + --output-dir "$DATA_DIR/arc2concept-aug-1000" \ + --subsets training2 evaluation2 concept \ + --test-set-name evaluation2 + else + echo "[skip] arc2concept-aug-1000 already exists" + fi +} + +case "$target" in + sudoku) build_sudoku ;; + maze) build_maze ;; + arc1) build_arc1 ;; + arc2) build_arc2 ;; + all) build_sudoku; build_maze; build_arc1; build_arc2 ;; + *) echo "unknown target: $target"; exit 1 ;; +esac + +echo "==> datasets ready under $DATA_DIR" +ls -la "$DATA_DIR" diff --git a/scripts/check_ptrm_gram.sh b/scripts/check_ptrm_gram.sh new file mode 100755 index 0000000..26f5e3e --- /dev/null +++ b/scripts/check_ptrm_gram.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# Weekly check: have PTRM / GRAM official implementations been released? +# - diffs repo lists of SamsungSAILMontreal (PTRM authors) and ahn-ml (GRAM authors) +# - GitHub global search for ptrm / gram-reasoning style repos +# - GRAM project page: does "coming soon" still gate the code link? +# On any hit: writes ALERT_ptrm_gram.txt next to the log. Designed for cron (no env deps). +set -o pipefail + +DIR="/home/yurenh2/rrm/scripts/ptrm_gram_watch" +mkdir -p "$DIR" +LOG="$DIR/check.log" +TS="$(date '+%Y-%m-%d %H:%M:%S')" +alert() { echo "[$TS] ALERT: $*" >> "$DIR/ALERT_ptrm_gram.txt"; echo "[$TS] ALERT: $*" >> "$LOG"; } +note() { echo "[$TS] $*" >> "$LOG"; } + +fetch_repos() { # org -> sorted repo names + curl -sL --max-time 60 "https://api.github.com/orgs/$1/repos?per_page=100" \ + | grep -o '"full_name": *"[^"]*"' | cut -d'"' -f4 | sort +} + +for org in SamsungSAILMontreal ahn-ml; do + cur="$DIR/${org}_repos.txt" + new="$DIR/${org}_repos.new" + fetch_repos "$org" > "$new" + if [[ ! -s "$new" ]]; then note "$org: fetch failed/empty, skipping diff"; rm -f "$new"; continue; fi + if [[ -f "$cur" ]]; then + added="$(comm -13 "$cur" "$new")" + if [[ -n "$added" ]]; then alert "$org new repos: $(echo "$added" | tr '\n' ' ')"; fi + fi + mv "$new" "$cur" + note "$org: $(wc -l < "$cur") repos" +done + +# global search (unauthenticated; weekly cadence well under rate limits) +for q in "PTRM+recursive+reasoning" "probabilistic+tiny+recursive" "GRAM+recursive+reasoning+model"; do + hits="$(curl -sL --max-time 60 "https://api.github.com/search/repositories?q=${q}&sort=updated&per_page=5" \ + | grep -o '"full_name": *"[^"]*"' | cut -d'"' -f4)" + known="$DIR/search_known.txt"; touch "$known" + while IFS= read -r r; do + [[ -z "$r" ]] && continue + if ! grep -qxF "$r" "$known"; then + echo "$r" >> "$known" + alert "new search hit ($q): $r" + fi + done <<< "$hits" +done + +# GRAM project page: code still "coming soon"? +page="$(curl -sL --max-time 60 "https://ahn-ml.github.io/gram-website/" || true)" +if [[ -n "$page" ]]; then + if echo "$page" | grep -qi "coming soon"; then + note "GRAM page: still 'coming soon'" + else + alert "GRAM page no longer says 'coming soon' — check for code link" + fi + link="$(echo "$page" | grep -oiE 'href="[^"]*github\.com[^"]*"' | grep -vi "ahn-ml.github.io\|Academic-project-page-template" | head -3)" + if [[ -n "$link" ]]; then alert "GRAM page github link(s): $link"; fi +else + note "GRAM page fetch failed" +fi + +note "check complete" diff --git a/scripts/ptrm_gram_watch/SamsungSAILMontreal_repos.txt b/scripts/ptrm_gram_watch/SamsungSAILMontreal_repos.txt new file mode 100644 index 0000000..7db0678 --- /dev/null +++ b/scripts/ptrm_gram_watch/SamsungSAILMontreal_repos.txt @@ -0,0 +1,23 @@ +SamsungSAILMontreal/AnyMolGenCritic +SamsungSAILMontreal/AVR-Eval-Agent +SamsungSAILMontreal/ByteCraft +SamsungSAILMontreal/cont-diffsubmin +SamsungSAILMontreal/difference-submodular-min +SamsungSAILMontreal/fair-matroid-submodular-max +SamsungSAILMontreal/ForestDiffusion +SamsungSAILMontreal/GGM-metrics +SamsungSAILMontreal/ghn3 +SamsungSAILMontreal/GuidedQuant +SamsungSAILMontreal/hyper-representation +SamsungSAILMontreal/l2o_pytorch +SamsungSAILMontreal/layer-merge +SamsungSAILMontreal/LoGAH +SamsungSAILMontreal/molecular-orbitals +SamsungSAILMontreal/mulo +SamsungSAILMontreal/multiset-equivariance +SamsungSAILMontreal/nino +SamsungSAILMontreal/PAPA +SamsungSAILMontreal/ream +SamsungSAILMontreal/STGG-AL +SamsungSAILMontreal/subpruning +SamsungSAILMontreal/TinyRecursiveModels diff --git a/scripts/ptrm_gram_watch/ahn-ml_repos.txt b/scripts/ptrm_gram_watch/ahn-ml_repos.txt new file mode 100644 index 0000000..1a31659 --- /dev/null +++ b/scripts/ptrm_gram_watch/ahn-ml_repos.txt @@ -0,0 +1,16 @@ +ahn-ml/crafter +ahn-ml/ctm +ahn-ml/dreamweaver-release +ahn-ml/drstrategy +ahn-ml/gpu_testing +ahn-ml/gram-website +ahn-ml/G-SWM +ahn-ml/IPR +ahn-ml/lddm +ahn-ml/mctd +ahn-ml/OCVT +ahn-ml/openreview-crawl-citation-stats +ahn-ml/SCALOR +ahn-ml/SPACE +ahn-ml/Understanding-LoRA-as-Knowledge-Memory +ahn-ml/vqgan diff --git a/scripts/ptrm_gram_watch/search_known.txt b/scripts/ptrm_gram_watch/search_known.txt new file mode 100644 index 0000000..9ffba66 --- /dev/null +++ b/scripts/ptrm_gram_watch/search_known.txt @@ -0,0 +1,3 @@ +DeadByDawn101/GRAM-MLX +edu-ide/wgram-lm +ad3002/gram diff --git a/scripts/queue_trm_sudoku.sh b/scripts/queue_trm_sudoku.sh new file mode 100755 index 0000000..fa447ff --- /dev/null +++ b/scripts/queue_trm_sudoku.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Queue TRM Sudoku 1k 4-GPU training: wait for Step 3 to finish then launch. +# Run via: nohup bash scripts/queue_trm_sudoku.sh > runs/trm_queue.log 2>&1 & +set -e +LOG_DIR=/home/yurenh2/rrm/runs +mkdir -p "$LOG_DIR" +TRM_LOG="$LOG_DIR/trm_sudoku_$(date +%Y%m%d_%H%M%S).log" +echo "[$(date)] queued, waiting for step3_train processes to exit..." + +while pgrep -f step3_train_with_rf >/dev/null; do + sleep 60 +done + +echo "[$(date)] Step 3 done — launching TRM Sudoku 1k 4-GPU" + +source "$(conda info --base)/etc/profile.d/conda.sh" +conda activate rrm + +cd /home/yurenh2/rrm/trm + +# Official TRM Sudoku 1k pretrain_mlp_t_sudoku command, scaled to 4-GPU torchrun +WANDB_MODE=offline OMP_NUM_THREADS=8 \ + torchrun --standalone --nproc-per-node 4 pretrain.py \ + arch=trm \ + data_paths="[/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000]" \ + evaluators="[]" \ + epochs=50000 eval_interval=5000 \ + lr=1e-4 puzzle_emb_lr=1e-4 weight_decay=1.0 puzzle_emb_weight_decay=1.0 \ + arch.mlp_t=True arch.pos_encodings=none \ + arch.L_layers=2 \ + arch.H_cycles=3 arch.L_cycles=6 \ + +run_name=pretrain_mlp_t_sudoku ema=True \ + > "$TRM_LOG" 2>&1 + +echo "[$(date)] TRM training finished. log: $TRM_LOG" diff --git a/scripts/run_hrm_sudoku.sh b/scripts/run_hrm_sudoku.sh new file mode 100755 index 0000000..04d3a1c --- /dev/null +++ b/scripts/run_hrm_sudoku.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# 启动 HRM Sudoku 1k 训练 (HRM 官方推荐配置) +# 单 GPU 约 10h on RTX 4070; A6000 应该更快 +set -euo pipefail +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +source "$(conda info --base)/etc/profile.d/conda.sh" +conda activate rrm + +cd "$REPO_ROOT/hrm" +OMP_NUM_THREADS=${OMP_NUM_THREADS:-8} \ +WANDB_MODE=${WANDB_MODE:-online} \ +python pretrain.py \ + data_path="$REPO_ROOT/data/sudoku-extreme-1k-aug-1000" \ + epochs=20000 eval_interval=2000 global_batch_size=384 \ + lr=7e-5 puzzle_emb_lr=7e-5 weight_decay=1.0 puzzle_emb_weight_decay=1.0 "$@" diff --git a/scripts/run_trm_sudoku.sh b/scripts/run_trm_sudoku.sh new file mode 100755 index 0000000..ae6db21 --- /dev/null +++ b/scripts/run_trm_sudoku.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# 启动 TRM Sudoku 1k 训练 (TRM 官方 pretrain_mlp_t_sudoku 配置) +# 单 GPU L40S 48GB 约 18h; A6000 应该接近 +set -euo pipefail +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +source "$(conda info --base)/etc/profile.d/conda.sh" +conda activate rrm + +cd "$REPO_ROOT/trm" +OMP_NUM_THREADS=${OMP_NUM_THREADS:-8} \ +WANDB_MODE=${WANDB_MODE:-online} \ +python pretrain.py \ + arch=trm \ + data_paths="[$REPO_ROOT/data/sudoku-extreme-1k-aug-1000]" \ + evaluators="[]" \ + epochs=50000 eval_interval=5000 \ + lr=1e-4 puzzle_emb_lr=1e-4 weight_decay=1.0 puzzle_emb_weight_decay=1.0 \ + arch.mlp_t=True arch.pos_encodings=none \ + arch.L_layers=2 \ + arch.H_cycles=3 arch.L_cycles=6 \ + +run_name=pretrain_mlp_t_sudoku ema=True "$@" diff --git a/scripts/smoke_test.sh b/scripts/smoke_test.sh new file mode 100755 index 0000000..248e9a2 --- /dev/null +++ b/scripts/smoke_test.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# 验证 HRM 和 TRM 都能在共享数据集上启动训练循环 +# 不跑全量训练,每边跑 90s 检查崩没崩 +set -euo pipefail +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +source "$(conda info --base)/etc/profile.d/conda.sh" +conda activate rrm + +if [[ ! -d "$REPO_ROOT/data/sudoku-extreme-1k-aug-1000" ]]; then + echo "datasets not built; run scripts/build_datasets.sh sudoku first"; exit 1 +fi + +echo "=== HRM smoke ===" +cd "$REPO_ROOT/hrm" +( timeout 90 env WANDB_MODE=offline OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=0 \ + python pretrain.py \ + data_path="$REPO_ROOT/data/sudoku-extreme-1k-aug-1000" \ + epochs=1 eval_interval=1 global_batch_size=64 \ + lr=7e-5 puzzle_emb_lr=7e-5 weight_decay=1.0 puzzle_emb_weight_decay=1.0 \ + > /tmp/hrm_smoke.log 2>&1 ) || true +grep -iE "error|traceback" /tmp/hrm_smoke.log && { echo "HRM smoke FAILED"; tail -30 /tmp/hrm_smoke.log; exit 1; } || echo "HRM smoke OK" + +echo "=== TRM smoke ===" +cd "$REPO_ROOT/trm" +( timeout 120 env WANDB_MODE=offline OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=0 \ + python pretrain.py \ + arch=trm \ + data_paths="[$REPO_ROOT/data/sudoku-extreme-1k-aug-1000]" \ + evaluators="[]" \ + epochs=1 eval_interval=1 global_batch_size=64 \ + lr=1e-4 puzzle_emb_lr=1e-4 weight_decay=1.0 puzzle_emb_weight_decay=1.0 \ + arch.mlp_t=True arch.pos_encodings=none \ + arch.L_layers=2 arch.H_cycles=3 arch.L_cycles=6 \ + +run_name=smoke_test ema=True \ + > /tmp/trm_smoke.log 2>&1 ) || true +grep -iE "error|traceback" /tmp/trm_smoke.log && { echo "TRM smoke FAILED"; tail -30 /tmp/trm_smoke.log; exit 1; } || echo "TRM smoke OK" + +echo "==> all smoke tests passed" -- cgit v1.2.3