summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/build_datasets.sh86
-rwxr-xr-xscripts/check_ptrm_gram.sh62
-rw-r--r--scripts/ptrm_gram_watch/SamsungSAILMontreal_repos.txt23
-rw-r--r--scripts/ptrm_gram_watch/ahn-ml_repos.txt16
-rw-r--r--scripts/ptrm_gram_watch/search_known.txt3
-rwxr-xr-xscripts/queue_trm_sudoku.sh35
-rwxr-xr-xscripts/run_hrm_sudoku.sh15
-rwxr-xr-xscripts/run_trm_sudoku.sh21
-rwxr-xr-xscripts/smoke_test.sh38
9 files changed, 299 insertions, 0 deletions
diff --git a/scripts/build_datasets.sh b/scripts/build_datasets.sh
new file mode 100755
index 0000000..df22e84
--- /dev/null
+++ b/scripts/build_datasets.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+# 构建 HRM/TRM 共享的数据集,存放到 rrm/data/ 下
+# TRM 的构建脚本在 metadata 里多了 total_puzzles 字段,HRM 也能加载 (Pydantic ignore extra)
+# 用法: bash scripts/build_datasets.sh [sudoku|maze|arc1|arc2|all]
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+DATA_DIR="$REPO_ROOT/data"
+TRM_DIR="$REPO_ROOT/trm"
+HRM_DIR="$REPO_ROOT/hrm"
+
+source "$(conda info --base)/etc/profile.d/conda.sh"
+conda activate rrm
+
+mkdir -p "$DATA_DIR"
+
+target="${1:-all}"
+
+build_sudoku() {
+ if [[ ! -d "$DATA_DIR/sudoku-extreme-1k-aug-1000" ]]; then
+ cd "$TRM_DIR"
+ python dataset/build_sudoku_dataset.py \
+ --output-dir "$DATA_DIR/sudoku-extreme-1k-aug-1000" \
+ --subsample-size 1000 --num-aug 1000
+ else
+ echo "[skip] sudoku-extreme-1k-aug-1000 already exists"
+ fi
+}
+
+build_maze() {
+ if [[ ! -d "$DATA_DIR/maze-30x30-hard-1k" ]]; then
+ cd "$TRM_DIR"
+ python dataset/build_maze_dataset.py --output-dir "$DATA_DIR/maze-30x30-hard-1k"
+ else
+ echo "[skip] maze-30x30-hard-1k already exists"
+ fi
+}
+
+# ARC 需要 HRM 的 git submodules (ARC-AGI / ARC-AGI-2 / ConceptARC)
+prepare_arc_submodules() {
+ cd "$HRM_DIR"
+ if [[ ! -f dataset/raw-data/ARC-AGI/data/training/.gitkeep ]] 2>/dev/null && \
+ [[ -z "$(ls -A dataset/raw-data/ARC-AGI 2>/dev/null)" ]]; then
+ git submodule update --init --recursive
+ fi
+}
+
+build_arc1() {
+ prepare_arc_submodules
+ if [[ ! -d "$DATA_DIR/arc1concept-aug-1000" ]]; then
+ cd "$TRM_DIR"
+ python -m dataset.build_arc_dataset \
+ --input-file-prefix "$HRM_DIR/dataset/raw-data/ARC-AGI/data" \
+ --output-dir "$DATA_DIR/arc1concept-aug-1000" \
+ --subsets training evaluation concept \
+ --test-set-name evaluation
+ else
+ echo "[skip] arc1concept-aug-1000 already exists"
+ fi
+}
+
+build_arc2() {
+ prepare_arc_submodules
+ if [[ ! -d "$DATA_DIR/arc2concept-aug-1000" ]]; then
+ cd "$TRM_DIR"
+ python -m dataset.build_arc_dataset \
+ --input-file-prefix "$HRM_DIR/dataset/raw-data/ARC-AGI-2/data" \
+ --output-dir "$DATA_DIR/arc2concept-aug-1000" \
+ --subsets training2 evaluation2 concept \
+ --test-set-name evaluation2
+ else
+ echo "[skip] arc2concept-aug-1000 already exists"
+ fi
+}
+
+case "$target" in
+ sudoku) build_sudoku ;;
+ maze) build_maze ;;
+ arc1) build_arc1 ;;
+ arc2) build_arc2 ;;
+ all) build_sudoku; build_maze; build_arc1; build_arc2 ;;
+ *) echo "unknown target: $target"; exit 1 ;;
+esac
+
+echo "==> datasets ready under $DATA_DIR"
+ls -la "$DATA_DIR"
diff --git a/scripts/check_ptrm_gram.sh b/scripts/check_ptrm_gram.sh
new file mode 100755
index 0000000..26f5e3e
--- /dev/null
+++ b/scripts/check_ptrm_gram.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+# Weekly check: have PTRM / GRAM official implementations been released?
+# - diffs repo lists of SamsungSAILMontreal (PTRM authors) and ahn-ml (GRAM authors)
+# - GitHub global search for ptrm / gram-reasoning style repos
+# - GRAM project page: does "coming soon" still gate the code link?
+# On any hit: writes ALERT_ptrm_gram.txt next to the log. Designed for cron (no env deps).
+set -o pipefail
+
+DIR="/home/yurenh2/rrm/scripts/ptrm_gram_watch"
+mkdir -p "$DIR"
+LOG="$DIR/check.log"
+TS="$(date '+%Y-%m-%d %H:%M:%S')"
+alert() { echo "[$TS] ALERT: $*" >> "$DIR/ALERT_ptrm_gram.txt"; echo "[$TS] ALERT: $*" >> "$LOG"; }
+note() { echo "[$TS] $*" >> "$LOG"; }
+
+fetch_repos() { # org -> sorted repo names
+ curl -sL --max-time 60 "https://api.github.com/orgs/$1/repos?per_page=100" \
+ | grep -o '"full_name": *"[^"]*"' | cut -d'"' -f4 | sort
+}
+
+for org in SamsungSAILMontreal ahn-ml; do
+ cur="$DIR/${org}_repos.txt"
+ new="$DIR/${org}_repos.new"
+ fetch_repos "$org" > "$new"
+ if [[ ! -s "$new" ]]; then note "$org: fetch failed/empty, skipping diff"; rm -f "$new"; continue; fi
+ if [[ -f "$cur" ]]; then
+ added="$(comm -13 "$cur" "$new")"
+ if [[ -n "$added" ]]; then alert "$org new repos: $(echo "$added" | tr '\n' ' ')"; fi
+ fi
+ mv "$new" "$cur"
+ note "$org: $(wc -l < "$cur") repos"
+done
+
+# global search (unauthenticated; weekly cadence well under rate limits)
+for q in "PTRM+recursive+reasoning" "probabilistic+tiny+recursive" "GRAM+recursive+reasoning+model"; do
+ hits="$(curl -sL --max-time 60 "https://api.github.com/search/repositories?q=${q}&sort=updated&per_page=5" \
+ | grep -o '"full_name": *"[^"]*"' | cut -d'"' -f4)"
+ known="$DIR/search_known.txt"; touch "$known"
+ while IFS= read -r r; do
+ [[ -z "$r" ]] && continue
+ if ! grep -qxF "$r" "$known"; then
+ echo "$r" >> "$known"
+ alert "new search hit ($q): $r"
+ fi
+ done <<< "$hits"
+done
+
+# GRAM project page: code still "coming soon"?
+page="$(curl -sL --max-time 60 "https://ahn-ml.github.io/gram-website/" || true)"
+if [[ -n "$page" ]]; then
+ if echo "$page" | grep -qi "coming soon"; then
+ note "GRAM page: still 'coming soon'"
+ else
+ alert "GRAM page no longer says 'coming soon' — check for code link"
+ fi
+ link="$(echo "$page" | grep -oiE 'href="[^"]*github\.com[^"]*"' | grep -vi "ahn-ml.github.io\|Academic-project-page-template" | head -3)"
+ if [[ -n "$link" ]]; then alert "GRAM page github link(s): $link"; fi
+else
+ note "GRAM page fetch failed"
+fi
+
+note "check complete"
diff --git a/scripts/ptrm_gram_watch/SamsungSAILMontreal_repos.txt b/scripts/ptrm_gram_watch/SamsungSAILMontreal_repos.txt
new file mode 100644
index 0000000..7db0678
--- /dev/null
+++ b/scripts/ptrm_gram_watch/SamsungSAILMontreal_repos.txt
@@ -0,0 +1,23 @@
+SamsungSAILMontreal/AnyMolGenCritic
+SamsungSAILMontreal/AVR-Eval-Agent
+SamsungSAILMontreal/ByteCraft
+SamsungSAILMontreal/cont-diffsubmin
+SamsungSAILMontreal/difference-submodular-min
+SamsungSAILMontreal/fair-matroid-submodular-max
+SamsungSAILMontreal/ForestDiffusion
+SamsungSAILMontreal/GGM-metrics
+SamsungSAILMontreal/ghn3
+SamsungSAILMontreal/GuidedQuant
+SamsungSAILMontreal/hyper-representation
+SamsungSAILMontreal/l2o_pytorch
+SamsungSAILMontreal/layer-merge
+SamsungSAILMontreal/LoGAH
+SamsungSAILMontreal/molecular-orbitals
+SamsungSAILMontreal/mulo
+SamsungSAILMontreal/multiset-equivariance
+SamsungSAILMontreal/nino
+SamsungSAILMontreal/PAPA
+SamsungSAILMontreal/ream
+SamsungSAILMontreal/STGG-AL
+SamsungSAILMontreal/subpruning
+SamsungSAILMontreal/TinyRecursiveModels
diff --git a/scripts/ptrm_gram_watch/ahn-ml_repos.txt b/scripts/ptrm_gram_watch/ahn-ml_repos.txt
new file mode 100644
index 0000000..1a31659
--- /dev/null
+++ b/scripts/ptrm_gram_watch/ahn-ml_repos.txt
@@ -0,0 +1,16 @@
+ahn-ml/crafter
+ahn-ml/ctm
+ahn-ml/dreamweaver-release
+ahn-ml/drstrategy
+ahn-ml/gpu_testing
+ahn-ml/gram-website
+ahn-ml/G-SWM
+ahn-ml/IPR
+ahn-ml/lddm
+ahn-ml/mctd
+ahn-ml/OCVT
+ahn-ml/openreview-crawl-citation-stats
+ahn-ml/SCALOR
+ahn-ml/SPACE
+ahn-ml/Understanding-LoRA-as-Knowledge-Memory
+ahn-ml/vqgan
diff --git a/scripts/ptrm_gram_watch/search_known.txt b/scripts/ptrm_gram_watch/search_known.txt
new file mode 100644
index 0000000..9ffba66
--- /dev/null
+++ b/scripts/ptrm_gram_watch/search_known.txt
@@ -0,0 +1,3 @@
+DeadByDawn101/GRAM-MLX
+edu-ide/wgram-lm
+ad3002/gram
diff --git a/scripts/queue_trm_sudoku.sh b/scripts/queue_trm_sudoku.sh
new file mode 100755
index 0000000..fa447ff
--- /dev/null
+++ b/scripts/queue_trm_sudoku.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Queue TRM Sudoku 1k 4-GPU training: wait for Step 3 to finish then launch.
+# Run via: nohup bash scripts/queue_trm_sudoku.sh > runs/trm_queue.log 2>&1 &
+set -e
+LOG_DIR=/home/yurenh2/rrm/runs
+mkdir -p "$LOG_DIR"
+TRM_LOG="$LOG_DIR/trm_sudoku_$(date +%Y%m%d_%H%M%S).log"
+echo "[$(date)] queued, waiting for step3_train processes to exit..."
+
+while pgrep -f step3_train_with_rf >/dev/null; do
+ sleep 60
+done
+
+echo "[$(date)] Step 3 done — launching TRM Sudoku 1k 4-GPU"
+
+source "$(conda info --base)/etc/profile.d/conda.sh"
+conda activate rrm
+
+cd /home/yurenh2/rrm/trm
+
+# Official TRM Sudoku 1k pretrain_mlp_t_sudoku command, scaled to 4-GPU torchrun
+WANDB_MODE=offline OMP_NUM_THREADS=8 \
+ torchrun --standalone --nproc-per-node 4 pretrain.py \
+ arch=trm \
+ data_paths="[/home/yurenh2/rrm/data/sudoku-extreme-1k-aug-1000]" \
+ evaluators="[]" \
+ epochs=50000 eval_interval=5000 \
+ lr=1e-4 puzzle_emb_lr=1e-4 weight_decay=1.0 puzzle_emb_weight_decay=1.0 \
+ arch.mlp_t=True arch.pos_encodings=none \
+ arch.L_layers=2 \
+ arch.H_cycles=3 arch.L_cycles=6 \
+ +run_name=pretrain_mlp_t_sudoku ema=True \
+ > "$TRM_LOG" 2>&1
+
+echo "[$(date)] TRM training finished. log: $TRM_LOG"
diff --git a/scripts/run_hrm_sudoku.sh b/scripts/run_hrm_sudoku.sh
new file mode 100755
index 0000000..04d3a1c
--- /dev/null
+++ b/scripts/run_hrm_sudoku.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# 启动 HRM Sudoku 1k 训练 (HRM 官方推荐配置)
+# 单 GPU 约 10h on RTX 4070; A6000 应该更快
+set -euo pipefail
+REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+source "$(conda info --base)/etc/profile.d/conda.sh"
+conda activate rrm
+
+cd "$REPO_ROOT/hrm"
+OMP_NUM_THREADS=${OMP_NUM_THREADS:-8} \
+WANDB_MODE=${WANDB_MODE:-online} \
+python pretrain.py \
+ data_path="$REPO_ROOT/data/sudoku-extreme-1k-aug-1000" \
+ epochs=20000 eval_interval=2000 global_batch_size=384 \
+ lr=7e-5 puzzle_emb_lr=7e-5 weight_decay=1.0 puzzle_emb_weight_decay=1.0 "$@"
diff --git a/scripts/run_trm_sudoku.sh b/scripts/run_trm_sudoku.sh
new file mode 100755
index 0000000..ae6db21
--- /dev/null
+++ b/scripts/run_trm_sudoku.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# 启动 TRM Sudoku 1k 训练 (TRM 官方 pretrain_mlp_t_sudoku 配置)
+# 单 GPU L40S 48GB 约 18h; A6000 应该接近
+set -euo pipefail
+REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+source "$(conda info --base)/etc/profile.d/conda.sh"
+conda activate rrm
+
+cd "$REPO_ROOT/trm"
+OMP_NUM_THREADS=${OMP_NUM_THREADS:-8} \
+WANDB_MODE=${WANDB_MODE:-online} \
+python pretrain.py \
+ arch=trm \
+ data_paths="[$REPO_ROOT/data/sudoku-extreme-1k-aug-1000]" \
+ evaluators="[]" \
+ epochs=50000 eval_interval=5000 \
+ lr=1e-4 puzzle_emb_lr=1e-4 weight_decay=1.0 puzzle_emb_weight_decay=1.0 \
+ arch.mlp_t=True arch.pos_encodings=none \
+ arch.L_layers=2 \
+ arch.H_cycles=3 arch.L_cycles=6 \
+ +run_name=pretrain_mlp_t_sudoku ema=True "$@"
diff --git a/scripts/smoke_test.sh b/scripts/smoke_test.sh
new file mode 100755
index 0000000..248e9a2
--- /dev/null
+++ b/scripts/smoke_test.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# 验证 HRM 和 TRM 都能在共享数据集上启动训练循环
+# 不跑全量训练,每边跑 90s 检查崩没崩
+set -euo pipefail
+REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+source "$(conda info --base)/etc/profile.d/conda.sh"
+conda activate rrm
+
+if [[ ! -d "$REPO_ROOT/data/sudoku-extreme-1k-aug-1000" ]]; then
+ echo "datasets not built; run scripts/build_datasets.sh sudoku first"; exit 1
+fi
+
+echo "=== HRM smoke ==="
+cd "$REPO_ROOT/hrm"
+( timeout 90 env WANDB_MODE=offline OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=0 \
+ python pretrain.py \
+ data_path="$REPO_ROOT/data/sudoku-extreme-1k-aug-1000" \
+ epochs=1 eval_interval=1 global_batch_size=64 \
+ lr=7e-5 puzzle_emb_lr=7e-5 weight_decay=1.0 puzzle_emb_weight_decay=1.0 \
+ > /tmp/hrm_smoke.log 2>&1 ) || true
+grep -iE "error|traceback" /tmp/hrm_smoke.log && { echo "HRM smoke FAILED"; tail -30 /tmp/hrm_smoke.log; exit 1; } || echo "HRM smoke OK"
+
+echo "=== TRM smoke ==="
+cd "$REPO_ROOT/trm"
+( timeout 120 env WANDB_MODE=offline OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=0 \
+ python pretrain.py \
+ arch=trm \
+ data_paths="[$REPO_ROOT/data/sudoku-extreme-1k-aug-1000]" \
+ evaluators="[]" \
+ epochs=1 eval_interval=1 global_batch_size=64 \
+ lr=1e-4 puzzle_emb_lr=1e-4 weight_decay=1.0 puzzle_emb_weight_decay=1.0 \
+ arch.mlp_t=True arch.pos_encodings=none \
+ arch.L_layers=2 arch.H_cycles=3 arch.L_cycles=6 \
+ +run_name=smoke_test ema=True \
+ > /tmp/trm_smoke.log 2>&1 ) || true
+grep -iE "error|traceback" /tmp/trm_smoke.log && { echo "TRM smoke FAILED"; tail -30 /tmp/trm_smoke.log; exit 1; } || echo "TRM smoke OK"
+
+echo "==> all smoke tests passed"