From 66e0d8b9fd4d0f7a2231d689c055e26fdf1cf04a Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Sat, 13 Jun 2026 12:35:36 -0500 Subject: rrm workspace: TRM/HRM/SRM code, Maze dataset, dynamical-analysis pipeline Curated export for clone-and-run Maze training (2x A6000) + diagnostics. trm/hrm pretrain.py carry trajectory-augmentation code (backward-compatible). Heavy artifacts (checkpoints/wandb/npz) gitignored; see PROVENANCE.md. Co-Authored-By: Claude Fable 5 --- scripts/build_datasets.sh | 86 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100755 scripts/build_datasets.sh (limited to 'scripts/build_datasets.sh') diff --git a/scripts/build_datasets.sh b/scripts/build_datasets.sh new file mode 100755 index 0000000..df22e84 --- /dev/null +++ b/scripts/build_datasets.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +# 构建 HRM/TRM 共享的数据集,存放到 rrm/data/ 下 +# TRM 的构建脚本在 metadata 里多了 total_puzzles 字段,HRM 也能加载 (Pydantic ignore extra) +# 用法: bash scripts/build_datasets.sh [sudoku|maze|arc1|arc2|all] +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +DATA_DIR="$REPO_ROOT/data" +TRM_DIR="$REPO_ROOT/trm" +HRM_DIR="$REPO_ROOT/hrm" + +source "$(conda info --base)/etc/profile.d/conda.sh" +conda activate rrm + +mkdir -p "$DATA_DIR" + +target="${1:-all}" + +build_sudoku() { + if [[ ! -d "$DATA_DIR/sudoku-extreme-1k-aug-1000" ]]; then + cd "$TRM_DIR" + python dataset/build_sudoku_dataset.py \ + --output-dir "$DATA_DIR/sudoku-extreme-1k-aug-1000" \ + --subsample-size 1000 --num-aug 1000 + else + echo "[skip] sudoku-extreme-1k-aug-1000 already exists" + fi +} + +build_maze() { + if [[ ! -d "$DATA_DIR/maze-30x30-hard-1k" ]]; then + cd "$TRM_DIR" + python dataset/build_maze_dataset.py --output-dir "$DATA_DIR/maze-30x30-hard-1k" + else + echo "[skip] maze-30x30-hard-1k already exists" + fi +} + +# ARC 需要 HRM 的 git submodules (ARC-AGI / ARC-AGI-2 / ConceptARC) +prepare_arc_submodules() { + cd "$HRM_DIR" + if [[ ! -f dataset/raw-data/ARC-AGI/data/training/.gitkeep ]] 2>/dev/null && \ + [[ -z "$(ls -A dataset/raw-data/ARC-AGI 2>/dev/null)" ]]; then + git submodule update --init --recursive + fi +} + +build_arc1() { + prepare_arc_submodules + if [[ ! -d "$DATA_DIR/arc1concept-aug-1000" ]]; then + cd "$TRM_DIR" + python -m dataset.build_arc_dataset \ + --input-file-prefix "$HRM_DIR/dataset/raw-data/ARC-AGI/data" \ + --output-dir "$DATA_DIR/arc1concept-aug-1000" \ + --subsets training evaluation concept \ + --test-set-name evaluation + else + echo "[skip] arc1concept-aug-1000 already exists" + fi +} + +build_arc2() { + prepare_arc_submodules + if [[ ! -d "$DATA_DIR/arc2concept-aug-1000" ]]; then + cd "$TRM_DIR" + python -m dataset.build_arc_dataset \ + --input-file-prefix "$HRM_DIR/dataset/raw-data/ARC-AGI-2/data" \ + --output-dir "$DATA_DIR/arc2concept-aug-1000" \ + --subsets training2 evaluation2 concept \ + --test-set-name evaluation2 + else + echo "[skip] arc2concept-aug-1000 already exists" + fi +} + +case "$target" in + sudoku) build_sudoku ;; + maze) build_maze ;; + arc1) build_arc1 ;; + arc2) build_arc2 ;; + all) build_sudoku; build_maze; build_arc1; build_arc2 ;; + *) echo "unknown target: $target"; exit 1 ;; +esac + +echo "==> datasets ready under $DATA_DIR" +ls -la "$DATA_DIR" -- cgit v1.2.3