#!/usr/bin/env bash # 构建 HRM/TRM 共享的数据集,存放到 rrm/data/ 下 # TRM 的构建脚本在 metadata 里多了 total_puzzles 字段,HRM 也能加载 (Pydantic ignore extra) # 用法: bash scripts/build_datasets.sh [sudoku|maze|arc1|arc2|all] set -euo pipefail REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" DATA_DIR="$REPO_ROOT/data" TRM_DIR="$REPO_ROOT/trm" HRM_DIR="$REPO_ROOT/hrm" source "$(conda info --base)/etc/profile.d/conda.sh" conda activate rrm mkdir -p "$DATA_DIR" target="${1:-all}" build_sudoku() { if [[ ! -d "$DATA_DIR/sudoku-extreme-1k-aug-1000" ]]; then cd "$TRM_DIR" python dataset/build_sudoku_dataset.py \ --output-dir "$DATA_DIR/sudoku-extreme-1k-aug-1000" \ --subsample-size 1000 --num-aug 1000 else echo "[skip] sudoku-extreme-1k-aug-1000 already exists" fi } build_maze() { if [[ ! -d "$DATA_DIR/maze-30x30-hard-1k" ]]; then cd "$TRM_DIR" python dataset/build_maze_dataset.py --output-dir "$DATA_DIR/maze-30x30-hard-1k" else echo "[skip] maze-30x30-hard-1k already exists" fi } # ARC 需要 HRM 的 git submodules (ARC-AGI / ARC-AGI-2 / ConceptARC) prepare_arc_submodules() { cd "$HRM_DIR" if [[ ! -f dataset/raw-data/ARC-AGI/data/training/.gitkeep ]] 2>/dev/null && \ [[ -z "$(ls -A dataset/raw-data/ARC-AGI 2>/dev/null)" ]]; then git submodule update --init --recursive fi } build_arc1() { prepare_arc_submodules if [[ ! -d "$DATA_DIR/arc1concept-aug-1000" ]]; then cd "$TRM_DIR" python -m dataset.build_arc_dataset \ --input-file-prefix "$HRM_DIR/dataset/raw-data/ARC-AGI/data" \ --output-dir "$DATA_DIR/arc1concept-aug-1000" \ --subsets training evaluation concept \ --test-set-name evaluation else echo "[skip] arc1concept-aug-1000 already exists" fi } build_arc2() { prepare_arc_submodules if [[ ! -d "$DATA_DIR/arc2concept-aug-1000" ]]; then cd "$TRM_DIR" python -m dataset.build_arc_dataset \ --input-file-prefix "$HRM_DIR/dataset/raw-data/ARC-AGI-2/data" \ --output-dir "$DATA_DIR/arc2concept-aug-1000" \ --subsets training2 evaluation2 concept \ --test-set-name evaluation2 else echo "[skip] arc2concept-aug-1000 already exists" fi } case "$target" in sudoku) build_sudoku ;; maze) build_maze ;; arc1) build_arc1 ;; arc2) build_arc2 ;; all) build_sudoku; build_maze; build_arc1; build_arc2 ;; *) echo "unknown target: $target"; exit 1 ;; esac echo "==> datasets ready under $DATA_DIR" ls -la "$DATA_DIR"