summaryrefslogtreecommitdiff
path: root/scripts/build_datasets.sh
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/build_datasets.sh')
-rwxr-xr-xscripts/build_datasets.sh86
1 files changed, 86 insertions, 0 deletions
diff --git a/scripts/build_datasets.sh b/scripts/build_datasets.sh
new file mode 100755
index 0000000..df22e84
--- /dev/null
+++ b/scripts/build_datasets.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+# 构建 HRM/TRM 共享的数据集,存放到 rrm/data/ 下
+# TRM 的构建脚本在 metadata 里多了 total_puzzles 字段,HRM 也能加载 (Pydantic ignore extra)
+# 用法: bash scripts/build_datasets.sh [sudoku|maze|arc1|arc2|all]
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+DATA_DIR="$REPO_ROOT/data"
+TRM_DIR="$REPO_ROOT/trm"
+HRM_DIR="$REPO_ROOT/hrm"
+
+source "$(conda info --base)/etc/profile.d/conda.sh"
+conda activate rrm
+
+mkdir -p "$DATA_DIR"
+
+target="${1:-all}"
+
+build_sudoku() {
+ if [[ ! -d "$DATA_DIR/sudoku-extreme-1k-aug-1000" ]]; then
+ cd "$TRM_DIR"
+ python dataset/build_sudoku_dataset.py \
+ --output-dir "$DATA_DIR/sudoku-extreme-1k-aug-1000" \
+ --subsample-size 1000 --num-aug 1000
+ else
+ echo "[skip] sudoku-extreme-1k-aug-1000 already exists"
+ fi
+}
+
+build_maze() {
+ if [[ ! -d "$DATA_DIR/maze-30x30-hard-1k" ]]; then
+ cd "$TRM_DIR"
+ python dataset/build_maze_dataset.py --output-dir "$DATA_DIR/maze-30x30-hard-1k"
+ else
+ echo "[skip] maze-30x30-hard-1k already exists"
+ fi
+}
+
+# ARC 需要 HRM 的 git submodules (ARC-AGI / ARC-AGI-2 / ConceptARC)
+prepare_arc_submodules() {
+ cd "$HRM_DIR"
+ if [[ ! -f dataset/raw-data/ARC-AGI/data/training/.gitkeep ]] 2>/dev/null && \
+ [[ -z "$(ls -A dataset/raw-data/ARC-AGI 2>/dev/null)" ]]; then
+ git submodule update --init --recursive
+ fi
+}
+
+build_arc1() {
+ prepare_arc_submodules
+ if [[ ! -d "$DATA_DIR/arc1concept-aug-1000" ]]; then
+ cd "$TRM_DIR"
+ python -m dataset.build_arc_dataset \
+ --input-file-prefix "$HRM_DIR/dataset/raw-data/ARC-AGI/data" \
+ --output-dir "$DATA_DIR/arc1concept-aug-1000" \
+ --subsets training evaluation concept \
+ --test-set-name evaluation
+ else
+ echo "[skip] arc1concept-aug-1000 already exists"
+ fi
+}
+
+build_arc2() {
+ prepare_arc_submodules
+ if [[ ! -d "$DATA_DIR/arc2concept-aug-1000" ]]; then
+ cd "$TRM_DIR"
+ python -m dataset.build_arc_dataset \
+ --input-file-prefix "$HRM_DIR/dataset/raw-data/ARC-AGI-2/data" \
+ --output-dir "$DATA_DIR/arc2concept-aug-1000" \
+ --subsets training2 evaluation2 concept \
+ --test-set-name evaluation2
+ else
+ echo "[skip] arc2concept-aug-1000 already exists"
+ fi
+}
+
+case "$target" in
+ sudoku) build_sudoku ;;
+ maze) build_maze ;;
+ arc1) build_arc1 ;;
+ arc2) build_arc2 ;;
+ all) build_sudoku; build_maze; build_arc1; build_arc2 ;;
+ *) echo "unknown target: $target"; exit 1 ;;
+esac
+
+echo "==> datasets ready under $DATA_DIR"
+ls -la "$DATA_DIR"