blob: df22e8403f0aa75c0ad659ec0ac9f0c3b0da3734 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
#!/usr/bin/env bash
# 构建 HRM/TRM 共享的数据集,存放到 rrm/data/ 下
# TRM 的构建脚本在 metadata 里多了 total_puzzles 字段,HRM 也能加载 (Pydantic ignore extra)
# 用法: bash scripts/build_datasets.sh [sudoku|maze|arc1|arc2|all]
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
DATA_DIR="$REPO_ROOT/data"
TRM_DIR="$REPO_ROOT/trm"
HRM_DIR="$REPO_ROOT/hrm"
source "$(conda info --base)/etc/profile.d/conda.sh"
conda activate rrm
mkdir -p "$DATA_DIR"
target="${1:-all}"
build_sudoku() {
if [[ ! -d "$DATA_DIR/sudoku-extreme-1k-aug-1000" ]]; then
cd "$TRM_DIR"
python dataset/build_sudoku_dataset.py \
--output-dir "$DATA_DIR/sudoku-extreme-1k-aug-1000" \
--subsample-size 1000 --num-aug 1000
else
echo "[skip] sudoku-extreme-1k-aug-1000 already exists"
fi
}
build_maze() {
if [[ ! -d "$DATA_DIR/maze-30x30-hard-1k" ]]; then
cd "$TRM_DIR"
python dataset/build_maze_dataset.py --output-dir "$DATA_DIR/maze-30x30-hard-1k"
else
echo "[skip] maze-30x30-hard-1k already exists"
fi
}
# ARC 需要 HRM 的 git submodules (ARC-AGI / ARC-AGI-2 / ConceptARC)
prepare_arc_submodules() {
cd "$HRM_DIR"
if [[ ! -f dataset/raw-data/ARC-AGI/data/training/.gitkeep ]] 2>/dev/null && \
[[ -z "$(ls -A dataset/raw-data/ARC-AGI 2>/dev/null)" ]]; then
git submodule update --init --recursive
fi
}
build_arc1() {
prepare_arc_submodules
if [[ ! -d "$DATA_DIR/arc1concept-aug-1000" ]]; then
cd "$TRM_DIR"
python -m dataset.build_arc_dataset \
--input-file-prefix "$HRM_DIR/dataset/raw-data/ARC-AGI/data" \
--output-dir "$DATA_DIR/arc1concept-aug-1000" \
--subsets training evaluation concept \
--test-set-name evaluation
else
echo "[skip] arc1concept-aug-1000 already exists"
fi
}
build_arc2() {
prepare_arc_submodules
if [[ ! -d "$DATA_DIR/arc2concept-aug-1000" ]]; then
cd "$TRM_DIR"
python -m dataset.build_arc_dataset \
--input-file-prefix "$HRM_DIR/dataset/raw-data/ARC-AGI-2/data" \
--output-dir "$DATA_DIR/arc2concept-aug-1000" \
--subsets training2 evaluation2 concept \
--test-set-name evaluation2
else
echo "[skip] arc2concept-aug-1000 already exists"
fi
}
case "$target" in
sudoku) build_sudoku ;;
maze) build_maze ;;
arc1) build_arc1 ;;
arc2) build_arc2 ;;
all) build_sudoku; build_maze; build_arc1; build_arc2 ;;
*) echo "unknown target: $target"; exit 1 ;;
esac
echo "==> datasets ready under $DATA_DIR"
ls -la "$DATA_DIR"
|