summaryrefslogtreecommitdiff
path: root/scripts/build_datasets.sh
blob: df22e8403f0aa75c0ad659ec0ac9f0c3b0da3734 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env bash
# 构建 HRM/TRM 共享的数据集,存放到 rrm/data/ 下
# TRM 的构建脚本在 metadata 里多了 total_puzzles 字段,HRM 也能加载 (Pydantic ignore extra)
# 用法: bash scripts/build_datasets.sh [sudoku|maze|arc1|arc2|all]
set -euo pipefail

REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
DATA_DIR="$REPO_ROOT/data"
TRM_DIR="$REPO_ROOT/trm"
HRM_DIR="$REPO_ROOT/hrm"

source "$(conda info --base)/etc/profile.d/conda.sh"
conda activate rrm

mkdir -p "$DATA_DIR"

target="${1:-all}"

build_sudoku() {
  if [[ ! -d "$DATA_DIR/sudoku-extreme-1k-aug-1000" ]]; then
    cd "$TRM_DIR"
    python dataset/build_sudoku_dataset.py \
      --output-dir "$DATA_DIR/sudoku-extreme-1k-aug-1000" \
      --subsample-size 1000 --num-aug 1000
  else
    echo "[skip] sudoku-extreme-1k-aug-1000 already exists"
  fi
}

build_maze() {
  if [[ ! -d "$DATA_DIR/maze-30x30-hard-1k" ]]; then
    cd "$TRM_DIR"
    python dataset/build_maze_dataset.py --output-dir "$DATA_DIR/maze-30x30-hard-1k"
  else
    echo "[skip] maze-30x30-hard-1k already exists"
  fi
}

# ARC 需要 HRM 的 git submodules (ARC-AGI / ARC-AGI-2 / ConceptARC)
prepare_arc_submodules() {
  cd "$HRM_DIR"
  if [[ ! -f dataset/raw-data/ARC-AGI/data/training/.gitkeep ]] 2>/dev/null && \
     [[ -z "$(ls -A dataset/raw-data/ARC-AGI 2>/dev/null)" ]]; then
    git submodule update --init --recursive
  fi
}

build_arc1() {
  prepare_arc_submodules
  if [[ ! -d "$DATA_DIR/arc1concept-aug-1000" ]]; then
    cd "$TRM_DIR"
    python -m dataset.build_arc_dataset \
      --input-file-prefix "$HRM_DIR/dataset/raw-data/ARC-AGI/data" \
      --output-dir "$DATA_DIR/arc1concept-aug-1000" \
      --subsets training evaluation concept \
      --test-set-name evaluation
  else
    echo "[skip] arc1concept-aug-1000 already exists"
  fi
}

build_arc2() {
  prepare_arc_submodules
  if [[ ! -d "$DATA_DIR/arc2concept-aug-1000" ]]; then
    cd "$TRM_DIR"
    python -m dataset.build_arc_dataset \
      --input-file-prefix "$HRM_DIR/dataset/raw-data/ARC-AGI-2/data" \
      --output-dir "$DATA_DIR/arc2concept-aug-1000" \
      --subsets training2 evaluation2 concept \
      --test-set-name evaluation2
  else
    echo "[skip] arc2concept-aug-1000 already exists"
  fi
}

case "$target" in
  sudoku) build_sudoku ;;
  maze)   build_maze ;;
  arc1)   build_arc1 ;;
  arc2)   build_arc2 ;;
  all)    build_sudoku; build_maze; build_arc1; build_arc2 ;;
  *) echo "unknown target: $target"; exit 1 ;;
esac

echo "==> datasets ready under $DATA_DIR"
ls -la "$DATA_DIR"