From 66e0d8b9fd4d0f7a2231d689c055e26fdf1cf04a Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Sat, 13 Jun 2026 12:35:36 -0500 Subject: rrm workspace: TRM/HRM/SRM code, Maze dataset, dynamical-analysis pipeline Curated export for clone-and-run Maze training (2x A6000) + diagnostics. trm/hrm pretrain.py carry trajectory-augmentation code (backward-compatible). Heavy artifacts (checkpoints/wandb/npz) gitignored; see PROVENANCE.md. Co-Authored-By: Claude Fable 5 --- trm/dataset/build_arc_dataset.py | 341 ++++++++++++++++++++++++++++++++++++ trm/dataset/build_maze_dataset.py | 140 +++++++++++++++ trm/dataset/build_sudoku_dataset.py | 167 ++++++++++++++++++ trm/dataset/common.py | 49 ++++++ 4 files changed, 697 insertions(+) create mode 100644 trm/dataset/build_arc_dataset.py create mode 100644 trm/dataset/build_maze_dataset.py create mode 100644 trm/dataset/build_sudoku_dataset.py create mode 100644 trm/dataset/common.py (limited to 'trm/dataset') diff --git a/trm/dataset/build_arc_dataset.py b/trm/dataset/build_arc_dataset.py new file mode 100644 index 0000000..c144275 --- /dev/null +++ b/trm/dataset/build_arc_dataset.py @@ -0,0 +1,341 @@ +from typing import List, Tuple, Dict +from dataclasses import dataclass +import os +import json +import hashlib +import numpy as np + +from argdantic import ArgParser +from pydantic import BaseModel + +from dataset.common import PuzzleDatasetMetadata, dihedral_transform, inverse_dihedral_transform + + +cli = ArgParser() + + +class DataProcessConfig(BaseModel): + input_file_prefix: str + output_dir: str + subsets: List[str] + test_set_name: str + test_set_name2: str = "your_test_set" + seed: int = 42 + num_aug: int = 1000 + puzzle_identifiers_start: int = 1 # start > 1 to handle multiple datasets + +ARCMaxGridSize = 30 +ARCAugmentRetriesFactor = 5 + +PuzzleIdSeparator = "|||" + + +@dataclass +class ARCPuzzle: + id: str + examples: List[Tuple[np.ndarray, np.ndarray]] + + +def arc_grid_to_np(grid: List[List[int]]): + arr = np.array(grid) + + # Shape check + assert arr.ndim == 2 + assert arr.shape[0] <= ARCMaxGridSize and arr.shape[1] <= ARCMaxGridSize + # Element check + assert np.all((arr >= 0) & (arr <= 9)) + return arr.astype(np.uint8) + + +def np_grid_to_seq_translational_augment(inp: np.ndarray, out: np.ndarray, do_translation: bool): + # PAD: 0, : 1, digits: 2 ... 11 + # Compute random top-left pad + if do_translation: + pad_r = np.random.randint(0, ARCMaxGridSize - max(inp.shape[0], out.shape[0]) + 1) + pad_c = np.random.randint(0, ARCMaxGridSize - max(inp.shape[1], out.shape[1]) + 1) + else: + pad_r = pad_c = 0 + + # Pad grid + result = [] + for grid in [inp, out]: + nrow, ncol = grid.shape + grid = np.pad(grid + 2, ((pad_r, ARCMaxGridSize - pad_r - nrow), (pad_c, ARCMaxGridSize - pad_c - ncol)), constant_values=0) + + # Add + eos_row, eos_col = pad_r + nrow, pad_c + ncol + if eos_row < ARCMaxGridSize: + grid[eos_row, pad_c:eos_col] = 1 + if eos_col < ARCMaxGridSize: + grid[pad_r:eos_row, eos_col] = 1 + + result.append(grid.flatten()) + + return result + + +def grid_hash(grid: np.ndarray): + assert grid.ndim == 2 + assert grid.dtype == np.uint8 + + buffer = [x.to_bytes(1, byteorder='big') for x in grid.shape] + buffer.append(grid.tobytes()) + + return hashlib.sha256(b"".join(buffer)).hexdigest() + + +def puzzle_hash(puzzle: dict): + # Hash the puzzle for checking equivalence + hashes = [] + for example_type, example in puzzle.items(): + for input, label in example.examples: + hashes.append(f"{grid_hash(input)}|{grid_hash(label)}") + + hashes.sort() + return hashlib.sha256("|".join(hashes).encode()).hexdigest() + + +def aug(name: str): + # Augment plan + trans_id = np.random.randint(0, 8) + mapping = np.concatenate([np.arange(0, 1, dtype=np.uint8), np.random.permutation(np.arange(1, 10, dtype=np.uint8))]) # Permute colors, Excluding "0" (black) + + name_with_aug_repr = f"{name}{PuzzleIdSeparator}t{trans_id}{PuzzleIdSeparator}{''.join(str(x) for x in mapping)}" + + def _map_grid(grid: np.ndarray): + return dihedral_transform(mapping[grid], trans_id) + + return name_with_aug_repr, _map_grid + + +def inverse_aug(name: str): + # Inverse the "aug" function + if PuzzleIdSeparator not in name: + return name, lambda x: x + + trans_id, perm = name.split(PuzzleIdSeparator)[-2:] + trans_id = int(trans_id[1:]) # Remove "t" letter + inv_perm = np.argsort(list(perm)).astype(np.uint8) + + def _map_grid(grid: np.ndarray): + return inv_perm[inverse_dihedral_transform(grid, trans_id)] + + return name.split(PuzzleIdSeparator)[0], _map_grid + + +def convert_single_arc_puzzle(results: dict, name: str, puzzle: dict, aug_count: int, dest_mapping: Dict[str, Tuple[str, str]]): + # Convert + dests = set(dest_mapping.values()) + converted = {dest: ARCPuzzle(name, []) for dest in dests} + for example_type, examples in puzzle.items(): + # Map to target split + dest = dest_mapping[example_type] + converted[dest].examples.extend([(arc_grid_to_np(example["input"]), arc_grid_to_np(example["output"])) for example in examples]) + + group = [converted] + + # Augment + if aug_count > 0: + hashes = {puzzle_hash(converted)} + + for _trial in range(ARCAugmentRetriesFactor * aug_count): + aug_name, _map_grid = aug(name) + + # Check duplicate + augmented = {dest: ARCPuzzle(aug_name, [(_map_grid(input), _map_grid(label)) for (input, label) in puzzle.examples]) for dest, puzzle in converted.items()} + h = puzzle_hash(augmented) + if h not in hashes: + hashes.add(h) + group.append(augmented) + + if len(group) >= aug_count + 1: + break + + if len(group) < aug_count + 1: + print (f"[Puzzle {name}] augmentation not full, only {len(group)}") + + # Append + for dest in dests: + # Convert the examples + dest_split, dest_set = dest + + results.setdefault(dest_split, {}) + results[dest_split].setdefault(dest_set, []) + results[dest_split][dest_set].append([converted[dest] for converted in group]) + + +def load_puzzles_arcagi(config: DataProcessConfig): + train_examples_dest = ("train", "all") + test_examples_map = { + config.test_set_name: [(1.0, ("test", "all"))], + config.test_set_name2: [(1.0, ("test", "all"))], + "_default": [(1.0, ("train", "all"))] + } + + test_puzzles = {} + results = {} + + total_puzzles = 0 + for subset_name in config.subsets: + # Load all puzzles in this subset + with open(f"{config.input_file_prefix}_{subset_name}_challenges.json", "r") as f: + puzzles = json.load(f) + + sols_filename = f"{config.input_file_prefix}_{subset_name}_solutions.json" + if os.path.isfile(sols_filename): + with open(sols_filename, "r") as f: + sols = json.load(f) + + for puzzle_id in puzzles.keys(): + for idx, sol_grid in enumerate(sols[puzzle_id]): + puzzles[puzzle_id]["test"][idx]["output"] = sol_grid + else: + # Fill with dummy + print (f"{subset_name} solutions not found, filling with dummy") + + for puzzle_id, puzzle in puzzles.items(): + for example in puzzle["test"]: + example.setdefault("output", [[0]]) + + # Shuffle puzzles + puzzles = list(puzzles.items()) + np.random.shuffle(puzzles) + + # Assign by fraction + for idx, (name, puzzle) in enumerate(puzzles): + fraction = idx / len(puzzles) + test_examples_dest = None + for f, dest in test_examples_map.get(subset_name, test_examples_map["_default"]): + if fraction < f: + test_examples_dest = dest + break + + assert test_examples_dest is not None + + if test_examples_dest[0] == "test": + test_puzzles[name] = puzzle + + convert_single_arc_puzzle(results, name, puzzle, config.num_aug, {"train": train_examples_dest, "test": test_examples_dest}) + total_puzzles += 1 + + print (f"Total puzzles: {total_puzzles}") + return results, test_puzzles + + +def convert_dataset(config: DataProcessConfig): + np.random.seed(config.seed) + + # Read dataset + data, test_puzzles = load_puzzles_arcagi(config) + + # Map global puzzle identifiers + num_identifiers = config.puzzle_identifiers_start # 0 is blank, start at 1 + identifier_map = {} + for split_name, split in data.items(): + for subset_name, subset in split.items(): + for group in subset: + for puzzle in group: + if puzzle.id not in identifier_map: + identifier_map[puzzle.id] = num_identifiers + num_identifiers += 1 + print (f"Total puzzle IDs (including ): {num_identifiers}") + + # Save + for split_name, split in data.items(): + os.makedirs(os.path.join(config.output_dir, split_name), exist_ok=True) + + # Translational augmentations + enable_translational_augment = split_name == "train" + + # Statistics + total_examples = 0 + total_puzzles = 0 + total_groups = 0 + + for subset_name, subset in split.items(): # "all" is the only subset + # Construct subset + results = {k: [] for k in ["inputs", "labels", "puzzle_identifiers", "puzzle_indices", "group_indices"]} + results["puzzle_indices"].append(0) + results["group_indices"].append(0) + + example_id = 0 + puzzle_id = 0 + + for group in subset: + for puzzle in group: + # Push puzzle + no_aug_id = np.random.randint(0, len(puzzle.examples)) + for _idx_ex, (inp, out) in enumerate(puzzle.examples): + inp, out = np_grid_to_seq_translational_augment(inp, out, do_translation=enable_translational_augment and _idx_ex != no_aug_id) + + results["inputs"].append(inp) + results["labels"].append(out) + example_id += 1 + + total_examples += 1 + + results["puzzle_indices"].append(example_id) + results["puzzle_identifiers"].append(identifier_map[puzzle.id]) + + puzzle_id += 1 + total_puzzles += 1 + + # Push group + results["group_indices"].append(puzzle_id) + total_groups += 1 + + for k, v in results.items(): + if k in {"inputs", "labels"}: + v = np.stack(v, 0) + else: + v = np.array(v, dtype=np.int32) + + np.save(os.path.join(config.output_dir, split_name, f"{subset_name}__{k}.npy"), v) + + # Metadata + metadata = PuzzleDatasetMetadata( + seq_len=ARCMaxGridSize * ARCMaxGridSize, + vocab_size=10 + 2, # PAD + EOS + "0" ... "9" + pad_id=0, + ignore_label_id=0, + blank_identifier_id=0, + num_puzzle_identifiers=num_identifiers, + total_groups=total_groups, + mean_puzzle_examples=total_examples / total_puzzles, + total_puzzles=total_puzzles, + sets=list(split.keys()) + ) + + # Save metadata as JSON. + with open(os.path.join(config.output_dir, split_name, "dataset.json"), "w") as f: + json.dump(metadata.model_dump(), f) + + # Save IDs mapping + with open(os.path.join(config.output_dir, "identifiers.json"), "w") as f: + ids_mapping = {v: k for k, v in identifier_map.items()} + json.dump([ids_mapping.get(i, "") for i in range(num_identifiers)], f) + + # Save Test Puzzles + with open(os.path.join(config.output_dir, "test_puzzles.json"), "w") as f: + json.dump(test_puzzles, f) + + +@cli.command(singleton=True) +def main(config: DataProcessConfig): + convert_dataset(config) + + +if __name__ == "__main__": + cli() + + + + + + + + + + + + diff --git a/trm/dataset/build_maze_dataset.py b/trm/dataset/build_maze_dataset.py new file mode 100644 index 0000000..7ebe267 --- /dev/null +++ b/trm/dataset/build_maze_dataset.py @@ -0,0 +1,140 @@ +from typing import Optional +import math +import os +import csv +import json +import numpy as np + +from argdantic import ArgParser +from pydantic import BaseModel +from tqdm import tqdm +from huggingface_hub import hf_hub_download + +from common import PuzzleDatasetMetadata, dihedral_transform + + +CHARSET = "# SGo" + + +cli = ArgParser() + + +class DataProcessConfig(BaseModel): + source_repo: str = "sapientinc/maze-30x30-hard-1k" + output_dir: str = "data/maze-30x30-hard-1k" + + subsample_size: Optional[int] = None + aug: bool = False + + +def convert_subset(set_name: str, config: DataProcessConfig): + # Read CSV + all_chars = set() + grid_size = None + inputs = [] + labels = [] + + with open(hf_hub_download(config.source_repo, f"{set_name}.csv", repo_type="dataset"), newline="") as csvfile: # type: ignore + reader = csv.reader(csvfile) + next(reader) # Skip header + for source, q, a, rating in reader: + all_chars.update(q) + all_chars.update(a) + + if grid_size is None: + n = int(len(q) ** 0.5) + grid_size = (n, n) + + inputs.append(np.frombuffer(q.encode(), dtype=np.uint8).reshape(grid_size)) + labels.append(np.frombuffer(a.encode(), dtype=np.uint8).reshape(grid_size)) + + # If subsample_size is specified for the training set, + # randomly sample the desired number of examples. + if set_name == "train" and config.subsample_size is not None: + total_samples = len(inputs) + if config.subsample_size < total_samples: + indices = np.random.choice(total_samples, size=config.subsample_size, replace=False) + inputs = [inputs[i] for i in indices] + labels = [labels[i] for i in indices] + + # Generate dataset + results = {k: [] for k in ["inputs", "labels", "puzzle_identifiers", "puzzle_indices", "group_indices"]} + puzzle_id = 0 + example_id = 0 + + results["puzzle_indices"].append(0) + results["group_indices"].append(0) + + for inp, out in zip(tqdm(inputs), labels): + # Dihedral transformations for augmentation + for aug_idx in range(8 if (set_name == "train" and config.aug) else 1): + results["inputs"].append(dihedral_transform(inp, aug_idx)) + results["labels"].append(dihedral_transform(out, aug_idx)) + example_id += 1 + puzzle_id += 1 + + results["puzzle_indices"].append(example_id) + results["puzzle_identifiers"].append(0) + + # Push group + results["group_indices"].append(puzzle_id) + + # Char mappings + assert len(all_chars - set(CHARSET)) == 0 + + char2id = np.zeros(256, np.uint8) + char2id[np.array(list(map(ord, CHARSET)))] = np.arange(len(CHARSET)) + 1 + + # To Numpy + def _seq_to_numpy(seq): + arr = np.vstack([char2id[s.reshape(-1)] for s in seq]) + + return arr + + results = { + "inputs": _seq_to_numpy(results["inputs"]), + "labels": _seq_to_numpy(results["labels"]), + + "group_indices": np.array(results["group_indices"], dtype=np.int32), + "puzzle_indices": np.array(results["puzzle_indices"], dtype=np.int32), + "puzzle_identifiers": np.array(results["puzzle_identifiers"], dtype=np.int32), + } + + # Metadata + metadata = PuzzleDatasetMetadata( + seq_len=int(math.prod(grid_size)), # type: ignore + vocab_size=len(CHARSET) + 1, # PAD + Charset + pad_id=0, + ignore_label_id=0, + blank_identifier_id=0, + num_puzzle_identifiers=1, + total_groups=len(results["group_indices"]) - 1, + mean_puzzle_examples=1, + total_puzzles=len(results["group_indices"]) - 1, + sets=["all"] + ) + + # Save metadata as JSON. + save_dir = os.path.join(config.output_dir, set_name) + os.makedirs(save_dir, exist_ok=True) + + with open(os.path.join(save_dir, "dataset.json"), "w") as f: + json.dump(metadata.model_dump(), f) + + # Save data + for k, v in results.items(): + np.save(os.path.join(save_dir, f"all__{k}.npy"), v) + + # Save IDs mapping (for visualization only) + with open(os.path.join(config.output_dir, "identifiers.json"), "w") as f: + json.dump([""], f) + + +@cli.command(singleton=True) +def preprocess_data(config: DataProcessConfig): + convert_subset("train", config) + convert_subset("test", config) + + +if __name__ == "__main__": + cli() diff --git a/trm/dataset/build_sudoku_dataset.py b/trm/dataset/build_sudoku_dataset.py new file mode 100644 index 0000000..796b128 --- /dev/null +++ b/trm/dataset/build_sudoku_dataset.py @@ -0,0 +1,167 @@ +from typing import Optional +import os +import csv +import json +import numpy as np + +from argdantic import ArgParser +from pydantic import BaseModel +from tqdm import tqdm +from huggingface_hub import hf_hub_download + +from common import PuzzleDatasetMetadata + + +cli = ArgParser() + + +class DataProcessConfig(BaseModel): + source_repo: str = "sapientinc/sudoku-extreme" + output_dir: str = "data/sudoku-extreme-full" + + subsample_size: Optional[int] = None + min_difficulty: Optional[int] = None + num_aug: int = 0 + + +def shuffle_sudoku(board: np.ndarray, solution: np.ndarray): + # Create a random digit mapping: a permutation of 1..9, with zero (blank) unchanged + digit_map = np.pad(np.random.permutation(np.arange(1, 10)), (1, 0)) + + # Randomly decide whether to transpose. + transpose_flag = np.random.rand() < 0.5 + + # Generate a valid row permutation: + # - Shuffle the 3 bands (each band = 3 rows) and for each band, shuffle its 3 rows. + bands = np.random.permutation(3) + row_perm = np.concatenate([b * 3 + np.random.permutation(3) for b in bands]) + + # Similarly for columns (stacks). + stacks = np.random.permutation(3) + col_perm = np.concatenate([s * 3 + np.random.permutation(3) for s in stacks]) + + # Build an 81->81 mapping. For each new cell at (i, j) + # (row index = i // 9, col index = i % 9), + # its value comes from old row = row_perm[i//9] and old col = col_perm[i%9]. + mapping = np.array([row_perm[i // 9] * 9 + col_perm[i % 9] for i in range(81)]) + + def apply_transformation(x: np.ndarray) -> np.ndarray: + # Apply transpose flag + if transpose_flag: + x = x.T + # Apply the position mapping. + new_board = x.flatten()[mapping].reshape(9, 9).copy() + # Apply digit mapping + return digit_map[new_board] + + return apply_transformation(board), apply_transformation(solution) + + +def convert_subset(set_name: str, config: DataProcessConfig): + # Read CSV + inputs = [] + labels = [] + + with open(hf_hub_download(config.source_repo, f"{set_name}.csv", repo_type="dataset"), newline="") as csvfile: + reader = csv.reader(csvfile) + next(reader) # Skip header + for source, q, a, rating in reader: + if (config.min_difficulty is None) or (int(rating) >= config.min_difficulty): + assert len(q) == 81 and len(a) == 81 + + inputs.append(np.frombuffer(q.replace('.', '0').encode(), dtype=np.uint8).reshape(9, 9) - ord('0')) + labels.append(np.frombuffer(a.encode(), dtype=np.uint8).reshape(9, 9) - ord('0')) + + # If subsample_size is specified for the training set, + # randomly sample the desired number of examples. + if set_name == "train" and config.subsample_size is not None: + total_samples = len(inputs) + if config.subsample_size < total_samples: + indices = np.random.choice(total_samples, size=config.subsample_size, replace=False) + inputs = [inputs[i] for i in indices] + labels = [labels[i] for i in indices] + + # Generate dataset + num_augments = config.num_aug if set_name == "train" else 0 + + results = {k: [] for k in ["inputs", "labels", "puzzle_identifiers", "puzzle_indices", "group_indices"]} + puzzle_id = 0 + example_id = 0 + + results["puzzle_indices"].append(0) + results["group_indices"].append(0) + + for orig_inp, orig_out in zip(tqdm(inputs), labels): + for aug_idx in range(1 + num_augments): + # First index is not augmented + if aug_idx == 0: + inp, out = orig_inp, orig_out + else: + inp, out = shuffle_sudoku(orig_inp, orig_out) + + # Push puzzle (only single example) + results["inputs"].append(inp) + results["labels"].append(out) + example_id += 1 + puzzle_id += 1 + + results["puzzle_indices"].append(example_id) + results["puzzle_identifiers"].append(0) + + # Push group + results["group_indices"].append(puzzle_id) + + # To Numpy + def _seq_to_numpy(seq): + arr = np.concatenate(seq).reshape(len(seq), -1) + + assert np.all((arr >= 0) & (arr <= 9)) + return arr + 1 + + results = { + "inputs": _seq_to_numpy(results["inputs"]), + "labels": _seq_to_numpy(results["labels"]), + + "group_indices": np.array(results["group_indices"], dtype=np.int32), + "puzzle_indices": np.array(results["puzzle_indices"], dtype=np.int32), + "puzzle_identifiers": np.array(results["puzzle_identifiers"], dtype=np.int32), + } + + # Metadata + metadata = PuzzleDatasetMetadata( + seq_len=81, + vocab_size=10 + 1, # PAD + "0" ... "9" + pad_id=0, + ignore_label_id=0, + blank_identifier_id=0, + num_puzzle_identifiers=1, + total_groups=len(results["group_indices"]) - 1, + mean_puzzle_examples=1, + total_puzzles=len(results["group_indices"]) - 1, + sets=["all"] + ) + + # Save metadata as JSON. + save_dir = os.path.join(config.output_dir, set_name) + os.makedirs(save_dir, exist_ok=True) + + with open(os.path.join(save_dir, "dataset.json"), "w") as f: + json.dump(metadata.model_dump(), f) + + # Save data + for k, v in results.items(): + np.save(os.path.join(save_dir, f"all__{k}.npy"), v) + + # Save IDs mapping (for visualization only) + with open(os.path.join(config.output_dir, "identifiers.json"), "w") as f: + json.dump([""], f) + + +@cli.command(singleton=True) +def preprocess_data(config: DataProcessConfig): + convert_subset("train", config) + convert_subset("test", config) + + +if __name__ == "__main__": + cli() diff --git a/trm/dataset/common.py b/trm/dataset/common.py new file mode 100644 index 0000000..5ff91bb --- /dev/null +++ b/trm/dataset/common.py @@ -0,0 +1,49 @@ +from typing import List, Optional + +import pydantic +import numpy as np + + +# Global list mapping each dihedral transform id to its inverse. +# Index corresponds to the original tid, and the value is its inverse. +DIHEDRAL_INVERSE = [0, 3, 2, 1, 4, 5, 6, 7] + + +class PuzzleDatasetMetadata(pydantic.BaseModel): + pad_id: int + ignore_label_id: Optional[int] + blank_identifier_id: int + vocab_size: int + seq_len: int + num_puzzle_identifiers: int + total_groups: int + mean_puzzle_examples: float + total_puzzles: int + sets: List[str] + + +def dihedral_transform(arr: np.ndarray, tid: int) -> np.ndarray: + """8 dihedral symmetries by rotate, flip and mirror""" + + if tid == 0: + return arr # identity + elif tid == 1: + return np.rot90(arr, k=1) + elif tid == 2: + return np.rot90(arr, k=2) + elif tid == 3: + return np.rot90(arr, k=3) + elif tid == 4: + return np.fliplr(arr) # horizontal flip + elif tid == 5: + return np.flipud(arr) # vertical flip + elif tid == 6: + return arr.T # transpose (reflection along main diagonal) + elif tid == 7: + return np.fliplr(np.rot90(arr, k=1)) # anti-diagonal reflection + else: + return arr + + +def inverse_dihedral_transform(arr: np.ndarray, tid: int) -> np.ndarray: + return dihedral_transform(arr, DIHEDRAL_INVERSE[tid]) -- cgit v1.2.3