Launch snapshot

author: Will DePue <williamd@openai.com> 2026-03-18 09:32:01 -0700
committer: Will DePue <williamd@openai.com> 2026-03-18 09:32:01 -0700
commit: a15093adad328a650d421e53c078cbd2c45beb0e (patch)
tree: e054c4bde12b89e6d3b39d611d9caadabc7f7234 /data
4 files changed, 859 insertions, 0 deletions
diff --git a/data/README.md b/data/README.md
new file mode 100644
index 0000000..e1920ad
--- /dev/null
+++ b/data/README.md
@@ -0,0 +1,66 @@
+# Data Workflows
+
+This directory contains the dataset download helpers and export scripts used for the challenge.
+
+Canonical local layout:
+- `data/datasets/<dataset_name>/`
+- `data/tokenizers/`
+- `data/manifest.json`
+- `data/docs_selected.jsonl`
+- `data/docs_selected.source_manifest.json`
+
+## Downloading Published Data
+
+Download the cached FineWeb export for a tokenizer variant with:
+
+```bash
+python3 data/cached_challenge_fineweb.py --variant sp1024
+```
+
+This populates `./data/datasets/fineweb10B_sp1024/` and `./data/tokenizers/`.
+By default it downloads the full validation split and 8B training tokens (80 train shards).
+
+To fetch more training shards, pass `--train-shards`:
+
+```bash
+python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 180
+```
+
+The downloader is manifest-driven and can fetch only a prefix of train shards from a larger published export. With the current shard size of `100_000_000` tokens, `10B` retokenized training tokens is `100` train shards:
+
+```bash
+MATCHED_FINEWEB_REPO_ID=your-hf-username/your-dataset-repo \
+MATCHED_FINEWEB_REMOTE_ROOT_PREFIX=your_50B_export_root \
+python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 100
+```
+
+Validation is always downloaded in full from the fixed `fineweb_val_*` split. Training on the first `N` train shards means training on the prefix of the same frozen shuffled export, so the data order stays aligned with the baseline for that tokenizer family.
+
+The default published repo is `willdepueoai/parameter-golf`, with the export rooted under the repo subdirectory `datasets/`.
+
+## Rebuilding Tokenizers From Published Docs
+
+To retrain a tokenizer or re-export shards from exactly the same selected documents, run the standalone retokenizer against the published docs cache:
+
+```bash
+python3 data/download_hf_docs_and_tokenize.py \
+  --repo-id your-hf-username/your-dataset-repo \
+  --remote-root your_50B_export_root \
+  --output-root /tmp/my_custom_tokenizer_export \
+  --tokenizer-config ./data/tokenizer_specs.json
+```
+
+The sidecar `docs_selected.source_manifest.json` includes `docs_sha256`, so users can verify they are rebuilding from the exact same document list and order as the baseline export.
+
+## Useful Knobs
+
+For CPU-heavy exports, useful knobs are:
+
+```bash
+MATCHED_FINEWEB_SP_BATCH_SIZE=2048
+MATCHED_FINEWEB_TOKENIZER_THREADS=16
+MATCHED_FINEWEB_TIKTOKEN_THREADS=16
+MATCHED_FINEWEB_GPT2_DECODE_BATCH_SIZE=512
+```
+
+These control batched tokenizer encoding during shard export, tokenizer thread count, tiktoken thread count, and batched GPT-2 decode for the blobstore docs-cache path.
diff --git a/data/cached_challenge_fineweb.py b/data/cached_challenge_fineweb.py
new file mode 100644
index 0000000..fa8029b
--- /dev/null
+++ b/data/cached_challenge_fineweb.py
@@ -0,0 +1,157 @@
+import argparse
+import json
+import os
+import shutil
+from pathlib import Path
+
+from huggingface_hub import hf_hub_download
+
+
+REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", "willdepueoai/parameter-golf")
+REMOTE_ROOT_PREFIX = os.environ.get("MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", "datasets")
+ROOT = Path(__file__).resolve().parent
+DATASETS_DIR = ROOT / "datasets"
+TOKENIZERS_DIR = ROOT / "tokenizers"
+
+def dataset_dir_for_variant(name: str) -> str:
+    if name == "byte260":
+        return "fineweb10B_byte260"
+    if name.startswith("sp") and name[2:].isdigit():
+        return f"fineweb10B_{name}"
+    raise ValueError(f"unsupported variant {name!r}; expected byte260 or sp<VOCAB_SIZE>")
+
+
+def local_path_for_remote(relative_path: str) -> Path:
+    remote_path = Path(relative_path)
+    if REMOTE_ROOT_PREFIX and remote_path.parts[:1] == (REMOTE_ROOT_PREFIX,):
+        remote_path = remote_path.relative_to(REMOTE_ROOT_PREFIX)
+    if remote_path.parts[:1] == ("datasets",):
+        return DATASETS_DIR.joinpath(*remote_path.parts[1:])
+    if remote_path.parts[:1] == ("tokenizers",):
+        return TOKENIZERS_DIR.joinpath(*remote_path.parts[1:])
+    return ROOT / remote_path
+
+
+def get(relative_path: str) -> None:
+    destination = local_path_for_remote(relative_path)
+    if destination.exists():
+        return
+    if destination.is_symlink():
+        destination.unlink()
+
+    remote_path = Path(relative_path)
+    cached_path = Path(
+        hf_hub_download(
+            repo_id=REPO_ID,
+            filename=remote_path.name,
+            subfolder=remote_path.parent.as_posix() if remote_path.parent != Path(".") else None,
+            repo_type="dataset",
+        )
+    )
+    # HF cache entries may be snapshot symlinks. Resolve to the underlying blob so we
+    # always materialize a real file in data/, not a broken relative symlink.
+    cached_source = cached_path.resolve(strict=True)
+    destination.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        os.link(cached_source, destination)
+    except OSError:
+        shutil.copy2(cached_source, destination)
+
+
+def manifest_path() -> Path:
+    return local_path_for_remote(f"{REMOTE_ROOT_PREFIX}/manifest.json")
+
+
+def load_manifest(*, skip_manifest_download: bool) -> dict:
+    path = manifest_path()
+    if not path.is_file():
+        if skip_manifest_download:
+            raise FileNotFoundError(
+                f"manifest.json is required for manifest-driven shard counts but is not present locally at {path}"
+            )
+        get(f"{REMOTE_ROOT_PREFIX}/manifest.json")
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def artifact_paths_for_tokenizer(tokenizer_entry: dict) -> list[str]:
+    artifacts = []
+    for key in ("model_path", "vocab_path", "path"):
+        value = tokenizer_entry.get(key)
+        if value:
+            artifacts.append(str(value))
+    if not artifacts:
+        raise ValueError(f"tokenizer entry is missing downloadable artifacts: {tokenizer_entry}")
+    return artifacts
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Download challenge FineWeb shards from Hugging Face")
+    parser.add_argument(
+        "train_shards_positional",
+        nargs="?",
+        type=int,
+        default=None,
+        help=argparse.SUPPRESS,
+    )
+    parser.add_argument(
+        "--train-shards",
+        type=int,
+        default=80,
+        help="Number of training shards to download for the selected variant. Defaults to 80.",
+    )
+    parser.add_argument(
+        "--variant",
+        default="sp1024",
+        help="Tokenizer family to download, for example sp1024, sp4096, or byte260.",
+    )
+    parser.add_argument(
+        "--skip-manifest",
+        action="store_true",
+        help="Skip downloading manifest.json.",
+    )
+    parser.add_argument(
+        "--with-docs",
+        action="store_true",
+        help="Also download docs_selected.jsonl and its sidecar for tokenizer retraining or dataset re-export.",
+    )
+    return parser
+
+
+def main() -> None:
+    args = build_parser().parse_args()
+    dataset_dir = dataset_dir_for_variant(args.variant)
+    train_shards = args.train_shards_positional if args.train_shards_positional is not None else args.train_shards
+    if train_shards < 0:
+        raise ValueError("train_shards must be non-negative")
+
+    manifest = load_manifest(skip_manifest_download=args.skip_manifest)
+    dataset_entry = next((x for x in manifest.get("datasets", []) if x.get("name") == dataset_dir), None)
+    if dataset_entry is None:
+        raise ValueError(f"dataset {dataset_dir} not found in {REMOTE_ROOT_PREFIX}/manifest.json")
+    max_train_shards = int((dataset_entry.get("stats") or {}).get("files_train"))
+    val_shards = int((dataset_entry.get("stats") or {}).get("files_val"))
+    if train_shards > max_train_shards:
+        raise ValueError(
+            f"{args.variant} only has {max_train_shards} training shards on {REPO_ID}, requested {train_shards}"
+        )
+    tokenizer_name = dataset_entry.get("tokenizer_name")
+    tokenizer_entry = next((x for x in manifest.get("tokenizers", []) if x.get("name") == tokenizer_name), None)
+    if tokenizer_entry is None:
+        raise ValueError(f"tokenizer {tokenizer_name} not found in {REMOTE_ROOT_PREFIX}/manifest.json")
+
+    if args.with_docs:
+        get(f"{REMOTE_ROOT_PREFIX}/docs_selected.jsonl")
+        get(f"{REMOTE_ROOT_PREFIX}/docs_selected.source_manifest.json")
+
+    dataset_prefix = f"{REMOTE_ROOT_PREFIX}/datasets/{dataset_dir}"
+    for i in range(val_shards):
+        get(f"{dataset_prefix}/fineweb_val_{i:06d}.bin")
+    for i in range(train_shards):
+        get(f"{dataset_prefix}/fineweb_train_{i:06d}.bin")
+
+    for artifact_path in artifact_paths_for_tokenizer(tokenizer_entry):
+        get(f"{REMOTE_ROOT_PREFIX}/{artifact_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/data/download_hf_docs_and_tokenize.py b/data/download_hf_docs_and_tokenize.py
new file mode 100644
index 0000000..dcabd40
--- /dev/null
+++ b/data/download_hf_docs_and_tokenize.py
@@ -0,0 +1,627 @@
+"""Download docs_selected.jsonl from Hugging Face and tokenize it locally.
+
+This script is standalone. It does not import any local exporter or tokenizer
+helpers. Tokenizer configs are JSON only and currently support the built-in
+pure-byte and SentencePiece tokenizer definitions in `data/tokenizer_specs.json`.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError
+
+
+DOCS_FILENAME = "docs_selected.jsonl"
+SIDECAR_FILENAME = "docs_selected.source_manifest.json"
+VERSION = "10B"
+NUM_VAL_DOCS = 50_000
+SHARD_SIZE = 10**8
+APPEND_EOS = False
+DATAFILE_MAGIC = 20240520
+DATAFILE_VERSION = 1
+DEFAULT_REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", "willdepueoai/parameter-golf")
+DEFAULT_REMOTE_ROOT = os.environ.get("MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", "datasets")
+DEFAULT_CONFIG = Path(__file__).with_name("tokenizer_specs.json")
+TOKENIZER_THREADS = max(1, int(os.environ.get("MATCHED_FINEWEB_TOKENIZER_THREADS", str(os.cpu_count() or 8))))
+SP_BATCH_SIZE = max(1, int(os.environ.get("MATCHED_FINEWEB_SP_BATCH_SIZE", "1024")))
+
+
+@dataclass(frozen=True)
+class PureByteTokenizer:
+    pad_id: int = 0
+    bos_id: int = 1
+    eos_id: int = 2
+    unk_id: int = 3
+    byte_offset: int = 4
+    byte_count: int = 256
+
+    @property
+    def vocab_size(self) -> int:
+        return self.byte_offset + self.byte_count
+
+    def encode(self, text: str) -> np.ndarray:
+        data = text.encode("utf-8", errors="replace")
+        return np.frombuffer(data, dtype=np.uint8).astype(np.uint16, copy=False) + self.byte_offset
+
+    def encode_batch(self, texts: list[str]) -> list[np.ndarray]:
+        return [self.encode(text) for text in texts]
+
+    def save_json(self, path: str | Path) -> None:
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        payload = {
+            "tokenizer_type": "pure_byte",
+            "config": asdict(self),
+            "vocab_size": self.vocab_size,
+        }
+        path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+
+
+def default_pure_byte_tokenizer() -> PureByteTokenizer:
+    return PureByteTokenizer()
+
+
+def docs_sidecar_path(docs_jsonl: Path) -> Path:
+    return docs_jsonl.with_name(f"{docs_jsonl.stem}.source_manifest.json")
+
+
+def maybe_load_docs_sidecar_meta(docs_jsonl: Path) -> dict[str, Any] | None:
+    sidecar_path = docs_sidecar_path(docs_jsonl)
+    if not sidecar_path.is_file():
+        return None
+    payload = json.loads(sidecar_path.read_text(encoding="utf-8"))
+    if not isinstance(payload, dict):
+        raise ValueError(f"docs sidecar must be a JSON object: {sidecar_path}")
+    return payload
+
+
+def copy_from_hf_cache(*, repo_id: str, remote_root: str, filename: str, destination: Path) -> bool:
+    remote_path = Path(remote_root) / filename if remote_root else Path(filename)
+    try:
+        cached_path = Path(
+            hf_hub_download(
+                repo_id=repo_id,
+                filename=remote_path.name,
+                subfolder=remote_path.parent.as_posix() if remote_path.parent != Path(".") else None,
+                repo_type="dataset",
+            )
+        )
+    except EntryNotFoundError:
+        return False
+
+    source = cached_path.resolve(strict=True)
+    destination.parent.mkdir(parents=True, exist_ok=True)
+    if destination.exists():
+        destination.unlink()
+    try:
+        os.link(source, destination)
+    except OSError:
+        shutil.copy2(source, destination)
+    return True
+
+
+def iter_docs(path: Path):
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            yield json.loads(line)["text"]
+
+
+def count_docs(path: Path) -> int:
+    with path.open("r", encoding="utf-8") as f:
+        return sum(1 for _ in f)
+
+
+def batched_docs_jsonl(path: Path, batch_size: int):
+    batch: list[str] = []
+    for text in iter_docs(path):
+        batch.append(text)
+        if len(batch) == batch_size:
+            yield batch
+            batch = []
+    if batch:
+        yield batch
+
+
+def write_datafile(path: Path, toks: Any) -> None:
+    if len(toks) >= 2**31:
+        raise ValueError("token count too large")
+    header = np.zeros(256, dtype="<i4")
+    header[0] = DATAFILE_MAGIC
+    header[1] = DATAFILE_VERSION
+    header[2] = len(toks)
+    toks = np.asarray(toks)
+    if toks.dtype != np.uint16:
+        if not ((0 <= toks).all() and (toks < 2**16).all()):
+            raise ValueError("token dictionary too large for uint16")
+        toks = toks.astype("<u2", copy=False)
+    else:
+        toks = toks.astype("<u2", copy=False)
+    with path.open("wb") as f:
+        f.write(header.tobytes())
+        f.write(toks.tobytes())
+
+
+def relativize_manifest_paths(value: Any, root: Path) -> Any:
+    if isinstance(value, dict):
+        return {k: relativize_manifest_paths(v, root) for k, v in value.items()}
+    if isinstance(value, list):
+        return [relativize_manifest_paths(v, root) for v in value]
+    if isinstance(value, str):
+        path = Path(value)
+        if path.is_absolute():
+            try:
+                return path.relative_to(root).as_posix()
+            except ValueError:
+                return value
+    return value
+
+
+def parse_reuse_sp_models(values: list[str]) -> dict[int, Path]:
+    reuse_models: dict[int, Path] = {}
+    for value in values:
+        vocab_size_str, model_path = value.split("=", 1)
+        vocab_size = int(vocab_size_str)
+        if vocab_size in reuse_models:
+            raise ValueError(f"duplicate --reuse_sp_model for vocab_size={vocab_size}")
+        reuse_models[vocab_size] = Path(model_path).expanduser().resolve()
+    return reuse_models
+
+
+def load_specs(config_path: Path) -> list[dict[str, Any]]:
+    payload = json.loads(config_path.read_text(encoding="utf-8"))
+    if isinstance(payload, dict):
+        specs = payload.get("tokenizer_specs", payload.get("tokenizers"))
+    else:
+        specs = payload
+    if not isinstance(specs, list) or not specs:
+        raise ValueError("tokenizer_config must define a non-empty list")
+    if not all(isinstance(spec, dict) for spec in specs):
+        raise ValueError("each tokenizer spec must be a JSON object")
+    return [dict(spec) for spec in specs]
+
+
+def tokenizer_kind(spec: dict[str, Any]) -> str:
+    kind = spec.get("kind")
+    if kind in {"byte", "pure_byte"}:
+        return "byte"
+    if kind in {"sentencepiece_bpe", "sentencepiece"}:
+        return "sentencepiece_bpe"
+    builder = str(spec.get("builder", ""))
+    builder_name = builder.rsplit(":", 1)[-1]
+    if builder_name == "build_pure_byte_tokenizer":
+        return "byte"
+    if builder_name == "build_sentencepiece_tokenizer":
+        return "sentencepiece_bpe"
+    if spec.get("dataset_suffix") == "byte260":
+        return "byte"
+    if "vocab_size" in spec:
+        return "sentencepiece_bpe"
+    raise ValueError(
+        f"unsupported tokenizer spec {spec.get('name', '<unnamed>')!r}: "
+        "expected a built-in pure-byte or sentencepiece builder"
+    )
+
+
+def write_tokenizer_config_export(output_root: Path, selected_specs: list[dict[str, Any]]) -> Path:
+    path = output_root / "tokenizer_config.export.json"
+    path.write_text(json.dumps({"tokenizers": selected_specs}, indent=2) + "\n", encoding="utf-8")
+    return path
+
+
+def _iter_sentencepiece_text(docs_jsonl: Path, *, max_docs: int | None = None):
+    with docs_jsonl.open("r", encoding="utf-8") as f:
+        for i, line in enumerate(f):
+            if max_docs is not None and i >= max_docs:
+                break
+            text = json.loads(line)["text"].replace("\x00", " ").strip()
+            if text:
+                yield text
+
+
+def build_pure_byte_tokenizer(*, spec: dict[str, Any], docs_jsonl: Path, tokenizers_dir: Path) -> dict[str, Any]:
+    del docs_jsonl
+    tok = default_pure_byte_tokenizer()
+    path = tokenizers_dir / spec.get("filename", "fineweb_pure_byte_260.json")
+    tok.save_json(path)
+    return {
+        "name": spec.get("name", "pure_byte_260"),
+        "kind": "byte",
+        "dataset_suffix": spec.get("dataset_suffix", "byte260"),
+        "vocab_size": tok.vocab_size,
+        "bos_id": tok.bos_id,
+        "eos_id": tok.eos_id,
+        "encode": tok.encode,
+        "encode_batch": tok.encode_batch,
+        "manifest": {"path": str(path), "pad_id": tok.pad_id, "unk_id": tok.unk_id},
+    }
+
+
+def build_sentencepiece_tokenizer(*, spec: dict[str, Any], docs_jsonl: Path, tokenizers_dir: Path) -> dict[str, Any]:
+    try:
+        import sentencepiece as spm
+    except ImportError as exc:
+        raise RuntimeError("sentencepiece is required for SentencePiece tokenizer exports") from exc
+
+    vocab_size = int(spec["vocab_size"])
+    prefix = tokenizers_dir / spec.get("model_prefix", f"fineweb_{vocab_size}_bpe")
+    model_path = prefix.with_suffix(".model")
+    vocab_path = prefix.with_suffix(".vocab")
+    prefix.parent.mkdir(parents=True, exist_ok=True)
+    for artifact in (model_path, vocab_path):
+        if artifact.exists():
+            artifact.unlink()
+
+    reuse_model_path = spec.get("reuse_model_path")
+    if reuse_model_path is not None:
+        reuse_model_path = Path(reuse_model_path).expanduser().resolve()
+        if not reuse_model_path.is_file():
+            raise FileNotFoundError(reuse_model_path)
+        shutil.copy2(reuse_model_path, model_path)
+        reuse_vocab_path = reuse_model_path.with_suffix(".vocab")
+        if reuse_vocab_path.is_file():
+            shutil.copy2(reuse_vocab_path, vocab_path)
+    else:
+        kwargs = {
+            "sentence_iterator": _iter_sentencepiece_text(
+                docs_jsonl,
+                max_docs=None if spec.get("tokenizer_train_docs") is None else int(spec["tokenizer_train_docs"]),
+            ),
+            "model_prefix": str(prefix),
+            "model_type": "bpe",
+            "vocab_size": vocab_size,
+            "character_coverage": 0.999,
+            "byte_fallback": True,
+            "split_digits": True,
+            "normalization_rule_name": "nmt_nfkc",
+            "add_dummy_prefix": False,
+            "pad_id": 0,
+            "bos_id": 1,
+            "eos_id": 2,
+            "unk_id": 3,
+            "hard_vocab_limit": False,
+        }
+        kwargs.update(spec.get("trainer_overrides") or {})
+        spm.SentencePieceTrainer.train(**kwargs)
+
+    tok = spm.SentencePieceProcessor(model_file=str(model_path))
+    return {
+        "name": spec.get("name", f"sp_bpe_{vocab_size}"),
+        "kind": "sentencepiece_bpe",
+        "dataset_suffix": spec.get("dataset_suffix", f"sp{vocab_size}"),
+        "vocab_size": int(tok.vocab_size()),
+        "bos_id": int(tok.bos_id()),
+        "eos_id": int(tok.eos_id()),
+        "encode": lambda text, tok=tok: tok.encode(text, out_type=int),
+        "encode_batch": lambda texts, tok=tok: tok.encode(texts, out_type=int, num_threads=TOKENIZER_THREADS),
+        "manifest": {"model_path": str(model_path), "vocab_path": str(vocab_path)},
+    }
+
+
+def export_shards(
+    docs_jsonl: Path,
+    tok: dict[str, Any],
+    output_dir: Path,
+    *,
+    num_val_docs: int,
+    shard_size: int,
+    docs_total: int,
+) -> dict[str, int]:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for pattern in ("fineweb_train_*.bin", "fineweb_val_*.bin"):
+        for stale in output_dir.glob(pattern):
+            stale.unlink()
+
+    stats = {
+        "docs_total": 0,
+        "docs_val": 0,
+        "docs_train": 0,
+        "files_total": 0,
+        "files_val": 0,
+        "files_train": 0,
+        "tokens_total": 0,
+        "tokens_val": 0,
+        "tokens_train": 0,
+    }
+    buf = np.empty((shard_size,), dtype=np.uint16)
+    fill = 0
+    split = "val"
+    shards = {"val": 0, "train": 0}
+
+    def flush() -> None:
+        nonlocal fill
+        if fill == 0:
+            return
+        write_datafile(output_dir / f"fineweb_{split}_{shards[split]:06d}.bin", buf[:fill])
+        stats["files_total"] += 1
+        stats[f"files_{split}"] += 1
+        shards[split] += 1
+        fill = 0
+
+    vocab_size = int(tok["vocab_size"])
+    if vocab_size > 2**16:
+        raise ValueError(f"vocab_size={vocab_size} is too large for uint16 shard storage")
+
+    batch_encode = tok.get("encode_batch")
+    batch_size = SP_BATCH_SIZE if callable(batch_encode) else 1
+    for texts in batched_docs_jsonl(docs_jsonl, batch_size):
+        encoded_docs = batch_encode(texts) if callable(batch_encode) else [tok["encode"](text) for text in texts]
+        for text, encoded in zip(texts, encoded_docs, strict=True):
+            del text
+            split_for_doc = "val" if stats["docs_total"] < num_val_docs else "train"
+            if split_for_doc != split:
+                flush()
+                split = split_for_doc
+
+            encoded_arr = np.asarray(encoded, dtype=np.int32)
+            toks = np.empty((encoded_arr.size + 1 + int(APPEND_EOS),), dtype=np.int32)
+            toks[0] = tok["bos_id"]
+            toks[1 : 1 + encoded_arr.size] = encoded_arr
+            if APPEND_EOS:
+                toks[-1] = tok["eos_id"]
+            if not ((0 <= toks).all() and (toks < vocab_size).all()):
+                bad = int(toks[(toks < 0) | (toks >= vocab_size)][0])
+                raise ValueError(f"token id {bad} outside declared vocab_size={vocab_size}")
+            toks = toks.astype("<u2", copy=False)
+
+            stats["docs_total"] += 1
+            stats[f"docs_{split}"] += 1
+            stats["tokens_total"] += len(toks)
+            stats[f"tokens_{split}"] += len(toks)
+
+            pos = 0
+            while pos < len(toks):
+                take = min(shard_size - fill, len(toks) - pos)
+                buf[fill : fill + take] = toks[pos : pos + take]
+                fill += take
+                pos += take
+                if fill == shard_size:
+                    flush()
+
+        if stats["docs_total"] and stats["docs_total"] % 100_000 == 0:
+            print(f"{output_dir.name}: {stats['docs_total']}/{docs_total} docs", flush=True)
+
+    flush()
+    if stats["docs_total"] != docs_total:
+        raise ValueError(f"expected {docs_total} docs, exported {stats['docs_total']}")
+    return stats
+
+
+def build_tokenizers(
+    *,
+    specs: list[dict[str, Any]],
+    docs_jsonl: Path,
+    tokenizers_dir: Path,
+    tokenizer_train_docs: int | None,
+    skip_byte: bool,
+    reuse_sp_models: dict[int, Path],
+) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
+    tokenizers: list[dict[str, Any]] = []
+    selected_specs: list[dict[str, Any]] = []
+    seen_names: set[str] = set()
+    seen_datasets: set[str] = set()
+
+    for raw_spec in specs:
+        spec = dict(raw_spec)
+        kind = tokenizer_kind(spec)
+        if skip_byte and kind == "byte":
+            continue
+        if kind == "sentencepiece_bpe":
+            if tokenizer_train_docs is not None:
+                spec["tokenizer_train_docs"] = int(tokenizer_train_docs)
+            vocab_size = int(spec["vocab_size"])
+            if vocab_size in reuse_sp_models:
+                spec["reuse_model_path"] = str(reuse_sp_models[vocab_size])
+
+        selected_specs.append(spec)
+        built = (
+            build_pure_byte_tokenizer(spec=spec, docs_jsonl=docs_jsonl, tokenizers_dir=tokenizers_dir)
+            if kind == "byte"
+            else build_sentencepiece_tokenizer(spec=spec, docs_jsonl=docs_jsonl, tokenizers_dir=tokenizers_dir)
+        )
+        name = str(built["name"])
+        dataset_suffix = built.get("dataset_suffix")
+        dataset_name = str(built.get("dataset_name", f"fineweb{VERSION}_{dataset_suffix}"))
+        if name in seen_names:
+            raise ValueError(f"duplicate tokenizer name: {name}")
+        if dataset_name in seen_datasets:
+            raise ValueError(f"duplicate dataset name: {dataset_name}")
+        seen_names.add(name)
+        seen_datasets.add(dataset_name)
+        vocab_size = int(built["vocab_size"])
+        recommended_bigram_vocab_size = int(
+            built.get("recommended_bigram_vocab_size", ((vocab_size + 127) // 128) * 128 * 5)
+        )
+        tokenizers.append(
+            {
+                "name": name,
+                "kind": str(built["kind"]),
+                "dataset_name": dataset_name,
+                "vocab_size": vocab_size,
+                "bos_id": int(built["bos_id"]),
+                "eos_id": int(built["eos_id"]),
+                "encode": built["encode"],
+                "encode_batch": built.get("encode_batch"),
+                "recommended_bigram_vocab_size": recommended_bigram_vocab_size,
+                "manifest": {
+                    "name": name,
+                    "kind": str(built["kind"]),
+                    "vocab_size": vocab_size,
+                    "bos_id": int(built["bos_id"]),
+                    "eos_id": int(built["eos_id"]),
+                    "recommended_bigram_vocab_size": recommended_bigram_vocab_size,
+                    "source_spec": spec,
+                    **(built.get("manifest") or {}),
+                },
+            }
+        )
+    if not tokenizers:
+        raise ValueError("tokenizer_config produced no tokenizers after filtering")
+    return tokenizers, selected_specs
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Download docs_selected.jsonl from a Hugging Face dataset repo and tokenize it locally"
+    )
+    parser.add_argument(
+        "--repo-id",
+        default=DEFAULT_REPO_ID,
+        help="Hugging Face dataset repo id, for example user/dataset",
+    )
+    parser.add_argument(
+        "--remote-root",
+        default=DEFAULT_REMOTE_ROOT,
+        help="Optional subdirectory inside the dataset repo that contains docs_selected.jsonl",
+    )
+    parser.add_argument("--output-root", required=True, help="Directory where docs, tokenizers, shards, and manifest are written")
+    parser.add_argument(
+        "--tokenizer-config",
+        default=str(DEFAULT_CONFIG),
+        help="Local tokenizer config JSON. Defaults to data/tokenizer_specs.json.",
+    )
+    parser.add_argument(
+        "--num-val-docs",
+        type=int,
+        default=None,
+        help="Validation document count. Defaults to the downloaded sidecar when present, otherwise 50000.",
+    )
+    parser.add_argument("--chunk-tokens", type=int, default=SHARD_SIZE, help="Shard size in tokens.")
+    parser.add_argument(
+        "--tokenizer-train-docs",
+        type=int,
+        default=None,
+        help="Limit the number of docs used for tokenizer training.",
+    )
+    parser.add_argument("--skip-byte", action="store_true", help="Skip byte-tokenizer export.")
+    parser.add_argument(
+        "--reuse-sp-model",
+        action="append",
+        default=[],
+        metavar="VOCAB=MODEL",
+        help="Reuse an existing SentencePiece model for the given vocab size instead of retraining it.",
+    )
+    return parser
+
+
+def main() -> None:
+    args = build_parser().parse_args()
+    if args.chunk_tokens <= 0:
+        raise ValueError(f"--chunk_tokens must be positive, got {args.chunk_tokens}")
+
+    output_root = Path(args.output_root).expanduser().resolve()
+    output_root.mkdir(parents=True, exist_ok=True)
+    tokenizers_dir = output_root / "tokenizers"
+    datasets_dir = output_root / "datasets"
+    tokenizers_dir.mkdir(parents=True, exist_ok=True)
+    datasets_dir.mkdir(parents=True, exist_ok=True)
+
+    docs_jsonl = output_root / DOCS_FILENAME
+    sidecar = output_root / SIDECAR_FILENAME
+    if not copy_from_hf_cache(
+        repo_id=args.repo_id,
+        remote_root=args.remote_root,
+        filename=DOCS_FILENAME,
+        destination=docs_jsonl,
+    ):
+        remote = f"{args.remote_root}/{DOCS_FILENAME}" if args.remote_root else DOCS_FILENAME
+        raise FileNotFoundError(f"{remote} not found in Hugging Face dataset repo {args.repo_id}")
+    if not copy_from_hf_cache(
+        repo_id=args.repo_id,
+        remote_root=args.remote_root,
+        filename=SIDECAR_FILENAME,
+        destination=sidecar,
+    ):
+        sidecar.unlink(missing_ok=True)
+
+    docs_sidecar = maybe_load_docs_sidecar_meta(docs_jsonl)
+    docs_total = int(docs_sidecar["num_docs"]) if docs_sidecar is not None and docs_sidecar.get("num_docs") is not None else count_docs(docs_jsonl)
+    if args.num_val_docs is not None:
+        num_val_docs = int(args.num_val_docs)
+    elif docs_sidecar is not None and docs_sidecar.get("docs_val") is not None:
+        num_val_docs = int(docs_sidecar["docs_val"])
+    else:
+        num_val_docs = NUM_VAL_DOCS
+    if not (0 <= num_val_docs <= docs_total):
+        raise ValueError(f"num_val_docs must be in [0, {docs_total}], got {num_val_docs}")
+
+    specs = load_specs(Path(args.tokenizer_config).expanduser().resolve())
+    reuse_sp_models = parse_reuse_sp_models(args.reuse_sp_model)
+    tokenizers, selected_specs = build_tokenizers(
+        specs=specs,
+        docs_jsonl=docs_jsonl,
+        tokenizers_dir=tokenizers_dir,
+        tokenizer_train_docs=args.tokenizer_train_docs,
+        skip_byte=args.skip_byte,
+        reuse_sp_models=reuse_sp_models,
+    )
+    write_tokenizer_config_export(output_root, selected_specs)
+
+    docs_meta = {
+        "remote_repo_id": args.repo_id,
+        "remote_root": args.remote_root,
+        "num_docs": docs_total,
+        "docs_sha256": None if docs_sidecar is None else docs_sidecar.get("docs_sha256"),
+        "source_manifest": str(docs_sidecar_path(docs_jsonl)) if docs_sidecar is not None else None,
+    }
+    if docs_sidecar is not None:
+        docs_meta["source_sidecar"] = docs_sidecar
+
+    manifest = {
+        "version": VERSION,
+        "num_docs": docs_total,
+        "num_val_docs": num_val_docs,
+        "shuffle_seed": None if docs_sidecar is None else docs_sidecar.get("shuffle_seed"),
+        "shard_size": int(args.chunk_tokens),
+        "append_eos": APPEND_EOS,
+        "docs_jsonl": str(docs_jsonl),
+        "docs_meta": docs_meta,
+        "tokenizer_specs": selected_specs,
+        "tokenizers": [],
+        "datasets": [],
+    }
+
+    for tok in tokenizers:
+        output_dir = datasets_dir / tok["dataset_name"]
+        print(f"Exporting dataset: {tok['dataset_name']}", flush=True)
+        stats = export_shards(
+            docs_jsonl,
+            tok,
+            output_dir,
+            num_val_docs=num_val_docs,
+            shard_size=int(args.chunk_tokens),
+            docs_total=docs_total,
+        )
+        manifest["tokenizers"].append(tok["manifest"])
+        manifest["datasets"].append(
+            {
+                "name": tok["dataset_name"],
+                "tokenizer_name": tok["name"],
+                "tokenizer_kind": tok["kind"],
+                "path": str(output_dir),
+                "train_glob": str(output_dir / "fineweb_train_*.bin"),
+                "val_glob": str(output_dir / "fineweb_val_*.bin"),
+                "vocab_size": tok["vocab_size"],
+                "bos_id": tok["bos_id"],
+                "eos_id": tok["eos_id"],
+                "recommended_bigram_vocab_size": tok["recommended_bigram_vocab_size"],
+                "stats": stats,
+            }
+        )
+
+    manifest = relativize_manifest_paths(manifest, output_root)
+    manifest_path = output_root / "manifest.json"
+    manifest_path.write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf-8")
+    print(f"Done. Manifest: {manifest_path}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/data/tokenizer_specs.json b/data/tokenizer_specs.json
new file mode 100644
index 0000000..d7ad1ca
--- /dev/null
+++ b/data/tokenizer_specs.json
@@ -0,0 +1,9 @@
+{
+  "tokenizers": [
+    {
+      "name": "sp_bpe_1024",
+      "dataset_suffix": "sp1024",
+      "vocab_size": 1024
+    }
+  ]
+}
author	Will DePue <williamd@openai.com>	2026-03-18 09:32:01 -0700
committer	Will DePue <williamd@openai.com>	2026-03-18 09:32:01 -0700
commit	a15093adad328a650d421e53c078cbd2c45beb0e (patch)
tree	e054c4bde12b89e6d3b39d611d9caadabc7f7234 /data