summaryrefslogtreecommitdiff
path: root/data
diff options
context:
space:
mode:
authorWill DePue <williamd@openai.com>2026-03-18 09:32:01 -0700
committerWill DePue <williamd@openai.com>2026-03-18 09:32:01 -0700
commita15093adad328a650d421e53c078cbd2c45beb0e (patch)
treee054c4bde12b89e6d3b39d611d9caadabc7f7234 /data
Launch snapshot
Diffstat (limited to 'data')
-rw-r--r--data/README.md66
-rw-r--r--data/cached_challenge_fineweb.py157
-rw-r--r--data/download_hf_docs_and_tokenize.py627
-rw-r--r--data/tokenizer_specs.json9
4 files changed, 859 insertions, 0 deletions
diff --git a/data/README.md b/data/README.md
new file mode 100644
index 0000000..e1920ad
--- /dev/null
+++ b/data/README.md
@@ -0,0 +1,66 @@
+# Data Workflows
+
+This directory contains the dataset download helpers and export scripts used for the challenge.
+
+Canonical local layout:
+- `data/datasets/<dataset_name>/`
+- `data/tokenizers/`
+- `data/manifest.json`
+- `data/docs_selected.jsonl`
+- `data/docs_selected.source_manifest.json`
+
+## Downloading Published Data
+
+Download the cached FineWeb export for a tokenizer variant with:
+
+```bash
+python3 data/cached_challenge_fineweb.py --variant sp1024
+```
+
+This populates `./data/datasets/fineweb10B_sp1024/` and `./data/tokenizers/`.
+By default it downloads the full validation split and 8B training tokens (80 train shards).
+
+To fetch more training shards, pass `--train-shards`:
+
+```bash
+python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 180
+```
+
+The downloader is manifest-driven and can fetch only a prefix of train shards from a larger published export. With the current shard size of `100_000_000` tokens, `10B` retokenized training tokens is `100` train shards:
+
+```bash
+MATCHED_FINEWEB_REPO_ID=your-hf-username/your-dataset-repo \
+MATCHED_FINEWEB_REMOTE_ROOT_PREFIX=your_50B_export_root \
+python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 100
+```
+
+Validation is always downloaded in full from the fixed `fineweb_val_*` split. Training on the first `N` train shards means training on the prefix of the same frozen shuffled export, so the data order stays aligned with the baseline for that tokenizer family.
+
+The default published repo is `willdepueoai/parameter-golf`, with the export rooted under the repo subdirectory `datasets/`.
+
+## Rebuilding Tokenizers From Published Docs
+
+To retrain a tokenizer or re-export shards from exactly the same selected documents, run the standalone retokenizer against the published docs cache:
+
+```bash
+python3 data/download_hf_docs_and_tokenize.py \
+ --repo-id your-hf-username/your-dataset-repo \
+ --remote-root your_50B_export_root \
+ --output-root /tmp/my_custom_tokenizer_export \
+ --tokenizer-config ./data/tokenizer_specs.json
+```
+
+The sidecar `docs_selected.source_manifest.json` includes `docs_sha256`, so users can verify they are rebuilding from the exact same document list and order as the baseline export.
+
+## Useful Knobs
+
+For CPU-heavy exports, useful knobs are:
+
+```bash
+MATCHED_FINEWEB_SP_BATCH_SIZE=2048
+MATCHED_FINEWEB_TOKENIZER_THREADS=16
+MATCHED_FINEWEB_TIKTOKEN_THREADS=16
+MATCHED_FINEWEB_GPT2_DECODE_BATCH_SIZE=512
+```
+
+These control batched tokenizer encoding during shard export, tokenizer thread count, tiktoken thread count, and batched GPT-2 decode for the blobstore docs-cache path.
diff --git a/data/cached_challenge_fineweb.py b/data/cached_challenge_fineweb.py
new file mode 100644
index 0000000..fa8029b
--- /dev/null
+++ b/data/cached_challenge_fineweb.py
@@ -0,0 +1,157 @@
+import argparse
+import json
+import os
+import shutil
+from pathlib import Path
+
+from huggingface_hub import hf_hub_download
+
+
+REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", "willdepueoai/parameter-golf")
+REMOTE_ROOT_PREFIX = os.environ.get("MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", "datasets")
+ROOT = Path(__file__).resolve().parent
+DATASETS_DIR = ROOT / "datasets"
+TOKENIZERS_DIR = ROOT / "tokenizers"
+
+def dataset_dir_for_variant(name: str) -> str:
+ if name == "byte260":
+ return "fineweb10B_byte260"
+ if name.startswith("sp") and name[2:].isdigit():
+ return f"fineweb10B_{name}"
+ raise ValueError(f"unsupported variant {name!r}; expected byte260 or sp<VOCAB_SIZE>")
+
+
+def local_path_for_remote(relative_path: str) -> Path:
+ remote_path = Path(relative_path)
+ if REMOTE_ROOT_PREFIX and remote_path.parts[:1] == (REMOTE_ROOT_PREFIX,):
+ remote_path = remote_path.relative_to(REMOTE_ROOT_PREFIX)
+ if remote_path.parts[:1] == ("datasets",):
+ return DATASETS_DIR.joinpath(*remote_path.parts[1:])
+ if remote_path.parts[:1] == ("tokenizers",):
+ return TOKENIZERS_DIR.joinpath(*remote_path.parts[1:])
+ return ROOT / remote_path
+
+
+def get(relative_path: str) -> None:
+ destination = local_path_for_remote(relative_path)
+ if destination.exists():
+ return
+ if destination.is_symlink():
+ destination.unlink()
+
+ remote_path = Path(relative_path)
+ cached_path = Path(
+ hf_hub_download(
+ repo_id=REPO_ID,
+ filename=remote_path.name,
+ subfolder=remote_path.parent.as_posix() if remote_path.parent != Path(".") else None,
+ repo_type="dataset",
+ )
+ )
+ # HF cache entries may be snapshot symlinks. Resolve to the underlying blob so we
+ # always materialize a real file in data/, not a broken relative symlink.
+ cached_source = cached_path.resolve(strict=True)
+ destination.parent.mkdir(parents=True, exist_ok=True)
+ try:
+ os.link(cached_source, destination)
+ except OSError:
+ shutil.copy2(cached_source, destination)
+
+
+def manifest_path() -> Path:
+ return local_path_for_remote(f"{REMOTE_ROOT_PREFIX}/manifest.json")
+
+
+def load_manifest(*, skip_manifest_download: bool) -> dict:
+ path = manifest_path()
+ if not path.is_file():
+ if skip_manifest_download:
+ raise FileNotFoundError(
+ f"manifest.json is required for manifest-driven shard counts but is not present locally at {path}"
+ )
+ get(f"{REMOTE_ROOT_PREFIX}/manifest.json")
+ return json.loads(path.read_text(encoding="utf-8"))
+
+
+def artifact_paths_for_tokenizer(tokenizer_entry: dict) -> list[str]:
+ artifacts = []
+ for key in ("model_path", "vocab_path", "path"):
+ value = tokenizer_entry.get(key)
+ if value:
+ artifacts.append(str(value))
+ if not artifacts:
+ raise ValueError(f"tokenizer entry is missing downloadable artifacts: {tokenizer_entry}")
+ return artifacts
+
+
+def build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(description="Download challenge FineWeb shards from Hugging Face")
+ parser.add_argument(
+ "train_shards_positional",
+ nargs="?",
+ type=int,
+ default=None,
+ help=argparse.SUPPRESS,
+ )
+ parser.add_argument(
+ "--train-shards",
+ type=int,
+ default=80,
+ help="Number of training shards to download for the selected variant. Defaults to 80.",
+ )
+ parser.add_argument(
+ "--variant",
+ default="sp1024",
+ help="Tokenizer family to download, for example sp1024, sp4096, or byte260.",
+ )
+ parser.add_argument(
+ "--skip-manifest",
+ action="store_true",
+ help="Skip downloading manifest.json.",
+ )
+ parser.add_argument(
+ "--with-docs",
+ action="store_true",
+ help="Also download docs_selected.jsonl and its sidecar for tokenizer retraining or dataset re-export.",
+ )
+ return parser
+
+
+def main() -> None:
+ args = build_parser().parse_args()
+ dataset_dir = dataset_dir_for_variant(args.variant)
+ train_shards = args.train_shards_positional if args.train_shards_positional is not None else args.train_shards
+ if train_shards < 0:
+ raise ValueError("train_shards must be non-negative")
+
+ manifest = load_manifest(skip_manifest_download=args.skip_manifest)
+ dataset_entry = next((x for x in manifest.get("datasets", []) if x.get("name") == dataset_dir), None)
+ if dataset_entry is None:
+ raise ValueError(f"dataset {dataset_dir} not found in {REMOTE_ROOT_PREFIX}/manifest.json")
+ max_train_shards = int((dataset_entry.get("stats") or {}).get("files_train"))
+ val_shards = int((dataset_entry.get("stats") or {}).get("files_val"))
+ if train_shards > max_train_shards:
+ raise ValueError(
+ f"{args.variant} only has {max_train_shards} training shards on {REPO_ID}, requested {train_shards}"
+ )
+ tokenizer_name = dataset_entry.get("tokenizer_name")
+ tokenizer_entry = next((x for x in manifest.get("tokenizers", []) if x.get("name") == tokenizer_name), None)
+ if tokenizer_entry is None:
+ raise ValueError(f"tokenizer {tokenizer_name} not found in {REMOTE_ROOT_PREFIX}/manifest.json")
+
+ if args.with_docs:
+ get(f"{REMOTE_ROOT_PREFIX}/docs_selected.jsonl")
+ get(f"{REMOTE_ROOT_PREFIX}/docs_selected.source_manifest.json")
+
+ dataset_prefix = f"{REMOTE_ROOT_PREFIX}/datasets/{dataset_dir}"
+ for i in range(val_shards):
+ get(f"{dataset_prefix}/fineweb_val_{i:06d}.bin")
+ for i in range(train_shards):
+ get(f"{dataset_prefix}/fineweb_train_{i:06d}.bin")
+
+ for artifact_path in artifact_paths_for_tokenizer(tokenizer_entry):
+ get(f"{REMOTE_ROOT_PREFIX}/{artifact_path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/data/download_hf_docs_and_tokenize.py b/data/download_hf_docs_and_tokenize.py
new file mode 100644
index 0000000..dcabd40
--- /dev/null
+++ b/data/download_hf_docs_and_tokenize.py
@@ -0,0 +1,627 @@
+"""Download docs_selected.jsonl from Hugging Face and tokenize it locally.
+
+This script is standalone. It does not import any local exporter or tokenizer
+helpers. Tokenizer configs are JSON only and currently support the built-in
+pure-byte and SentencePiece tokenizer definitions in `data/tokenizer_specs.json`.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError
+
+
+DOCS_FILENAME = "docs_selected.jsonl"
+SIDECAR_FILENAME = "docs_selected.source_manifest.json"
+VERSION = "10B"
+NUM_VAL_DOCS = 50_000
+SHARD_SIZE = 10**8
+APPEND_EOS = False
+DATAFILE_MAGIC = 20240520
+DATAFILE_VERSION = 1
+DEFAULT_REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", "willdepueoai/parameter-golf")
+DEFAULT_REMOTE_ROOT = os.environ.get("MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", "datasets")
+DEFAULT_CONFIG = Path(__file__).with_name("tokenizer_specs.json")
+TOKENIZER_THREADS = max(1, int(os.environ.get("MATCHED_FINEWEB_TOKENIZER_THREADS", str(os.cpu_count() or 8))))
+SP_BATCH_SIZE = max(1, int(os.environ.get("MATCHED_FINEWEB_SP_BATCH_SIZE", "1024")))
+
+
+@dataclass(frozen=True)
+class PureByteTokenizer:
+ pad_id: int = 0
+ bos_id: int = 1
+ eos_id: int = 2
+ unk_id: int = 3
+ byte_offset: int = 4
+ byte_count: int = 256
+
+ @property
+ def vocab_size(self) -> int:
+ return self.byte_offset + self.byte_count
+
+ def encode(self, text: str) -> np.ndarray:
+ data = text.encode("utf-8", errors="replace")
+ return np.frombuffer(data, dtype=np.uint8).astype(np.uint16, copy=False) + self.byte_offset
+
+ def encode_batch(self, texts: list[str]) -> list[np.ndarray]:
+ return [self.encode(text) for text in texts]
+
+ def save_json(self, path: str | Path) -> None:
+ path = Path(path)
+ path.parent.mkdir(parents=True, exist_ok=True)
+ payload = {
+ "tokenizer_type": "pure_byte",
+ "config": asdict(self),
+ "vocab_size": self.vocab_size,
+ }
+ path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+
+
+def default_pure_byte_tokenizer() -> PureByteTokenizer:
+ return PureByteTokenizer()
+
+
+def docs_sidecar_path(docs_jsonl: Path) -> Path:
+ return docs_jsonl.with_name(f"{docs_jsonl.stem}.source_manifest.json")
+
+
+def maybe_load_docs_sidecar_meta(docs_jsonl: Path) -> dict[str, Any] | None:
+ sidecar_path = docs_sidecar_path(docs_jsonl)
+ if not sidecar_path.is_file():
+ return None
+ payload = json.loads(sidecar_path.read_text(encoding="utf-8"))
+ if not isinstance(payload, dict):
+ raise ValueError(f"docs sidecar must be a JSON object: {sidecar_path}")
+ return payload
+
+
+def copy_from_hf_cache(*, repo_id: str, remote_root: str, filename: str, destination: Path) -> bool:
+ remote_path = Path(remote_root) / filename if remote_root else Path(filename)
+ try:
+ cached_path = Path(
+ hf_hub_download(
+ repo_id=repo_id,
+ filename=remote_path.name,
+ subfolder=remote_path.parent.as_posix() if remote_path.parent != Path(".") else None,
+ repo_type="dataset",
+ )
+ )
+ except EntryNotFoundError:
+ return False
+
+ source = cached_path.resolve(strict=True)
+ destination.parent.mkdir(parents=True, exist_ok=True)
+ if destination.exists():
+ destination.unlink()
+ try:
+ os.link(source, destination)
+ except OSError:
+ shutil.copy2(source, destination)
+ return True
+
+
+def iter_docs(path: Path):
+ with path.open("r", encoding="utf-8") as f:
+ for line in f:
+ yield json.loads(line)["text"]
+
+
+def count_docs(path: Path) -> int:
+ with path.open("r", encoding="utf-8") as f:
+ return sum(1 for _ in f)
+
+
+def batched_docs_jsonl(path: Path, batch_size: int):
+ batch: list[str] = []
+ for text in iter_docs(path):
+ batch.append(text)
+ if len(batch) == batch_size:
+ yield batch
+ batch = []
+ if batch:
+ yield batch
+
+
+def write_datafile(path: Path, toks: Any) -> None:
+ if len(toks) >= 2**31:
+ raise ValueError("token count too large")
+ header = np.zeros(256, dtype="<i4")
+ header[0] = DATAFILE_MAGIC
+ header[1] = DATAFILE_VERSION
+ header[2] = len(toks)
+ toks = np.asarray(toks)
+ if toks.dtype != np.uint16:
+ if not ((0 <= toks).all() and (toks < 2**16).all()):
+ raise ValueError("token dictionary too large for uint16")
+ toks = toks.astype("<u2", copy=False)
+ else:
+ toks = toks.astype("<u2", copy=False)
+ with path.open("wb") as f:
+ f.write(header.tobytes())
+ f.write(toks.tobytes())
+
+
+def relativize_manifest_paths(value: Any, root: Path) -> Any:
+ if isinstance(value, dict):
+ return {k: relativize_manifest_paths(v, root) for k, v in value.items()}
+ if isinstance(value, list):
+ return [relativize_manifest_paths(v, root) for v in value]
+ if isinstance(value, str):
+ path = Path(value)
+ if path.is_absolute():
+ try:
+ return path.relative_to(root).as_posix()
+ except ValueError:
+ return value
+ return value
+
+
+def parse_reuse_sp_models(values: list[str]) -> dict[int, Path]:
+ reuse_models: dict[int, Path] = {}
+ for value in values:
+ vocab_size_str, model_path = value.split("=", 1)
+ vocab_size = int(vocab_size_str)
+ if vocab_size in reuse_models:
+ raise ValueError(f"duplicate --reuse_sp_model for vocab_size={vocab_size}")
+ reuse_models[vocab_size] = Path(model_path).expanduser().resolve()
+ return reuse_models
+
+
+def load_specs(config_path: Path) -> list[dict[str, Any]]:
+ payload = json.loads(config_path.read_text(encoding="utf-8"))
+ if isinstance(payload, dict):
+ specs = payload.get("tokenizer_specs", payload.get("tokenizers"))
+ else:
+ specs = payload
+ if not isinstance(specs, list) or not specs:
+ raise ValueError("tokenizer_config must define a non-empty list")
+ if not all(isinstance(spec, dict) for spec in specs):
+ raise ValueError("each tokenizer spec must be a JSON object")
+ return [dict(spec) for spec in specs]
+
+
+def tokenizer_kind(spec: dict[str, Any]) -> str:
+ kind = spec.get("kind")
+ if kind in {"byte", "pure_byte"}:
+ return "byte"
+ if kind in {"sentencepiece_bpe", "sentencepiece"}:
+ return "sentencepiece_bpe"
+ builder = str(spec.get("builder", ""))
+ builder_name = builder.rsplit(":", 1)[-1]
+ if builder_name == "build_pure_byte_tokenizer":
+ return "byte"
+ if builder_name == "build_sentencepiece_tokenizer":
+ return "sentencepiece_bpe"
+ if spec.get("dataset_suffix") == "byte260":
+ return "byte"
+ if "vocab_size" in spec:
+ return "sentencepiece_bpe"
+ raise ValueError(
+ f"unsupported tokenizer spec {spec.get('name', '<unnamed>')!r}: "
+ "expected a built-in pure-byte or sentencepiece builder"
+ )
+
+
+def write_tokenizer_config_export(output_root: Path, selected_specs: list[dict[str, Any]]) -> Path:
+ path = output_root / "tokenizer_config.export.json"
+ path.write_text(json.dumps({"tokenizers": selected_specs}, indent=2) + "\n", encoding="utf-8")
+ return path
+
+
+def _iter_sentencepiece_text(docs_jsonl: Path, *, max_docs: int | None = None):
+ with docs_jsonl.open("r", encoding="utf-8") as f:
+ for i, line in enumerate(f):
+ if max_docs is not None and i >= max_docs:
+ break
+ text = json.loads(line)["text"].replace("\x00", " ").strip()
+ if text:
+ yield text
+
+
+def build_pure_byte_tokenizer(*, spec: dict[str, Any], docs_jsonl: Path, tokenizers_dir: Path) -> dict[str, Any]:
+ del docs_jsonl
+ tok = default_pure_byte_tokenizer()
+ path = tokenizers_dir / spec.get("filename", "fineweb_pure_byte_260.json")
+ tok.save_json(path)
+ return {
+ "name": spec.get("name", "pure_byte_260"),
+ "kind": "byte",
+ "dataset_suffix": spec.get("dataset_suffix", "byte260"),
+ "vocab_size": tok.vocab_size,
+ "bos_id": tok.bos_id,
+ "eos_id": tok.eos_id,
+ "encode": tok.encode,
+ "encode_batch": tok.encode_batch,
+ "manifest": {"path": str(path), "pad_id": tok.pad_id, "unk_id": tok.unk_id},
+ }
+
+
+def build_sentencepiece_tokenizer(*, spec: dict[str, Any], docs_jsonl: Path, tokenizers_dir: Path) -> dict[str, Any]:
+ try:
+ import sentencepiece as spm
+ except ImportError as exc:
+ raise RuntimeError("sentencepiece is required for SentencePiece tokenizer exports") from exc
+
+ vocab_size = int(spec["vocab_size"])
+ prefix = tokenizers_dir / spec.get("model_prefix", f"fineweb_{vocab_size}_bpe")
+ model_path = prefix.with_suffix(".model")
+ vocab_path = prefix.with_suffix(".vocab")
+ prefix.parent.mkdir(parents=True, exist_ok=True)
+ for artifact in (model_path, vocab_path):
+ if artifact.exists():
+ artifact.unlink()
+
+ reuse_model_path = spec.get("reuse_model_path")
+ if reuse_model_path is not None:
+ reuse_model_path = Path(reuse_model_path).expanduser().resolve()
+ if not reuse_model_path.is_file():
+ raise FileNotFoundError(reuse_model_path)
+ shutil.copy2(reuse_model_path, model_path)
+ reuse_vocab_path = reuse_model_path.with_suffix(".vocab")
+ if reuse_vocab_path.is_file():
+ shutil.copy2(reuse_vocab_path, vocab_path)
+ else:
+ kwargs = {
+ "sentence_iterator": _iter_sentencepiece_text(
+ docs_jsonl,
+ max_docs=None if spec.get("tokenizer_train_docs") is None else int(spec["tokenizer_train_docs"]),
+ ),
+ "model_prefix": str(prefix),
+ "model_type": "bpe",
+ "vocab_size": vocab_size,
+ "character_coverage": 0.999,
+ "byte_fallback": True,
+ "split_digits": True,
+ "normalization_rule_name": "nmt_nfkc",
+ "add_dummy_prefix": False,
+ "pad_id": 0,
+ "bos_id": 1,
+ "eos_id": 2,
+ "unk_id": 3,
+ "hard_vocab_limit": False,
+ }
+ kwargs.update(spec.get("trainer_overrides") or {})
+ spm.SentencePieceTrainer.train(**kwargs)
+
+ tok = spm.SentencePieceProcessor(model_file=str(model_path))
+ return {
+ "name": spec.get("name", f"sp_bpe_{vocab_size}"),
+ "kind": "sentencepiece_bpe",
+ "dataset_suffix": spec.get("dataset_suffix", f"sp{vocab_size}"),
+ "vocab_size": int(tok.vocab_size()),
+ "bos_id": int(tok.bos_id()),
+ "eos_id": int(tok.eos_id()),
+ "encode": lambda text, tok=tok: tok.encode(text, out_type=int),
+ "encode_batch": lambda texts, tok=tok: tok.encode(texts, out_type=int, num_threads=TOKENIZER_THREADS),
+ "manifest": {"model_path": str(model_path), "vocab_path": str(vocab_path)},
+ }
+
+
+def export_shards(
+ docs_jsonl: Path,
+ tok: dict[str, Any],
+ output_dir: Path,
+ *,
+ num_val_docs: int,
+ shard_size: int,
+ docs_total: int,
+) -> dict[str, int]:
+ output_dir.mkdir(parents=True, exist_ok=True)
+ for pattern in ("fineweb_train_*.bin", "fineweb_val_*.bin"):
+ for stale in output_dir.glob(pattern):
+ stale.unlink()
+
+ stats = {
+ "docs_total": 0,
+ "docs_val": 0,
+ "docs_train": 0,
+ "files_total": 0,
+ "files_val": 0,
+ "files_train": 0,
+ "tokens_total": 0,
+ "tokens_val": 0,
+ "tokens_train": 0,
+ }
+ buf = np.empty((shard_size,), dtype=np.uint16)
+ fill = 0
+ split = "val"
+ shards = {"val": 0, "train": 0}
+
+ def flush() -> None:
+ nonlocal fill
+ if fill == 0:
+ return
+ write_datafile(output_dir / f"fineweb_{split}_{shards[split]:06d}.bin", buf[:fill])
+ stats["files_total"] += 1
+ stats[f"files_{split}"] += 1
+ shards[split] += 1
+ fill = 0
+
+ vocab_size = int(tok["vocab_size"])
+ if vocab_size > 2**16:
+ raise ValueError(f"vocab_size={vocab_size} is too large for uint16 shard storage")
+
+ batch_encode = tok.get("encode_batch")
+ batch_size = SP_BATCH_SIZE if callable(batch_encode) else 1
+ for texts in batched_docs_jsonl(docs_jsonl, batch_size):
+ encoded_docs = batch_encode(texts) if callable(batch_encode) else [tok["encode"](text) for text in texts]
+ for text, encoded in zip(texts, encoded_docs, strict=True):
+ del text
+ split_for_doc = "val" if stats["docs_total"] < num_val_docs else "train"
+ if split_for_doc != split:
+ flush()
+ split = split_for_doc
+
+ encoded_arr = np.asarray(encoded, dtype=np.int32)
+ toks = np.empty((encoded_arr.size + 1 + int(APPEND_EOS),), dtype=np.int32)
+ toks[0] = tok["bos_id"]
+ toks[1 : 1 + encoded_arr.size] = encoded_arr
+ if APPEND_EOS:
+ toks[-1] = tok["eos_id"]
+ if not ((0 <= toks).all() and (toks < vocab_size).all()):
+ bad = int(toks[(toks < 0) | (toks >= vocab_size)][0])
+ raise ValueError(f"token id {bad} outside declared vocab_size={vocab_size}")
+ toks = toks.astype("<u2", copy=False)
+
+ stats["docs_total"] += 1
+ stats[f"docs_{split}"] += 1
+ stats["tokens_total"] += len(toks)
+ stats[f"tokens_{split}"] += len(toks)
+
+ pos = 0
+ while pos < len(toks):
+ take = min(shard_size - fill, len(toks) - pos)
+ buf[fill : fill + take] = toks[pos : pos + take]
+ fill += take
+ pos += take
+ if fill == shard_size:
+ flush()
+
+ if stats["docs_total"] and stats["docs_total"] % 100_000 == 0:
+ print(f"{output_dir.name}: {stats['docs_total']}/{docs_total} docs", flush=True)
+
+ flush()
+ if stats["docs_total"] != docs_total:
+ raise ValueError(f"expected {docs_total} docs, exported {stats['docs_total']}")
+ return stats
+
+
+def build_tokenizers(
+ *,
+ specs: list[dict[str, Any]],
+ docs_jsonl: Path,
+ tokenizers_dir: Path,
+ tokenizer_train_docs: int | None,
+ skip_byte: bool,
+ reuse_sp_models: dict[int, Path],
+) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
+ tokenizers: list[dict[str, Any]] = []
+ selected_specs: list[dict[str, Any]] = []
+ seen_names: set[str] = set()
+ seen_datasets: set[str] = set()
+
+ for raw_spec in specs:
+ spec = dict(raw_spec)
+ kind = tokenizer_kind(spec)
+ if skip_byte and kind == "byte":
+ continue
+ if kind == "sentencepiece_bpe":
+ if tokenizer_train_docs is not None:
+ spec["tokenizer_train_docs"] = int(tokenizer_train_docs)
+ vocab_size = int(spec["vocab_size"])
+ if vocab_size in reuse_sp_models:
+ spec["reuse_model_path"] = str(reuse_sp_models[vocab_size])
+
+ selected_specs.append(spec)
+ built = (
+ build_pure_byte_tokenizer(spec=spec, docs_jsonl=docs_jsonl, tokenizers_dir=tokenizers_dir)
+ if kind == "byte"
+ else build_sentencepiece_tokenizer(spec=spec, docs_jsonl=docs_jsonl, tokenizers_dir=tokenizers_dir)
+ )
+ name = str(built["name"])
+ dataset_suffix = built.get("dataset_suffix")
+ dataset_name = str(built.get("dataset_name", f"fineweb{VERSION}_{dataset_suffix}"))
+ if name in seen_names:
+ raise ValueError(f"duplicate tokenizer name: {name}")
+ if dataset_name in seen_datasets:
+ raise ValueError(f"duplicate dataset name: {dataset_name}")
+ seen_names.add(name)
+ seen_datasets.add(dataset_name)
+ vocab_size = int(built["vocab_size"])
+ recommended_bigram_vocab_size = int(
+ built.get("recommended_bigram_vocab_size", ((vocab_size + 127) // 128) * 128 * 5)
+ )
+ tokenizers.append(
+ {
+ "name": name,
+ "kind": str(built["kind"]),
+ "dataset_name": dataset_name,
+ "vocab_size": vocab_size,
+ "bos_id": int(built["bos_id"]),
+ "eos_id": int(built["eos_id"]),
+ "encode": built["encode"],
+ "encode_batch": built.get("encode_batch"),
+ "recommended_bigram_vocab_size": recommended_bigram_vocab_size,
+ "manifest": {
+ "name": name,
+ "kind": str(built["kind"]),
+ "vocab_size": vocab_size,
+ "bos_id": int(built["bos_id"]),
+ "eos_id": int(built["eos_id"]),
+ "recommended_bigram_vocab_size": recommended_bigram_vocab_size,
+ "source_spec": spec,
+ **(built.get("manifest") or {}),
+ },
+ }
+ )
+ if not tokenizers:
+ raise ValueError("tokenizer_config produced no tokenizers after filtering")
+ return tokenizers, selected_specs
+
+
+def build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(
+ description="Download docs_selected.jsonl from a Hugging Face dataset repo and tokenize it locally"
+ )
+ parser.add_argument(
+ "--repo-id",
+ default=DEFAULT_REPO_ID,
+ help="Hugging Face dataset repo id, for example user/dataset",
+ )
+ parser.add_argument(
+ "--remote-root",
+ default=DEFAULT_REMOTE_ROOT,
+ help="Optional subdirectory inside the dataset repo that contains docs_selected.jsonl",
+ )
+ parser.add_argument("--output-root", required=True, help="Directory where docs, tokenizers, shards, and manifest are written")
+ parser.add_argument(
+ "--tokenizer-config",
+ default=str(DEFAULT_CONFIG),
+ help="Local tokenizer config JSON. Defaults to data/tokenizer_specs.json.",
+ )
+ parser.add_argument(
+ "--num-val-docs",
+ type=int,
+ default=None,
+ help="Validation document count. Defaults to the downloaded sidecar when present, otherwise 50000.",
+ )
+ parser.add_argument("--chunk-tokens", type=int, default=SHARD_SIZE, help="Shard size in tokens.")
+ parser.add_argument(
+ "--tokenizer-train-docs",
+ type=int,
+ default=None,
+ help="Limit the number of docs used for tokenizer training.",
+ )
+ parser.add_argument("--skip-byte", action="store_true", help="Skip byte-tokenizer export.")
+ parser.add_argument(
+ "--reuse-sp-model",
+ action="append",
+ default=[],
+ metavar="VOCAB=MODEL",
+ help="Reuse an existing SentencePiece model for the given vocab size instead of retraining it.",
+ )
+ return parser
+
+
+def main() -> None:
+ args = build_parser().parse_args()
+ if args.chunk_tokens <= 0:
+ raise ValueError(f"--chunk_tokens must be positive, got {args.chunk_tokens}")
+
+ output_root = Path(args.output_root).expanduser().resolve()
+ output_root.mkdir(parents=True, exist_ok=True)
+ tokenizers_dir = output_root / "tokenizers"
+ datasets_dir = output_root / "datasets"
+ tokenizers_dir.mkdir(parents=True, exist_ok=True)
+ datasets_dir.mkdir(parents=True, exist_ok=True)
+
+ docs_jsonl = output_root / DOCS_FILENAME
+ sidecar = output_root / SIDECAR_FILENAME
+ if not copy_from_hf_cache(
+ repo_id=args.repo_id,
+ remote_root=args.remote_root,
+ filename=DOCS_FILENAME,
+ destination=docs_jsonl,
+ ):
+ remote = f"{args.remote_root}/{DOCS_FILENAME}" if args.remote_root else DOCS_FILENAME
+ raise FileNotFoundError(f"{remote} not found in Hugging Face dataset repo {args.repo_id}")
+ if not copy_from_hf_cache(
+ repo_id=args.repo_id,
+ remote_root=args.remote_root,
+ filename=SIDECAR_FILENAME,
+ destination=sidecar,
+ ):
+ sidecar.unlink(missing_ok=True)
+
+ docs_sidecar = maybe_load_docs_sidecar_meta(docs_jsonl)
+ docs_total = int(docs_sidecar["num_docs"]) if docs_sidecar is not None and docs_sidecar.get("num_docs") is not None else count_docs(docs_jsonl)
+ if args.num_val_docs is not None:
+ num_val_docs = int(args.num_val_docs)
+ elif docs_sidecar is not None and docs_sidecar.get("docs_val") is not None:
+ num_val_docs = int(docs_sidecar["docs_val"])
+ else:
+ num_val_docs = NUM_VAL_DOCS
+ if not (0 <= num_val_docs <= docs_total):
+ raise ValueError(f"num_val_docs must be in [0, {docs_total}], got {num_val_docs}")
+
+ specs = load_specs(Path(args.tokenizer_config).expanduser().resolve())
+ reuse_sp_models = parse_reuse_sp_models(args.reuse_sp_model)
+ tokenizers, selected_specs = build_tokenizers(
+ specs=specs,
+ docs_jsonl=docs_jsonl,
+ tokenizers_dir=tokenizers_dir,
+ tokenizer_train_docs=args.tokenizer_train_docs,
+ skip_byte=args.skip_byte,
+ reuse_sp_models=reuse_sp_models,
+ )
+ write_tokenizer_config_export(output_root, selected_specs)
+
+ docs_meta = {
+ "remote_repo_id": args.repo_id,
+ "remote_root": args.remote_root,
+ "num_docs": docs_total,
+ "docs_sha256": None if docs_sidecar is None else docs_sidecar.get("docs_sha256"),
+ "source_manifest": str(docs_sidecar_path(docs_jsonl)) if docs_sidecar is not None else None,
+ }
+ if docs_sidecar is not None:
+ docs_meta["source_sidecar"] = docs_sidecar
+
+ manifest = {
+ "version": VERSION,
+ "num_docs": docs_total,
+ "num_val_docs": num_val_docs,
+ "shuffle_seed": None if docs_sidecar is None else docs_sidecar.get("shuffle_seed"),
+ "shard_size": int(args.chunk_tokens),
+ "append_eos": APPEND_EOS,
+ "docs_jsonl": str(docs_jsonl),
+ "docs_meta": docs_meta,
+ "tokenizer_specs": selected_specs,
+ "tokenizers": [],
+ "datasets": [],
+ }
+
+ for tok in tokenizers:
+ output_dir = datasets_dir / tok["dataset_name"]
+ print(f"Exporting dataset: {tok['dataset_name']}", flush=True)
+ stats = export_shards(
+ docs_jsonl,
+ tok,
+ output_dir,
+ num_val_docs=num_val_docs,
+ shard_size=int(args.chunk_tokens),
+ docs_total=docs_total,
+ )
+ manifest["tokenizers"].append(tok["manifest"])
+ manifest["datasets"].append(
+ {
+ "name": tok["dataset_name"],
+ "tokenizer_name": tok["name"],
+ "tokenizer_kind": tok["kind"],
+ "path": str(output_dir),
+ "train_glob": str(output_dir / "fineweb_train_*.bin"),
+ "val_glob": str(output_dir / "fineweb_val_*.bin"),
+ "vocab_size": tok["vocab_size"],
+ "bos_id": tok["bos_id"],
+ "eos_id": tok["eos_id"],
+ "recommended_bigram_vocab_size": tok["recommended_bigram_vocab_size"],
+ "stats": stats,
+ }
+ )
+
+ manifest = relativize_manifest_paths(manifest, output_root)
+ manifest_path = output_root / "manifest.json"
+ manifest_path.write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf-8")
+ print(f"Done. Manifest: {manifest_path}", flush=True)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/data/tokenizer_specs.json b/data/tokenizer_specs.json
new file mode 100644
index 0000000..d7ad1ca
--- /dev/null
+++ b/data/tokenizer_specs.json
@@ -0,0 +1,9 @@
+{
+ "tokenizers": [
+ {
+ "name": "sp_bpe_1024",
+ "dataset_suffix": "sp1024",
+ "vocab_size": 1024
+ }
+ ]
+}