diff options
| author | Will DePue <williamd@openai.com> | 2026-03-18 09:32:01 -0700 |
|---|---|---|
| committer | Will DePue <williamd@openai.com> | 2026-03-18 09:32:01 -0700 |
| commit | a15093adad328a650d421e53c078cbd2c45beb0e (patch) | |
| tree | e054c4bde12b89e6d3b39d611d9caadabc7f7234 /data | |
Launch snapshot
Diffstat (limited to 'data')
| -rw-r--r-- | data/README.md | 66 | ||||
| -rw-r--r-- | data/cached_challenge_fineweb.py | 157 | ||||
| -rw-r--r-- | data/download_hf_docs_and_tokenize.py | 627 | ||||
| -rw-r--r-- | data/tokenizer_specs.json | 9 |
4 files changed, 859 insertions, 0 deletions
diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..e1920ad --- /dev/null +++ b/data/README.md @@ -0,0 +1,66 @@ +# Data Workflows + +This directory contains the dataset download helpers and export scripts used for the challenge. + +Canonical local layout: +- `data/datasets/<dataset_name>/` +- `data/tokenizers/` +- `data/manifest.json` +- `data/docs_selected.jsonl` +- `data/docs_selected.source_manifest.json` + +## Downloading Published Data + +Download the cached FineWeb export for a tokenizer variant with: + +```bash +python3 data/cached_challenge_fineweb.py --variant sp1024 +``` + +This populates `./data/datasets/fineweb10B_sp1024/` and `./data/tokenizers/`. +By default it downloads the full validation split and 8B training tokens (80 train shards). + +To fetch more training shards, pass `--train-shards`: + +```bash +python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 180 +``` + +The downloader is manifest-driven and can fetch only a prefix of train shards from a larger published export. With the current shard size of `100_000_000` tokens, `10B` retokenized training tokens is `100` train shards: + +```bash +MATCHED_FINEWEB_REPO_ID=your-hf-username/your-dataset-repo \ +MATCHED_FINEWEB_REMOTE_ROOT_PREFIX=your_50B_export_root \ +python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 100 +``` + +Validation is always downloaded in full from the fixed `fineweb_val_*` split. Training on the first `N` train shards means training on the prefix of the same frozen shuffled export, so the data order stays aligned with the baseline for that tokenizer family. + +The default published repo is `willdepueoai/parameter-golf`, with the export rooted under the repo subdirectory `datasets/`. + +## Rebuilding Tokenizers From Published Docs + +To retrain a tokenizer or re-export shards from exactly the same selected documents, run the standalone retokenizer against the published docs cache: + +```bash +python3 data/download_hf_docs_and_tokenize.py \ + --repo-id your-hf-username/your-dataset-repo \ + --remote-root your_50B_export_root \ + --output-root /tmp/my_custom_tokenizer_export \ + --tokenizer-config ./data/tokenizer_specs.json +``` + +The sidecar `docs_selected.source_manifest.json` includes `docs_sha256`, so users can verify they are rebuilding from the exact same document list and order as the baseline export. + +## Useful Knobs + +For CPU-heavy exports, useful knobs are: + +```bash +MATCHED_FINEWEB_SP_BATCH_SIZE=2048 +MATCHED_FINEWEB_TOKENIZER_THREADS=16 +MATCHED_FINEWEB_TIKTOKEN_THREADS=16 +MATCHED_FINEWEB_GPT2_DECODE_BATCH_SIZE=512 +``` + +These control batched tokenizer encoding during shard export, tokenizer thread count, tiktoken thread count, and batched GPT-2 decode for the blobstore docs-cache path. diff --git a/data/cached_challenge_fineweb.py b/data/cached_challenge_fineweb.py new file mode 100644 index 0000000..fa8029b --- /dev/null +++ b/data/cached_challenge_fineweb.py @@ -0,0 +1,157 @@ +import argparse +import json +import os +import shutil +from pathlib import Path + +from huggingface_hub import hf_hub_download + + +REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", "willdepueoai/parameter-golf") +REMOTE_ROOT_PREFIX = os.environ.get("MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", "datasets") +ROOT = Path(__file__).resolve().parent +DATASETS_DIR = ROOT / "datasets" +TOKENIZERS_DIR = ROOT / "tokenizers" + +def dataset_dir_for_variant(name: str) -> str: + if name == "byte260": + return "fineweb10B_byte260" + if name.startswith("sp") and name[2:].isdigit(): + return f"fineweb10B_{name}" + raise ValueError(f"unsupported variant {name!r}; expected byte260 or sp<VOCAB_SIZE>") + + +def local_path_for_remote(relative_path: str) -> Path: + remote_path = Path(relative_path) + if REMOTE_ROOT_PREFIX and remote_path.parts[:1] == (REMOTE_ROOT_PREFIX,): + remote_path = remote_path.relative_to(REMOTE_ROOT_PREFIX) + if remote_path.parts[:1] == ("datasets",): + return DATASETS_DIR.joinpath(*remote_path.parts[1:]) + if remote_path.parts[:1] == ("tokenizers",): + return TOKENIZERS_DIR.joinpath(*remote_path.parts[1:]) + return ROOT / remote_path + + +def get(relative_path: str) -> None: + destination = local_path_for_remote(relative_path) + if destination.exists(): + return + if destination.is_symlink(): + destination.unlink() + + remote_path = Path(relative_path) + cached_path = Path( + hf_hub_download( + repo_id=REPO_ID, + filename=remote_path.name, + subfolder=remote_path.parent.as_posix() if remote_path.parent != Path(".") else None, + repo_type="dataset", + ) + ) + # HF cache entries may be snapshot symlinks. Resolve to the underlying blob so we + # always materialize a real file in data/, not a broken relative symlink. + cached_source = cached_path.resolve(strict=True) + destination.parent.mkdir(parents=True, exist_ok=True) + try: + os.link(cached_source, destination) + except OSError: + shutil.copy2(cached_source, destination) + + +def manifest_path() -> Path: + return local_path_for_remote(f"{REMOTE_ROOT_PREFIX}/manifest.json") + + +def load_manifest(*, skip_manifest_download: bool) -> dict: + path = manifest_path() + if not path.is_file(): + if skip_manifest_download: + raise FileNotFoundError( + f"manifest.json is required for manifest-driven shard counts but is not present locally at {path}" + ) + get(f"{REMOTE_ROOT_PREFIX}/manifest.json") + return json.loads(path.read_text(encoding="utf-8")) + + +def artifact_paths_for_tokenizer(tokenizer_entry: dict) -> list[str]: + artifacts = [] + for key in ("model_path", "vocab_path", "path"): + value = tokenizer_entry.get(key) + if value: + artifacts.append(str(value)) + if not artifacts: + raise ValueError(f"tokenizer entry is missing downloadable artifacts: {tokenizer_entry}") + return artifacts + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Download challenge FineWeb shards from Hugging Face") + parser.add_argument( + "train_shards_positional", + nargs="?", + type=int, + default=None, + help=argparse.SUPPRESS, + ) + parser.add_argument( + "--train-shards", + type=int, + default=80, + help="Number of training shards to download for the selected variant. Defaults to 80.", + ) + parser.add_argument( + "--variant", + default="sp1024", + help="Tokenizer family to download, for example sp1024, sp4096, or byte260.", + ) + parser.add_argument( + "--skip-manifest", + action="store_true", + help="Skip downloading manifest.json.", + ) + parser.add_argument( + "--with-docs", + action="store_true", + help="Also download docs_selected.jsonl and its sidecar for tokenizer retraining or dataset re-export.", + ) + return parser + + +def main() -> None: + args = build_parser().parse_args() + dataset_dir = dataset_dir_for_variant(args.variant) + train_shards = args.train_shards_positional if args.train_shards_positional is not None else args.train_shards + if train_shards < 0: + raise ValueError("train_shards must be non-negative") + + manifest = load_manifest(skip_manifest_download=args.skip_manifest) + dataset_entry = next((x for x in manifest.get("datasets", []) if x.get("name") == dataset_dir), None) + if dataset_entry is None: + raise ValueError(f"dataset {dataset_dir} not found in {REMOTE_ROOT_PREFIX}/manifest.json") + max_train_shards = int((dataset_entry.get("stats") or {}).get("files_train")) + val_shards = int((dataset_entry.get("stats") or {}).get("files_val")) + if train_shards > max_train_shards: + raise ValueError( + f"{args.variant} only has {max_train_shards} training shards on {REPO_ID}, requested {train_shards}" + ) + tokenizer_name = dataset_entry.get("tokenizer_name") + tokenizer_entry = next((x for x in manifest.get("tokenizers", []) if x.get("name") == tokenizer_name), None) + if tokenizer_entry is None: + raise ValueError(f"tokenizer {tokenizer_name} not found in {REMOTE_ROOT_PREFIX}/manifest.json") + + if args.with_docs: + get(f"{REMOTE_ROOT_PREFIX}/docs_selected.jsonl") + get(f"{REMOTE_ROOT_PREFIX}/docs_selected.source_manifest.json") + + dataset_prefix = f"{REMOTE_ROOT_PREFIX}/datasets/{dataset_dir}" + for i in range(val_shards): + get(f"{dataset_prefix}/fineweb_val_{i:06d}.bin") + for i in range(train_shards): + get(f"{dataset_prefix}/fineweb_train_{i:06d}.bin") + + for artifact_path in artifact_paths_for_tokenizer(tokenizer_entry): + get(f"{REMOTE_ROOT_PREFIX}/{artifact_path}") + + +if __name__ == "__main__": + main() diff --git a/data/download_hf_docs_and_tokenize.py b/data/download_hf_docs_and_tokenize.py new file mode 100644 index 0000000..dcabd40 --- /dev/null +++ b/data/download_hf_docs_and_tokenize.py @@ -0,0 +1,627 @@ +"""Download docs_selected.jsonl from Hugging Face and tokenize it locally. + +This script is standalone. It does not import any local exporter or tokenizer +helpers. Tokenizer configs are JSON only and currently support the built-in +pure-byte and SentencePiece tokenizer definitions in `data/tokenizer_specs.json`. +""" + +from __future__ import annotations + +import argparse +import json +import os +import shutil +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any + +import numpy as np +from huggingface_hub import hf_hub_download +from huggingface_hub.utils import EntryNotFoundError + + +DOCS_FILENAME = "docs_selected.jsonl" +SIDECAR_FILENAME = "docs_selected.source_manifest.json" +VERSION = "10B" +NUM_VAL_DOCS = 50_000 +SHARD_SIZE = 10**8 +APPEND_EOS = False +DATAFILE_MAGIC = 20240520 +DATAFILE_VERSION = 1 +DEFAULT_REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", "willdepueoai/parameter-golf") +DEFAULT_REMOTE_ROOT = os.environ.get("MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", "datasets") +DEFAULT_CONFIG = Path(__file__).with_name("tokenizer_specs.json") +TOKENIZER_THREADS = max(1, int(os.environ.get("MATCHED_FINEWEB_TOKENIZER_THREADS", str(os.cpu_count() or 8)))) +SP_BATCH_SIZE = max(1, int(os.environ.get("MATCHED_FINEWEB_SP_BATCH_SIZE", "1024"))) + + +@dataclass(frozen=True) +class PureByteTokenizer: + pad_id: int = 0 + bos_id: int = 1 + eos_id: int = 2 + unk_id: int = 3 + byte_offset: int = 4 + byte_count: int = 256 + + @property + def vocab_size(self) -> int: + return self.byte_offset + self.byte_count + + def encode(self, text: str) -> np.ndarray: + data = text.encode("utf-8", errors="replace") + return np.frombuffer(data, dtype=np.uint8).astype(np.uint16, copy=False) + self.byte_offset + + def encode_batch(self, texts: list[str]) -> list[np.ndarray]: + return [self.encode(text) for text in texts] + + def save_json(self, path: str | Path) -> None: + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + payload = { + "tokenizer_type": "pure_byte", + "config": asdict(self), + "vocab_size": self.vocab_size, + } + path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def default_pure_byte_tokenizer() -> PureByteTokenizer: + return PureByteTokenizer() + + +def docs_sidecar_path(docs_jsonl: Path) -> Path: + return docs_jsonl.with_name(f"{docs_jsonl.stem}.source_manifest.json") + + +def maybe_load_docs_sidecar_meta(docs_jsonl: Path) -> dict[str, Any] | None: + sidecar_path = docs_sidecar_path(docs_jsonl) + if not sidecar_path.is_file(): + return None + payload = json.loads(sidecar_path.read_text(encoding="utf-8")) + if not isinstance(payload, dict): + raise ValueError(f"docs sidecar must be a JSON object: {sidecar_path}") + return payload + + +def copy_from_hf_cache(*, repo_id: str, remote_root: str, filename: str, destination: Path) -> bool: + remote_path = Path(remote_root) / filename if remote_root else Path(filename) + try: + cached_path = Path( + hf_hub_download( + repo_id=repo_id, + filename=remote_path.name, + subfolder=remote_path.parent.as_posix() if remote_path.parent != Path(".") else None, + repo_type="dataset", + ) + ) + except EntryNotFoundError: + return False + + source = cached_path.resolve(strict=True) + destination.parent.mkdir(parents=True, exist_ok=True) + if destination.exists(): + destination.unlink() + try: + os.link(source, destination) + except OSError: + shutil.copy2(source, destination) + return True + + +def iter_docs(path: Path): + with path.open("r", encoding="utf-8") as f: + for line in f: + yield json.loads(line)["text"] + + +def count_docs(path: Path) -> int: + with path.open("r", encoding="utf-8") as f: + return sum(1 for _ in f) + + +def batched_docs_jsonl(path: Path, batch_size: int): + batch: list[str] = [] + for text in iter_docs(path): + batch.append(text) + if len(batch) == batch_size: + yield batch + batch = [] + if batch: + yield batch + + +def write_datafile(path: Path, toks: Any) -> None: + if len(toks) >= 2**31: + raise ValueError("token count too large") + header = np.zeros(256, dtype="<i4") + header[0] = DATAFILE_MAGIC + header[1] = DATAFILE_VERSION + header[2] = len(toks) + toks = np.asarray(toks) + if toks.dtype != np.uint16: + if not ((0 <= toks).all() and (toks < 2**16).all()): + raise ValueError("token dictionary too large for uint16") + toks = toks.astype("<u2", copy=False) + else: + toks = toks.astype("<u2", copy=False) + with path.open("wb") as f: + f.write(header.tobytes()) + f.write(toks.tobytes()) + + +def relativize_manifest_paths(value: Any, root: Path) -> Any: + if isinstance(value, dict): + return {k: relativize_manifest_paths(v, root) for k, v in value.items()} + if isinstance(value, list): + return [relativize_manifest_paths(v, root) for v in value] + if isinstance(value, str): + path = Path(value) + if path.is_absolute(): + try: + return path.relative_to(root).as_posix() + except ValueError: + return value + return value + + +def parse_reuse_sp_models(values: list[str]) -> dict[int, Path]: + reuse_models: dict[int, Path] = {} + for value in values: + vocab_size_str, model_path = value.split("=", 1) + vocab_size = int(vocab_size_str) + if vocab_size in reuse_models: + raise ValueError(f"duplicate --reuse_sp_model for vocab_size={vocab_size}") + reuse_models[vocab_size] = Path(model_path).expanduser().resolve() + return reuse_models + + +def load_specs(config_path: Path) -> list[dict[str, Any]]: + payload = json.loads(config_path.read_text(encoding="utf-8")) + if isinstance(payload, dict): + specs = payload.get("tokenizer_specs", payload.get("tokenizers")) + else: + specs = payload + if not isinstance(specs, list) or not specs: + raise ValueError("tokenizer_config must define a non-empty list") + if not all(isinstance(spec, dict) for spec in specs): + raise ValueError("each tokenizer spec must be a JSON object") + return [dict(spec) for spec in specs] + + +def tokenizer_kind(spec: dict[str, Any]) -> str: + kind = spec.get("kind") + if kind in {"byte", "pure_byte"}: + return "byte" + if kind in {"sentencepiece_bpe", "sentencepiece"}: + return "sentencepiece_bpe" + builder = str(spec.get("builder", "")) + builder_name = builder.rsplit(":", 1)[-1] + if builder_name == "build_pure_byte_tokenizer": + return "byte" + if builder_name == "build_sentencepiece_tokenizer": + return "sentencepiece_bpe" + if spec.get("dataset_suffix") == "byte260": + return "byte" + if "vocab_size" in spec: + return "sentencepiece_bpe" + raise ValueError( + f"unsupported tokenizer spec {spec.get('name', '<unnamed>')!r}: " + "expected a built-in pure-byte or sentencepiece builder" + ) + + +def write_tokenizer_config_export(output_root: Path, selected_specs: list[dict[str, Any]]) -> Path: + path = output_root / "tokenizer_config.export.json" + path.write_text(json.dumps({"tokenizers": selected_specs}, indent=2) + "\n", encoding="utf-8") + return path + + +def _iter_sentencepiece_text(docs_jsonl: Path, *, max_docs: int | None = None): + with docs_jsonl.open("r", encoding="utf-8") as f: + for i, line in enumerate(f): + if max_docs is not None and i >= max_docs: + break + text = json.loads(line)["text"].replace("\x00", " ").strip() + if text: + yield text + + +def build_pure_byte_tokenizer(*, spec: dict[str, Any], docs_jsonl: Path, tokenizers_dir: Path) -> dict[str, Any]: + del docs_jsonl + tok = default_pure_byte_tokenizer() + path = tokenizers_dir / spec.get("filename", "fineweb_pure_byte_260.json") + tok.save_json(path) + return { + "name": spec.get("name", "pure_byte_260"), + "kind": "byte", + "dataset_suffix": spec.get("dataset_suffix", "byte260"), + "vocab_size": tok.vocab_size, + "bos_id": tok.bos_id, + "eos_id": tok.eos_id, + "encode": tok.encode, + "encode_batch": tok.encode_batch, + "manifest": {"path": str(path), "pad_id": tok.pad_id, "unk_id": tok.unk_id}, + } + + +def build_sentencepiece_tokenizer(*, spec: dict[str, Any], docs_jsonl: Path, tokenizers_dir: Path) -> dict[str, Any]: + try: + import sentencepiece as spm + except ImportError as exc: + raise RuntimeError("sentencepiece is required for SentencePiece tokenizer exports") from exc + + vocab_size = int(spec["vocab_size"]) + prefix = tokenizers_dir / spec.get("model_prefix", f"fineweb_{vocab_size}_bpe") + model_path = prefix.with_suffix(".model") + vocab_path = prefix.with_suffix(".vocab") + prefix.parent.mkdir(parents=True, exist_ok=True) + for artifact in (model_path, vocab_path): + if artifact.exists(): + artifact.unlink() + + reuse_model_path = spec.get("reuse_model_path") + if reuse_model_path is not None: + reuse_model_path = Path(reuse_model_path).expanduser().resolve() + if not reuse_model_path.is_file(): + raise FileNotFoundError(reuse_model_path) + shutil.copy2(reuse_model_path, model_path) + reuse_vocab_path = reuse_model_path.with_suffix(".vocab") + if reuse_vocab_path.is_file(): + shutil.copy2(reuse_vocab_path, vocab_path) + else: + kwargs = { + "sentence_iterator": _iter_sentencepiece_text( + docs_jsonl, + max_docs=None if spec.get("tokenizer_train_docs") is None else int(spec["tokenizer_train_docs"]), + ), + "model_prefix": str(prefix), + "model_type": "bpe", + "vocab_size": vocab_size, + "character_coverage": 0.999, + "byte_fallback": True, + "split_digits": True, + "normalization_rule_name": "nmt_nfkc", + "add_dummy_prefix": False, + "pad_id": 0, + "bos_id": 1, + "eos_id": 2, + "unk_id": 3, + "hard_vocab_limit": False, + } + kwargs.update(spec.get("trainer_overrides") or {}) + spm.SentencePieceTrainer.train(**kwargs) + + tok = spm.SentencePieceProcessor(model_file=str(model_path)) + return { + "name": spec.get("name", f"sp_bpe_{vocab_size}"), + "kind": "sentencepiece_bpe", + "dataset_suffix": spec.get("dataset_suffix", f"sp{vocab_size}"), + "vocab_size": int(tok.vocab_size()), + "bos_id": int(tok.bos_id()), + "eos_id": int(tok.eos_id()), + "encode": lambda text, tok=tok: tok.encode(text, out_type=int), + "encode_batch": lambda texts, tok=tok: tok.encode(texts, out_type=int, num_threads=TOKENIZER_THREADS), + "manifest": {"model_path": str(model_path), "vocab_path": str(vocab_path)}, + } + + +def export_shards( + docs_jsonl: Path, + tok: dict[str, Any], + output_dir: Path, + *, + num_val_docs: int, + shard_size: int, + docs_total: int, +) -> dict[str, int]: + output_dir.mkdir(parents=True, exist_ok=True) + for pattern in ("fineweb_train_*.bin", "fineweb_val_*.bin"): + for stale in output_dir.glob(pattern): + stale.unlink() + + stats = { + "docs_total": 0, + "docs_val": 0, + "docs_train": 0, + "files_total": 0, + "files_val": 0, + "files_train": 0, + "tokens_total": 0, + "tokens_val": 0, + "tokens_train": 0, + } + buf = np.empty((shard_size,), dtype=np.uint16) + fill = 0 + split = "val" + shards = {"val": 0, "train": 0} + + def flush() -> None: + nonlocal fill + if fill == 0: + return + write_datafile(output_dir / f"fineweb_{split}_{shards[split]:06d}.bin", buf[:fill]) + stats["files_total"] += 1 + stats[f"files_{split}"] += 1 + shards[split] += 1 + fill = 0 + + vocab_size = int(tok["vocab_size"]) + if vocab_size > 2**16: + raise ValueError(f"vocab_size={vocab_size} is too large for uint16 shard storage") + + batch_encode = tok.get("encode_batch") + batch_size = SP_BATCH_SIZE if callable(batch_encode) else 1 + for texts in batched_docs_jsonl(docs_jsonl, batch_size): + encoded_docs = batch_encode(texts) if callable(batch_encode) else [tok["encode"](text) for text in texts] + for text, encoded in zip(texts, encoded_docs, strict=True): + del text + split_for_doc = "val" if stats["docs_total"] < num_val_docs else "train" + if split_for_doc != split: + flush() + split = split_for_doc + + encoded_arr = np.asarray(encoded, dtype=np.int32) + toks = np.empty((encoded_arr.size + 1 + int(APPEND_EOS),), dtype=np.int32) + toks[0] = tok["bos_id"] + toks[1 : 1 + encoded_arr.size] = encoded_arr + if APPEND_EOS: + toks[-1] = tok["eos_id"] + if not ((0 <= toks).all() and (toks < vocab_size).all()): + bad = int(toks[(toks < 0) | (toks >= vocab_size)][0]) + raise ValueError(f"token id {bad} outside declared vocab_size={vocab_size}") + toks = toks.astype("<u2", copy=False) + + stats["docs_total"] += 1 + stats[f"docs_{split}"] += 1 + stats["tokens_total"] += len(toks) + stats[f"tokens_{split}"] += len(toks) + + pos = 0 + while pos < len(toks): + take = min(shard_size - fill, len(toks) - pos) + buf[fill : fill + take] = toks[pos : pos + take] + fill += take + pos += take + if fill == shard_size: + flush() + + if stats["docs_total"] and stats["docs_total"] % 100_000 == 0: + print(f"{output_dir.name}: {stats['docs_total']}/{docs_total} docs", flush=True) + + flush() + if stats["docs_total"] != docs_total: + raise ValueError(f"expected {docs_total} docs, exported {stats['docs_total']}") + return stats + + +def build_tokenizers( + *, + specs: list[dict[str, Any]], + docs_jsonl: Path, + tokenizers_dir: Path, + tokenizer_train_docs: int | None, + skip_byte: bool, + reuse_sp_models: dict[int, Path], +) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + tokenizers: list[dict[str, Any]] = [] + selected_specs: list[dict[str, Any]] = [] + seen_names: set[str] = set() + seen_datasets: set[str] = set() + + for raw_spec in specs: + spec = dict(raw_spec) + kind = tokenizer_kind(spec) + if skip_byte and kind == "byte": + continue + if kind == "sentencepiece_bpe": + if tokenizer_train_docs is not None: + spec["tokenizer_train_docs"] = int(tokenizer_train_docs) + vocab_size = int(spec["vocab_size"]) + if vocab_size in reuse_sp_models: + spec["reuse_model_path"] = str(reuse_sp_models[vocab_size]) + + selected_specs.append(spec) + built = ( + build_pure_byte_tokenizer(spec=spec, docs_jsonl=docs_jsonl, tokenizers_dir=tokenizers_dir) + if kind == "byte" + else build_sentencepiece_tokenizer(spec=spec, docs_jsonl=docs_jsonl, tokenizers_dir=tokenizers_dir) + ) + name = str(built["name"]) + dataset_suffix = built.get("dataset_suffix") + dataset_name = str(built.get("dataset_name", f"fineweb{VERSION}_{dataset_suffix}")) + if name in seen_names: + raise ValueError(f"duplicate tokenizer name: {name}") + if dataset_name in seen_datasets: + raise ValueError(f"duplicate dataset name: {dataset_name}") + seen_names.add(name) + seen_datasets.add(dataset_name) + vocab_size = int(built["vocab_size"]) + recommended_bigram_vocab_size = int( + built.get("recommended_bigram_vocab_size", ((vocab_size + 127) // 128) * 128 * 5) + ) + tokenizers.append( + { + "name": name, + "kind": str(built["kind"]), + "dataset_name": dataset_name, + "vocab_size": vocab_size, + "bos_id": int(built["bos_id"]), + "eos_id": int(built["eos_id"]), + "encode": built["encode"], + "encode_batch": built.get("encode_batch"), + "recommended_bigram_vocab_size": recommended_bigram_vocab_size, + "manifest": { + "name": name, + "kind": str(built["kind"]), + "vocab_size": vocab_size, + "bos_id": int(built["bos_id"]), + "eos_id": int(built["eos_id"]), + "recommended_bigram_vocab_size": recommended_bigram_vocab_size, + "source_spec": spec, + **(built.get("manifest") or {}), + }, + } + ) + if not tokenizers: + raise ValueError("tokenizer_config produced no tokenizers after filtering") + return tokenizers, selected_specs + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Download docs_selected.jsonl from a Hugging Face dataset repo and tokenize it locally" + ) + parser.add_argument( + "--repo-id", + default=DEFAULT_REPO_ID, + help="Hugging Face dataset repo id, for example user/dataset", + ) + parser.add_argument( + "--remote-root", + default=DEFAULT_REMOTE_ROOT, + help="Optional subdirectory inside the dataset repo that contains docs_selected.jsonl", + ) + parser.add_argument("--output-root", required=True, help="Directory where docs, tokenizers, shards, and manifest are written") + parser.add_argument( + "--tokenizer-config", + default=str(DEFAULT_CONFIG), + help="Local tokenizer config JSON. Defaults to data/tokenizer_specs.json.", + ) + parser.add_argument( + "--num-val-docs", + type=int, + default=None, + help="Validation document count. Defaults to the downloaded sidecar when present, otherwise 50000.", + ) + parser.add_argument("--chunk-tokens", type=int, default=SHARD_SIZE, help="Shard size in tokens.") + parser.add_argument( + "--tokenizer-train-docs", + type=int, + default=None, + help="Limit the number of docs used for tokenizer training.", + ) + parser.add_argument("--skip-byte", action="store_true", help="Skip byte-tokenizer export.") + parser.add_argument( + "--reuse-sp-model", + action="append", + default=[], + metavar="VOCAB=MODEL", + help="Reuse an existing SentencePiece model for the given vocab size instead of retraining it.", + ) + return parser + + +def main() -> None: + args = build_parser().parse_args() + if args.chunk_tokens <= 0: + raise ValueError(f"--chunk_tokens must be positive, got {args.chunk_tokens}") + + output_root = Path(args.output_root).expanduser().resolve() + output_root.mkdir(parents=True, exist_ok=True) + tokenizers_dir = output_root / "tokenizers" + datasets_dir = output_root / "datasets" + tokenizers_dir.mkdir(parents=True, exist_ok=True) + datasets_dir.mkdir(parents=True, exist_ok=True) + + docs_jsonl = output_root / DOCS_FILENAME + sidecar = output_root / SIDECAR_FILENAME + if not copy_from_hf_cache( + repo_id=args.repo_id, + remote_root=args.remote_root, + filename=DOCS_FILENAME, + destination=docs_jsonl, + ): + remote = f"{args.remote_root}/{DOCS_FILENAME}" if args.remote_root else DOCS_FILENAME + raise FileNotFoundError(f"{remote} not found in Hugging Face dataset repo {args.repo_id}") + if not copy_from_hf_cache( + repo_id=args.repo_id, + remote_root=args.remote_root, + filename=SIDECAR_FILENAME, + destination=sidecar, + ): + sidecar.unlink(missing_ok=True) + + docs_sidecar = maybe_load_docs_sidecar_meta(docs_jsonl) + docs_total = int(docs_sidecar["num_docs"]) if docs_sidecar is not None and docs_sidecar.get("num_docs") is not None else count_docs(docs_jsonl) + if args.num_val_docs is not None: + num_val_docs = int(args.num_val_docs) + elif docs_sidecar is not None and docs_sidecar.get("docs_val") is not None: + num_val_docs = int(docs_sidecar["docs_val"]) + else: + num_val_docs = NUM_VAL_DOCS + if not (0 <= num_val_docs <= docs_total): + raise ValueError(f"num_val_docs must be in [0, {docs_total}], got {num_val_docs}") + + specs = load_specs(Path(args.tokenizer_config).expanduser().resolve()) + reuse_sp_models = parse_reuse_sp_models(args.reuse_sp_model) + tokenizers, selected_specs = build_tokenizers( + specs=specs, + docs_jsonl=docs_jsonl, + tokenizers_dir=tokenizers_dir, + tokenizer_train_docs=args.tokenizer_train_docs, + skip_byte=args.skip_byte, + reuse_sp_models=reuse_sp_models, + ) + write_tokenizer_config_export(output_root, selected_specs) + + docs_meta = { + "remote_repo_id": args.repo_id, + "remote_root": args.remote_root, + "num_docs": docs_total, + "docs_sha256": None if docs_sidecar is None else docs_sidecar.get("docs_sha256"), + "source_manifest": str(docs_sidecar_path(docs_jsonl)) if docs_sidecar is not None else None, + } + if docs_sidecar is not None: + docs_meta["source_sidecar"] = docs_sidecar + + manifest = { + "version": VERSION, + "num_docs": docs_total, + "num_val_docs": num_val_docs, + "shuffle_seed": None if docs_sidecar is None else docs_sidecar.get("shuffle_seed"), + "shard_size": int(args.chunk_tokens), + "append_eos": APPEND_EOS, + "docs_jsonl": str(docs_jsonl), + "docs_meta": docs_meta, + "tokenizer_specs": selected_specs, + "tokenizers": [], + "datasets": [], + } + + for tok in tokenizers: + output_dir = datasets_dir / tok["dataset_name"] + print(f"Exporting dataset: {tok['dataset_name']}", flush=True) + stats = export_shards( + docs_jsonl, + tok, + output_dir, + num_val_docs=num_val_docs, + shard_size=int(args.chunk_tokens), + docs_total=docs_total, + ) + manifest["tokenizers"].append(tok["manifest"]) + manifest["datasets"].append( + { + "name": tok["dataset_name"], + "tokenizer_name": tok["name"], + "tokenizer_kind": tok["kind"], + "path": str(output_dir), + "train_glob": str(output_dir / "fineweb_train_*.bin"), + "val_glob": str(output_dir / "fineweb_val_*.bin"), + "vocab_size": tok["vocab_size"], + "bos_id": tok["bos_id"], + "eos_id": tok["eos_id"], + "recommended_bigram_vocab_size": tok["recommended_bigram_vocab_size"], + "stats": stats, + } + ) + + manifest = relativize_manifest_paths(manifest, output_root) + manifest_path = output_root / "manifest.json" + manifest_path.write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf-8") + print(f"Done. Manifest: {manifest_path}", flush=True) + + +if __name__ == "__main__": + main() diff --git a/data/tokenizer_specs.json b/data/tokenizer_specs.json new file mode 100644 index 0000000..d7ad1ca --- /dev/null +++ b/data/tokenizer_specs.json @@ -0,0 +1,9 @@ +{ + "tokenizers": [ + { + "name": "sp_bpe_1024", + "dataset_suffix": "sp1024", + "vocab_size": 1024 + } + ] +} |
