summaryrefslogtreecommitdiff
path: root/data/download_hf_docs_and_tokenize.py
diff options
context:
space:
mode:
authorWill DePue <williamd@openai.com>2026-03-18 09:32:01 -0700
committerWill DePue <williamd@openai.com>2026-03-18 09:32:01 -0700
commita15093adad328a650d421e53c078cbd2c45beb0e (patch)
treee054c4bde12b89e6d3b39d611d9caadabc7f7234 /data/download_hf_docs_and_tokenize.py
Launch snapshot
Diffstat (limited to 'data/download_hf_docs_and_tokenize.py')
-rw-r--r--data/download_hf_docs_and_tokenize.py627
1 files changed, 627 insertions, 0 deletions
diff --git a/data/download_hf_docs_and_tokenize.py b/data/download_hf_docs_and_tokenize.py
new file mode 100644
index 0000000..dcabd40
--- /dev/null
+++ b/data/download_hf_docs_and_tokenize.py
@@ -0,0 +1,627 @@
+"""Download docs_selected.jsonl from Hugging Face and tokenize it locally.
+
+This script is standalone. It does not import any local exporter or tokenizer
+helpers. Tokenizer configs are JSON only and currently support the built-in
+pure-byte and SentencePiece tokenizer definitions in `data/tokenizer_specs.json`.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError
+
+
+DOCS_FILENAME = "docs_selected.jsonl"
+SIDECAR_FILENAME = "docs_selected.source_manifest.json"
+VERSION = "10B"
+NUM_VAL_DOCS = 50_000
+SHARD_SIZE = 10**8
+APPEND_EOS = False
+DATAFILE_MAGIC = 20240520
+DATAFILE_VERSION = 1
+DEFAULT_REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", "willdepueoai/parameter-golf")
+DEFAULT_REMOTE_ROOT = os.environ.get("MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", "datasets")
+DEFAULT_CONFIG = Path(__file__).with_name("tokenizer_specs.json")
+TOKENIZER_THREADS = max(1, int(os.environ.get("MATCHED_FINEWEB_TOKENIZER_THREADS", str(os.cpu_count() or 8))))
+SP_BATCH_SIZE = max(1, int(os.environ.get("MATCHED_FINEWEB_SP_BATCH_SIZE", "1024")))
+
+
+@dataclass(frozen=True)
+class PureByteTokenizer:
+ pad_id: int = 0
+ bos_id: int = 1
+ eos_id: int = 2
+ unk_id: int = 3
+ byte_offset: int = 4
+ byte_count: int = 256
+
+ @property
+ def vocab_size(self) -> int:
+ return self.byte_offset + self.byte_count
+
+ def encode(self, text: str) -> np.ndarray:
+ data = text.encode("utf-8", errors="replace")
+ return np.frombuffer(data, dtype=np.uint8).astype(np.uint16, copy=False) + self.byte_offset
+
+ def encode_batch(self, texts: list[str]) -> list[np.ndarray]:
+ return [self.encode(text) for text in texts]
+
+ def save_json(self, path: str | Path) -> None:
+ path = Path(path)
+ path.parent.mkdir(parents=True, exist_ok=True)
+ payload = {
+ "tokenizer_type": "pure_byte",
+ "config": asdict(self),
+ "vocab_size": self.vocab_size,
+ }
+ path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+
+
+def default_pure_byte_tokenizer() -> PureByteTokenizer:
+ return PureByteTokenizer()
+
+
+def docs_sidecar_path(docs_jsonl: Path) -> Path:
+ return docs_jsonl.with_name(f"{docs_jsonl.stem}.source_manifest.json")
+
+
+def maybe_load_docs_sidecar_meta(docs_jsonl: Path) -> dict[str, Any] | None:
+ sidecar_path = docs_sidecar_path(docs_jsonl)
+ if not sidecar_path.is_file():
+ return None
+ payload = json.loads(sidecar_path.read_text(encoding="utf-8"))
+ if not isinstance(payload, dict):
+ raise ValueError(f"docs sidecar must be a JSON object: {sidecar_path}")
+ return payload
+
+
+def copy_from_hf_cache(*, repo_id: str, remote_root: str, filename: str, destination: Path) -> bool:
+ remote_path = Path(remote_root) / filename if remote_root else Path(filename)
+ try:
+ cached_path = Path(
+ hf_hub_download(
+ repo_id=repo_id,
+ filename=remote_path.name,
+ subfolder=remote_path.parent.as_posix() if remote_path.parent != Path(".") else None,
+ repo_type="dataset",
+ )
+ )
+ except EntryNotFoundError:
+ return False
+
+ source = cached_path.resolve(strict=True)
+ destination.parent.mkdir(parents=True, exist_ok=True)
+ if destination.exists():
+ destination.unlink()
+ try:
+ os.link(source, destination)
+ except OSError:
+ shutil.copy2(source, destination)
+ return True
+
+
+def iter_docs(path: Path):
+ with path.open("r", encoding="utf-8") as f:
+ for line in f:
+ yield json.loads(line)["text"]
+
+
+def count_docs(path: Path) -> int:
+ with path.open("r", encoding="utf-8") as f:
+ return sum(1 for _ in f)
+
+
+def batched_docs_jsonl(path: Path, batch_size: int):
+ batch: list[str] = []
+ for text in iter_docs(path):
+ batch.append(text)
+ if len(batch) == batch_size:
+ yield batch
+ batch = []
+ if batch:
+ yield batch
+
+
+def write_datafile(path: Path, toks: Any) -> None:
+ if len(toks) >= 2**31:
+ raise ValueError("token count too large")
+ header = np.zeros(256, dtype="<i4")
+ header[0] = DATAFILE_MAGIC
+ header[1] = DATAFILE_VERSION
+ header[2] = len(toks)
+ toks = np.asarray(toks)
+ if toks.dtype != np.uint16:
+ if not ((0 <= toks).all() and (toks < 2**16).all()):
+ raise ValueError("token dictionary too large for uint16")
+ toks = toks.astype("<u2", copy=False)
+ else:
+ toks = toks.astype("<u2", copy=False)
+ with path.open("wb") as f:
+ f.write(header.tobytes())
+ f.write(toks.tobytes())
+
+
+def relativize_manifest_paths(value: Any, root: Path) -> Any:
+ if isinstance(value, dict):
+ return {k: relativize_manifest_paths(v, root) for k, v in value.items()}
+ if isinstance(value, list):
+ return [relativize_manifest_paths(v, root) for v in value]
+ if isinstance(value, str):
+ path = Path(value)
+ if path.is_absolute():
+ try:
+ return path.relative_to(root).as_posix()
+ except ValueError:
+ return value
+ return value
+
+
+def parse_reuse_sp_models(values: list[str]) -> dict[int, Path]:
+ reuse_models: dict[int, Path] = {}
+ for value in values:
+ vocab_size_str, model_path = value.split("=", 1)
+ vocab_size = int(vocab_size_str)
+ if vocab_size in reuse_models:
+ raise ValueError(f"duplicate --reuse_sp_model for vocab_size={vocab_size}")
+ reuse_models[vocab_size] = Path(model_path).expanduser().resolve()
+ return reuse_models
+
+
+def load_specs(config_path: Path) -> list[dict[str, Any]]:
+ payload = json.loads(config_path.read_text(encoding="utf-8"))
+ if isinstance(payload, dict):
+ specs = payload.get("tokenizer_specs", payload.get("tokenizers"))
+ else:
+ specs = payload
+ if not isinstance(specs, list) or not specs:
+ raise ValueError("tokenizer_config must define a non-empty list")
+ if not all(isinstance(spec, dict) for spec in specs):
+ raise ValueError("each tokenizer spec must be a JSON object")
+ return [dict(spec) for spec in specs]
+
+
+def tokenizer_kind(spec: dict[str, Any]) -> str:
+ kind = spec.get("kind")
+ if kind in {"byte", "pure_byte"}:
+ return "byte"
+ if kind in {"sentencepiece_bpe", "sentencepiece"}:
+ return "sentencepiece_bpe"
+ builder = str(spec.get("builder", ""))
+ builder_name = builder.rsplit(":", 1)[-1]
+ if builder_name == "build_pure_byte_tokenizer":
+ return "byte"
+ if builder_name == "build_sentencepiece_tokenizer":
+ return "sentencepiece_bpe"
+ if spec.get("dataset_suffix") == "byte260":
+ return "byte"
+ if "vocab_size" in spec:
+ return "sentencepiece_bpe"
+ raise ValueError(
+ f"unsupported tokenizer spec {spec.get('name', '<unnamed>')!r}: "
+ "expected a built-in pure-byte or sentencepiece builder"
+ )
+
+
+def write_tokenizer_config_export(output_root: Path, selected_specs: list[dict[str, Any]]) -> Path:
+ path = output_root / "tokenizer_config.export.json"
+ path.write_text(json.dumps({"tokenizers": selected_specs}, indent=2) + "\n", encoding="utf-8")
+ return path
+
+
+def _iter_sentencepiece_text(docs_jsonl: Path, *, max_docs: int | None = None):
+ with docs_jsonl.open("r", encoding="utf-8") as f:
+ for i, line in enumerate(f):
+ if max_docs is not None and i >= max_docs:
+ break
+ text = json.loads(line)["text"].replace("\x00", " ").strip()
+ if text:
+ yield text
+
+
+def build_pure_byte_tokenizer(*, spec: dict[str, Any], docs_jsonl: Path, tokenizers_dir: Path) -> dict[str, Any]:
+ del docs_jsonl
+ tok = default_pure_byte_tokenizer()
+ path = tokenizers_dir / spec.get("filename", "fineweb_pure_byte_260.json")
+ tok.save_json(path)
+ return {
+ "name": spec.get("name", "pure_byte_260"),
+ "kind": "byte",
+ "dataset_suffix": spec.get("dataset_suffix", "byte260"),
+ "vocab_size": tok.vocab_size,
+ "bos_id": tok.bos_id,
+ "eos_id": tok.eos_id,
+ "encode": tok.encode,
+ "encode_batch": tok.encode_batch,
+ "manifest": {"path": str(path), "pad_id": tok.pad_id, "unk_id": tok.unk_id},
+ }
+
+
+def build_sentencepiece_tokenizer(*, spec: dict[str, Any], docs_jsonl: Path, tokenizers_dir: Path) -> dict[str, Any]:
+ try:
+ import sentencepiece as spm
+ except ImportError as exc:
+ raise RuntimeError("sentencepiece is required for SentencePiece tokenizer exports") from exc
+
+ vocab_size = int(spec["vocab_size"])
+ prefix = tokenizers_dir / spec.get("model_prefix", f"fineweb_{vocab_size}_bpe")
+ model_path = prefix.with_suffix(".model")
+ vocab_path = prefix.with_suffix(".vocab")
+ prefix.parent.mkdir(parents=True, exist_ok=True)
+ for artifact in (model_path, vocab_path):
+ if artifact.exists():
+ artifact.unlink()
+
+ reuse_model_path = spec.get("reuse_model_path")
+ if reuse_model_path is not None:
+ reuse_model_path = Path(reuse_model_path).expanduser().resolve()
+ if not reuse_model_path.is_file():
+ raise FileNotFoundError(reuse_model_path)
+ shutil.copy2(reuse_model_path, model_path)
+ reuse_vocab_path = reuse_model_path.with_suffix(".vocab")
+ if reuse_vocab_path.is_file():
+ shutil.copy2(reuse_vocab_path, vocab_path)
+ else:
+ kwargs = {
+ "sentence_iterator": _iter_sentencepiece_text(
+ docs_jsonl,
+ max_docs=None if spec.get("tokenizer_train_docs") is None else int(spec["tokenizer_train_docs"]),
+ ),
+ "model_prefix": str(prefix),
+ "model_type": "bpe",
+ "vocab_size": vocab_size,
+ "character_coverage": 0.999,
+ "byte_fallback": True,
+ "split_digits": True,
+ "normalization_rule_name": "nmt_nfkc",
+ "add_dummy_prefix": False,
+ "pad_id": 0,
+ "bos_id": 1,
+ "eos_id": 2,
+ "unk_id": 3,
+ "hard_vocab_limit": False,
+ }
+ kwargs.update(spec.get("trainer_overrides") or {})
+ spm.SentencePieceTrainer.train(**kwargs)
+
+ tok = spm.SentencePieceProcessor(model_file=str(model_path))
+ return {
+ "name": spec.get("name", f"sp_bpe_{vocab_size}"),
+ "kind": "sentencepiece_bpe",
+ "dataset_suffix": spec.get("dataset_suffix", f"sp{vocab_size}"),
+ "vocab_size": int(tok.vocab_size()),
+ "bos_id": int(tok.bos_id()),
+ "eos_id": int(tok.eos_id()),
+ "encode": lambda text, tok=tok: tok.encode(text, out_type=int),
+ "encode_batch": lambda texts, tok=tok: tok.encode(texts, out_type=int, num_threads=TOKENIZER_THREADS),
+ "manifest": {"model_path": str(model_path), "vocab_path": str(vocab_path)},
+ }
+
+
+def export_shards(
+ docs_jsonl: Path,
+ tok: dict[str, Any],
+ output_dir: Path,
+ *,
+ num_val_docs: int,
+ shard_size: int,
+ docs_total: int,
+) -> dict[str, int]:
+ output_dir.mkdir(parents=True, exist_ok=True)
+ for pattern in ("fineweb_train_*.bin", "fineweb_val_*.bin"):
+ for stale in output_dir.glob(pattern):
+ stale.unlink()
+
+ stats = {
+ "docs_total": 0,
+ "docs_val": 0,
+ "docs_train": 0,
+ "files_total": 0,
+ "files_val": 0,
+ "files_train": 0,
+ "tokens_total": 0,
+ "tokens_val": 0,
+ "tokens_train": 0,
+ }
+ buf = np.empty((shard_size,), dtype=np.uint16)
+ fill = 0
+ split = "val"
+ shards = {"val": 0, "train": 0}
+
+ def flush() -> None:
+ nonlocal fill
+ if fill == 0:
+ return
+ write_datafile(output_dir / f"fineweb_{split}_{shards[split]:06d}.bin", buf[:fill])
+ stats["files_total"] += 1
+ stats[f"files_{split}"] += 1
+ shards[split] += 1
+ fill = 0
+
+ vocab_size = int(tok["vocab_size"])
+ if vocab_size > 2**16:
+ raise ValueError(f"vocab_size={vocab_size} is too large for uint16 shard storage")
+
+ batch_encode = tok.get("encode_batch")
+ batch_size = SP_BATCH_SIZE if callable(batch_encode) else 1
+ for texts in batched_docs_jsonl(docs_jsonl, batch_size):
+ encoded_docs = batch_encode(texts) if callable(batch_encode) else [tok["encode"](text) for text in texts]
+ for text, encoded in zip(texts, encoded_docs, strict=True):
+ del text
+ split_for_doc = "val" if stats["docs_total"] < num_val_docs else "train"
+ if split_for_doc != split:
+ flush()
+ split = split_for_doc
+
+ encoded_arr = np.asarray(encoded, dtype=np.int32)
+ toks = np.empty((encoded_arr.size + 1 + int(APPEND_EOS),), dtype=np.int32)
+ toks[0] = tok["bos_id"]
+ toks[1 : 1 + encoded_arr.size] = encoded_arr
+ if APPEND_EOS:
+ toks[-1] = tok["eos_id"]
+ if not ((0 <= toks).all() and (toks < vocab_size).all()):
+ bad = int(toks[(toks < 0) | (toks >= vocab_size)][0])
+ raise ValueError(f"token id {bad} outside declared vocab_size={vocab_size}")
+ toks = toks.astype("<u2", copy=False)
+
+ stats["docs_total"] += 1
+ stats[f"docs_{split}"] += 1
+ stats["tokens_total"] += len(toks)
+ stats[f"tokens_{split}"] += len(toks)
+
+ pos = 0
+ while pos < len(toks):
+ take = min(shard_size - fill, len(toks) - pos)
+ buf[fill : fill + take] = toks[pos : pos + take]
+ fill += take
+ pos += take
+ if fill == shard_size:
+ flush()
+
+ if stats["docs_total"] and stats["docs_total"] % 100_000 == 0:
+ print(f"{output_dir.name}: {stats['docs_total']}/{docs_total} docs", flush=True)
+
+ flush()
+ if stats["docs_total"] != docs_total:
+ raise ValueError(f"expected {docs_total} docs, exported {stats['docs_total']}")
+ return stats
+
+
+def build_tokenizers(
+ *,
+ specs: list[dict[str, Any]],
+ docs_jsonl: Path,
+ tokenizers_dir: Path,
+ tokenizer_train_docs: int | None,
+ skip_byte: bool,
+ reuse_sp_models: dict[int, Path],
+) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
+ tokenizers: list[dict[str, Any]] = []
+ selected_specs: list[dict[str, Any]] = []
+ seen_names: set[str] = set()
+ seen_datasets: set[str] = set()
+
+ for raw_spec in specs:
+ spec = dict(raw_spec)
+ kind = tokenizer_kind(spec)
+ if skip_byte and kind == "byte":
+ continue
+ if kind == "sentencepiece_bpe":
+ if tokenizer_train_docs is not None:
+ spec["tokenizer_train_docs"] = int(tokenizer_train_docs)
+ vocab_size = int(spec["vocab_size"])
+ if vocab_size in reuse_sp_models:
+ spec["reuse_model_path"] = str(reuse_sp_models[vocab_size])
+
+ selected_specs.append(spec)
+ built = (
+ build_pure_byte_tokenizer(spec=spec, docs_jsonl=docs_jsonl, tokenizers_dir=tokenizers_dir)
+ if kind == "byte"
+ else build_sentencepiece_tokenizer(spec=spec, docs_jsonl=docs_jsonl, tokenizers_dir=tokenizers_dir)
+ )
+ name = str(built["name"])
+ dataset_suffix = built.get("dataset_suffix")
+ dataset_name = str(built.get("dataset_name", f"fineweb{VERSION}_{dataset_suffix}"))
+ if name in seen_names:
+ raise ValueError(f"duplicate tokenizer name: {name}")
+ if dataset_name in seen_datasets:
+ raise ValueError(f"duplicate dataset name: {dataset_name}")
+ seen_names.add(name)
+ seen_datasets.add(dataset_name)
+ vocab_size = int(built["vocab_size"])
+ recommended_bigram_vocab_size = int(
+ built.get("recommended_bigram_vocab_size", ((vocab_size + 127) // 128) * 128 * 5)
+ )
+ tokenizers.append(
+ {
+ "name": name,
+ "kind": str(built["kind"]),
+ "dataset_name": dataset_name,
+ "vocab_size": vocab_size,
+ "bos_id": int(built["bos_id"]),
+ "eos_id": int(built["eos_id"]),
+ "encode": built["encode"],
+ "encode_batch": built.get("encode_batch"),
+ "recommended_bigram_vocab_size": recommended_bigram_vocab_size,
+ "manifest": {
+ "name": name,
+ "kind": str(built["kind"]),
+ "vocab_size": vocab_size,
+ "bos_id": int(built["bos_id"]),
+ "eos_id": int(built["eos_id"]),
+ "recommended_bigram_vocab_size": recommended_bigram_vocab_size,
+ "source_spec": spec,
+ **(built.get("manifest") or {}),
+ },
+ }
+ )
+ if not tokenizers:
+ raise ValueError("tokenizer_config produced no tokenizers after filtering")
+ return tokenizers, selected_specs
+
+
+def build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(
+ description="Download docs_selected.jsonl from a Hugging Face dataset repo and tokenize it locally"
+ )
+ parser.add_argument(
+ "--repo-id",
+ default=DEFAULT_REPO_ID,
+ help="Hugging Face dataset repo id, for example user/dataset",
+ )
+ parser.add_argument(
+ "--remote-root",
+ default=DEFAULT_REMOTE_ROOT,
+ help="Optional subdirectory inside the dataset repo that contains docs_selected.jsonl",
+ )
+ parser.add_argument("--output-root", required=True, help="Directory where docs, tokenizers, shards, and manifest are written")
+ parser.add_argument(
+ "--tokenizer-config",
+ default=str(DEFAULT_CONFIG),
+ help="Local tokenizer config JSON. Defaults to data/tokenizer_specs.json.",
+ )
+ parser.add_argument(
+ "--num-val-docs",
+ type=int,
+ default=None,
+ help="Validation document count. Defaults to the downloaded sidecar when present, otherwise 50000.",
+ )
+ parser.add_argument("--chunk-tokens", type=int, default=SHARD_SIZE, help="Shard size in tokens.")
+ parser.add_argument(
+ "--tokenizer-train-docs",
+ type=int,
+ default=None,
+ help="Limit the number of docs used for tokenizer training.",
+ )
+ parser.add_argument("--skip-byte", action="store_true", help="Skip byte-tokenizer export.")
+ parser.add_argument(
+ "--reuse-sp-model",
+ action="append",
+ default=[],
+ metavar="VOCAB=MODEL",
+ help="Reuse an existing SentencePiece model for the given vocab size instead of retraining it.",
+ )
+ return parser
+
+
+def main() -> None:
+ args = build_parser().parse_args()
+ if args.chunk_tokens <= 0:
+ raise ValueError(f"--chunk_tokens must be positive, got {args.chunk_tokens}")
+
+ output_root = Path(args.output_root).expanduser().resolve()
+ output_root.mkdir(parents=True, exist_ok=True)
+ tokenizers_dir = output_root / "tokenizers"
+ datasets_dir = output_root / "datasets"
+ tokenizers_dir.mkdir(parents=True, exist_ok=True)
+ datasets_dir.mkdir(parents=True, exist_ok=True)
+
+ docs_jsonl = output_root / DOCS_FILENAME
+ sidecar = output_root / SIDECAR_FILENAME
+ if not copy_from_hf_cache(
+ repo_id=args.repo_id,
+ remote_root=args.remote_root,
+ filename=DOCS_FILENAME,
+ destination=docs_jsonl,
+ ):
+ remote = f"{args.remote_root}/{DOCS_FILENAME}" if args.remote_root else DOCS_FILENAME
+ raise FileNotFoundError(f"{remote} not found in Hugging Face dataset repo {args.repo_id}")
+ if not copy_from_hf_cache(
+ repo_id=args.repo_id,
+ remote_root=args.remote_root,
+ filename=SIDECAR_FILENAME,
+ destination=sidecar,
+ ):
+ sidecar.unlink(missing_ok=True)
+
+ docs_sidecar = maybe_load_docs_sidecar_meta(docs_jsonl)
+ docs_total = int(docs_sidecar["num_docs"]) if docs_sidecar is not None and docs_sidecar.get("num_docs") is not None else count_docs(docs_jsonl)
+ if args.num_val_docs is not None:
+ num_val_docs = int(args.num_val_docs)
+ elif docs_sidecar is not None and docs_sidecar.get("docs_val") is not None:
+ num_val_docs = int(docs_sidecar["docs_val"])
+ else:
+ num_val_docs = NUM_VAL_DOCS
+ if not (0 <= num_val_docs <= docs_total):
+ raise ValueError(f"num_val_docs must be in [0, {docs_total}], got {num_val_docs}")
+
+ specs = load_specs(Path(args.tokenizer_config).expanduser().resolve())
+ reuse_sp_models = parse_reuse_sp_models(args.reuse_sp_model)
+ tokenizers, selected_specs = build_tokenizers(
+ specs=specs,
+ docs_jsonl=docs_jsonl,
+ tokenizers_dir=tokenizers_dir,
+ tokenizer_train_docs=args.tokenizer_train_docs,
+ skip_byte=args.skip_byte,
+ reuse_sp_models=reuse_sp_models,
+ )
+ write_tokenizer_config_export(output_root, selected_specs)
+
+ docs_meta = {
+ "remote_repo_id": args.repo_id,
+ "remote_root": args.remote_root,
+ "num_docs": docs_total,
+ "docs_sha256": None if docs_sidecar is None else docs_sidecar.get("docs_sha256"),
+ "source_manifest": str(docs_sidecar_path(docs_jsonl)) if docs_sidecar is not None else None,
+ }
+ if docs_sidecar is not None:
+ docs_meta["source_sidecar"] = docs_sidecar
+
+ manifest = {
+ "version": VERSION,
+ "num_docs": docs_total,
+ "num_val_docs": num_val_docs,
+ "shuffle_seed": None if docs_sidecar is None else docs_sidecar.get("shuffle_seed"),
+ "shard_size": int(args.chunk_tokens),
+ "append_eos": APPEND_EOS,
+ "docs_jsonl": str(docs_jsonl),
+ "docs_meta": docs_meta,
+ "tokenizer_specs": selected_specs,
+ "tokenizers": [],
+ "datasets": [],
+ }
+
+ for tok in tokenizers:
+ output_dir = datasets_dir / tok["dataset_name"]
+ print(f"Exporting dataset: {tok['dataset_name']}", flush=True)
+ stats = export_shards(
+ docs_jsonl,
+ tok,
+ output_dir,
+ num_val_docs=num_val_docs,
+ shard_size=int(args.chunk_tokens),
+ docs_total=docs_total,
+ )
+ manifest["tokenizers"].append(tok["manifest"])
+ manifest["datasets"].append(
+ {
+ "name": tok["dataset_name"],
+ "tokenizer_name": tok["name"],
+ "tokenizer_kind": tok["kind"],
+ "path": str(output_dir),
+ "train_glob": str(output_dir / "fineweb_train_*.bin"),
+ "val_glob": str(output_dir / "fineweb_val_*.bin"),
+ "vocab_size": tok["vocab_size"],
+ "bos_id": tok["bos_id"],
+ "eos_id": tok["eos_id"],
+ "recommended_bigram_vocab_size": tok["recommended_bigram_vocab_size"],
+ "stats": stats,
+ }
+ )
+
+ manifest = relativize_manifest_paths(manifest, output_root)
+ manifest_path = output_root / "manifest.json"
+ manifest_path.write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf-8")
+ print(f"Done. Manifest: {manifest_path}", flush=True)
+
+
+if __name__ == "__main__":
+ main()