From b83947778e2c776f757a07d4719b7ce961d7ed55 Mon Sep 17 00:00:00 2001 From: Yuren Hao Date: Fri, 3 Jul 2026 05:56:50 -0500 Subject: =?UTF-8?q?Initial=20commit:=20ept=20=E2=80=94=20backprop-free=20e?= =?UTF-8?q?quilibrium=20transformer=20(EP)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code (ep_run/), organized docs (docs/{method,campaign,hardware,outreach,paper}), analysis scripts (scripts/), ONBOARDING.md entry point. Large data/checkpoints git-ignored (share separately). Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_014FAPDWQ49M5Ye3NpTndTpn --- ep_run/prepare_tinystories_bpe.py | 49 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 ep_run/prepare_tinystories_bpe.py (limited to 'ep_run/prepare_tinystories_bpe.py') diff --git a/ep_run/prepare_tinystories_bpe.py b/ep_run/prepare_tinystories_bpe.py new file mode 100644 index 0000000..9b03a83 --- /dev/null +++ b/ep_run/prepare_tinystories_bpe.py @@ -0,0 +1,49 @@ +"""TinyStories -> 4k BPE -> train.bin/val.bin (uint16) + meta.pkl + tokenizer.json. +Same bin format as the char pipeline so lt_ep_train consumes it via --data.""" +import pickle +import numpy as np +from pathlib import Path +from tokenizers import Tokenizer +from tokenizers.models import BPE +from tokenizers.trainers import BpeTrainer +from tokenizers.pre_tokenizers import ByteLevel +from tokenizers.decoders import ByteLevel as ByteLevelDec + +SRC = Path('/home/yurenh2/ept/ep_run/data/tsrc') +D = Path('/home/yurenh2/ept/ep_run/data/tinystories_bpe') +D.mkdir(parents=True, exist_ok=True) +VOCAB = 4096 + +tok = Tokenizer(BPE(unk_token=None)) +tok.pre_tokenizer = ByteLevel(add_prefix_space=False) +tok.decoder = ByteLevelDec() +trainer = BpeTrainer(vocab_size=VOCAB, special_tokens=[], show_progress=True) +tok.train([str(SRC / 'train.txt')], trainer) +tok.save(str(D / 'tokenizer.json')) +print(f"trained BPE vocab={tok.get_vocab_size()}", flush=True) + + +def enc_file(src, dst): + out = open(dst, 'wb') + n = 0 + buf = [] + with open(src, encoding='utf-8', errors='replace') as f: + for line in f: + buf.append(line) + if len(buf) >= 20000: + ids = [i for e in tok.encode_batch([''.join(buf)]) for i in e.ids] + np.array(ids, dtype=np.uint16).tofile(out) + n += len(ids) + buf = [] + if buf: + ids = [i for e in tok.encode_batch([''.join(buf)]) for i in e.ids] + np.array(ids, dtype=np.uint16).tofile(out) + n += len(ids) + out.close() + return n + + +nt = enc_file(SRC / 'train.txt', D / 'train.bin') +nv = enc_file(SRC / 'valid.txt', D / 'val.bin') +pickle.dump({'vocab_size': tok.get_vocab_size()}, open(D / 'meta.pkl', 'wb')) +print(f"vocab={tok.get_vocab_size()} train_tokens={nt/1e6:.1f}M val_tokens={nv/1e6:.1f}M", flush=True) -- cgit v1.2.3