From b83947778e2c776f757a07d4719b7ce961d7ed55 Mon Sep 17 00:00:00 2001
From: Yuren Hao <yurenh2@illinois.edu>
Date: Fri, 3 Jul 2026 05:56:50 -0500
Subject: =?UTF-8?q?Initial=20commit:=20ept=20=E2=80=94=20backprop-free=20e?=
 =?UTF-8?q?quilibrium=20transformer=20(EP)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Code (ep_run/), organized docs (docs/{method,campaign,hardware,outreach,paper}),
analysis scripts (scripts/), ONBOARDING.md entry point. Large data/checkpoints
git-ignored (share separately).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_014FAPDWQ49M5Ye3NpTndTpn
---
 ep_run/prepare_tinystories.py | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 ep_run/prepare_tinystories.py

(limited to 'ep_run/prepare_tinystories.py')

diff --git a/ep_run/prepare_tinystories.py b/ep_run/prepare_tinystories.py
new file mode 100644
index 0000000..d7305a3
--- /dev/null
+++ b/ep_run/prepare_tinystories.py
@@ -0,0 +1,40 @@
+"""Char-level TinyStories -> train.bin/val.bin (uint16) + meta.pkl, same format as
+shakespeare_char so lt_ep_train.py consumes it via --data. Top-127 chars by train-set
+frequency; everything else maps to '?' (keeps the vocab clean of rare unicode)."""
+import collections, pickle
+import numpy as np
+from pathlib import Path
+
+D = Path('/tmp/lt_ep/data/tinystories')
+cnt = collections.Counter()
+with open(D / 'train.txt', encoding='utf-8', errors='replace') as f:
+    while True:
+        chunk = f.read(1 << 24)
+        if not chunk:
+            break
+        cnt.update(chunk)
+keep = sorted(c for c, _ in cnt.most_common(127))
+stoi = {c: i for i, c in enumerate(keep)}
+UNK = stoi.get('?', 0)
+table = {ord(c): i for c, i in stoi.items()}
+
+
+def enc_file(src, dst):
+    out = open(dst, 'wb')
+    n = 0
+    with open(src, encoding='utf-8', errors='replace') as f:
+        while True:
+            chunk = f.read(1 << 24)
+            if not chunk:
+                break
+            arr = np.fromiter((table.get(ord(c), UNK) for c in chunk), dtype=np.uint16, count=len(chunk))
+            arr.tofile(out)
+            n += len(arr)
+    out.close()
+    return n
+
+
+nt = enc_file(D / 'train.txt', D / 'train.bin')
+nv = enc_file(D / 'valid.txt', D / 'val.bin')
+pickle.dump({'vocab_size': len(stoi), 'stoi': stoi}, open(D / 'meta.pkl', 'wb'))
+print(f"vocab={len(stoi)} train_tokens={nt / 1e6:.1f}M val_tokens={nv / 1e6:.1f}M", flush=True)
-- 
cgit v1.2.3