diff options
| author | Yuren Hao <yurenh2@illinois.edu> | 2026-07-03 06:06:09 -0500 |
|---|---|---|
| committer | Yuren Hao <yurenh2@illinois.edu> | 2026-07-03 06:06:09 -0500 |
| commit | 8626ac5d6cf6f548157cb349ea99b8b603b268ce (patch) | |
| tree | 04afed3ed84652154944a735272939980d888f53 /pull_assets.py | |
| parent | b7fab6a524c4c5cd29aaf9933fb150e7b7902a3f (diff) | |
Add pull_assets.py one-click restore (HF ept-assets) + ONBOARDING restore note
git clone + python pull_assets.py = full working tree: pulls TinyStories-BPE
data (~697M) and the 5 key checkpoints from the private HF dataset repo
blackhao0426/ept-assets straight into ep_run/{data,runs}.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_014FAPDWQ49M5Ye3NpTndTpn
Diffstat (limited to 'pull_assets.py')
| -rw-r--r-- | pull_assets.py | 38 |
1 files changed, 38 insertions, 0 deletions
diff --git a/pull_assets.py b/pull_assets.py new file mode 100644 index 0000000..e9b3777 --- /dev/null +++ b/pull_assets.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +"""One-click restore of the git-ignored large assets (TinyStories-BPE data + key checkpoints) from the +private HF dataset repo, into the correct paths. So: `git clone` + `python pull_assets.py` = a full +working tree (ep_run/data/ + ep_run/runs/ reconstructed in place). + +Prereqs: + pip install -U huggingface_hub + huggingface-cli login # must have access to the private repo below (ask Yuren) + +What it restores: + ep_run/data/tinystories_bpe/ (train.bin / val.bin / tokenizer.json / meta.pkl) + ep_run/runs/redx_traj/s2000.pt (the warm-start operator, §5 of ONBOARDING) + ep_run/runs/{ep_rr_ajr, ep_resreg_scratch, ep_fast_adaptive, bptt_clean}.pt (key result checkpoints) +""" +import os, sys, subprocess + +REPO = "blackhao0426/ept-assets" # private HF dataset repo (mirrors ep_run/ layout) +HERE = os.path.dirname(os.path.abspath(__file__)) +DEST = os.path.join(HERE, "ep_run") + + +def main(): + try: + from huggingface_hub import snapshot_download + except ImportError: + subprocess.run([sys.executable, "-m", "pip", "install", "-q", "huggingface_hub"], check=True) + from huggingface_hub import snapshot_download + print(f"restoring {REPO} -> {DEST}/{{data,runs}} ...", flush=True) + snapshot_download(repo_id=REPO, repo_type="dataset", local_dir=DEST) + for p in ("data/tinystories_bpe", "runs"): + fp = os.path.join(DEST, p) + if os.path.isdir(fp): + print(f" ok ep_run/{p}/ ({len(os.listdir(fp))} items)", flush=True) + print("done — working tree restored.", flush=True) + + +if __name__ == "__main__": + main() |
