summaryrefslogtreecommitdiff
path: root/pull_assets.py
blob: e9b37778ccc0f3126c664bc05d13c777b343006b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/env python3
"""One-click restore of the git-ignored large assets (TinyStories-BPE data + key checkpoints) from the
private HF dataset repo, into the correct paths. So:  `git clone` + `python pull_assets.py`  =  a full
working tree (ep_run/data/ + ep_run/runs/ reconstructed in place).

Prereqs:
  pip install -U huggingface_hub
  huggingface-cli login          # must have access to the private repo below (ask Yuren)

What it restores:
  ep_run/data/tinystories_bpe/   (train.bin / val.bin / tokenizer.json / meta.pkl)
  ep_run/runs/redx_traj/s2000.pt (the warm-start operator, §5 of ONBOARDING)
  ep_run/runs/{ep_rr_ajr, ep_resreg_scratch, ep_fast_adaptive, bptt_clean}.pt  (key result checkpoints)
"""
import os, sys, subprocess

REPO = "blackhao0426/ept-assets"                          # private HF dataset repo (mirrors ep_run/ layout)
HERE = os.path.dirname(os.path.abspath(__file__))
DEST = os.path.join(HERE, "ep_run")


def main():
    try:
        from huggingface_hub import snapshot_download
    except ImportError:
        subprocess.run([sys.executable, "-m", "pip", "install", "-q", "huggingface_hub"], check=True)
        from huggingface_hub import snapshot_download
    print(f"restoring  {REPO}  ->  {DEST}/{{data,runs}} ...", flush=True)
    snapshot_download(repo_id=REPO, repo_type="dataset", local_dir=DEST)
    for p in ("data/tinystories_bpe", "runs"):
        fp = os.path.join(DEST, p)
        if os.path.isdir(fp):
            print(f"  ok  ep_run/{p}/  ({len(os.listdir(fp))} items)", flush=True)
    print("done — working tree restored.", flush=True)


if __name__ == "__main__":
    main()