summaryrefslogtreecommitdiff
path: root/ep_run/bench_gpu2.py
diff options
context:
space:
mode:
authorYuren Hao <yurenh2@illinois.edu>2026-07-03 05:56:50 -0500
committerYuren Hao <yurenh2@illinois.edu>2026-07-03 05:56:50 -0500
commitb83947778e2c776f757a07d4719b7ce961d7ed55 (patch)
treeb9cc01d7adda691d9156d9d04f4fb2f644674e96 /ep_run/bench_gpu2.py
Initial commit: ept — backprop-free equilibrium transformer (EP)
Code (ep_run/), organized docs (docs/{method,campaign,hardware,outreach,paper}), analysis scripts (scripts/), ONBOARDING.md entry point. Large data/checkpoints git-ignored (share separately). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_014FAPDWQ49M5Ye3NpTndTpn
Diffstat (limited to 'ep_run/bench_gpu2.py')
-rw-r--r--ep_run/bench_gpu2.py33
1 files changed, 33 insertions, 0 deletions
diff --git a/ep_run/bench_gpu2.py b/ep_run/bench_gpu2.py
new file mode 100644
index 0000000..ef66ef4
--- /dev/null
+++ b/ep_run/bench_gpu2.py
@@ -0,0 +1,33 @@
+"""Safe GPU-2 benchmark wrapper for the a-select speed test.
+
+Runs test_aselect_deepdive.py's main() on GPU 2 (shared with japardi2's NV-Embed
+server) with a HARD allocator cap + a start-time free-memory guard so we can never
+OOM the neighbour. Forwards all CLI args to the underlying test.
+
+Usage:
+ CUDA_VISIBLE_DEVICES=2 MEMFRAC=0.010 python3 bench_gpu2.py --B 1 --T2 80 --T1 2
+"""
+import os, sys, torch, runpy
+
+torch.cuda.init()
+free0, total = torch.cuda.mem_get_info()
+f0 = free0 / 1024**2
+tot = total / 1024**2
+print(f"[guard] GPU free at start = {f0:.0f} MiB (of {tot:.0f})", flush=True)
+
+MIN_FREE = float(os.environ.get("MINFREE", "1100"))
+if f0 < MIN_FREE:
+ sys.exit(f"[guard] ABORT: free {f0:.0f} < {MIN_FREE:.0f} MiB — too risky for the neighbour, back off.")
+
+frac = float(os.environ.get("MEMFRAC", "0.010"))
+torch.cuda.set_per_process_memory_fraction(frac)
+cap = frac * tot
+print(f"[guard] allocator hard-capped at {cap:.0f} MiB (frac={frac}); leaving >= {f0-cap-700:.0f} MiB headroom after ~700 MiB ctx", flush=True)
+
+# forward remaining argv to the test's main()
+sys.argv = ["test_aselect_deepdive.py"] + sys.argv[1:]
+try:
+ runpy.run_path("/home/yurenh2/ept/ep_run/test_aselect_deepdive.py", run_name="__main__")
+finally:
+ free1, _ = torch.cuda.mem_get_info()
+ print(f"[guard] GPU free at end = {free1/1024**2:.0f} MiB; my peak reserved = {torch.cuda.max_memory_reserved()/1024**2:.0f} MiB", flush=True)