"""Safe GPU-2 benchmark wrapper for the a-select speed test.

Runs test_aselect_deepdive.py's main() on GPU 2 (shared with japardi2's NV-Embed
server) with a HARD allocator cap + a start-time free-memory guard so we can never
OOM the neighbour. Forwards all CLI args to the underlying test.

Usage:
  CUDA_VISIBLE_DEVICES=2 MEMFRAC=0.010 python3 bench_gpu2.py --B 1 --T2 80 --T1 2
"""
import os, sys, torch, runpy

torch.cuda.init()
free0, total = torch.cuda.mem_get_info()
f0 = free0 / 1024**2
tot = total / 1024**2
print(f"[guard] GPU free at start = {f0:.0f} MiB (of {tot:.0f})", flush=True)

MIN_FREE = float(os.environ.get("MINFREE", "1100"))
if f0 < MIN_FREE:
    sys.exit(f"[guard] ABORT: free {f0:.0f} < {MIN_FREE:.0f} MiB — too risky for the neighbour, back off.")

frac = float(os.environ.get("MEMFRAC", "0.010"))
torch.cuda.set_per_process_memory_fraction(frac)
cap = frac * tot
print(f"[guard] allocator hard-capped at {cap:.0f} MiB (frac={frac}); leaving >= {f0-cap-700:.0f} MiB headroom after ~700 MiB ctx", flush=True)

# forward remaining argv to the test's main()
sys.argv = ["test_aselect_deepdive.py"] + sys.argv[1:]
try:
    runpy.run_path("/home/yurenh2/ept/ep_run/test_aselect_deepdive.py", run_name="__main__")
finally:
    free1, _ = torch.cuda.mem_get_info()
    print(f"[guard] GPU free at end = {free1/1024**2:.0f} MiB; my peak reserved = {torch.cuda.max_memory_reserved()/1024**2:.0f} MiB", flush=True)