diff options
| author | Yuren Hao <yurenh2@timan108.cs.illinois.edu> | 2025-09-04 23:46:09 -0500 |
|---|---|---|
| committer | Yuren Hao <yurenh2@timan108.cs.illinois.edu> | 2025-09-04 23:46:09 -0500 |
| commit | 4cf7e11cbf4dd60a358f8dd452488b3158f32ad7 (patch) | |
| tree | f31fc3ba44bb2ea9029608698badb642f9f6eb44 | |
| parent | 07872bdf87383026e9880303ca80313387d03875 (diff) | |
some check scripts
| -rw-r--r-- | README.md | 87 | ||||
| -rw-r--r-- | configs/accelerate/default_config.yaml | 6 | ||||
| -rw-r--r-- | docs/hardware.md | 27 | ||||
| -rwxr-xr-x | scripts/self_check.py | 397 |
4 files changed, 517 insertions, 0 deletions
@@ -58,6 +58,93 @@ bash sh/eval_all_math.sh --- +### Caching (Hugging Face) + +To avoid repeated downloads across runs, we persist Hugging Face caches in the user cache directory. When activating the `one-shot-em` conda environment, the following environment variables are set: + +```bash +HF_HOME="$HOME/.cache/huggingface" +HF_DATASETS_CACHE="$HF_HOME/datasets" +HF_HUB_CACHE="$HF_HOME/hub" +TRANSFORMERS_CACHE="$HF_HUB_CACHE" +``` + +You can change these by editing the conda env activation hook under: + +``` +$CONDA_PREFIX/etc/conda/activate.d/98-hf-cache.sh +``` + +Models and tokenizers are cached under `~/.cache/huggingface/hub` and will be reused automatically. + +--- + +### Weights & Tokenizer Prefetch (Qwen2.5-7B-Instruct) + +To pre-download the text-only Instruct variant (not long-context/multimodal) and its tokenizer into the cache: + +```bash +conda activate one-shot-em +python - <<'PY' +from huggingface_hub import snapshot_download +repo = "Qwen/Qwen2.5-7B-Instruct" +# First grab tokenizer-related small files (fast verification) +snapshot_download(repo_id=repo, allow_patterns=[ + "tokenizer*","vocab*","merges*", + "special_tokens_map.json","tokenizer.json", + "tokenizer_config.json","tokenizer.model", +], resume_download=True) +# Then optionally grab the full snapshot (large download; resumes automatically) +snapshot_download(repo_id=repo, resume_download=True) +PY +``` + +--- + +### Accelerate Configuration + +We keep a default Accelerate config at: + +``` +configs/accelerate/default_config.yaml +``` + +This is a placeholder you can modify with `accelerate config` for multi-GPU runs later. + +--- + +### Weights & Biases (W&B) + +By default, W&B is disabled in the `one-shot-em` environment. To enable it, unset `WANDB_DISABLED` (or set it to `false`) and ensure your API key is set, for example: + +```bash +export WANDB_DISABLED=false +export WANDB_API_KEY=... # your key +``` + +If you wish to keep it off (default), no action is required. + +--- + +### One-click Self-check + +Run a comprehensive environment check (cache, model/tokenizer, W&B, Accelerate, GPU) and write a hardware snapshot to `docs/hardware.md`: + +```bash +conda activate one-shot-em +python scripts/self_check.py +``` + +To avoid writing the hardware snapshot (stdout only): + +```bash +python scripts/self_check.py --no-write +``` + +The script will also verify that `Qwen/Qwen2.5-7B-Instruct` (text-only Instruct) is cached and loadable locally. + +--- + ### Acknowledgements Our dataset references and builds upon the following open-source contributions: diff --git a/configs/accelerate/default_config.yaml b/configs/accelerate/default_config.yaml new file mode 100644 index 0000000..f290654 --- /dev/null +++ b/configs/accelerate/default_config.yaml @@ -0,0 +1,6 @@ +# Accelerate default config placeholder (customize as needed) +compute_environment: LOCAL_MACHINE +distributed_type: MULTI_GPU +mixed_precision: no +num_processes: 1 +use_cpu: false diff --git a/docs/hardware.md b/docs/hardware.md new file mode 100644 index 0000000..896c200 --- /dev/null +++ b/docs/hardware.md @@ -0,0 +1,27 @@ +# Hardware Snapshot + +- Timestamp (UTC): 2025-09-05T04:45:31Z +- nvidia-smi -L: + +``` +GPU 0: NVIDIA RTX A5000 (UUID: GPU-3e3f0520-a6f9-4849-77df-b0f62f198605) +GPU 1: NVIDIA RTX A5000 (UUID: GPU-836f796a-6a49-de71-9dd9-7e12e5480326) +GPU 2: NVIDIA RTX A5000 (UUID: GPU-1b14b672-1b6e-538a-c468-88cbba83d439) +GPU 3: NVIDIA RTX A5000 (UUID: GPU-7637f928-3232-8d26-7ed9-2647e8cbd83c) +``` + +- GPU name, driver, memory (from nvidia-smi): + +``` +NVIDIA RTX A5000, 535.183.01, 24564 MiB +NVIDIA RTX A5000, 535.183.01, 24564 MiB +NVIDIA RTX A5000, 535.183.01, 24564 MiB +NVIDIA RTX A5000, 535.183.01, 24564 MiB +``` + +- nvcc --version (if available): + +``` +nvidia-smi header: Thu Sep 4 23:45:24 2025 +torch.version.cuda: 12.6 +``` diff --git a/scripts/self_check.py b/scripts/self_check.py new file mode 100755 index 0000000..898a4a7 --- /dev/null +++ b/scripts/self_check.py @@ -0,0 +1,397 @@ +#!/usr/bin/env python3 +""" +One-click environment self-check for the One-shot EM project. + +This script validates: +1) Hugging Face cache variables and directories +2) Presence of Qwen/Qwen2.5-7B-Instruct model and tokenizer in local cache +3) Weights & Biases disablement via environment variables +4) Accelerate configuration placeholder existence +5) GPU visibility via nvidia-smi and PyTorch + +It also writes a concise hardware snapshot to docs/hardware.md and prints a +human-readable report to stdout. + +Note: All code/comments are kept in English to follow project policy. +""" + +from __future__ import annotations + +import dataclasses +import datetime as dt +import json +import os +import pathlib +import shutil +import subprocess +import sys +from typing import Dict, List, Optional, Tuple + + +REPO_ROOT = pathlib.Path(__file__).resolve().parents[1] +DOCS_DIR = REPO_ROOT / "docs" +HARDWARE_MD = DOCS_DIR / "hardware.md" +ACCELERATE_CFG = REPO_ROOT / "configs" / "accelerate" / "default_config.yaml" +MODEL_ID = "Qwen/Qwen2.5-7B-Instruct" + + +def run_cmd(cmd: List[str]) -> Tuple[int, str, str]: + """Run a command and return (code, stdout, stderr).""" + try: + proc = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + text=True, + ) + return proc.returncode, proc.stdout.strip(), proc.stderr.strip() + except FileNotFoundError as exc: + return 127, "", str(exc) + + +def check_env_vars() -> Dict[str, Optional[str]]: + keys = [ + "HF_HOME", + "HF_DATASETS_CACHE", + "HF_HUB_CACHE", + "TRANSFORMERS_CACHE", + "WANDB_DISABLED", + "WANDB_MODE", + "WANDB_SILENT", + "CONDA_PREFIX", + ] + return {k: os.environ.get(k) for k in keys} + + +@dataclasses.dataclass +class HFCaches: + hf_home: Optional[str] + datasets_cache: Optional[str] + hub_cache: Optional[str] + transformers_cache: Optional[str] + + +def ensure_dirs(paths: List[str]) -> List[Tuple[str, bool]]: + results: List[Tuple[str, bool]] = [] + for p in paths: + if not p: + results.append((p, False)) + continue + try: + pathlib.Path(p).mkdir(parents=True, exist_ok=True) + results.append((p, True)) + except Exception: + results.append((p, False)) + return results + + +def check_hf_caches() -> Tuple[HFCaches, List[Tuple[str, bool]]]: + env = check_env_vars() + caches = HFCaches( + hf_home=env.get("HF_HOME"), + datasets_cache=env.get("HF_DATASETS_CACHE"), + hub_cache=env.get("HF_HUB_CACHE"), + transformers_cache=env.get("TRANSFORMERS_CACHE"), + ) + ensured = ensure_dirs( + [ + caches.hf_home or "", + caches.datasets_cache or "", + caches.hub_cache or "", + ] + ) + return caches, ensured + + +@dataclasses.dataclass +class ModelCheck: + tokenizer_cached: bool + model_cached: bool + tokenizer_loadable: bool + model_loadable: bool + snapshot_dir: Optional[str] + error: Optional[str] + + +def check_model_and_tokenizer(model_id: str = MODEL_ID) -> ModelCheck: + tokenizer_cached = False + model_cached = False + tokenizer_loadable = False + model_loadable = False + snapshot_dir: Optional[str] = None + error: Optional[str] = None + + try: + from huggingface_hub import snapshot_download # type: ignore + # Tokenizer presence (local only) + try: + snapshot_download( + repo_id=model_id, + allow_patterns=[ + "tokenizer*", + "vocab*", + "merges*", + "special_tokens_map.json", + "tokenizer.json", + "tokenizer_config.json", + "tokenizer.model", + ], + local_files_only=True, + ) + tokenizer_cached = True + except Exception: + tokenizer_cached = False + + # Full snapshot presence (local only) + try: + snapshot_dir = snapshot_download( + repo_id=model_id, + local_files_only=True, + ) + model_cached = True + except Exception: + model_cached = False + + # Loadability via transformers (local only) + try: + from transformers import AutoTokenizer, AutoModelForCausalLM # type: ignore + try: + _ = AutoTokenizer.from_pretrained(model_id, local_files_only=True) + tokenizer_loadable = True + except Exception: + tokenizer_loadable = False + + try: + _ = AutoModelForCausalLM.from_pretrained(model_id, local_files_only=True) + model_loadable = True + except Exception: + model_loadable = False + except Exception as exc: + # transformers not available or other error + error = f"transformers check failed: {exc}" + except Exception as exc: + error = f"huggingface_hub check failed: {exc}" + + return ModelCheck( + tokenizer_cached=tokenizer_cached, + model_cached=model_cached, + tokenizer_loadable=tokenizer_loadable, + model_loadable=model_loadable, + snapshot_dir=snapshot_dir, + error=error, + ) + + +@dataclasses.dataclass +class AccelerateCheck: + config_exists: bool + cli_available: bool + + +def check_accelerate() -> AccelerateCheck: + cfg_exists = ACCELERATE_CFG.exists() + code, _, _ = run_cmd(["bash", "-lc", "command -v accelerate >/dev/null 2>&1 && echo OK || true"]) + cli_available = (code == 0) + return AccelerateCheck(config_exists=cfg_exists, cli_available=cli_available) + + +@dataclasses.dataclass +class WandbCheck: + disabled: bool + mode_offline: bool + silent: bool + + +def check_wandb() -> WandbCheck: + env = check_env_vars() + disabled = str(env.get("WANDB_DISABLED", "")).lower() in {"1", "true", "yes"} + mode_offline = str(env.get("WANDB_MODE", "")).lower() == "offline" + silent = str(env.get("WANDB_SILENT", "")).lower() in {"1", "true", "yes"} + return WandbCheck(disabled=disabled, mode_offline=mode_offline, silent=silent) + + +@dataclasses.dataclass +class GpuCheck: + nvidia_smi_L: str + nvidia_smi_query: str + nvcc_version: str + torch_cuda_available: Optional[bool] + torch_num_devices: Optional[int] + torch_device0_name: Optional[str] + + +def _detect_cuda_info() -> str: + """Return a multiline string describing CUDA toolchain versions. + + Tries in order: + - nvcc --version / -V + - /usr/local/cuda/version.txt + - nvidia-smi header line (CUDA Version: X.Y) + - torch.version.cuda + """ + parts: List[str] = [] + + # nvcc --version + for cmd in [ + "nvcc --version", + "nvcc -V", + ]: + code, out, _ = run_cmd(["bash", "-lc", f"{cmd} 2>/dev/null || true"]) + if out: + parts.append(f"{cmd}:\n{out}") + break + + # /usr/local/cuda/version.txt + try: + p = pathlib.Path("/usr/local/cuda/version.txt") + if p.exists(): + txt = p.read_text().strip() + if txt: + parts.append(f"/usr/local/cuda/version.txt: {txt}") + except Exception: + pass + + # nvidia-smi header + _, smi_head, _ = run_cmd(["bash", "-lc", "nvidia-smi 2>/dev/null | head -n 1 || true"]) + if smi_head: + parts.append(f"nvidia-smi header: {smi_head}") + + # torch.version.cuda + try: + import torch # type: ignore + + if getattr(torch, "version", None) is not None: + cuda_v = getattr(torch.version, "cuda", None) + if cuda_v: + parts.append(f"torch.version.cuda: {cuda_v}") + except Exception: + pass + + return "\n".join(parts).strip() + + +def check_gpu() -> GpuCheck: + _, smi_L, _ = run_cmd(["bash", "-lc", "nvidia-smi -L 2>/dev/null || true"]) + _, smi_Q, _ = run_cmd([ + "bash", + "-lc", + "nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader 2>/dev/null || true", + ]) + nvcc_v = _detect_cuda_info() + + torch_available = None + torch_devices = None + torch_dev0 = None + try: + import torch # type: ignore + + torch_available = bool(torch.cuda.is_available()) + torch_devices = int(torch.cuda.device_count()) + if torch_available and torch_devices and torch_devices > 0: + torch_dev0 = torch.cuda.get_device_name(0) + except Exception: + pass + + return GpuCheck( + nvidia_smi_L=smi_L, + nvidia_smi_query=smi_Q, + nvcc_version=nvcc_v, + torch_cuda_available=torch_available, + torch_num_devices=torch_devices, + torch_device0_name=torch_dev0, + ) + + +def write_hardware_md(gpu: GpuCheck) -> None: + DOCS_DIR.mkdir(parents=True, exist_ok=True) + ts = dt.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") + content = ( + "# Hardware Snapshot\n\n" + f"- Timestamp (UTC): {ts}\n" + "- nvidia-smi -L:\n\n" + "```\n" + f"{gpu.nvidia_smi_L}\n" + "```\n\n" + "- GPU name, driver, memory (from nvidia-smi):\n\n" + "```\n" + f"{gpu.nvidia_smi_query}\n" + "```\n\n" + "- nvcc --version (if available):\n\n" + "```\n" + f"{gpu.nvcc_version}\n" + "```\n" + ) + HARDWARE_MD.write_text(content) + + +def build_report(env: Dict[str, Optional[str]], caches: HFCaches, ensured: List[Tuple[str, bool]], + model: ModelCheck, acc: AccelerateCheck, wb: WandbCheck, gpu: GpuCheck) -> str: + lines: List[str] = [] + lines.append("=== Self-check Report ===") + lines.append("") + lines.append("[Environment]") + lines.append(f"CONDA_PREFIX: {env.get('CONDA_PREFIX')}") + lines.append("") + lines.append("[Hugging Face Cache]") + lines.append(f"HF_HOME: {caches.hf_home}") + lines.append(f"HF_DATASETS_CACHE: {caches.datasets_cache}") + lines.append(f"HF_HUB_CACHE: {caches.hub_cache}") + lines.append(f"TRANSFORMERS_CACHE: {caches.transformers_cache}") + for path, ok in ensured: + lines.append(f"ensure_dir {path!r}: {'OK' if ok else 'FAIL'}") + lines.append("") + lines.append("[Model/Tokenizer: Qwen/Qwen2.5-7B-Instruct]") + lines.append(f"tokenizer_cached: {model.tokenizer_cached}") + lines.append(f"model_cached: {model.model_cached}") + lines.append(f"tokenizer_loadable: {model.tokenizer_loadable}") + lines.append(f"model_loadable: {model.model_loadable}") + lines.append(f"snapshot_dir: {model.snapshot_dir}") + lines.append(f"error: {model.error}") + lines.append("") + lines.append("[Accelerate]") + lines.append(f"config_exists: {acc.config_exists} path={ACCELERATE_CFG}") + lines.append(f"cli_available: {acc.cli_available}") + lines.append("") + lines.append("[Weights & Biases]") + lines.append(f"WANDB_DISABLED: {wb.disabled}") + lines.append(f"WANDB_MODE_offline: {wb.mode_offline}") + lines.append(f"WANDB_SILENT: {wb.silent}") + lines.append("") + lines.append("[GPU]") + lines.append(f"nvidia-smi -L:\n{gpu.nvidia_smi_L}") + lines.append(f"query (name,driver,mem):\n{gpu.nvidia_smi_query}") + lines.append(f"nvcc: {gpu.nvcc_version}") + lines.append( + f"torch cuda_available={gpu.torch_cuda_available} num_devices={gpu.torch_num_devices} dev0={gpu.torch_device0_name}" + ) + return "\n".join(lines) + + +def main(argv: List[str]) -> int: + write_doc = True + if "--no-write" in argv: + write_doc = False + + env = check_env_vars() + caches, ensured = check_hf_caches() + model = check_model_and_tokenizer(MODEL_ID) + acc = check_accelerate() + wb = check_wandb() + gpu = check_gpu() + + if write_doc: + try: + write_hardware_md(gpu) + except Exception as exc: + print(f"Failed to write {HARDWARE_MD}: {exc}", file=sys.stderr) + + report = build_report(env, caches, ensured, model, acc, wb, gpu) + print(report) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) + + |
