diff options
| author | Yuren Hao <yurenh2@timan108.cs.illinois.edu> | 2025-09-04 23:46:09 -0500 |
|---|---|---|
| committer | Yuren Hao <yurenh2@timan108.cs.illinois.edu> | 2025-09-04 23:46:09 -0500 |
| commit | 4cf7e11cbf4dd60a358f8dd452488b3158f32ad7 (patch) | |
| tree | f31fc3ba44bb2ea9029608698badb642f9f6eb44 /scripts/self_check.py | |
| parent | 07872bdf87383026e9880303ca80313387d03875 (diff) | |
some check scripts
Diffstat (limited to 'scripts/self_check.py')
| -rwxr-xr-x | scripts/self_check.py | 397 |
1 files changed, 397 insertions, 0 deletions
diff --git a/scripts/self_check.py b/scripts/self_check.py new file mode 100755 index 0000000..898a4a7 --- /dev/null +++ b/scripts/self_check.py @@ -0,0 +1,397 @@ +#!/usr/bin/env python3 +""" +One-click environment self-check for the One-shot EM project. + +This script validates: +1) Hugging Face cache variables and directories +2) Presence of Qwen/Qwen2.5-7B-Instruct model and tokenizer in local cache +3) Weights & Biases disablement via environment variables +4) Accelerate configuration placeholder existence +5) GPU visibility via nvidia-smi and PyTorch + +It also writes a concise hardware snapshot to docs/hardware.md and prints a +human-readable report to stdout. + +Note: All code/comments are kept in English to follow project policy. +""" + +from __future__ import annotations + +import dataclasses +import datetime as dt +import json +import os +import pathlib +import shutil +import subprocess +import sys +from typing import Dict, List, Optional, Tuple + + +REPO_ROOT = pathlib.Path(__file__).resolve().parents[1] +DOCS_DIR = REPO_ROOT / "docs" +HARDWARE_MD = DOCS_DIR / "hardware.md" +ACCELERATE_CFG = REPO_ROOT / "configs" / "accelerate" / "default_config.yaml" +MODEL_ID = "Qwen/Qwen2.5-7B-Instruct" + + +def run_cmd(cmd: List[str]) -> Tuple[int, str, str]: + """Run a command and return (code, stdout, stderr).""" + try: + proc = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + text=True, + ) + return proc.returncode, proc.stdout.strip(), proc.stderr.strip() + except FileNotFoundError as exc: + return 127, "", str(exc) + + +def check_env_vars() -> Dict[str, Optional[str]]: + keys = [ + "HF_HOME", + "HF_DATASETS_CACHE", + "HF_HUB_CACHE", + "TRANSFORMERS_CACHE", + "WANDB_DISABLED", + "WANDB_MODE", + "WANDB_SILENT", + "CONDA_PREFIX", + ] + return {k: os.environ.get(k) for k in keys} + + +@dataclasses.dataclass +class HFCaches: + hf_home: Optional[str] + datasets_cache: Optional[str] + hub_cache: Optional[str] + transformers_cache: Optional[str] + + +def ensure_dirs(paths: List[str]) -> List[Tuple[str, bool]]: + results: List[Tuple[str, bool]] = [] + for p in paths: + if not p: + results.append((p, False)) + continue + try: + pathlib.Path(p).mkdir(parents=True, exist_ok=True) + results.append((p, True)) + except Exception: + results.append((p, False)) + return results + + +def check_hf_caches() -> Tuple[HFCaches, List[Tuple[str, bool]]]: + env = check_env_vars() + caches = HFCaches( + hf_home=env.get("HF_HOME"), + datasets_cache=env.get("HF_DATASETS_CACHE"), + hub_cache=env.get("HF_HUB_CACHE"), + transformers_cache=env.get("TRANSFORMERS_CACHE"), + ) + ensured = ensure_dirs( + [ + caches.hf_home or "", + caches.datasets_cache or "", + caches.hub_cache or "", + ] + ) + return caches, ensured + + +@dataclasses.dataclass +class ModelCheck: + tokenizer_cached: bool + model_cached: bool + tokenizer_loadable: bool + model_loadable: bool + snapshot_dir: Optional[str] + error: Optional[str] + + +def check_model_and_tokenizer(model_id: str = MODEL_ID) -> ModelCheck: + tokenizer_cached = False + model_cached = False + tokenizer_loadable = False + model_loadable = False + snapshot_dir: Optional[str] = None + error: Optional[str] = None + + try: + from huggingface_hub import snapshot_download # type: ignore + # Tokenizer presence (local only) + try: + snapshot_download( + repo_id=model_id, + allow_patterns=[ + "tokenizer*", + "vocab*", + "merges*", + "special_tokens_map.json", + "tokenizer.json", + "tokenizer_config.json", + "tokenizer.model", + ], + local_files_only=True, + ) + tokenizer_cached = True + except Exception: + tokenizer_cached = False + + # Full snapshot presence (local only) + try: + snapshot_dir = snapshot_download( + repo_id=model_id, + local_files_only=True, + ) + model_cached = True + except Exception: + model_cached = False + + # Loadability via transformers (local only) + try: + from transformers import AutoTokenizer, AutoModelForCausalLM # type: ignore + try: + _ = AutoTokenizer.from_pretrained(model_id, local_files_only=True) + tokenizer_loadable = True + except Exception: + tokenizer_loadable = False + + try: + _ = AutoModelForCausalLM.from_pretrained(model_id, local_files_only=True) + model_loadable = True + except Exception: + model_loadable = False + except Exception as exc: + # transformers not available or other error + error = f"transformers check failed: {exc}" + except Exception as exc: + error = f"huggingface_hub check failed: {exc}" + + return ModelCheck( + tokenizer_cached=tokenizer_cached, + model_cached=model_cached, + tokenizer_loadable=tokenizer_loadable, + model_loadable=model_loadable, + snapshot_dir=snapshot_dir, + error=error, + ) + + +@dataclasses.dataclass +class AccelerateCheck: + config_exists: bool + cli_available: bool + + +def check_accelerate() -> AccelerateCheck: + cfg_exists = ACCELERATE_CFG.exists() + code, _, _ = run_cmd(["bash", "-lc", "command -v accelerate >/dev/null 2>&1 && echo OK || true"]) + cli_available = (code == 0) + return AccelerateCheck(config_exists=cfg_exists, cli_available=cli_available) + + +@dataclasses.dataclass +class WandbCheck: + disabled: bool + mode_offline: bool + silent: bool + + +def check_wandb() -> WandbCheck: + env = check_env_vars() + disabled = str(env.get("WANDB_DISABLED", "")).lower() in {"1", "true", "yes"} + mode_offline = str(env.get("WANDB_MODE", "")).lower() == "offline" + silent = str(env.get("WANDB_SILENT", "")).lower() in {"1", "true", "yes"} + return WandbCheck(disabled=disabled, mode_offline=mode_offline, silent=silent) + + +@dataclasses.dataclass +class GpuCheck: + nvidia_smi_L: str + nvidia_smi_query: str + nvcc_version: str + torch_cuda_available: Optional[bool] + torch_num_devices: Optional[int] + torch_device0_name: Optional[str] + + +def _detect_cuda_info() -> str: + """Return a multiline string describing CUDA toolchain versions. + + Tries in order: + - nvcc --version / -V + - /usr/local/cuda/version.txt + - nvidia-smi header line (CUDA Version: X.Y) + - torch.version.cuda + """ + parts: List[str] = [] + + # nvcc --version + for cmd in [ + "nvcc --version", + "nvcc -V", + ]: + code, out, _ = run_cmd(["bash", "-lc", f"{cmd} 2>/dev/null || true"]) + if out: + parts.append(f"{cmd}:\n{out}") + break + + # /usr/local/cuda/version.txt + try: + p = pathlib.Path("/usr/local/cuda/version.txt") + if p.exists(): + txt = p.read_text().strip() + if txt: + parts.append(f"/usr/local/cuda/version.txt: {txt}") + except Exception: + pass + + # nvidia-smi header + _, smi_head, _ = run_cmd(["bash", "-lc", "nvidia-smi 2>/dev/null | head -n 1 || true"]) + if smi_head: + parts.append(f"nvidia-smi header: {smi_head}") + + # torch.version.cuda + try: + import torch # type: ignore + + if getattr(torch, "version", None) is not None: + cuda_v = getattr(torch.version, "cuda", None) + if cuda_v: + parts.append(f"torch.version.cuda: {cuda_v}") + except Exception: + pass + + return "\n".join(parts).strip() + + +def check_gpu() -> GpuCheck: + _, smi_L, _ = run_cmd(["bash", "-lc", "nvidia-smi -L 2>/dev/null || true"]) + _, smi_Q, _ = run_cmd([ + "bash", + "-lc", + "nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader 2>/dev/null || true", + ]) + nvcc_v = _detect_cuda_info() + + torch_available = None + torch_devices = None + torch_dev0 = None + try: + import torch # type: ignore + + torch_available = bool(torch.cuda.is_available()) + torch_devices = int(torch.cuda.device_count()) + if torch_available and torch_devices and torch_devices > 0: + torch_dev0 = torch.cuda.get_device_name(0) + except Exception: + pass + + return GpuCheck( + nvidia_smi_L=smi_L, + nvidia_smi_query=smi_Q, + nvcc_version=nvcc_v, + torch_cuda_available=torch_available, + torch_num_devices=torch_devices, + torch_device0_name=torch_dev0, + ) + + +def write_hardware_md(gpu: GpuCheck) -> None: + DOCS_DIR.mkdir(parents=True, exist_ok=True) + ts = dt.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") + content = ( + "# Hardware Snapshot\n\n" + f"- Timestamp (UTC): {ts}\n" + "- nvidia-smi -L:\n\n" + "```\n" + f"{gpu.nvidia_smi_L}\n" + "```\n\n" + "- GPU name, driver, memory (from nvidia-smi):\n\n" + "```\n" + f"{gpu.nvidia_smi_query}\n" + "```\n\n" + "- nvcc --version (if available):\n\n" + "```\n" + f"{gpu.nvcc_version}\n" + "```\n" + ) + HARDWARE_MD.write_text(content) + + +def build_report(env: Dict[str, Optional[str]], caches: HFCaches, ensured: List[Tuple[str, bool]], + model: ModelCheck, acc: AccelerateCheck, wb: WandbCheck, gpu: GpuCheck) -> str: + lines: List[str] = [] + lines.append("=== Self-check Report ===") + lines.append("") + lines.append("[Environment]") + lines.append(f"CONDA_PREFIX: {env.get('CONDA_PREFIX')}") + lines.append("") + lines.append("[Hugging Face Cache]") + lines.append(f"HF_HOME: {caches.hf_home}") + lines.append(f"HF_DATASETS_CACHE: {caches.datasets_cache}") + lines.append(f"HF_HUB_CACHE: {caches.hub_cache}") + lines.append(f"TRANSFORMERS_CACHE: {caches.transformers_cache}") + for path, ok in ensured: + lines.append(f"ensure_dir {path!r}: {'OK' if ok else 'FAIL'}") + lines.append("") + lines.append("[Model/Tokenizer: Qwen/Qwen2.5-7B-Instruct]") + lines.append(f"tokenizer_cached: {model.tokenizer_cached}") + lines.append(f"model_cached: {model.model_cached}") + lines.append(f"tokenizer_loadable: {model.tokenizer_loadable}") + lines.append(f"model_loadable: {model.model_loadable}") + lines.append(f"snapshot_dir: {model.snapshot_dir}") + lines.append(f"error: {model.error}") + lines.append("") + lines.append("[Accelerate]") + lines.append(f"config_exists: {acc.config_exists} path={ACCELERATE_CFG}") + lines.append(f"cli_available: {acc.cli_available}") + lines.append("") + lines.append("[Weights & Biases]") + lines.append(f"WANDB_DISABLED: {wb.disabled}") + lines.append(f"WANDB_MODE_offline: {wb.mode_offline}") + lines.append(f"WANDB_SILENT: {wb.silent}") + lines.append("") + lines.append("[GPU]") + lines.append(f"nvidia-smi -L:\n{gpu.nvidia_smi_L}") + lines.append(f"query (name,driver,mem):\n{gpu.nvidia_smi_query}") + lines.append(f"nvcc: {gpu.nvcc_version}") + lines.append( + f"torch cuda_available={gpu.torch_cuda_available} num_devices={gpu.torch_num_devices} dev0={gpu.torch_device0_name}" + ) + return "\n".join(lines) + + +def main(argv: List[str]) -> int: + write_doc = True + if "--no-write" in argv: + write_doc = False + + env = check_env_vars() + caches, ensured = check_hf_caches() + model = check_model_and_tokenizer(MODEL_ID) + acc = check_accelerate() + wb = check_wandb() + gpu = check_gpu() + + if write_doc: + try: + write_hardware_md(gpu) + except Exception as exc: + print(f"Failed to write {HARDWARE_MD}: {exc}", file=sys.stderr) + + report = build_report(env, caches, ensured, model, acc, wb, gpu) + print(report) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) + + |
