1 files changed, 397 insertions, 0 deletions
diff --git a/scripts/self_check.py b/scripts/self_check.py
new file mode 100755
index 0000000..898a4a7
--- /dev/null
+++ b/scripts/self_check.py
@@ -0,0 +1,397 @@
+#!/usr/bin/env python3
+"""
+One-click environment self-check for the One-shot EM project.
+
+This script validates:
+1) Hugging Face cache variables and directories
+2) Presence of Qwen/Qwen2.5-7B-Instruct model and tokenizer in local cache
+3) Weights & Biases disablement via environment variables
+4) Accelerate configuration placeholder existence
+5) GPU visibility via nvidia-smi and PyTorch
+
+It also writes a concise hardware snapshot to docs/hardware.md and prints a
+human-readable report to stdout.
+
+Note: All code/comments are kept in English to follow project policy.
+"""
+
+from __future__ import annotations
+
+import dataclasses
+import datetime as dt
+import json
+import os
+import pathlib
+import shutil
+import subprocess
+import sys
+from typing import Dict, List, Optional, Tuple
+
+
+REPO_ROOT = pathlib.Path(__file__).resolve().parents[1]
+DOCS_DIR = REPO_ROOT / "docs"
+HARDWARE_MD = DOCS_DIR / "hardware.md"
+ACCELERATE_CFG = REPO_ROOT / "configs" / "accelerate" / "default_config.yaml"
+MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
+
+
+def run_cmd(cmd: List[str]) -> Tuple[int, str, str]:
+    """Run a command and return (code, stdout, stderr)."""
+    try:
+        proc = subprocess.run(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=False,
+            text=True,
+        )
+        return proc.returncode, proc.stdout.strip(), proc.stderr.strip()
+    except FileNotFoundError as exc:
+        return 127, "", str(exc)
+
+
+def check_env_vars() -> Dict[str, Optional[str]]:
+    keys = [
+        "HF_HOME",
+        "HF_DATASETS_CACHE",
+        "HF_HUB_CACHE",
+        "TRANSFORMERS_CACHE",
+        "WANDB_DISABLED",
+        "WANDB_MODE",
+        "WANDB_SILENT",
+        "CONDA_PREFIX",
+    ]
+    return {k: os.environ.get(k) for k in keys}
+
+
+@dataclasses.dataclass
+class HFCaches:
+    hf_home: Optional[str]
+    datasets_cache: Optional[str]
+    hub_cache: Optional[str]
+    transformers_cache: Optional[str]
+
+
+def ensure_dirs(paths: List[str]) -> List[Tuple[str, bool]]:
+    results: List[Tuple[str, bool]] = []
+    for p in paths:
+        if not p:
+            results.append((p, False))
+            continue
+        try:
+            pathlib.Path(p).mkdir(parents=True, exist_ok=True)
+            results.append((p, True))
+        except Exception:
+            results.append((p, False))
+    return results
+
+
+def check_hf_caches() -> Tuple[HFCaches, List[Tuple[str, bool]]]:
+    env = check_env_vars()
+    caches = HFCaches(
+        hf_home=env.get("HF_HOME"),
+        datasets_cache=env.get("HF_DATASETS_CACHE"),
+        hub_cache=env.get("HF_HUB_CACHE"),
+        transformers_cache=env.get("TRANSFORMERS_CACHE"),
+    )
+    ensured = ensure_dirs(
+        [
+            caches.hf_home or "",
+            caches.datasets_cache or "",
+            caches.hub_cache or "",
+        ]
+    )
+    return caches, ensured
+
+
+@dataclasses.dataclass
+class ModelCheck:
+    tokenizer_cached: bool
+    model_cached: bool
+    tokenizer_loadable: bool
+    model_loadable: bool
+    snapshot_dir: Optional[str]
+    error: Optional[str]
+
+
+def check_model_and_tokenizer(model_id: str = MODEL_ID) -> ModelCheck:
+    tokenizer_cached = False
+    model_cached = False
+    tokenizer_loadable = False
+    model_loadable = False
+    snapshot_dir: Optional[str] = None
+    error: Optional[str] = None
+
+    try:
+        from huggingface_hub import snapshot_download  # type: ignore
+        # Tokenizer presence (local only)
+        try:
+            snapshot_download(
+                repo_id=model_id,
+                allow_patterns=[
+                    "tokenizer*",
+                    "vocab*",
+                    "merges*",
+                    "special_tokens_map.json",
+                    "tokenizer.json",
+                    "tokenizer_config.json",
+                    "tokenizer.model",
+                ],
+                local_files_only=True,
+            )
+            tokenizer_cached = True
+        except Exception:
+            tokenizer_cached = False
+
+        # Full snapshot presence (local only)
+        try:
+            snapshot_dir = snapshot_download(
+                repo_id=model_id,
+                local_files_only=True,
+            )
+            model_cached = True
+        except Exception:
+            model_cached = False
+
+        # Loadability via transformers (local only)
+        try:
+            from transformers import AutoTokenizer, AutoModelForCausalLM  # type: ignore
+            try:
+                _ = AutoTokenizer.from_pretrained(model_id, local_files_only=True)
+                tokenizer_loadable = True
+            except Exception:
+                tokenizer_loadable = False
+
+            try:
+                _ = AutoModelForCausalLM.from_pretrained(model_id, local_files_only=True)
+                model_loadable = True
+            except Exception:
+                model_loadable = False
+        except Exception as exc:
+            # transformers not available or other error
+            error = f"transformers check failed: {exc}"
+    except Exception as exc:
+        error = f"huggingface_hub check failed: {exc}"
+
+    return ModelCheck(
+        tokenizer_cached=tokenizer_cached,
+        model_cached=model_cached,
+        tokenizer_loadable=tokenizer_loadable,
+        model_loadable=model_loadable,
+        snapshot_dir=snapshot_dir,
+        error=error,
+    )
+
+
+@dataclasses.dataclass
+class AccelerateCheck:
+    config_exists: bool
+    cli_available: bool
+
+
+def check_accelerate() -> AccelerateCheck:
+    cfg_exists = ACCELERATE_CFG.exists()
+    code, _, _ = run_cmd(["bash", "-lc", "command -v accelerate >/dev/null 2>&1 && echo OK || true"])
+    cli_available = (code == 0)
+    return AccelerateCheck(config_exists=cfg_exists, cli_available=cli_available)
+
+
+@dataclasses.dataclass
+class WandbCheck:
+    disabled: bool
+    mode_offline: bool
+    silent: bool
+
+
+def check_wandb() -> WandbCheck:
+    env = check_env_vars()
+    disabled = str(env.get("WANDB_DISABLED", "")).lower() in {"1", "true", "yes"}
+    mode_offline = str(env.get("WANDB_MODE", "")).lower() == "offline"
+    silent = str(env.get("WANDB_SILENT", "")).lower() in {"1", "true", "yes"}
+    return WandbCheck(disabled=disabled, mode_offline=mode_offline, silent=silent)
+
+
+@dataclasses.dataclass
+class GpuCheck:
+    nvidia_smi_L: str
+    nvidia_smi_query: str
+    nvcc_version: str
+    torch_cuda_available: Optional[bool]
+    torch_num_devices: Optional[int]
+    torch_device0_name: Optional[str]
+
+
+def _detect_cuda_info() -> str:
+    """Return a multiline string describing CUDA toolchain versions.
+
+    Tries in order:
+    - nvcc --version / -V
+    - /usr/local/cuda/version.txt
+    - nvidia-smi header line (CUDA Version: X.Y)
+    - torch.version.cuda
+    """
+    parts: List[str] = []
+
+    # nvcc --version
+    for cmd in [
+        "nvcc --version",
+        "nvcc -V",
+    ]:
+        code, out, _ = run_cmd(["bash", "-lc", f"{cmd} 2>/dev/null || true"])
+        if out:
+            parts.append(f"{cmd}:\n{out}")
+            break
+
+    # /usr/local/cuda/version.txt
+    try:
+        p = pathlib.Path("/usr/local/cuda/version.txt")
+        if p.exists():
+            txt = p.read_text().strip()
+            if txt:
+                parts.append(f"/usr/local/cuda/version.txt: {txt}")
+    except Exception:
+        pass
+
+    # nvidia-smi header
+    _, smi_head, _ = run_cmd(["bash", "-lc", "nvidia-smi 2>/dev/null | head -n 1 || true"])
+    if smi_head:
+        parts.append(f"nvidia-smi header: {smi_head}")
+
+    # torch.version.cuda
+    try:
+        import torch  # type: ignore
+
+        if getattr(torch, "version", None) is not None:
+            cuda_v = getattr(torch.version, "cuda", None)
+            if cuda_v:
+                parts.append(f"torch.version.cuda: {cuda_v}")
+    except Exception:
+        pass
+
+    return "\n".join(parts).strip()
+
+
+def check_gpu() -> GpuCheck:
+    _, smi_L, _ = run_cmd(["bash", "-lc", "nvidia-smi -L 2>/dev/null || true"])
+    _, smi_Q, _ = run_cmd([
+        "bash",
+        "-lc",
+        "nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader 2>/dev/null || true",
+    ])
+    nvcc_v = _detect_cuda_info()
+
+    torch_available = None
+    torch_devices = None
+    torch_dev0 = None
+    try:
+        import torch  # type: ignore
+
+        torch_available = bool(torch.cuda.is_available())
+        torch_devices = int(torch.cuda.device_count())
+        if torch_available and torch_devices and torch_devices > 0:
+            torch_dev0 = torch.cuda.get_device_name(0)
+    except Exception:
+        pass
+
+    return GpuCheck(
+        nvidia_smi_L=smi_L,
+        nvidia_smi_query=smi_Q,
+        nvcc_version=nvcc_v,
+        torch_cuda_available=torch_available,
+        torch_num_devices=torch_devices,
+        torch_device0_name=torch_dev0,
+    )
+
+
+def write_hardware_md(gpu: GpuCheck) -> None:
+    DOCS_DIR.mkdir(parents=True, exist_ok=True)
+    ts = dt.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
+    content = (
+        "# Hardware Snapshot\n\n"
+        f"- Timestamp (UTC): {ts}\n"
+        "- nvidia-smi -L:\n\n"
+        "```\n"
+        f"{gpu.nvidia_smi_L}\n"
+        "```\n\n"
+        "- GPU name, driver, memory (from nvidia-smi):\n\n"
+        "```\n"
+        f"{gpu.nvidia_smi_query}\n"
+        "```\n\n"
+        "- nvcc --version (if available):\n\n"
+        "```\n"
+        f"{gpu.nvcc_version}\n"
+        "```\n"
+    )
+    HARDWARE_MD.write_text(content)
+
+
+def build_report(env: Dict[str, Optional[str]], caches: HFCaches, ensured: List[Tuple[str, bool]],
+                 model: ModelCheck, acc: AccelerateCheck, wb: WandbCheck, gpu: GpuCheck) -> str:
+    lines: List[str] = []
+    lines.append("=== Self-check Report ===")
+    lines.append("")
+    lines.append("[Environment]")
+    lines.append(f"CONDA_PREFIX: {env.get('CONDA_PREFIX')}")
+    lines.append("")
+    lines.append("[Hugging Face Cache]")
+    lines.append(f"HF_HOME: {caches.hf_home}")
+    lines.append(f"HF_DATASETS_CACHE: {caches.datasets_cache}")
+    lines.append(f"HF_HUB_CACHE: {caches.hub_cache}")
+    lines.append(f"TRANSFORMERS_CACHE: {caches.transformers_cache}")
+    for path, ok in ensured:
+        lines.append(f"ensure_dir {path!r}: {'OK' if ok else 'FAIL'}")
+    lines.append("")
+    lines.append("[Model/Tokenizer: Qwen/Qwen2.5-7B-Instruct]")
+    lines.append(f"tokenizer_cached: {model.tokenizer_cached}")
+    lines.append(f"model_cached: {model.model_cached}")
+    lines.append(f"tokenizer_loadable: {model.tokenizer_loadable}")
+    lines.append(f"model_loadable: {model.model_loadable}")
+    lines.append(f"snapshot_dir: {model.snapshot_dir}")
+    lines.append(f"error: {model.error}")
+    lines.append("")
+    lines.append("[Accelerate]")
+    lines.append(f"config_exists: {acc.config_exists} path={ACCELERATE_CFG}")
+    lines.append(f"cli_available: {acc.cli_available}")
+    lines.append("")
+    lines.append("[Weights & Biases]")
+    lines.append(f"WANDB_DISABLED: {wb.disabled}")
+    lines.append(f"WANDB_MODE_offline: {wb.mode_offline}")
+    lines.append(f"WANDB_SILENT: {wb.silent}")
+    lines.append("")
+    lines.append("[GPU]")
+    lines.append(f"nvidia-smi -L:\n{gpu.nvidia_smi_L}")
+    lines.append(f"query (name,driver,mem):\n{gpu.nvidia_smi_query}")
+    lines.append(f"nvcc: {gpu.nvcc_version}")
+    lines.append(
+        f"torch cuda_available={gpu.torch_cuda_available} num_devices={gpu.torch_num_devices} dev0={gpu.torch_device0_name}"
+    )
+    return "\n".join(lines)
+
+
+def main(argv: List[str]) -> int:
+    write_doc = True
+    if "--no-write" in argv:
+        write_doc = False
+
+    env = check_env_vars()
+    caches, ensured = check_hf_caches()
+    model = check_model_and_tokenizer(MODEL_ID)
+    acc = check_accelerate()
+    wb = check_wandb()
+    gpu = check_gpu()
+
+    if write_doc:
+        try:
+            write_hardware_md(gpu)
+        except Exception as exc:
+            print(f"Failed to write {HARDWARE_MD}: {exc}", file=sys.stderr)
+
+    report = build_report(env, caches, ensured, model, acc, wb, gpu)
+    print(report)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
+
+