#!/usr/bin/env python3
"""
One-click environment self-check for the One-shot EM project.

This script validates:
1) Hugging Face cache variables and directories
2) Presence of Qwen/Qwen2.5-7B-Instruct model and tokenizer in local cache
3) Weights & Biases disablement via environment variables
4) Accelerate configuration placeholder existence
5) GPU visibility via nvidia-smi and PyTorch

It also writes a concise hardware snapshot to docs/hardware.md and prints a
human-readable report to stdout.

Note: All code/comments are kept in English to follow project policy.
"""

from __future__ import annotations

import dataclasses
import datetime as dt
import json
import os
import pathlib
import shutil
import subprocess
import sys
from typing import Dict, List, Optional, Tuple


REPO_ROOT = pathlib.Path(__file__).resolve().parents[1]
DOCS_DIR = REPO_ROOT / "docs"
HARDWARE_MD = DOCS_DIR / "hardware.md"
ACCELERATE_CFG = REPO_ROOT / "configs" / "accelerate" / "default_config.yaml"
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"


def run_cmd(cmd: List[str]) -> Tuple[int, str, str]:
    """Run a command and return (code, stdout, stderr)."""
    try:
        proc = subprocess.run(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=False,
            text=True,
        )
        return proc.returncode, proc.stdout.strip(), proc.stderr.strip()
    except FileNotFoundError as exc:
        return 127, "", str(exc)


def check_env_vars() -> Dict[str, Optional[str]]:
    keys = [
        "HF_HOME",
        "HF_DATASETS_CACHE",
        "HF_HUB_CACHE",
        "TRANSFORMERS_CACHE",
        "WANDB_DISABLED",
        "WANDB_MODE",
        "WANDB_SILENT",
        "CONDA_PREFIX",
    ]
    return {k: os.environ.get(k) for k in keys}


@dataclasses.dataclass
class HFCaches:
    hf_home: Optional[str]
    datasets_cache: Optional[str]
    hub_cache: Optional[str]
    transformers_cache: Optional[str]


def ensure_dirs(paths: List[str]) -> List[Tuple[str, bool]]:
    results: List[Tuple[str, bool]] = []
    for p in paths:
        if not p:
            results.append((p, False))
            continue
        try:
            pathlib.Path(p).mkdir(parents=True, exist_ok=True)
            results.append((p, True))
        except Exception:
            results.append((p, False))
    return results


def check_hf_caches() -> Tuple[HFCaches, List[Tuple[str, bool]]]:
    env = check_env_vars()
    caches = HFCaches(
        hf_home=env.get("HF_HOME"),
        datasets_cache=env.get("HF_DATASETS_CACHE"),
        hub_cache=env.get("HF_HUB_CACHE"),
        transformers_cache=env.get("TRANSFORMERS_CACHE"),
    )
    ensured = ensure_dirs(
        [
            caches.hf_home or "",
            caches.datasets_cache or "",
            caches.hub_cache or "",
        ]
    )
    return caches, ensured


@dataclasses.dataclass
class ModelCheck:
    tokenizer_cached: bool
    model_cached: bool
    tokenizer_loadable: bool
    model_loadable: bool
    snapshot_dir: Optional[str]
    error: Optional[str]


def check_model_and_tokenizer(model_id: str = MODEL_ID) -> ModelCheck:
    tokenizer_cached = False
    model_cached = False
    tokenizer_loadable = False
    model_loadable = False
    snapshot_dir: Optional[str] = None
    error: Optional[str] = None

    try:
        from huggingface_hub import snapshot_download  # type: ignore
        # Tokenizer presence (local only)
        try:
            snapshot_download(
                repo_id=model_id,
                allow_patterns=[
                    "tokenizer*",
                    "vocab*",
                    "merges*",
                    "special_tokens_map.json",
                    "tokenizer.json",
                    "tokenizer_config.json",
                    "tokenizer.model",
                ],
                local_files_only=True,
            )
            tokenizer_cached = True
        except Exception:
            tokenizer_cached = False

        # Full snapshot presence (local only)
        try:
            snapshot_dir = snapshot_download(
                repo_id=model_id,
                local_files_only=True,
            )
            model_cached = True
        except Exception:
            model_cached = False

        # Loadability via transformers (local only)
        try:
            from transformers import AutoTokenizer, AutoModelForCausalLM  # type: ignore
            try:
                _ = AutoTokenizer.from_pretrained(model_id, local_files_only=True)
                tokenizer_loadable = True
            except Exception:
                tokenizer_loadable = False

            try:
                _ = AutoModelForCausalLM.from_pretrained(model_id, local_files_only=True)
                model_loadable = True
            except Exception:
                model_loadable = False
        except Exception as exc:
            # transformers not available or other error
            error = f"transformers check failed: {exc}"
    except Exception as exc:
        error = f"huggingface_hub check failed: {exc}"

    return ModelCheck(
        tokenizer_cached=tokenizer_cached,
        model_cached=model_cached,
        tokenizer_loadable=tokenizer_loadable,
        model_loadable=model_loadable,
        snapshot_dir=snapshot_dir,
        error=error,
    )


@dataclasses.dataclass
class AccelerateCheck:
    config_exists: bool
    cli_available: bool


def check_accelerate() -> AccelerateCheck:
    cfg_exists = ACCELERATE_CFG.exists()
    code, _, _ = run_cmd(["bash", "-lc", "command -v accelerate >/dev/null 2>&1 && echo OK || true"])
    cli_available = (code == 0)
    return AccelerateCheck(config_exists=cfg_exists, cli_available=cli_available)


@dataclasses.dataclass
class WandbCheck:
    disabled: bool
    mode_offline: bool
    silent: bool


def check_wandb() -> WandbCheck:
    env = check_env_vars()
    disabled = str(env.get("WANDB_DISABLED", "")).lower() in {"1", "true", "yes"}
    mode_offline = str(env.get("WANDB_MODE", "")).lower() == "offline"
    silent = str(env.get("WANDB_SILENT", "")).lower() in {"1", "true", "yes"}
    return WandbCheck(disabled=disabled, mode_offline=mode_offline, silent=silent)


@dataclasses.dataclass
class GpuCheck:
    nvidia_smi_L: str
    nvidia_smi_query: str
    nvcc_version: str
    torch_cuda_available: Optional[bool]
    torch_num_devices: Optional[int]
    torch_device0_name: Optional[str]


def _detect_cuda_info() -> str:
    """Return a multiline string describing CUDA toolchain versions.

    Tries in order:
    - nvcc --version / -V
    - /usr/local/cuda/version.txt
    - nvidia-smi header line (CUDA Version: X.Y)
    - torch.version.cuda
    """
    parts: List[str] = []

    # nvcc --version
    for cmd in [
        "nvcc --version",
        "nvcc -V",
    ]:
        code, out, _ = run_cmd(["bash", "-lc", f"{cmd} 2>/dev/null || true"])
        if out:
            parts.append(f"{cmd}:\n{out}")
            break

    # /usr/local/cuda/version.txt
    try:
        p = pathlib.Path("/usr/local/cuda/version.txt")
        if p.exists():
            txt = p.read_text().strip()
            if txt:
                parts.append(f"/usr/local/cuda/version.txt: {txt}")
    except Exception:
        pass

    # nvidia-smi header
    _, smi_head, _ = run_cmd(["bash", "-lc", "nvidia-smi 2>/dev/null | head -n 1 || true"])
    if smi_head:
        parts.append(f"nvidia-smi header: {smi_head}")

    # torch.version.cuda
    try:
        import torch  # type: ignore

        if getattr(torch, "version", None) is not None:
            cuda_v = getattr(torch.version, "cuda", None)
            if cuda_v:
                parts.append(f"torch.version.cuda: {cuda_v}")
    except Exception:
        pass

    return "\n".join(parts).strip()


def check_gpu() -> GpuCheck:
    _, smi_L, _ = run_cmd(["bash", "-lc", "nvidia-smi -L 2>/dev/null || true"])
    _, smi_Q, _ = run_cmd([
        "bash",
        "-lc",
        "nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader 2>/dev/null || true",
    ])
    nvcc_v = _detect_cuda_info()

    torch_available = None
    torch_devices = None
    torch_dev0 = None
    try:
        import torch  # type: ignore

        torch_available = bool(torch.cuda.is_available())
        torch_devices = int(torch.cuda.device_count())
        if torch_available and torch_devices and torch_devices > 0:
            torch_dev0 = torch.cuda.get_device_name(0)
    except Exception:
        pass

    return GpuCheck(
        nvidia_smi_L=smi_L,
        nvidia_smi_query=smi_Q,
        nvcc_version=nvcc_v,
        torch_cuda_available=torch_available,
        torch_num_devices=torch_devices,
        torch_device0_name=torch_dev0,
    )


def write_hardware_md(gpu: GpuCheck) -> None:
    DOCS_DIR.mkdir(parents=True, exist_ok=True)
    ts = dt.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
    content = (
        "# Hardware Snapshot\n\n"
        f"- Timestamp (UTC): {ts}\n"
        "- nvidia-smi -L:\n\n"
        "```\n"
        f"{gpu.nvidia_smi_L}\n"
        "```\n\n"
        "- GPU name, driver, memory (from nvidia-smi):\n\n"
        "```\n"
        f"{gpu.nvidia_smi_query}\n"
        "```\n\n"
        "- nvcc --version (if available):\n\n"
        "```\n"
        f"{gpu.nvcc_version}\n"
        "```\n"
    )
    HARDWARE_MD.write_text(content)


def build_report(env: Dict[str, Optional[str]], caches: HFCaches, ensured: List[Tuple[str, bool]],
                 model: ModelCheck, acc: AccelerateCheck, wb: WandbCheck, gpu: GpuCheck) -> str:
    lines: List[str] = []
    lines.append("=== Self-check Report ===")
    lines.append("")
    lines.append("[Environment]")
    lines.append(f"CONDA_PREFIX: {env.get('CONDA_PREFIX')}")
    lines.append("")
    lines.append("[Hugging Face Cache]")
    lines.append(f"HF_HOME: {caches.hf_home}")
    lines.append(f"HF_DATASETS_CACHE: {caches.datasets_cache}")
    lines.append(f"HF_HUB_CACHE: {caches.hub_cache}")
    lines.append(f"TRANSFORMERS_CACHE: {caches.transformers_cache}")
    for path, ok in ensured:
        lines.append(f"ensure_dir {path!r}: {'OK' if ok else 'FAIL'}")
    lines.append("")
    lines.append("[Model/Tokenizer: Qwen/Qwen2.5-7B-Instruct]")
    lines.append(f"tokenizer_cached: {model.tokenizer_cached}")
    lines.append(f"model_cached: {model.model_cached}")
    lines.append(f"tokenizer_loadable: {model.tokenizer_loadable}")
    lines.append(f"model_loadable: {model.model_loadable}")
    lines.append(f"snapshot_dir: {model.snapshot_dir}")
    lines.append(f"error: {model.error}")
    lines.append("")
    lines.append("[Accelerate]")
    lines.append(f"config_exists: {acc.config_exists} path={ACCELERATE_CFG}")
    lines.append(f"cli_available: {acc.cli_available}")
    lines.append("")
    lines.append("[Weights & Biases]")
    lines.append(f"WANDB_DISABLED: {wb.disabled}")
    lines.append(f"WANDB_MODE_offline: {wb.mode_offline}")
    lines.append(f"WANDB_SILENT: {wb.silent}")
    lines.append("")
    lines.append("[GPU]")
    lines.append(f"nvidia-smi -L:\n{gpu.nvidia_smi_L}")
    lines.append(f"query (name,driver,mem):\n{gpu.nvidia_smi_query}")
    lines.append(f"nvcc: {gpu.nvcc_version}")
    lines.append(
        f"torch cuda_available={gpu.torch_cuda_available} num_devices={gpu.torch_num_devices} dev0={gpu.torch_device0_name}"
    )
    return "\n".join(lines)


def main(argv: List[str]) -> int:
    write_doc = True
    if "--no-write" in argv:
        write_doc = False

    env = check_env_vars()
    caches, ensured = check_hf_caches()
    model = check_model_and_tokenizer(MODEL_ID)
    acc = check_accelerate()
    wb = check_wandb()
    gpu = check_gpu()

    if write_doc:
        try:
            write_hardware_md(gpu)
        except Exception as exc:
            print(f"Failed to write {HARDWARE_MD}: {exc}", file=sys.stderr)

    report = build_report(env, caches, ensured, model, acc, wb, gpu)
    print(report)
    return 0


if __name__ == "__main__":
    raise SystemExit(main(sys.argv[1:]))