import torch from transformers import AutoModel, AutoTokenizer import os import psutil import time import sys import gc def log_mem(msg): mem = psutil.Process().memory_info().rss / (1024**3) if torch.cuda.is_available(): gpu = torch.cuda.memory_allocated() / (1024**3) gpu_res = torch.cuda.memory_reserved() / (1024**3) print(f"[{msg}] RAM: {mem:.2f}GB | GPU Alloc: {gpu:.2f}GB | GPU Res: {gpu_res:.2f}GB") else: print(f"[{msg}] RAM: {mem:.2f}GB | GPU: N/A") sys.stdout.flush() def main(): print("--- Diagnostic Script ---") log_mem("Start") model_path = "models/qwen3-embedding-8b" print(f"Model path: {model_path}") # Check config import yaml try: with open("configs/local_models.yaml", "r") as f: cfg = yaml.safe_load(f) print("Config loaded from local_models.yaml:") print(cfg['models']['embedding']['qwen3']) except Exception as e: print(f"Could not load config: {e}") # Explicit garbage collection gc.collect() torch.cuda.empty_cache() log_mem("Pre-Load") print("Loading Tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=False) log_mem("Tokenizer Loaded") print("Loading Model (trust_remote_code=False)...") try: # Load with low_cpu_mem_usage=True explicit (though auto/cuda usually does it) model = AutoModel.from_pretrained( model_path, device_map="cuda:0", torch_dtype=torch.bfloat16, trust_remote_code=False, low_cpu_mem_usage=True ) print("Model loaded successfully.") except Exception as e: print(f"Model load failed: {e}") return log_mem("Model Loaded") print("Testing forward pass with small input...") input_text = "Hello world" inputs = tokenizer(input_text, return_tensors="pt").to("cuda:0") try: with torch.no_grad(): outputs = model(**inputs) print("Forward pass success.") print(f"Output shape: {outputs.last_hidden_state.shape}") except Exception as e: print(f"Forward pass failed: {e}") log_mem("End") if __name__ == "__main__": main()