#!/usr/bin/env python3
"""
Evaluation script for PersonaMem task: Base vs Ours (User Vector).
Metric: Accuracy (Top-1 correct option)
"""

import sys
import os
import numpy as np
from tqdm import tqdm
from typing import List

# Add src to sys.path
sys.path.append(os.path.join(os.path.dirname(__file__), "../src"))

from personalization.config.settings import load_local_models_config
from personalization.models.embedding.qwen3_8b import Qwen3Embedding8B
from personalization.data.personamem_loader import load_personamem_questions_32k
from personalization.user_model.features import ItemProjection
from personalization.retrieval.preference_store.schemas import MemoryCard
from personalization.user_model.tensor_store import UserState

def cosine_sim_matrix(a: np.ndarray, b: np.ndarray) -> np.ndarray:
    # a: [d] or [N, d], b: [d] or [M, d]
    norm_a = np.linalg.norm(a, axis=-1, keepdims=True)
    norm_b = np.linalg.norm(b, axis=-1, keepdims=True)
    
    # Ensure correct shapes for broadcasting
    # If a is 1D [d], dot gives 1D. If a is 2D [N, d], dot gives 2D.
    # b is typically [M, d] (memories or options)
    
    dot = np.dot(b, a.T)
    denom = np.dot(norm_b, norm_a.T) + 1e-8
    
    # Flatten if needed
    sim = dot / denom
    if sim.ndim == 2 and sim.shape[1] == 1:
        return sim.flatten()
    return sim

def dense_retrieval(
    e_q: np.ndarray, 
    memory_embeddings: np.ndarray, 
    topk: int = 5
) -> np.ndarray:
    """Returns topk indices of memories most similar to query."""
    if memory_embeddings.shape[0] == 0:
        return np.array([], dtype=int)
        
    sims = cosine_sim_matrix(e_q, memory_embeddings)
    # Get topk
    k = min(topk, len(sims))
    idx = np.argsort(sims)[-k:][::-1]
    return idx

def policy_retrieval(
    e_q: np.ndarray,
    memory_embeddings: np.ndarray,
    item_vectors: np.ndarray,
    z_user: np.ndarray,
    item_proj: ItemProjection,
    topk_dense: int = 20,
    topk_final: int = 5,
    beta: float = 0.2
) -> np.ndarray:
    """
    Simulates retrieve_with_policy:
    1. Dense retrieval (topk_dense)
    2. Policy Scoring: s = s_base + beta * (z . v_m)
    3. Select topk_final
    """
    if memory_embeddings.shape[0] == 0:
        return np.array([], dtype=int)

    # 1. Dense Candidate Generation
    dense_idx = dense_retrieval(e_q, memory_embeddings, topk=topk_dense)
    if len(dense_idx) == 0:
        return np.array([], dtype=int)
        
    candidates_e = memory_embeddings[dense_idx]
    candidates_v = item_vectors[dense_idx]
    
    # 2. Base Scores (Sim(q, m))
    base_scores = cosine_sim_matrix(e_q, candidates_e)
    
    # 3. Policy Bonus
    # z: [k], v: [K, k]
    bonus = np.dot(candidates_v, z_user)
    
    total_scores = base_scores + beta * bonus
    
    # 4. Final Selection
    k = min(topk_final, len(total_scores))
    local_top_idx = np.argsort(total_scores)[-k:][::-1]
    
    # Map back to global indices
    return dense_idx[local_top_idx]

def score_rag(
    e_q: np.ndarray,
    e_opts: np.ndarray,
    retrieved_embeddings: np.ndarray
) -> np.ndarray:
    """
    Computes score for each option based on query match AND memory match.
    Score = Sim(Q, Opt) + Mean(Sim(Memories, Opt))
    """
    # Base: Query-Option similarity
    # e_q: [d], e_opts: [4, d] -> [4]
    s_q_opt = cosine_sim_matrix(e_q, e_opts)
    
    if len(retrieved_embeddings) == 0:
        return s_q_opt
        
    # Memory-Option similarity
    # retrieved: [K, d], e_opts: [4, d]
    # We want for each option, the average similarity to retrieved memories.
    # sim_matrix: [4, K] (options x memories) - check implementation of cosine_sim_matrix
    # cosine_sim_matrix(a, b) does dot(b, a.T). 
    # Let a=retrieved [K, d], b=e_opts [4, d]. Result: [4, K]
    
    s_mem_opt = cosine_sim_matrix(retrieved_embeddings, e_opts)
    
    # Max pooling or Mean pooling over memories
    # Usually max is better for "if any memory supports this option"
    s_rag = np.max(s_mem_opt, axis=1) # [4]
    
    # Combine
    return s_q_opt + s_rag

def main():
    # Paths
    q_path = "data/raw_datasets/personamem/questions_32k.csv"
    vec_path = "data/personamem/user_vectors.npz"
    cards_path = "data/personamem/memory_cards.jsonl" # Generated by builder
    item_proj_path = "data/corpora/item_projection.npz"
    
    if not os.path.exists(q_path) or not os.path.exists(vec_path) or not os.path.exists(cards_path):
        print("Data missing. Run personamem_build_user_vectors.py first.")
        sys.exit(1)
        
    print("Loading resources...")
    cfg = load_local_models_config()
    embed_model = Qwen3Embedding8B.from_config(cfg)
    
    proj_data = np.load(item_proj_path)
    item_proj = ItemProjection(P=proj_data["P"], mean=proj_data["mean"])
    
    # Load User Vectors
    uv_data = np.load(vec_path, allow_pickle=True)
    user_ids = uv_data["user_ids"]
    Z = uv_data["Z"]
    user_vector_map = {uid: Z[i] for i, uid in enumerate(user_ids)}
    print(f"Loaded {len(user_vector_map)} user vectors.")
    
    # Load Memory Cards & Embeddings
    print("Loading memory store...")
    cards_by_user = {}
    embs_by_user = {}
    vecs_by_user = {} # v vectors
    
    # We need to load all cards to build per-user indices
    # This might be slow if file is huge, but 32k dataset usually produces ~100k cards?
    # Builder output: "Extracted 321 memory cards" (from small sample log). 
    # Let's assume it fits in memory.
    
    with open(cards_path, "r") as f:
        for line in f:
            card = MemoryCard.model_validate_json(line)
            uid = card.user_id
            if uid not in cards_by_user:
                cards_by_user[uid] = []
                embs_by_user[uid] = []
                
            cards_by_user[uid].append(card)
            embs_by_user[uid].append(card.embedding_e)
            
    # Convert lists to numpy arrays
    for uid in embs_by_user:
        E = np.array(embs_by_user[uid], dtype=np.float32)
        embs_by_user[uid] = E
        # Compute V on the fly or load if saved (builder didn't save V in separate file, but we can project)
        vecs_by_user[uid] = item_proj.transform_embeddings(E)

    print(f"Loaded memories for {len(cards_by_user)} users.")
    
    # Load Questions
    questions = load_personamem_questions_32k(q_path)
    print(f"Loaded {len(questions)} questions.")
    
    correct_base = []
    correct_ours = []
    
    # Hyperparams
    betas = [0.0, 1.0] # Sanity check: 0.0 should match Base RAG (if Ours RAG logic aligns when beta=0, wait, Ours RAG uses Policy Retrieval. Beta=0 in Policy Retrieval means Dense Retrieval order. So Ours RAG (beta=0) == Base RAG? Ideally yes, if topk_dense is large enough to contain base_topk)
    
    # Actually, Ours RAG pipeline:
    # 1. Dense (top20) -> 2. Re-rank (Base + beta*Bonus) -> 3. Top5
    # Base RAG pipeline:
    # 1. Dense (top5)
    
    # If beta=0, Ours RAG re-ranking is based on Base Score (Sim(q,m)). 
    # Since Dense Retrieval already sorts by Sim(q,m), re-ranking by Sim(q,m) keeps order.
    # So if topk_dense >= topk_final, Ours (beta=0) should pick same top-5 as Base RAG.
    
    for beta in betas:
        print(f"\nEvaluating with RAG (beta={beta})...")
        
        correct_base = []
        correct_ours = []
        
        case_count = 0
        
        for q in tqdm(questions):
            target_id = q.shared_context_id
            
            # Skip if no vector or no memories
            if target_id not in user_vector_map or target_id not in embs_by_user:
                continue
                
            z_user = user_vector_map[target_id]
            mem_E = embs_by_user[target_id]
            mem_V = vecs_by_user[target_id]
            
            # Embed Query
            e_q = embed_model.encode([q.user_question_or_message], return_tensor=False)[0]
            e_q = np.array(e_q, dtype=np.float32)
            
            # Embed Options
            if not q.all_options:
                continue
            e_opts = embed_model.encode(q.all_options, return_tensor=False)
            e_opts = np.array(e_opts, dtype=np.float32)
            
            # --- BASE (No RAG) ---
            s_base = score_rag(e_q, e_opts, np.array([]))
            
            # --- OURS (Personalized RAG) ---
            ours_idx = policy_retrieval(e_q, mem_E, mem_V, z_user, item_proj, topk_dense=20, topk_final=5, beta=beta)
            ours_mem_E = mem_E[ours_idx]
            s_ours = score_rag(e_q, e_opts, ours_mem_E)
            
            pred_base = int(np.argmax(s_base))
            pred_ours = int(np.argmax(s_ours))
            
            is_correct = (pred_ours == q.correct_index)
            base_correct = (pred_base == q.correct_index)
            
            # Detailed Case Print (Task A)
            # Print only when Beta > 0 (to avoid duplicate logs) and when they disagree
            if beta > 0.0 and pred_base != pred_ours and case_count < 5:
                case_count += 1
                
                # Reconstruct memory text (need to find card in list)
                # Optimization: Create a map or just linear search in cards_by_user[target_id]
                user_cards = cards_by_user[target_id]
                # ours_idx are indices into mem_E which corresponds to cards_by_user list order
                retrieved_notes = [user_cards[i].note_text for i in ours_idx]
                
                print(f"\n" + "="*60)
                print(f"[CASE ANALYSIS] QID: {q.question_id}")
                print(f"User Question: {q.user_question_or_message}")
                print(f"Correct Option ({q.correct_index}): {q.all_options[q.correct_index]}")
                print("-" * 30)
                print(f"Base Pred ({pred_base}): {q.all_options[pred_base]} [{'CORRECT' if base_correct else 'WRONG'}]")
                print(f"Ours Pred ({pred_ours}): {q.all_options[pred_ours]} [{'CORRECT' if is_correct else 'WRONG'}]")
                print("-" * 30)
                print(f"Retrieved Memories (Top 3 of {len(retrieved_notes)}):")
                for note in retrieved_notes[:3]:
                    print(f"  - {note}")
                print("-" * 30)
                print(f"Scores Base: {s_base}")
                print(f"Scores Ours: {s_ours}")
                print("="*60 + "\n")

            if q.correct_index != -1:
                correct_base.append(1 if pred_base == q.correct_index else 0)
                correct_ours.append(1 if pred_ours == q.correct_index else 0)
                
        if not correct_base:
            print("No valid evaluation samples processed.")
            continue
            
        acc_base = np.mean(correct_base)
        acc_ours = np.mean(correct_ours)
        
        # Win rate
        wins = [1 if (c_o == 1 and c_b == 0) else 0 for c_o, c_b in zip(correct_ours, correct_base)]
        win_rate = np.mean(wins)
        
        print(f"\n--- Results (Beta={beta}) ---")
        print(f"Total Samples: {len(correct_base)}")
        print(f"Accuracy (Base No-RAG): {acc_base:.4f}")
        print(f"Accuracy (Ours RAG):    {acc_ours:.4f}")
        print(f"Win Rate: {win_rate:.4f}")

if __name__ == "__main__":
    main()