#!/usr/bin/env python3 """ Evaluation script for PersonaMem task: Base vs Ours (User Vector). Metric: Accuracy (Top-1 correct option) """ import sys import os import numpy as np from tqdm import tqdm from typing import List # Add src to sys.path sys.path.append(os.path.join(os.path.dirname(__file__), "../src")) from personalization.config.settings import load_local_models_config from personalization.models.embedding.qwen3_8b import Qwen3Embedding8B from personalization.data.personamem_loader import load_personamem_questions_32k from personalization.user_model.features import ItemProjection from personalization.retrieval.preference_store.schemas import MemoryCard from personalization.user_model.tensor_store import UserState def cosine_sim_matrix(a: np.ndarray, b: np.ndarray) -> np.ndarray: # a: [d] or [N, d], b: [d] or [M, d] norm_a = np.linalg.norm(a, axis=-1, keepdims=True) norm_b = np.linalg.norm(b, axis=-1, keepdims=True) # Ensure correct shapes for broadcasting # If a is 1D [d], dot gives 1D. If a is 2D [N, d], dot gives 2D. # b is typically [M, d] (memories or options) dot = np.dot(b, a.T) denom = np.dot(norm_b, norm_a.T) + 1e-8 # Flatten if needed sim = dot / denom if sim.ndim == 2 and sim.shape[1] == 1: return sim.flatten() return sim def dense_retrieval( e_q: np.ndarray, memory_embeddings: np.ndarray, topk: int = 5 ) -> np.ndarray: """Returns topk indices of memories most similar to query.""" if memory_embeddings.shape[0] == 0: return np.array([], dtype=int) sims = cosine_sim_matrix(e_q, memory_embeddings) # Get topk k = min(topk, len(sims)) idx = np.argsort(sims)[-k:][::-1] return idx def policy_retrieval( e_q: np.ndarray, memory_embeddings: np.ndarray, item_vectors: np.ndarray, z_user: np.ndarray, item_proj: ItemProjection, topk_dense: int = 20, topk_final: int = 5, beta: float = 0.2 ) -> np.ndarray: """ Simulates retrieve_with_policy: 1. Dense retrieval (topk_dense) 2. Policy Scoring: s = s_base + beta * (z . v_m) 3. Select topk_final """ if memory_embeddings.shape[0] == 0: return np.array([], dtype=int) # 1. Dense Candidate Generation dense_idx = dense_retrieval(e_q, memory_embeddings, topk=topk_dense) if len(dense_idx) == 0: return np.array([], dtype=int) candidates_e = memory_embeddings[dense_idx] candidates_v = item_vectors[dense_idx] # 2. Base Scores (Sim(q, m)) base_scores = cosine_sim_matrix(e_q, candidates_e) # 3. Policy Bonus # z: [k], v: [K, k] bonus = np.dot(candidates_v, z_user) total_scores = base_scores + beta * bonus # 4. Final Selection k = min(topk_final, len(total_scores)) local_top_idx = np.argsort(total_scores)[-k:][::-1] # Map back to global indices return dense_idx[local_top_idx] def score_rag( e_q: np.ndarray, e_opts: np.ndarray, retrieved_embeddings: np.ndarray ) -> np.ndarray: """ Computes score for each option based on query match AND memory match. Score = Sim(Q, Opt) + Mean(Sim(Memories, Opt)) """ # Base: Query-Option similarity # e_q: [d], e_opts: [4, d] -> [4] s_q_opt = cosine_sim_matrix(e_q, e_opts) if len(retrieved_embeddings) == 0: return s_q_opt # Memory-Option similarity # retrieved: [K, d], e_opts: [4, d] # We want for each option, the average similarity to retrieved memories. # sim_matrix: [4, K] (options x memories) - check implementation of cosine_sim_matrix # cosine_sim_matrix(a, b) does dot(b, a.T). # Let a=retrieved [K, d], b=e_opts [4, d]. Result: [4, K] s_mem_opt = cosine_sim_matrix(retrieved_embeddings, e_opts) # Max pooling or Mean pooling over memories # Usually max is better for "if any memory supports this option" s_rag = np.max(s_mem_opt, axis=1) # [4] # Combine return s_q_opt + s_rag def main(): # Paths q_path = "data/raw_datasets/personamem/questions_32k.csv" vec_path = "data/personamem/user_vectors.npz" cards_path = "data/personamem/memory_cards.jsonl" # Generated by builder item_proj_path = "data/corpora/item_projection.npz" if not os.path.exists(q_path) or not os.path.exists(vec_path) or not os.path.exists(cards_path): print("Data missing. Run personamem_build_user_vectors.py first.") sys.exit(1) print("Loading resources...") cfg = load_local_models_config() embed_model = Qwen3Embedding8B.from_config(cfg) proj_data = np.load(item_proj_path) item_proj = ItemProjection(P=proj_data["P"], mean=proj_data["mean"]) # Load User Vectors uv_data = np.load(vec_path, allow_pickle=True) user_ids = uv_data["user_ids"] Z = uv_data["Z"] user_vector_map = {uid: Z[i] for i, uid in enumerate(user_ids)} print(f"Loaded {len(user_vector_map)} user vectors.") # Load Memory Cards & Embeddings print("Loading memory store...") cards_by_user = {} embs_by_user = {} vecs_by_user = {} # v vectors # We need to load all cards to build per-user indices # This might be slow if file is huge, but 32k dataset usually produces ~100k cards? # Builder output: "Extracted 321 memory cards" (from small sample log). # Let's assume it fits in memory. with open(cards_path, "r") as f: for line in f: card = MemoryCard.model_validate_json(line) uid = card.user_id if uid not in cards_by_user: cards_by_user[uid] = [] embs_by_user[uid] = [] cards_by_user[uid].append(card) embs_by_user[uid].append(card.embedding_e) # Convert lists to numpy arrays for uid in embs_by_user: E = np.array(embs_by_user[uid], dtype=np.float32) embs_by_user[uid] = E # Compute V on the fly or load if saved (builder didn't save V in separate file, but we can project) vecs_by_user[uid] = item_proj.transform_embeddings(E) print(f"Loaded memories for {len(cards_by_user)} users.") # Load Questions questions = load_personamem_questions_32k(q_path) print(f"Loaded {len(questions)} questions.") correct_base = [] correct_ours = [] # Hyperparams betas = [0.0, 1.0] # Sanity check: 0.0 should match Base RAG (if Ours RAG logic aligns when beta=0, wait, Ours RAG uses Policy Retrieval. Beta=0 in Policy Retrieval means Dense Retrieval order. So Ours RAG (beta=0) == Base RAG? Ideally yes, if topk_dense is large enough to contain base_topk) # Actually, Ours RAG pipeline: # 1. Dense (top20) -> 2. Re-rank (Base + beta*Bonus) -> 3. Top5 # Base RAG pipeline: # 1. Dense (top5) # If beta=0, Ours RAG re-ranking is based on Base Score (Sim(q,m)). # Since Dense Retrieval already sorts by Sim(q,m), re-ranking by Sim(q,m) keeps order. # So if topk_dense >= topk_final, Ours (beta=0) should pick same top-5 as Base RAG. for beta in betas: print(f"\nEvaluating with RAG (beta={beta})...") correct_base = [] correct_ours = [] case_count = 0 for q in tqdm(questions): target_id = q.shared_context_id # Skip if no vector or no memories if target_id not in user_vector_map or target_id not in embs_by_user: continue z_user = user_vector_map[target_id] mem_E = embs_by_user[target_id] mem_V = vecs_by_user[target_id] # Embed Query e_q = embed_model.encode([q.user_question_or_message], return_tensor=False)[0] e_q = np.array(e_q, dtype=np.float32) # Embed Options if not q.all_options: continue e_opts = embed_model.encode(q.all_options, return_tensor=False) e_opts = np.array(e_opts, dtype=np.float32) # --- BASE (No RAG) --- s_base = score_rag(e_q, e_opts, np.array([])) # --- OURS (Personalized RAG) --- ours_idx = policy_retrieval(e_q, mem_E, mem_V, z_user, item_proj, topk_dense=20, topk_final=5, beta=beta) ours_mem_E = mem_E[ours_idx] s_ours = score_rag(e_q, e_opts, ours_mem_E) pred_base = int(np.argmax(s_base)) pred_ours = int(np.argmax(s_ours)) is_correct = (pred_ours == q.correct_index) base_correct = (pred_base == q.correct_index) # Detailed Case Print (Task A) # Print only when Beta > 0 (to avoid duplicate logs) and when they disagree if beta > 0.0 and pred_base != pred_ours and case_count < 5: case_count += 1 # Reconstruct memory text (need to find card in list) # Optimization: Create a map or just linear search in cards_by_user[target_id] user_cards = cards_by_user[target_id] # ours_idx are indices into mem_E which corresponds to cards_by_user list order retrieved_notes = [user_cards[i].note_text for i in ours_idx] print(f"\n" + "="*60) print(f"[CASE ANALYSIS] QID: {q.question_id}") print(f"User Question: {q.user_question_or_message}") print(f"Correct Option ({q.correct_index}): {q.all_options[q.correct_index]}") print("-" * 30) print(f"Base Pred ({pred_base}): {q.all_options[pred_base]} [{'CORRECT' if base_correct else 'WRONG'}]") print(f"Ours Pred ({pred_ours}): {q.all_options[pred_ours]} [{'CORRECT' if is_correct else 'WRONG'}]") print("-" * 30) print(f"Retrieved Memories (Top 3 of {len(retrieved_notes)}):") for note in retrieved_notes[:3]: print(f" - {note}") print("-" * 30) print(f"Scores Base: {s_base}") print(f"Scores Ours: {s_ours}") print("="*60 + "\n") if q.correct_index != -1: correct_base.append(1 if pred_base == q.correct_index else 0) correct_ours.append(1 if pred_ours == q.correct_index else 0) if not correct_base: print("No valid evaluation samples processed.") continue acc_base = np.mean(correct_base) acc_ours = np.mean(correct_ours) # Win rate wins = [1 if (c_o == 1 and c_b == 0) else 0 for c_o, c_b in zip(correct_ours, correct_base)] win_rate = np.mean(wins) print(f"\n--- Results (Beta={beta}) ---") print(f"Total Samples: {len(correct_base)}") print(f"Accuracy (Base No-RAG): {acc_base:.4f}") print(f"Accuracy (Ours RAG): {acc_ours:.4f}") print(f"Win Rate: {win_rate:.4f}") if __name__ == "__main__": main()