scripts/personamem_eval_base_vs_ours.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299

#!/usr/bin/env python3
"""
Evaluation script for PersonaMem task: Base vs Ours (User Vector).
Metric: Accuracy (Top-1 correct option)
"""

import sys
import os
import numpy as np
from tqdm import tqdm
from typing import List

# Add src to sys.path
sys.path.append(os.path.join(os.path.dirname(__file__), "../src"))

from personalization.config.settings import load_local_models_config
from personalization.models.embedding.qwen3_8b import Qwen3Embedding8B
from personalization.data.personamem_loader import load_personamem_questions_32k
from personalization.user_model.features import ItemProjection
from personalization.retrieval.preference_store.schemas import MemoryCard
from personalization.user_model.tensor_store import UserState

def cosine_sim_matrix(a: np.ndarray, b: np.ndarray) -> np.ndarray:
    # a: [d] or [N, d], b: [d] or [M, d]
    norm_a = np.linalg.norm(a, axis=-1, keepdims=True)
    norm_b = np.linalg.norm(b, axis=-1, keepdims=True)
    
    # Ensure correct shapes for broadcasting
    # If a is 1D [d], dot gives 1D. If a is 2D [N, d], dot gives 2D.
    # b is typically [M, d] (memories or options)
    
    dot = np.dot(b, a.T)
    denom = np.dot(norm_b, norm_a.T) + 1e-8
    
    # Flatten if needed
    sim = dot / denom
    if sim.ndim == 2 and sim.shape[1] == 1:
        return sim.flatten()
    return sim

def dense_retrieval(
    e_q: np.ndarray, 
    memory_embeddings: np.ndarray, 
    topk: int = 5
) -> np.ndarray:
    """Returns topk indices of memories most similar to query."""
    if memory_embeddings.shape[0] == 0:
        return np.array([], dtype=int)
        
    sims = cosine_sim_matrix(e_q, memory_embeddings)
    # Get topk
    k = min(topk, len(sims))
    idx = np.argsort(sims)[-k:][::-1]
    return idx

def policy_retrieval(
    e_q: np.ndarray,
    memory_embeddings: np.ndarray,
    item_vectors: np.ndarray,
    z_user: np.ndarray,
    item_proj: ItemProjection,
    topk_dense: int = 20,
    topk_final: int = 5,
    beta: float = 0.2
) -> np.ndarray:
    """
    Simulates retrieve_with_policy:
    1. Dense retrieval (topk_dense)
    2. Policy Scoring: s = s_base + beta * (z . v_m)
    3. Select topk_final
    """
    if memory_embeddings.shape[0] == 0:
        return np.array([], dtype=int)

    # 1. Dense Candidate Generation
    dense_idx = dense_retrieval(e_q, memory_embeddings, topk=topk_dense)
    if len(dense_idx) == 0:
        return np.array([], dtype=int)
        
    candidates_e = memory_embeddings[dense_idx]
    candidates_v = item_vectors[dense_idx]
    
    # 2. Base Scores (Sim(q, m))
    base_scores = cosine_sim_matrix(e_q, candidates_e)
    
    # 3. Policy Bonus
    # z: [k], v: [K, k]
    bonus = np.dot(candidates_v, z_user)
    
    total_scores = base_scores + beta * bonus
    
    # 4. Final Selection
    k = min(topk_final, len(total_scores))
    local_top_idx = np.argsort(total_scores)[-k:][::-1]
    
    # Map back to global indices
    return dense_idx[local_top_idx]

def score_rag(
    e_q: np.ndarray,
    e_opts: np.ndarray,
    retrieved_embeddings: np.ndarray
) -> np.ndarray:
    """
    Computes score for each option based on query match AND memory match.
    Score = Sim(Q, Opt) + Mean(Sim(Memories, Opt))
    """
    # Base: Query-Option similarity
    # e_q: [d], e_opts: [4, d] -> [4]
    s_q_opt = cosine_sim_matrix(e_q, e_opts)
    
    if len(retrieved_embeddings) == 0:
        return s_q_opt
        
    # Memory-Option similarity
    # retrieved: [K, d], e_opts: [4, d]
    # We want for each option, the average similarity to retrieved memories.
    # sim_matrix: [4, K] (options x memories) - check implementation of cosine_sim_matrix
    # cosine_sim_matrix(a, b) does dot(b, a.T). 
    # Let a=retrieved [K, d], b=e_opts [4, d]. Result: [4, K]
    
    s_mem_opt = cosine_sim_matrix(retrieved_embeddings, e_opts)
    
    # Max pooling or Mean pooling over memories
    # Usually max is better for "if any memory supports this option"
    s_rag = np.max(s_mem_opt, axis=1) # [4]
    
    # Combine
    return s_q_opt + s_rag

def main():
    # Paths
    q_path = "data/raw_datasets/personamem/questions_32k.csv"
    vec_path = "data/personamem/user_vectors.npz"
    cards_path = "data/personamem/memory_cards.jsonl" # Generated by builder
    item_proj_path = "data/corpora/item_projection.npz"
    
    if not os.path.exists(q_path) or not os.path.exists(vec_path) or not os.path.exists(cards_path):
        print("Data missing. Run personamem_build_user_vectors.py first.")
        sys.exit(1)
        
    print("Loading resources...")
    cfg = load_local_models_config()
    embed_model = Qwen3Embedding8B.from_config(cfg)
    
    proj_data = np.load(item_proj_path)
    item_proj = ItemProjection(P=proj_data["P"], mean=proj_data["mean"])
    
    # Load User Vectors
    uv_data = np.load(vec_path, allow_pickle=True)
    user_ids = uv_data["user_ids"]
    Z = uv_data["Z"]
    user_vector_map = {uid: Z[i] for i, uid in enumerate(user_ids)}
    print(f"Loaded {len(user_vector_map)} user vectors.")
    
    # Load Memory Cards & Embeddings
    print("Loading memory store...")
    cards_by_user = {}
    embs_by_user = {}
    vecs_by_user = {} # v vectors
    
    # We need to load all cards to build per-user indices
    # This might be slow if file is huge, but 32k dataset usually produces ~100k cards?
    # Builder output: "Extracted 321 memory cards" (from small sample log). 
    # Let's assume it fits in memory.
    
    with open(cards_path, "r") as f:
        for line in f:
            card = MemoryCard.model_validate_json(line)
            uid = card.user_id
            if uid not in cards_by_user:
                cards_by_user[uid] = []
                embs_by_user[uid] = []
                
            cards_by_user[uid].append(card)
            embs_by_user[uid].append(card.embedding_e)
            
    # Convert lists to numpy arrays
    for uid in embs_by_user:
        E = np.array(embs_by_user[uid], dtype=np.float32)
        embs_by_user[uid] = E
        # Compute V on the fly or load if saved (builder didn't save V in separate file, but we can project)
        vecs_by_user[uid] = item_proj.transform_embeddings(E)

    print(f"Loaded memories for {len(cards_by_user)} users.")
    
    # Load Questions
    questions = load_personamem_questions_32k(q_path)
    print(f"Loaded {len(questions)} questions.")
    
    correct_base = []
    correct_ours = []
    
    # Hyperparams
    betas = [0.0, 1.0] # Sanity check: 0.0 should match Base RAG (if Ours RAG logic aligns when beta=0, wait, Ours RAG uses Policy Retrieval. Beta=0 in Policy Retrieval means Dense Retrieval order. So Ours RAG (beta=0) == Base RAG? Ideally yes, if topk_dense is large enough to contain base_topk)
    
    # Actually, Ours RAG pipeline:
    # 1. Dense (top20) -> 2. Re-rank (Base + beta*Bonus) -> 3. Top5
    # Base RAG pipeline:
    # 1. Dense (top5)
    
    # If beta=0, Ours RAG re-ranking is based on Base Score (Sim(q,m)). 
    # Since Dense Retrieval already sorts by Sim(q,m), re-ranking by Sim(q,m) keeps order.
    # So if topk_dense >= topk_final, Ours (beta=0) should pick same top-5 as Base RAG.
    
    for beta in betas:
        print(f"\nEvaluating with RAG (beta={beta})...")
        
        correct_base = []
        correct_ours = []
        
        case_count = 0
        
        for q in tqdm(questions):
            target_id = q.shared_context_id
            
            # Skip if no vector or no memories
            if target_id not in user_vector_map or target_id not in embs_by_user:
                continue
                
            z_user = user_vector_map[target_id]
            mem_E = embs_by_user[target_id]
            mem_V = vecs_by_user[target_id]
            
            # Embed Query
            e_q = embed_model.encode([q.user_question_or_message], return_tensor=False)[0]
            e_q = np.array(e_q, dtype=np.float32)
            
            # Embed Options
            if not q.all_options:
                continue
            e_opts = embed_model.encode(q.all_options, return_tensor=False)
            e_opts = np.array(e_opts, dtype=np.float32)
            
            # --- BASE (No RAG) ---
            s_base = score_rag(e_q, e_opts, np.array([]))
            
            # --- OURS (Personalized RAG) ---
            ours_idx = policy_retrieval(e_q, mem_E, mem_V, z_user, item_proj, topk_dense=20, topk_final=5, beta=beta)
            ours_mem_E = mem_E[ours_idx]
            s_ours = score_rag(e_q, e_opts, ours_mem_E)
            
            pred_base = int(np.argmax(s_base))
            pred_ours = int(np.argmax(s_ours))
            
            is_correct = (pred_ours == q.correct_index)
            base_correct = (pred_base == q.correct_index)
            
            # Detailed Case Print (Task A)
            # Print only when Beta > 0 (to avoid duplicate logs) and when they disagree
            if beta > 0.0 and pred_base != pred_ours and case_count < 5:
                case_count += 1
                
                # Reconstruct memory text (need to find card in list)
                # Optimization: Create a map or just linear search in cards_by_user[target_id]
                user_cards = cards_by_user[target_id]
                # ours_idx are indices into mem_E which corresponds to cards_by_user list order
                retrieved_notes = [user_cards[i].note_text for i in ours_idx]
                
                print(f"\n" + "="*60)
                print(f"[CASE ANALYSIS] QID: {q.question_id}")
                print(f"User Question: {q.user_question_or_message}")
                print(f"Correct Option ({q.correct_index}): {q.all_options[q.correct_index]}")
                print("-" * 30)
                print(f"Base Pred ({pred_base}): {q.all_options[pred_base]} [{'CORRECT' if base_correct else 'WRONG'}]")
                print(f"Ours Pred ({pred_ours}): {q.all_options[pred_ours]} [{'CORRECT' if is_correct else 'WRONG'}]")
                print("-" * 30)
                print(f"Retrieved Memories (Top 3 of {len(retrieved_notes)}):")
                for note in retrieved_notes[:3]:
                    print(f"  - {note}")
                print("-" * 30)
                print(f"Scores Base: {s_base}")
                print(f"Scores Ours: {s_ours}")
                print("="*60 + "\n")

            if q.correct_index != -1:
                correct_base.append(1 if pred_base == q.correct_index else 0)
                correct_ours.append(1 if pred_ours == q.correct_index else 0)
                
        if not correct_base:
            print("No valid evaluation samples processed.")
            continue
            
        acc_base = np.mean(correct_base)
        acc_ours = np.mean(correct_ours)
        
        # Win rate
        wins = [1 if (c_o == 1 and c_b == 0) else 0 for c_o, c_b in zip(correct_ours, correct_base)]
        win_rate = np.mean(wins)
        
        print(f"\n--- Results (Beta={beta}) ---")
        print(f"Total Samples: {len(correct_base)}")
        print(f"Accuracy (Base No-RAG): {acc_base:.4f}")
        print(f"Accuracy (Ours RAG):    {acc_ours:.4f}")
        print(f"Win Rate: {win_rate:.4f}")

if __name__ == "__main__":
    main()