51 files changed, 2043 insertions, 0 deletions
diff --git a/src/personalization/__init__.py b/src/personalization/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/__init__.py
diff --git a/src/personalization/config/__init__.py b/src/personalization/config/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/config/__init__.py
diff --git a/src/personalization/config/registry.py b/src/personalization/config/registry.py
new file mode 100644
index 0000000..d825ad3
--- /dev/null
+++ b/src/personalization/config/registry.py
@@ -0,0 +1,131 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Dict, Optional
+import torch
+import yaml
+
+from personalization.config import settings
+
+# Avoid circular imports by NOT importing extractors here at top level
+# from personalization.models.preference_extractor.base import PreferenceExtractorBase
+# from personalization.models.preference_extractor.rule_extractor import QwenRuleExtractor
+# from personalization.models.preference_extractor.gpt4o_extractor import GPT4OExtractor
+# from personalization.models.preference_extractor.llm_extractor import PreferenceExtractorLLM
+
+_DTYPE_MAP: Dict[str, torch.dtype] = {
+    "bfloat16": torch.bfloat16,
+    "float16": torch.float16,
+    "float32": torch.float32,
+}
+
+def choose_dtype(preferred: Optional[str] = None) -> torch.dtype:
+    if preferred and preferred.lower() in _DTYPE_MAP:
+        dt = _DTYPE_MAP[preferred.lower()]
+    else:
+        dt = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    if dt is torch.bfloat16 and not torch.cuda.is_available():
+        return torch.float32
+    return dt
+
+def choose_device_map(spec: Optional[str] = "auto") -> Any:
+    return spec or "auto"
+
+def ensure_local_path(path_str: str) -> str:
+    path = Path(path_str)
+    if not path.exists():
+        path.mkdir(parents=True, exist_ok=True)
+    return str(path)
+
+# --- Chat Model Factory ---
+def get_chat_model(name: str, device_override: Optional[str] = None):
+    """
+    Get a chat model by name.
+    
+    Args:
+        name: Model name (e.g., "qwen_1_5b", "llama_8b")
+        device_override: Optional device override (e.g., "cuda:2"). If None, uses config default.
+    """
+    from personalization.models.llm.base import ChatModel
+    from personalization.models.llm.qwen_instruct import QwenInstruct
+    from personalization.models.llm.llama_instruct import LlamaChatModel
+    
+    cfg = settings.load_local_models_config()
+    
+    # Try to load raw config to support multi-backend map
+    with open("configs/local_models.yaml", "r") as f:
+        raw_cfg = yaml.safe_load(f)
+    
+    models = raw_cfg.get("models", {}).get("llm", {})
+    
+    # If models['llm'] is a dict of configs (new style)
+    if isinstance(models, dict) and "backend" in models.get(name, {}):
+        spec = models[name]
+        backend = spec.get("backend", "qwen")
+        path = spec["path"]
+        device = device_override or spec.get("device", "cuda")  # Use override if provided
+        dtype = spec.get("dtype", "bfloat16")
+        max_len = spec.get("max_context_length", 4096)
+        
+        if backend == "qwen":
+            return QwenInstruct(
+                model_path=path,
+                device=device,
+                dtype=choose_dtype(dtype), # Converts string to torch.dtype
+                max_context_length=max_len
+            )
+        elif backend == "llama":
+            return LlamaChatModel(
+                model_path=path,
+                device=device,
+                dtype=choose_dtype(dtype), # Converts string to torch.dtype
+                max_context_length=max_len
+            )
+    
+    # Fallback to legacy single config
+    return QwenInstruct.from_config(cfg)
+
+def get_preference_extractor(name: Optional[str] = None):
+    # Deferred imports to break circular dependency
+    from personalization.models.preference_extractor.rule_extractor import QwenRuleExtractor
+    from personalization.models.preference_extractor.gpt4o_extractor import GPT4OExtractor
+    from personalization.models.preference_extractor.llm_extractor import PreferenceExtractorLLM
+    
+    cfg = settings.load_local_models_config()
+    pref_cfg = cfg.preference_extractor
+    
+    if name is None:
+        if isinstance(pref_cfg, dict) and "qwen3_0_6b_sft" in pref_cfg:
+            name = "qwen3_0_6b_sft"
+        else:
+            name = "rule"
+
+    if isinstance(pref_cfg, dict) and name in pref_cfg:
+        spec = pref_cfg[name]
+        if name == "qwen3_0_6b_sft":
+            # Use QwenRuleExtractor which we have updated for SFT End-to-End logic
+            return QwenRuleExtractor(
+                model_path=spec["path"],
+                device_map=spec.get("device", "auto"),
+                dtype=choose_dtype(spec.get("dtype", "bfloat16")),
+            )
+        # Add 'default' handling if mapped to rule/gpt
+        if name == "default":
+             pass
+
+    if name == "gpt4o":
+        return GPT4OExtractor.from_config(cfg)
+    elif name == "rule":
+        if isinstance(pref_cfg, dict):
+             if "default" in pref_cfg:
+                 # Manually construct to bypass ModelSpec mismatch if needed
+                 spec_dict = pref_cfg["default"]
+                 return QwenRuleExtractor(
+                     model_path=spec_dict["local_path"],
+                     dtype=choose_dtype(spec_dict.get("dtype")),
+                     device_map=choose_device_map(spec_dict.get("device_map"))
+                 )
+        else:
+            return QwenRuleExtractor.from_config(cfg)
+
+    raise ValueError(f"Could not load preference extractor: {name}")
diff --git a/src/personalization/config/settings.py b/src/personalization/config/settings.py
new file mode 100644
index 0000000..1bb1bbe
--- /dev/null
+++ b/src/personalization/config/settings.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import Optional, Any, Dict
+
+import yaml
+from pydantic import BaseModel, Field
+
+
+class ModelSpec(BaseModel):
+    hf_id: str = Field(..., description="Hugging Face repository id")
+    local_path: str = Field(..., description="Local directory for model weights")
+    dtype: Optional[str] = Field(
+        default="bfloat16", description="Preferred torch dtype: bfloat16|float16|float32"
+    )
+    device_map: Optional[str] = Field(default="auto", description="Device map policy")
+
+
+class EmbeddingModelsConfig(BaseModel):
+    qwen3: Optional[ModelSpec] = None
+    nemotron: Optional[ModelSpec] = None
+
+
+class RerankerModelsConfig(BaseModel):
+    qwen3_8b: Optional[ModelSpec] = None
+
+
+class LocalModelsConfig(BaseModel):
+    llm: ModelSpec
+    preference_extractor: Any # Allow flexible dict or ModelSpec for now to support map
+    embedding: Optional[EmbeddingModelsConfig] = None
+    reranker: Optional[RerankerModelsConfig] = None
+
+
+def _resolve_config_path(env_key: str, default_rel: str) -> Path:
+    value = os.getenv(env_key)
+    if value:
+        return Path(value).expanduser().resolve()
+    return (Path.cwd() / default_rel).resolve()
+
+
+def load_local_models_config(path: Optional[str] = None) -> LocalModelsConfig:
+    config_path = Path(path) if path else _resolve_config_path(
+        "LOCAL_MODELS_CONFIG", "configs/local_models.yaml"
+    )
+    with open(config_path, "r", encoding="utf-8") as f:
+        raw = yaml.safe_load(f) or {}
+    models = raw.get("models", {})
+    embedding_cfg = None
+    if "embedding" in models:
+        emb = models["embedding"] or {}
+        # dtype/device_map are not necessary for embedders; ModelSpec still accepts them
+        embedding_cfg = EmbeddingModelsConfig(
+            qwen3=ModelSpec(**emb["qwen3"]) if "qwen3" in emb else None,
+            nemotron=ModelSpec(**emb["nemotron"]) if "nemotron" in emb else None,
+        )
+    
+    reranker_cfg = None
+    if "reranker" in models:
+        rer = models["reranker"] or {}
+        reranker_cfg = RerankerModelsConfig(
+            qwen3_8b=ModelSpec(**rer["qwen3_8b"]) if "qwen3_8b" in rer else None
+        )
+
+    return LocalModelsConfig(
+        llm=ModelSpec(**models["llm"]),
+        preference_extractor=models["preference_extractor"], # Pass raw dict/value
+        embedding=embedding_cfg,
+        reranker=reranker_cfg,
+    )
+
+
diff --git a/src/personalization/data/personamem_loader.py b/src/personalization/data/personamem_loader.py
new file mode 100644
index 0000000..3b516ad
--- /dev/null
+++ b/src/personalization/data/personamem_loader.py
@@ -0,0 +1,84 @@
+from __future__ import annotations
+
+import csv
+import json
+from dataclasses import dataclass
+from typing import Dict, List
+
+@dataclass
+class PersonaMemQuestion:
+    persona_id: str
+    question_id: str
+    question_type: str
+    topic: str
+    user_question_or_message: str
+    all_options: List[str]   # 4 options
+    correct_index: int       # 0..3
+    shared_context_id: str
+    end_index_in_shared_context: int
+
+@dataclass
+class PersonaMemContext:
+    shared_context_id: str
+    messages: List[dict]  # raw dicts with "role"/"content" etc
+
+def load_personamem_questions_32k(path_csv: str) -> List[PersonaMemQuestion]:
+    questions = []
+    with open(path_csv, "r", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            # Check fields
+            # The official csv usually has: question_id, persona_id, shared_context_id, question, correct_answer, options etc.
+            # Assuming standard PersonaMem format or similar to provided description.
+            # We might need to adjust based on actual file content.
+            # Based on user description:
+            try:
+                options_str = row.get("all_options", "[]") # Assuming json string
+                try:
+                    options = json.loads(options_str)
+                except:
+                    # Fallback if it's not JSON (e.g. string repr)
+                    # For now assume JSON or simple list
+                    options = []
+
+                # Handle raw answer format (e.g. "(c)" or "c")
+                raw_ans = row.get("correct_answer", "").strip()
+                # Remove parens if present
+                if raw_ans.startswith("(") and raw_ans.endswith(")"):
+                    raw_ans = raw_ans[1:-1]
+                
+                # Parse correct index
+                # If correct_answer is 'A','B','C','D' -> 0,1,2,3
+                ans_map = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'a': 0, 'b': 1, 'c': 2, 'd': 3}
+                correct_idx = ans_map.get(raw_ans, -1)
+                
+                q = PersonaMemQuestion(
+                    persona_id=row["persona_id"],
+                    question_id=row["question_id"],
+                    question_type=row.get("question_type", "unknown"),
+                    topic=row.get("topic", "unknown"),
+                    user_question_or_message=row.get("user_question_or_message", row.get("question", "")), 
+                    all_options=options,
+                    correct_index=correct_idx,
+                    shared_context_id=row["shared_context_id"],
+                    end_index_in_shared_context=int(row.get("end_index_in_shared_context", -1))
+                )
+                questions.append(q)
+            except KeyError as e:
+                # print(f"Skipping row due to missing key: {e}")
+                continue
+    return questions
+
+def load_personamem_contexts_32k(path_jsonl: str) -> Dict[str, PersonaMemContext]:
+    contexts = {}
+    with open(path_jsonl, "r", encoding="utf-8") as f:
+        for line in f:
+            data = json.loads(line)
+            # Format: {"hash_id": [messages...]}
+            for cid, msgs in data.items():
+                contexts[cid] = PersonaMemContext(
+                    shared_context_id=cid,
+                    messages=msgs
+                )
+    return contexts
+
diff --git a/src/personalization/evaluation/__init__.py b/src/personalization/evaluation/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/evaluation/__init__.py
diff --git a/src/personalization/evaluation/compare_pairs.py b/src/personalization/evaluation/compare_pairs.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/evaluation/compare_pairs.py
diff --git a/src/personalization/evaluation/metrics.py b/src/personalization/evaluation/metrics.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/evaluation/metrics.py
diff --git a/src/personalization/feedback/__init__.py b/src/personalization/feedback/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/feedback/__init__.py
diff --git a/src/personalization/feedback/gating.py b/src/personalization/feedback/gating.py
new file mode 100644
index 0000000..d741874
--- /dev/null
+++ b/src/personalization/feedback/gating.py
@@ -0,0 +1,72 @@
+import numpy as np
+from personalization.feedback.schemas import TurnSample
+
+def cosine_sim_batch(matrix: np.ndarray, vector: np.ndarray) -> np.ndarray:
+    # matrix: [N, d], vector: [d]
+    # return: [N]
+    norm_m = np.linalg.norm(matrix, axis=1)
+    norm_v = np.linalg.norm(vector)
+    
+    # Avoid div by zero
+    den = norm_m * norm_v
+    den[den == 0] = 1e-9
+    
+    return np.dot(matrix, vector) / den
+
+def estimate_retrieval_gating(sample: TurnSample, reward_hat: float) -> float:
+    """
+    Return g_t in [0,1], representing how much the reward is due to retrieval.
+    """
+    e_q = sample.query_embedding_t
+    e_q1 = sample.query_embedding_t1
+    
+    if e_q is None or e_q1 is None or not sample.memories:
+        return 0.5 # Neutral
+        
+    # We need embeddings of the memories. 
+    # In a real pipeline, we might pass them in sample.memory_embeddings.
+    # If missing, we can't compute sim.
+    if sample.memory_embeddings is None:
+        # Try to use embedding_e from memory cards if available
+        # But MemoryCard.embedding_e is List[float]
+        try:
+            mem_embs = np.array([m.embedding_e for m in sample.memories])
+            if mem_embs.shape[1] == 0: # Empty embeddings
+                return 0.5
+        except:
+            return 0.5
+    else:
+        mem_embs = sample.memory_embeddings
+
+    # Compute similarities
+    # shape: [K]
+    sims_q = cosine_sim_batch(mem_embs, e_q)
+    sims_q1 = cosine_sim_batch(mem_embs, e_q1)
+    
+    s_q_max = sims_q.max() if len(sims_q) > 0 else 0
+    s_q1_max = sims_q1.max() if len(sims_q1) > 0 else 0
+    
+    g = 0.5
+    
+    # Heuristics
+    
+    # Case A: Retrieval clearly irrelevant + bad reward
+    # q_t / q_{t+1} have low similarity to memories -> likely retrieval failure (or no relevant memories)
+    if reward_hat < -0.5 and s_q_max < 0.2 and s_q1_max < 0.2:
+        g = 0.9 # Blame retrieval (for failing to find anything, or nothing exists)
+        
+    # Case B: Retrieval looks good but reward is bad
+    # Memories are relevant to query, but user still unhappy -> LLM didn't use them well?
+    elif reward_hat < -0.5 and s_q_max > 0.5:
+        g = 0.2 # Likely LLM fault
+        
+    # Case C: Good reward
+    # If reward is high, we assume both did okay. 
+    elif reward_hat > 0.5:
+        if s_q_max > 0.4:
+            g = 0.6 # Retrieval helped
+        else:
+            g = 0.3 # LLM handled it without strong retrieval help
+            
+    return float(g)
+
diff --git a/src/personalization/feedback/handlers.py b/src/personalization/feedback/handlers.py
new file mode 100644
index 0000000..60a8d17
--- /dev/null
+++ b/src/personalization/feedback/handlers.py
@@ -0,0 +1,50 @@
+from typing import Tuple, List, Optional
+import numpy as np
+
+from personalization.retrieval.preference_store.schemas import MemoryCard
+from personalization.feedback.schemas import TurnSample
+from personalization.feedback.reward_model import estimate_reward
+from personalization.feedback.gating import estimate_retrieval_gating
+
+def eval_step(
+    q_t: str,
+    answer_t: str,
+    q_t1: str,
+    memories_t: List[MemoryCard],
+    query_embedding_t: Optional[np.ndarray] = None,
+    query_embedding_t1: Optional[np.ndarray] = None,
+) -> Tuple[float, float]:
+    """
+    Unified evaluation interface.
+    Given (q_t, a_t, q_{t+1}, memories), returns (reward_hat, gating_hat).
+    """
+    
+    # Construct a lightweight TurnSample
+    # We might need embeddings for gating. If not provided, gating might return default.
+    
+    # Ensure memories have embeddings for gating
+    mem_embs = None
+    if memories_t and memories_t[0].embedding_e:
+        try:
+            mem_embs = np.array([m.embedding_e for m in memories_t])
+        except:
+            pass
+            
+    sample = TurnSample(
+        user_id="", # Not needed for simple eval
+        session_id="",
+        turn_id=0,
+        query_t=q_t,
+        answer_t=answer_t,
+        query_t1=q_t1,
+        memories=memories_t,
+        query_embedding_t=query_embedding_t,
+        query_embedding_t1=query_embedding_t1,
+        memory_embeddings=mem_embs
+    )
+    
+    r_hat = estimate_reward(sample)
+    g_hat = estimate_retrieval_gating(sample, r_hat)
+    
+    return r_hat, g_hat
+
diff --git a/src/personalization/feedback/online_update.py b/src/personalization/feedback/online_update.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/feedback/online_update.py
diff --git a/src/personalization/feedback/reward_model.py b/src/personalization/feedback/reward_model.py
new file mode 100644
index 0000000..3584b43
--- /dev/null
+++ b/src/personalization/feedback/reward_model.py
@@ -0,0 +1,64 @@
+import numpy as np
+from personalization.feedback.schemas import TurnSample
+
+def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
+    norm_a = np.linalg.norm(a)
+    norm_b = np.linalg.norm(b)
+    if norm_a == 0 or norm_b == 0:
+        return 0.0
+    return float(np.dot(a, b) / (norm_a * norm_b))
+
+def estimate_reward(sample: TurnSample) -> float:
+    """
+    Return a scalar reward_hat, indicating if the previous answer was helpful.
+    Range: [-1.0, 1.0] (approx)
+    """
+    
+    # 1. Language/Topic Coherence
+    if sample.query_embedding_t is None or sample.query_embedding_t1 is None:
+        topic_sim = 0.5
+    else:
+        topic_sim = cosine_sim(sample.query_embedding_t, sample.query_embedding_t1)
+        
+    # 2. Negative Keywords (Complaint/Correction)
+    negative_keywords = [
+        "you didn't", "that's not", "incorrect", "redo", "again", "explain more", 
+        "doesn't help", "wrong", "no", "not what i asked",
+        "你没", "不是", "这不是", "重来", "重新", "不对", "错了", "没说清楚"
+    ]
+    
+    # 3. Positive Keywords (Follow-up/Elaboration)
+    positive_keywords = [
+        "can you elaborate", "give an example", "continue", "what if", "based on that", 
+        "thanks", "good", "great", "cool",
+        "能不能详细一点", "举个例子", "再继续", "那如果", "接下来", "在这个基础上", "谢谢", "不错"
+    ]
+    
+    q1_lower = sample.query_t1.lower()
+    
+    has_negative = any(kw in q1_lower for kw in negative_keywords)
+    has_positive = any(kw in q1_lower for kw in positive_keywords)
+    
+    reward = 0.0
+    
+    if has_negative:
+        reward -= 1.0
+        
+    if has_positive:
+        # Only reward if topic similarity is decent, otherwise might be "thanks, bye" (end of session)
+        # But "thanks" is good.
+        reward += 0.5
+        if topic_sim > 0.3:
+            reward += 0.5
+            
+    if topic_sim < 0.2:
+        # Topic shift -> previous interaction likely finished or failed. 
+        # If no explicit positive/negative, assume neutral/slightly decayed.
+        # If user changes topic, it often means the previous task is done (neutral/positive) 
+        # OR they gave up (negative). Hard to tell. 
+        # Let's dampen the reward towards 0.
+        reward *= 0.5
+        
+    # Clip
+    return max(-1.0, min(1.0, reward))
+
diff --git a/src/personalization/feedback/sampler.py b/src/personalization/feedback/sampler.py
new file mode 100644
index 0000000..9e26912
--- /dev/null
+++ b/src/personalization/feedback/sampler.py
@@ -0,0 +1,109 @@
+from typing import Iterable, List, Optional
+import numpy as np
+from tqdm import tqdm
+
+from personalization.retrieval.preference_store.schemas import ChatTurn, MemoryCard
+from personalization.feedback.schemas import TurnSample
+from personalization.retrieval.pipeline import retrieve_with_rerank
+from personalization.models.llm.qwen_instruct import QwenInstruct
+from personalization.models.embedding.base import EmbeddingModel
+from personalization.models.reranker.base import Reranker
+from personalization.user_model.tensor_store import UserTensorStore
+
+def build_turn_samples_from_sessions(
+    sessions: Iterable[List[ChatTurn]],
+    embed_model: EmbeddingModel,
+    llm: QwenInstruct,
+    reranker: Reranker,
+    memory_cards: List[MemoryCard],
+    memory_embeddings: np.ndarray,
+    user_store: UserTensorStore,
+    item_vectors: np.ndarray,
+    max_samples: Optional[int] = None,
+    topk_dense: int = 64,
+    topk_rerank: int = 3,
+) -> List[TurnSample]:
+    samples = []
+    
+    for turns in tqdm(sessions, desc="Building TurnSamples"):
+        if max_samples and len(samples) >= max_samples:
+            break
+            
+        # Ensure sorted by turn_id
+        sorted_turns = sorted(turns, key=lambda x: x.turn_id)
+        
+        # Iterate to find (q_t, a_t, q_{t+1})
+        for i in range(len(sorted_turns)):
+            if max_samples and len(samples) >= max_samples:
+                break
+                
+            q_t = sorted_turns[i]
+            if q_t.role != "user":
+                continue
+                
+            # Find next user turn
+            # Also try to find assistant response in between
+            a_t_text = ""
+            q_t1 = None
+            
+            # Look ahead
+            for j in range(i + 1, len(sorted_turns)):
+                next_turn = sorted_turns[j]
+                if next_turn.role == "assistant" and not a_t_text:
+                    a_t_text = next_turn.text
+                elif next_turn.role == "user":
+                    q_t1 = next_turn
+                    break
+            
+            if not q_t1:
+                # End of session or no subsequent user query
+                continue
+            
+            # We have q_t, a_t (optional but preferred), q_t1
+            # If a_t is missing, we might skip or use empty string. 
+            # For RL, we usually need the answer to evaluate quality.
+            # If dataset doesn't have assistant turns, we might need to generate one?
+            # For now, let's proceed even if a_t is empty, or maybe require it.
+            if not a_t_text:
+                # Try to use LLM to generate if needed, but for offline sampling 
+                # from existing chats, we prefer existing answers.
+                # If using OASST1, it should have assistant turns.
+                pass
+
+            # 3. Retrieve memories for q_t
+            memories_t = retrieve_with_rerank(
+                user_id=q_t.user_id,
+                query=q_t.text,
+                embed_model=embed_model,
+                reranker=reranker,
+                memory_cards=memory_cards,
+                memory_embeddings=memory_embeddings,
+                user_store=user_store,
+                item_vectors=item_vectors,
+                topk_dense=topk_dense,
+                topk_rerank=topk_rerank,
+                beta_long=0.0,
+                beta_short=0.0,
+                only_own_memories=True # Assume we want user specific memories
+            )
+            
+            # 4. Precompute embeddings
+            # We can do this efficiently later or batch, but here per sample
+            e_q_t = embed_model.encode([q_t.text], return_tensor=False)[0]
+            e_q_t1 = embed_model.encode([q_t1.text], return_tensor=False)[0]
+            
+            sample = TurnSample(
+                user_id=q_t.user_id,
+                session_id=q_t.session_id,
+                turn_id=q_t.turn_id,
+                query_t=q_t.text,
+                answer_t=a_t_text,
+                query_t1=q_t1.text,
+                memories=memories_t,
+                query_embedding_t=np.array(e_q_t),
+                query_embedding_t1=np.array(e_q_t1)
+            )
+            samples.append(sample)
+            
+    return samples
+
diff --git a/src/personalization/feedback/schemas.py b/src/personalization/feedback/schemas.py
new file mode 100644
index 0000000..b15db80
--- /dev/null
+++ b/src/personalization/feedback/schemas.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import List, Optional, Any
+import numpy as np
+
+from personalization.retrieval.preference_store.schemas import MemoryCard
+
+@dataclass
+class TurnSample:
+    user_id: str
+    session_id: str
+    turn_id: int        # index of q_t within the session
+    query_t: str        # q_t
+    answer_t: str       # a_t
+    query_t1: str       # q_{t+1}
+    memories: List[MemoryCard]   # A_t
+
+    # Optional pre-computed vectors and features
+    query_embedding_t: Optional[np.ndarray] = None
+    query_embedding_t1: Optional[np.ndarray] = None
+    memory_embeddings: Optional[np.ndarray] = None   # corresponding e_m or v_m for memories
+
diff --git a/src/personalization/retrieval/__init__.py b/src/personalization/retrieval/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/retrieval/__init__.py
diff --git a/src/personalization/retrieval/chunking/__init__.py b/src/personalization/retrieval/chunking/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/retrieval/chunking/__init__.py
diff --git a/src/personalization/retrieval/chunking/rules.py b/src/personalization/retrieval/chunking/rules.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/retrieval/chunking/rules.py
diff --git a/src/personalization/retrieval/pipeline.py b/src/personalization/retrieval/pipeline.py
new file mode 100644
index 0000000..3d3eeb7
--- /dev/null
+++ b/src/personalization/retrieval/pipeline.py
@@ -0,0 +1,250 @@
+from typing import List, Tuple
+import numpy as np
+
+from personalization.models.embedding.base import EmbeddingModel
+from personalization.models.reranker.base import Reranker
+from personalization.retrieval.preference_store.schemas import MemoryCard
+from personalization.user_model.tensor_store import UserTensorStore, UserState
+from personalization.user_model.scoring import score_with_user
+from personalization.user_model.policy.reinforce import compute_policy_scores
+
+def cosine_similarity_matrix(E: np.ndarray, e_q: np.ndarray) -> np.ndarray:
+    # E: [M, d], e_q: [d]
+    return np.dot(E, e_q)
+
+def dense_topk_indices(
+    query: str,
+    embed_model: EmbeddingModel,
+    memory_embeddings: np.ndarray,
+    valid_indices: List[int] = None,
+    topk: int = 64
+) -> List[int]:
+    """
+    Return indices of topk memories based on dense embedding similarity.
+    If valid_indices is provided, only search within that subset.
+    """
+    if valid_indices is not None and len(valid_indices) == 0:
+        return []
+
+    e_q_list = embed_model.encode([query], normalize=True, return_tensor=False)
+    e_q = np.array(e_q_list[0], dtype=np.float32)
+    
+    # Select subset of embeddings if restricted
+    if valid_indices is not None:
+        # subset_embeddings = memory_embeddings[valid_indices]
+        # But valid_indices might be arbitrary.
+        # Efficient way: only dot product with subset
+        # E_sub: [M_sub, d]
+        E_sub = memory_embeddings[valid_indices]
+        sims_sub = np.dot(E_sub, e_q)
+        
+        # Topk within subset
+        k = min(topk, len(sims_sub))
+        if k == 0:
+            return []
+            
+        # argsort gives indices relative to E_sub (0..M_sub-1)
+        # We need to map back to original indices
+        idx_sub = np.argsort(sims_sub)[-k:][::-1]
+        
+        return [valid_indices[i] for i in idx_sub]
+    
+    # Global search
+    sims = np.dot(memory_embeddings, e_q)
+    k = min(topk, len(memory_embeddings))
+    if k == 0:
+        return []
+        
+    idx = np.argsort(sims)[-k:][::-1]
+    return idx.tolist()
+
+def retrieve_with_policy(
+    user_id: str,
+    query: str,
+    embed_model: EmbeddingModel,
+    reranker: Reranker,
+    memory_cards: List[MemoryCard],
+    memory_embeddings: np.ndarray,  # shape: [M, d]
+    user_store: UserTensorStore,
+    item_vectors: np.ndarray,       # shape: [M, k], v_m
+    topk_dense: int = 64,
+    topk_rerank: int = 8,
+    beta_long: float = 0.0,
+    beta_short: float = 0.0,
+    tau: float = 1.0,
+    only_own_memories: bool = False,
+    sample: bool = False,
+) -> Tuple[List[MemoryCard], np.ndarray, np.ndarray, List[int], np.ndarray]:
+    """
+    Returns extended info for policy update:
+    (candidates, candidate_item_vectors, base_scores, chosen_indices, policy_probs)
+    
+    Args:
+        sample: If True, use stochastic sampling from policy distribution (for training/exploration).
+                If False, use deterministic top-k by policy scores (for evaluation).
+    """
+    # 0. Filter indices if needed
+    valid_indices = None
+    if only_own_memories:
+        valid_indices = [i for i, card in enumerate(memory_cards) if card.user_id == user_id]
+        if not valid_indices:
+            return [], np.array([]), np.array([]), [], np.array([])
+
+    # 1. Dense retrieval
+    dense_idx = dense_topk_indices(
+        query, 
+        embed_model, 
+        memory_embeddings, 
+        valid_indices=valid_indices,
+        topk=topk_dense
+    )
+    # DEBUG: Check for duplicates or out of bounds
+    if len(dense_idx) > 0:
+        import os
+        if os.getenv("RETRIEVAL_DEBUG") == "1":
+            print(f"  [Pipeline] Dense Indices (Top {len(dense_idx)}): {dense_idx[:10]}...")
+            print(f"  [Pipeline] Max Index: {max(dense_idx)} | Memory Size: {len(memory_cards)}")
+            
+    if not dense_idx:
+        return [], np.array([]), np.array([]), [], np.array([])
+
+    candidates = [memory_cards[i] for i in dense_idx]
+    candidate_docs = [c.note_text for c in candidates]
+    
+    # 2. Rerank base score (P(yes|q,m))
+    base_scores = np.array(reranker.score(query, candidate_docs))
+    
+    # 3. Policy Scoring (Softmax)
+    user_state: UserState = user_store.get_state(user_id)
+    candidate_vectors = item_vectors[dense_idx] # [K, k]
+    
+    policy_out = compute_policy_scores(
+        base_scores=base_scores,
+        user_state=user_state,
+        item_vectors=candidate_vectors,
+        beta_long=beta_long,
+        beta_short=beta_short,
+        tau=tau
+    )
+    
+    # 4. Selection: Greedy (eval) or Stochastic (training)
+    k = min(topk_rerank, len(policy_out.scores))
+    
+    if sample:
+        # Stochastic sampling from policy distribution (for training/exploration)
+        # Sample k indices without replacement, weighted by policy probs
+        probs = policy_out.probs
+        # Normalize to ensure sum to 1 (handle numerical issues)
+        probs = probs / (probs.sum() + 1e-10)
+        # Sample without replacement
+        chosen_indices = np.random.choice(
+            len(probs), size=k, replace=False, p=probs
+        ).tolist()
+    else:
+        # Deterministic top-k by policy scores (for evaluation)
+        top_indices_local = policy_out.scores.argsort()[-k:][::-1]
+        chosen_indices = top_indices_local.tolist()
+    
+    import os
+    if os.getenv("RETRIEVAL_DEBUG") == "1":
+        print(f"  [Pipeline] Candidates: {len(candidates)} | Chosen Indices: {chosen_indices} | Sample: {sample}")
+        
+    return candidates, candidate_vectors, base_scores, chosen_indices, policy_out.probs
+
+def retrieve_no_policy(
+    user_id: str,
+    query: str,
+    embed_model: EmbeddingModel,
+    reranker: Reranker,
+    memory_cards: List[MemoryCard],
+    memory_embeddings: np.ndarray,  # shape: [M, d]
+    topk_dense: int = 64,
+    topk_rerank: int = 8,
+    only_own_memories: bool = False,
+) -> Tuple[List[MemoryCard], np.ndarray, np.ndarray, List[int], np.ndarray]:
+    """
+    Deterministic retrieval baseline (NoPersonal mode):
+    - Dense retrieval -> Rerank -> Top-K (no policy sampling, no user vector influence)
+    
+    Returns same structure as retrieve_with_policy for compatibility:
+    (candidates, candidate_item_vectors, base_scores, chosen_indices, rerank_scores_for_chosen)
+    
+    Note: candidate_item_vectors is empty array (not used in NoPersonal mode)
+          The last return value is rerank scores instead of policy probs
+    """
+    # 0. Filter indices if needed
+    valid_indices = None
+    if only_own_memories:
+        valid_indices = [i for i, card in enumerate(memory_cards) if card.user_id == user_id]
+        if not valid_indices:
+            return [], np.array([]), np.array([]), [], np.array([])
+
+    # 1. Dense retrieval
+    dense_idx = dense_topk_indices(
+        query, 
+        embed_model, 
+        memory_embeddings, 
+        valid_indices=valid_indices,
+        topk=topk_dense
+    )
+    
+    if not dense_idx:
+        return [], np.array([]), np.array([]), [], np.array([])
+
+    candidates = [memory_cards[i] for i in dense_idx]
+    candidate_docs = [c.note_text for c in candidates]
+    
+    # 2. Rerank base score (P(yes|q,m))
+    base_scores = np.array(reranker.score(query, candidate_docs))
+    
+    # 3. Deterministic Top-K selection based on rerank scores ONLY (no policy)
+    k = min(topk_rerank, len(base_scores))
+    top_indices_local = base_scores.argsort()[-k:][::-1]
+    chosen_indices = top_indices_local.tolist()
+    
+    # Get scores for chosen items (for logging compatibility)
+    chosen_scores = base_scores[top_indices_local]
+    
+    # Return empty item vectors (not used in NoPersonal mode)
+    # Return rerank scores as the "probs" field for logging compatibility
+    return candidates, np.array([]), base_scores, chosen_indices, chosen_scores
+
+
+def retrieve_with_rerank(
+    user_id: str,
+    query: str,
+    embed_model: EmbeddingModel,
+    reranker: Reranker,
+    memory_cards: List[MemoryCard],
+    memory_embeddings: np.ndarray,  # shape: [M, d]
+    user_store: UserTensorStore,
+    item_vectors: np.ndarray,       # shape: [M, k], v_m
+    topk_dense: int = 64,
+    topk_rerank: int = 8,
+    beta_long: float = 0.0,
+    beta_short: float = 0.0,
+    only_own_memories: bool = False,
+) -> List[MemoryCard]:
+    """
+    Wrapper around retrieve_with_policy for standard inference.
+    """
+    candidates, _, _, chosen_indices, _ = retrieve_with_policy(
+        user_id=user_id,
+        query=query,
+        embed_model=embed_model,
+        reranker=reranker,
+        memory_cards=memory_cards,
+        memory_embeddings=memory_embeddings,
+        user_store=user_store,
+        item_vectors=item_vectors,
+        topk_dense=topk_dense,
+        topk_rerank=topk_rerank,
+        beta_long=beta_long,
+        beta_short=beta_short,
+        tau=1.0, # Default tau
+        only_own_memories=only_own_memories
+    )
+    
+    return [candidates[i] for i in chosen_indices]
+
+
diff --git a/src/personalization/retrieval/preference_store/__init__.py b/src/personalization/retrieval/preference_store/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/retrieval/preference_store/__init__.py
diff --git a/src/personalization/retrieval/preference_store/base.py b/src/personalization/retrieval/preference_store/base.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/retrieval/preference_store/base.py
diff --git a/src/personalization/retrieval/preference_store/schemas.py b/src/personalization/retrieval/preference_store/schemas.py
new file mode 100644
index 0000000..eb82558
--- /dev/null
+++ b/src/personalization/retrieval/preference_store/schemas.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from typing import List, Literal, Optional, Dict, Any
+
+from pydantic import BaseModel, Field, confloat
+
+
+class Preference(BaseModel):
+    condition: str = Field(
+        ..., min_length=1, max_length=128, description="When the rule applies"
+    )
+    action: str = Field(
+        ..., min_length=1, max_length=256, description="What to do in that case"
+    )
+    confidence: confloat(ge=0.0, le=1.0) = Field(
+        ..., description="Confidence the rule is correct"
+    )
+
+
+class PreferenceList(BaseModel):
+    preferences: List[Preference] = Field(default_factory=list)
+
+
+def preference_list_json_schema() -> dict:
+    return PreferenceList.model_json_schema()
+
+
+class ChatTurn(BaseModel):
+    user_id: str
+    session_id: str
+    turn_id: int
+    role: Literal["user", "assistant"]
+    text: str
+    timestamp: Optional[float] = None
+    meta: Dict[str, Any] = Field(default_factory=dict)
+
+
+class MemoryCard(BaseModel):
+    card_id: str
+    user_id: str
+    source_session_id: str
+    source_turn_ids: List[int]
+    raw_queries: List[str]  # The original user utterances
+    preference_list: PreferenceList
+    note_text: str  # Summarized "condition: action" text
+    embedding_e: List[float]  # The embedding vector
+    kind: Literal["pref", "fact"] = "pref"
diff --git a/src/personalization/retrieval/preference_store/vector_kv.py b/src/personalization/retrieval/preference_store/vector_kv.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/retrieval/preference_store/vector_kv.py
diff --git a/src/personalization/retrieval/rerank.py b/src/personalization/retrieval/rerank.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/retrieval/rerank.py
diff --git a/src/personalization/retrieval/store/__init__.py b/src/personalization/retrieval/store/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/retrieval/store/__init__.py
diff --git a/src/personalization/retrieval/store/base.py b/src/personalization/retrieval/store/base.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/retrieval/store/base.py
diff --git a/src/personalization/retrieval/store/faiss_store.py b/src/personalization/retrieval/store/faiss_store.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/retrieval/store/faiss_store.py
diff --git a/src/personalization/retrieval/store/pgvector_store.py b/src/personalization/retrieval/store/pgvector_store.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/retrieval/store/pgvector_store.py
diff --git a/src/personalization/serving/__init__.py b/src/personalization/serving/__init__.py
new file mode 100644
index 0000000..11adcf8
--- /dev/null
+++ b/src/personalization/serving/__init__.py
@@ -0,0 +1,22 @@
+# Personalization Serving Module
+#
+# This module provides the interface layer for the personalization system.
+
+from personalization.serving.personalized_llm import (
+    PersonalizedLLM,
+    AssistantResponse,
+    UsageStats,
+    DebugInfo,
+    Feedback,
+    create_personalized_llm,
+)
+
+__all__ = [
+    "PersonalizedLLM",
+    "AssistantResponse",
+    "UsageStats",
+    "DebugInfo",
+    "Feedback",
+    "create_personalized_llm",
+]
+
diff --git a/src/personalization/serving/api/__init__.py b/src/personalization/serving/api/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/serving/api/__init__.py
diff --git a/src/personalization/serving/api/main.py b/src/personalization/serving/api/main.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/serving/api/main.py
diff --git a/src/personalization/serving/api/routes/__init__.py b/src/personalization/serving/api/routes/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/serving/api/routes/__init__.py
diff --git a/src/personalization/serving/api/routes/feedback.py b/src/personalization/serving/api/routes/feedback.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/serving/api/routes/feedback.py
diff --git a/src/personalization/serving/api/routes/query.py b/src/personalization/serving/api/routes/query.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/serving/api/routes/query.py
diff --git a/src/personalization/serving/api/routes/users.py b/src/personalization/serving/api/routes/users.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/serving/api/routes/users.py
diff --git a/src/personalization/serving/api/schemas.py b/src/personalization/serving/api/schemas.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/serving/api/schemas.py
diff --git a/src/personalization/serving/personalized_llm.py b/src/personalization/serving/personalized_llm.py
new file mode 100644
index 0000000..2c4d5a8
--- /dev/null
+++ b/src/personalization/serving/personalized_llm.py
@@ -0,0 +1,837 @@
+#!/usr/bin/env python3
+"""
+Personalized LLM Interface for Evaluation.
+
+This module provides the `PersonalizedLLM` class that wraps the entire
+personalization system into a clean interface for evaluation frameworks
+and user simulators.
+
+Interface contract:
+- chat(user_id, query) -> AssistantResponse: Main online interface
+- reset_session(user_id): Clear session history and short-term state
+- reset_user(user_id): Completely reset user (long-term, short-term, memories)
+- apply_feedback(feedback): Apply external feedback for RL updates
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+import uuid
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import yaml
+
+# Ensure src is in path for standalone usage
+_src_path = os.path.join(os.path.dirname(__file__), "../../..")
+if _src_path not in sys.path:
+    sys.path.insert(0, _src_path)
+
+from personalization.config.settings import load_local_models_config
+from personalization.config.registry import get_preference_extractor, get_chat_model
+from personalization.models.embedding.qwen3_8b import Qwen3Embedding8B
+from personalization.models.reranker.qwen3_reranker import Qwen3Reranker
+from personalization.user_model.tensor_store import UserTensorStore, UserState
+from personalization.user_model.session_state import OnlineSessionState
+from personalization.user_model.features import ItemProjection
+from personalization.retrieval.preference_store.schemas import (
+    MemoryCard, ChatTurn, PreferenceList, Preference
+)
+from personalization.retrieval.pipeline import retrieve_with_policy, retrieve_no_policy
+from personalization.feedback.handlers import eval_step
+from personalization.user_model.policy.reinforce import reinforce_update_user_state
+
+
+# =============================================================================
+# Data Classes for Interface
+# =============================================================================
+
+@dataclass
+class UsageStats:
+    """Token usage statistics from a chat completion."""
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+    model: str
+
+
+@dataclass
+class DebugInfo:
+    """
+    Debug information for analysis and ablation studies.
+    All fields are optional - fill what you have, leave empty what you don't.
+    """
+    selected_memory_ids: List[str] = field(default_factory=list)
+    selected_memory_notes: List[str] = field(default_factory=list)
+    selected_memory_scores: List[float] = field(default_factory=list)
+    user_vector_before: Optional[List[float]] = None
+    user_vector_after: Optional[List[float]] = None
+    extracted_preferences: List[Dict[str, Any]] = field(default_factory=list)
+    extra: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class AssistantResponse:
+    """Response from the personalized LLM chat interface."""
+    answer: str
+    usage: UsageStats
+    debug: Optional[DebugInfo] = None
+
+
+@dataclass
+class Feedback:
+    """
+    Feedback data structure for RL updates from user simulator or judge.
+    
+    Attributes:
+        user_id: The user this feedback is for.
+        turn_id: The turn this feedback refers to (from the previous turn).
+        reward: Reward scalar computed by user simulator / judge.
+        gating: Gating flag (1=valid learning signal, 0=skip update).
+        meta: Additional metadata for training/analysis.
+    """
+    user_id: str
+    turn_id: int
+    reward: float
+    gating: float  # Can be 0.0 or 1.0, or continuous
+    meta: Dict[str, Any] = field(default_factory=dict)
+
+
+# =============================================================================
+# Internal Session State Extended
+# =============================================================================
+
+@dataclass
+class _SessionContext:
+    """Extended session context for evaluation tracking."""
+    session_state: OnlineSessionState
+    turn_counter: int = 0
+    # Store info needed for apply_feedback
+    pending_rl_update: Optional[Dict[str, Any]] = None
+
+
+# =============================================================================
+# PersonalizedLLM Class
+# =============================================================================
+
+class PersonalizedLLM:
+    """
+    Personalized LLM wrapper for evaluation frameworks.
+    
+    This class provides a clean interface that accepts only (user_id, query)
+    for the main chat function, while internally managing:
+    - User state vectors (z_long, z_short)
+    - Session history
+    - Memory retrieval and policy
+    - Preference extraction and storage
+    - RL updates
+    
+    Example usage:
+        llm = PersonalizedLLM()
+        
+        # Reset user for fresh experiment
+        llm.reset_user("user_123")
+        
+        # Start a session
+        llm.reset_session("user_123")
+        
+        # Chat
+        response = llm.chat("user_123", "What's a good recipe for dinner?")
+        print(response.answer)
+        
+        # Apply feedback from previous turn (from turn 2 onwards)
+        llm.apply_feedback(Feedback(
+            user_id="user_123",
+            turn_id=0,
+            reward=0.8,
+            gating=1.0
+        ))
+    """
+    
+    def __init__(
+        self,
+        config_path: Optional[str] = None,
+        user_store_path: str = "data/users/user_store_eval.npz",
+        memory_cards_path: str = "data/corpora/memory_cards.jsonl",
+        memory_embeddings_path: str = "data/corpora/memory_embeddings.npy",
+        item_projection_path: str = "data/corpora/item_projection.npz",
+        only_own_memories: bool = True,
+        enable_preference_extraction: bool = True,
+        enable_rl_updates: bool = True,
+        mode: str = "full",  # "full", "nopersonal", or "vanilla"
+        eval_mode: bool = True,  # True = greedy selection, False = stochastic sampling
+        device_assignment: Optional[Dict[str, str]] = None,  # Multi-GPU support
+    ):
+        """
+        Initialize the PersonalizedLLM.
+        
+        Args:
+            config_path: Path to config file. If None, uses default locations.
+            user_store_path: Path to persist user state vectors.
+            memory_cards_path: Path to memory cards JSONL file.
+            memory_embeddings_path: Path to memory embeddings numpy file.
+            item_projection_path: Path to item projection (PCA) file.
+            only_own_memories: If True, only retrieve user's own memories (strict privacy).
+            enable_preference_extraction: If True, extract preferences from user turns.
+            enable_rl_updates: If True, apply RL updates via apply_feedback.
+            mode: "full" for full personalization, "nopersonal" for baseline (no user vector influence),
+                  "vanilla" for pure LLM without any memory retrieval or preference extraction.
+            eval_mode: If True, use greedy/deterministic selection (for evaluation).
+                       If False, use stochastic sampling (for training/exploration).
+            device_assignment: Optional dict to assign models to specific GPUs.
+                Example: {"embed": "cuda:0", "reranker": "cuda:1", "chat": "cuda:2", "extractor": "cuda:3"}
+                If None, uses "auto" for all models.
+        """
+        self.only_own_memories = only_own_memories
+        self.enable_preference_extraction = enable_preference_extraction
+        self.enable_rl_updates = enable_rl_updates
+        self.mode = mode  # "full" or "nopersonal"
+        self.eval_mode = eval_mode  # True = greedy, False = sample
+        
+        # Multi-GPU device assignment
+        self._device_assignment = device_assignment or {
+            "embed": "auto",
+            "reranker": "auto",
+            "chat": "auto",
+            "extractor": "auto",
+        }
+        
+        # Paths
+        self._memory_cards_path = memory_cards_path
+        self._memory_embeddings_path = memory_embeddings_path
+        self._item_projection_path = item_projection_path
+        
+        # RL Configuration
+        # Note: beta/eta increased for more significant z_u updates
+        self._rl_cfg = {
+            "item_dim": 256,
+            "beta_long": 2.0,    # Increased from 0.1 for stronger personalization
+            "beta_short": 5.0,   # Increased from 0.3
+            "tau": 1.0,
+            "eta_long": 0.01,    # Increased from 1e-3 for faster learning
+            "eta_short": 0.05,   # Increased from 5e-3
+            "ema_alpha": 0.05,
+            "short_decay": 0.1,
+            "dense_topk": 64,
+            "rerank_topk": 3,
+            "max_new_tokens": 512,
+        }
+        
+        # Load config and override RL params if available
+        self._load_config(config_path)
+        
+        # Load models
+        print("[PersonalizedLLM] Loading models...")
+        self._load_models()
+        
+        # Load memory store
+        print("[PersonalizedLLM] Loading memory store...")
+        self._load_memory_store()
+        
+        # Initialize user store
+        self._user_store = UserTensorStore(
+            k=self._rl_cfg["item_dim"],
+            path=user_store_path,
+        )
+        
+        # Session contexts per user (in-memory)
+        self._sessions: Dict[str, _SessionContext] = {}
+        
+        print("[PersonalizedLLM] Initialization complete.")
+    
+    def _load_config(self, config_path: Optional[str]):
+        """Load configuration from yaml files."""
+        self._cfg = load_local_models_config()
+        
+        # Try to load user_model.yaml for RL params
+        if config_path is None:
+            config_path = "configs/user_model.yaml"
+        
+        self._llm_name = "qwen_1_5b"  # Default
+        
+        try:
+            if os.path.exists(config_path):
+                with open(config_path, "r") as f:
+                    user_cfg = yaml.safe_load(f)
+                    if user_cfg:
+                        # Override RL params if present
+                        for key in self._rl_cfg:
+                            if key in user_cfg:
+                                self._rl_cfg[key] = user_cfg[key]
+                        # LLM name
+                        if "llm_name" in user_cfg:
+                            self._llm_name = user_cfg["llm_name"]
+        except Exception as e:
+            print(f"[PersonalizedLLM] Warning: Failed to load config: {e}")
+    
+    def _load_models(self):
+        """Load all ML models with optional multi-GPU assignment."""
+        import torch
+        
+        # Report GPU availability
+        num_gpus = torch.cuda.device_count()
+        print(f"[PersonalizedLLM] Available GPUs: {num_gpus}")
+        for i in range(num_gpus):
+            mem = torch.cuda.get_device_properties(i).total_memory / 1e9
+            print(f"  GPU {i}: {torch.cuda.get_device_name(i)} ({mem:.1f}GB)")
+        
+        embed_device = self._device_assignment.get("embed", "auto")
+        reranker_device = self._device_assignment.get("reranker", "auto")
+        chat_device = self._device_assignment.get("chat", "auto")
+        extractor_device = self._device_assignment.get("extractor", "auto")
+        
+        # Embedding model
+        print(f"[PersonalizedLLM] Loading Embedding model on {embed_device}...")
+        self._embed_model = Qwen3Embedding8B(
+            model_path=self._cfg.embedding.qwen3.local_path,
+            dtype=torch.bfloat16,
+            device_map=embed_device,
+        )
+        
+        # Reranker
+        print(f"[PersonalizedLLM] Loading Reranker on {reranker_device}...")
+        self._reranker = Qwen3Reranker(
+            model_path=self._cfg.reranker.qwen3_8b.local_path,
+            device_map=reranker_device,
+            dtype=torch.bfloat16,
+        )
+        
+        # Chat model (via registry for backend switching)
+        print(f"[PersonalizedLLM] Loading ChatModel: {self._llm_name} on {chat_device}...")
+        # Pass device override if specified (not "auto")
+        device_for_chat = chat_device if chat_device != "auto" else None
+        self._chat_model = get_chat_model(self._llm_name, device_override=device_for_chat)
+        
+        # Preference extractor
+        if self.enable_preference_extraction:
+            extractor_name = "qwen3_0_6b_sft"
+            print(f"[PersonalizedLLM] Loading extractor: {extractor_name} on {extractor_device}...")
+            try:
+                self._extractor = get_preference_extractor(extractor_name)
+            except Exception as e:
+                print(f"[PersonalizedLLM] Warning: Failed to load {extractor_name}: {e}. Using rule-based.")
+                self._extractor = get_preference_extractor("rule")
+        else:
+            print("[PersonalizedLLM] Preference extraction disabled, using rule-based extractor.")
+            self._extractor = get_preference_extractor("rule")
+    
+    def _load_memory_store(self):
+        """Load memory cards and embeddings."""
+        if not os.path.exists(self._memory_cards_path):
+            print(f"[PersonalizedLLM] Warning: Memory cards not found at {self._memory_cards_path}")
+            self._memory_cards: List[MemoryCard] = []
+            self._memory_embeddings = np.zeros((0, 4096), dtype=np.float32)
+            self._item_vectors = np.zeros((0, self._rl_cfg["item_dim"]), dtype=np.float32)
+            self._projection = None
+            return
+        
+        # Load cards
+        self._memory_cards = []
+        with open(self._memory_cards_path, "r") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    self._memory_cards.append(MemoryCard.model_validate_json(line))
+        
+        # Load embeddings
+        if os.path.exists(self._memory_embeddings_path):
+            self._memory_embeddings = np.load(self._memory_embeddings_path)
+        else:
+            self._memory_embeddings = np.zeros((len(self._memory_cards), 4096), dtype=np.float32)
+        
+        # Load projection
+        if os.path.exists(self._item_projection_path):
+            proj_data = np.load(self._item_projection_path)
+            self._projection = ItemProjection(P=proj_data["P"], mean=proj_data["mean"])
+            self._item_vectors = proj_data["V"]
+        else:
+            self._projection = None
+            self._item_vectors = np.zeros((len(self._memory_cards), self._rl_cfg["item_dim"]), dtype=np.float32)
+        
+        print(f"[PersonalizedLLM] Loaded {len(self._memory_cards)} memory cards.")
+    
+    def _get_or_create_session(self, user_id: str) -> _SessionContext:
+        """Get or create session context for a user."""
+        if user_id not in self._sessions:
+            self._sessions[user_id] = _SessionContext(
+                session_state=OnlineSessionState(user_id=user_id),
+                turn_counter=0,
+            )
+        return self._sessions[user_id]
+    
+    def _build_chat_turn(self, user_id: str, text: str, role: str, turn_id: int) -> ChatTurn:
+        """Build a ChatTurn object."""
+        return ChatTurn(
+            user_id=user_id,
+            session_id=f"eval_session_{user_id}",
+            turn_id=turn_id,
+            role=role,
+            text=text,
+            meta={"source": "eval"}
+        )
+    
+    def _count_tokens(self, text: str) -> int:
+        """Estimate token count using the tokenizer."""
+        try:
+            # Use the chat model's tokenizer if available
+            if hasattr(self._chat_model, 'tokenizer'):
+                return len(self._chat_model.tokenizer.encode(text))
+            else:
+                # Rough estimate: ~4 chars per token
+                return len(text) // 4
+        except Exception:
+            return len(text) // 4
+    
+    def _add_preferences_as_memory(
+        self,
+        prefs: PreferenceList,
+        query: str,
+        user_id: str,
+        turn_id: int,
+    ) -> List[Dict[str, Any]]:
+        """
+        Add extracted preferences as new memory cards.
+        Returns list of preference dicts for debug info.
+        """
+        extracted = []
+        
+        if not prefs.preferences or self._projection is None:
+            return extracted
+        
+        # Compute embedding for the query
+        e_q = self._embed_model.encode([query], return_tensor=False)[0]
+        v_q = self._projection.transform_vector(np.array(e_q))
+        
+        for pref in prefs.preferences:
+            note_text = f"When {pref.condition}, {pref.action}."
+            
+            # Record for debug
+            extracted.append({
+                "condition": pref.condition,
+                "action": pref.action,
+                "confidence": pref.confidence,
+            })
+            
+            # Deduplication check
+            is_duplicate = any(
+                card.user_id == user_id and card.note_text == note_text
+                for card in self._memory_cards
+            )
+            
+            if is_duplicate:
+                continue
+            
+            # Create new memory card
+            card = MemoryCard(
+                card_id=str(uuid.uuid4()),
+                user_id=user_id,
+                source_session_id=f"eval_session_{user_id}",
+                source_turn_ids=[turn_id],
+                raw_queries=[query],
+                preference_list=PreferenceList(preferences=[pref]),
+                note_text=note_text,
+                embedding_e=list(e_q),
+                kind="pref",
+            )
+            
+            # Add to memory store
+            self._memory_cards.append(card)
+            self._memory_embeddings = np.vstack([self._memory_embeddings, np.array([e_q])])
+            self._item_vectors = np.vstack([self._item_vectors, np.array([v_q])])
+        
+        return extracted
+    
+    # =========================================================================
+    # Public Interface
+    # =========================================================================
+    
+    def chat(self, user_id: str, query: str) -> AssistantResponse:
+        """
+        Main online chat interface.
+        
+        Args:
+            user_id: Unique identifier for the user.
+            query: Current user query/message.
+        
+        Returns:
+            AssistantResponse containing the answer, usage stats, and debug info.
+        
+        Notes:
+            - Internally manages user state, session history, memory retrieval
+            - After this call, you can call apply_feedback() with the turn's feedback
+        """
+        ctx = self._get_or_create_session(user_id)
+        session = ctx.session_state
+        user_state = self._user_store.get_state(user_id)
+        
+        # Record user vector before for debug
+        z_long_before = user_state.z_long.copy().tolist()
+        z_short_before = user_state.z_short.copy().tolist()
+        
+        # Compute query embedding
+        e_q_t = np.array(self._embed_model.encode([query], return_tensor=False)[0])
+        
+        # Store pending RL update info from last turn (for apply_feedback)
+        if session.last_query is not None and self.enable_rl_updates:
+            ctx.pending_rl_update = {
+                "last_query": session.last_query,
+                "last_answer": session.last_answer,
+                "last_memories": session.last_memories,
+                "last_query_embedding": session.last_query_embedding,
+                "current_query_embedding": e_q_t,
+                "last_candidate_item_vectors": session.last_candidate_item_vectors,
+                "last_policy_probs": session.last_policy_probs,
+                "last_chosen_indices": session.last_chosen_indices,
+            }
+        
+        # Add user turn to history
+        user_turn = self._build_chat_turn(user_id, query, "user", ctx.turn_counter)
+        session.history.append(user_turn)
+        
+        # Vanilla mode: pure LLM without any memory or preference extraction
+        if self.mode == "vanilla":
+            # Skip preference extraction and memory retrieval entirely
+            extracted_prefs = []
+            candidates = []
+            cand_item_vecs = np.array([])
+            base_scores = np.array([])
+            chosen_indices = []
+            probs = np.array([])
+            memories_t = []
+            memory_notes = []
+        else:
+            # Extract preferences from conversation (if enabled)
+            extracted_prefs = []
+            if self.enable_preference_extraction:
+                prefs = self._extractor.extract_turn(session.history)
+                extracted_prefs = self._add_preferences_as_memory(
+                    prefs, query, user_id, ctx.turn_counter
+                )
+            
+            # Retrieve memories
+            # In "nopersonal" mode: deterministic retrieval (dense + rerank + topk), no policy/user vector
+            # In "full" mode: policy-based retrieval with user vector influence
+            if self.mode == "nopersonal":
+                candidates, cand_item_vecs, base_scores, chosen_indices, probs = retrieve_no_policy(
+                    user_id=user_id,
+                    query=query,
+                    embed_model=self._embed_model,
+                    reranker=self._reranker,
+                    memory_cards=self._memory_cards,
+                    memory_embeddings=self._memory_embeddings,
+                    topk_dense=self._rl_cfg["dense_topk"],
+                    topk_rerank=self._rl_cfg["rerank_topk"],
+                    only_own_memories=self.only_own_memories,
+                )
+            else:
+                beta_long = self._rl_cfg["beta_long"]
+                beta_short = self._rl_cfg["beta_short"]
+                # eval_mode=True -> sample=False (greedy/deterministic)
+                # eval_mode=False -> sample=True (stochastic/exploration)
+                candidates, cand_item_vecs, base_scores, chosen_indices, probs = retrieve_with_policy(
+                    user_id=user_id,
+                    query=query,
+                    embed_model=self._embed_model,
+                    reranker=self._reranker,
+                    memory_cards=self._memory_cards,
+                    memory_embeddings=self._memory_embeddings,
+                    user_store=self._user_store,
+                    item_vectors=self._item_vectors,
+                    topk_dense=self._rl_cfg["dense_topk"],
+                    topk_rerank=self._rl_cfg["rerank_topk"],
+                    beta_long=beta_long,
+                    beta_short=beta_short,
+                    tau=self._rl_cfg["tau"],
+                    only_own_memories=self.only_own_memories,
+                    sample=not self.eval_mode,
+                )
+            
+            # Get selected memories
+            memories_t = [candidates[int(i)] for i in chosen_indices] if chosen_indices else []
+            memory_notes = [m.note_text for m in memories_t]
+        
+        # Build prompt and count tokens
+        prompt_tokens = self._count_tokens(query)
+        for turn in session.history:
+            prompt_tokens += self._count_tokens(turn.text)
+        for note in memory_notes:
+            prompt_tokens += self._count_tokens(note)
+        
+        # Generate answer
+        answer_t = self._chat_model.answer(
+            history=session.history,
+            memory_notes=memory_notes,
+            max_new_tokens=self._rl_cfg["max_new_tokens"],
+        )
+        
+        completion_tokens = self._count_tokens(answer_t)
+        
+        # Add assistant turn to history
+        assist_turn = self._build_chat_turn(user_id, answer_t, "assistant", ctx.turn_counter)
+        session.history.append(assist_turn)
+        
+        # Update session state for next turn
+        session.last_query = query
+        session.last_answer = answer_t
+        session.last_memories = memories_t
+        session.last_query_embedding = e_q_t
+        session.last_candidate_item_vectors = cand_item_vecs
+        session.last_policy_probs = probs
+        session.last_chosen_indices = list(chosen_indices) if len(chosen_indices) > 0 else []
+        
+        ctx.turn_counter += 1
+        
+        # Build debug info
+        debug = DebugInfo(
+            selected_memory_ids=[m.card_id for m in memories_t],
+            selected_memory_notes=[m.note_text for m in memories_t],
+            selected_memory_scores=[float(probs[i]) if i < len(probs) else 0.0 for i in chosen_indices] if len(chosen_indices) > 0 else [],
+            user_vector_before=z_long_before + z_short_before,  # Concatenated for simplicity
+            user_vector_after=user_state.z_long.tolist() + user_state.z_short.tolist(),
+            extracted_preferences=extracted_prefs,
+            extra={
+                "num_candidates": len(candidates),
+                "num_total_memories": len(self._memory_cards),
+                "z_long_norm": float(np.linalg.norm(user_state.z_long)),
+                "z_short_norm": float(np.linalg.norm(user_state.z_short)),
+            }
+        )
+        
+        # Build usage stats
+        usage = UsageStats(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+            model=self._llm_name,
+        )
+        
+        return AssistantResponse(
+            answer=answer_t,
+            usage=usage,
+            debug=debug,
+        )
+    
+    def reset_session(self, user_id: str) -> None:
+        """
+        Reset session for a user (new chat window).
+        
+        This clears:
+        - Session conversation history
+        - Short-term user vector (z_short)
+        - Pending RL update info
+        
+        This preserves:
+        - Long-term user vector (z_long)
+        - User's memory cards
+        
+        Args:
+            user_id: The user whose session to reset.
+        """
+        # Clear session context
+        if user_id in self._sessions:
+            del self._sessions[user_id]
+        
+        # Create fresh session
+        self._sessions[user_id] = _SessionContext(
+            session_state=OnlineSessionState(user_id=user_id),
+            turn_counter=0,
+        )
+        
+        # Reset short-term vector but keep long-term
+        user_state = self._user_store.get_state(user_id)
+        user_state.z_short = np.zeros(self._rl_cfg["item_dim"], dtype=np.float32)
+        self._user_store.save_state(user_state)
+    
+    def reset_user(self, user_id: str) -> None:
+        """
+        Completely reset a user (new "life").
+        
+        This clears:
+        - Long-term user vector (z_long)
+        - Short-term user vector (z_short)
+        - User's memory cards
+        - Session history
+        - All cached state
+        
+        Args:
+            user_id: The user to reset.
+        """
+        # Clear session
+        if user_id in self._sessions:
+            del self._sessions[user_id]
+        
+        # Reset user state vectors
+        user_state = self._user_store.get_state(user_id)
+        user_state.z_long = self._user_store.global_init_z.copy()
+        user_state.z_short = np.zeros(self._rl_cfg["item_dim"], dtype=np.float32)
+        user_state.reward_ma = 0.0
+        self._user_store.save_state(user_state)
+        
+        # Find indices to KEEP (cards NOT belonging to this user)
+        # Must do this BEFORE modifying _memory_cards
+        keep_indices = [
+            i for i, card in enumerate(self._memory_cards)
+            if card.user_id != user_id
+        ]
+        
+        # Filter memory cards
+        self._memory_cards = [self._memory_cards[i] for i in keep_indices]
+        
+        # Filter embeddings and item vectors to match
+        if len(keep_indices) > 0 and len(self._memory_embeddings) > 0:
+            self._memory_embeddings = self._memory_embeddings[keep_indices]
+            self._item_vectors = self._item_vectors[keep_indices]
+        else:
+            # No cards left or no embeddings
+            embed_dim = self._memory_embeddings.shape[1] if len(self._memory_embeddings) > 0 else 4096
+            self._memory_embeddings = np.zeros((0, embed_dim), dtype=np.float32)
+            self._item_vectors = np.zeros((0, self._rl_cfg["item_dim"]), dtype=np.float32)
+    
+    def apply_feedback(self, feedback: Feedback) -> None:
+        """
+        Apply feedback from user simulator or judge.
+        
+        This performs the REINFORCE update to user vectors based on
+        the reward signal from the previous turn.
+        
+        Args:
+            feedback: Feedback object containing reward, gating, and metadata.
+        
+        Notes:
+            - Should be called AFTER chat() but BEFORE the next chat() call
+            - Uses the stored context from the previous turn
+            - If enable_rl_updates is False, this is a no-op (logging only)
+            - If mode is "nopersonal", this is a no-op (baseline comparison)
+        """
+        if not self.enable_rl_updates:
+            return
+        
+        # In "nopersonal" or "vanilla" mode, skip RL updates entirely (baseline)
+        if self.mode in ("nopersonal", "vanilla"):
+            return
+        
+        user_id = feedback.user_id
+        ctx = self._sessions.get(user_id)
+        
+        if ctx is None or ctx.pending_rl_update is None:
+            return
+        
+        pending = ctx.pending_rl_update
+        user_state = self._user_store.get_state(user_id)
+        
+        # Check if we have the necessary data for RL update
+        if (pending.get("last_candidate_item_vectors") is not None and
+            pending.get("last_policy_probs") is not None and
+            pending.get("last_chosen_indices") is not None and
+            len(pending["last_chosen_indices"]) > 0):
+            
+            # Extract chosen vectors
+            chosen_indices = pending["last_chosen_indices"]
+            candidate_vectors = pending["last_candidate_item_vectors"]
+            
+            if len(candidate_vectors) > 0:
+                # REINFORCE expects:
+                # - item_vectors: ALL candidate vectors [K, k]
+                # - chosen_indices: indices into those candidates
+                # - policy_probs: probabilities over all K candidates [K]
+                updated = reinforce_update_user_state(
+                    user_state=user_state,
+                    item_vectors=candidate_vectors,  # All candidates, not just chosen
+                    chosen_indices=chosen_indices,   # Original indices into candidates
+                    policy_probs=pending["last_policy_probs"],
+                    reward_hat=feedback.reward,
+                    gating=feedback.gating,
+                    tau=self._rl_cfg["tau"],
+                    eta_long=self._rl_cfg["eta_long"],
+                    eta_short=self._rl_cfg["eta_short"],
+                    ema_alpha=self._rl_cfg["ema_alpha"],
+                    short_decay=self._rl_cfg["short_decay"],
+                )
+                
+                if updated:
+                    self._user_store.save_state(user_state)
+        
+        # Clear pending update
+        ctx.pending_rl_update = None
+    
+    def get_user_state_summary(self, user_id: str) -> Dict[str, Any]:
+        """
+        Get a summary of the user's current state (for debugging/analysis).
+        
+        Args:
+            user_id: The user to query.
+        
+        Returns:
+            Dictionary with user state information.
+        """
+        user_state = self._user_store.get_state(user_id)
+        ctx = self._sessions.get(user_id)
+        
+        user_memory_count = sum(
+            1 for card in self._memory_cards if card.user_id == user_id
+        )
+        
+        return {
+            "user_id": user_id,
+            "z_long_norm": float(np.linalg.norm(user_state.z_long)),
+            "z_short_norm": float(np.linalg.norm(user_state.z_short)),
+            "reward_ma": user_state.reward_ma,
+            "session_history_length": len(ctx.session_state.history) if ctx else 0,
+            "turn_counter": ctx.turn_counter if ctx else 0,
+            "user_memory_count": user_memory_count,
+            "total_memory_count": len(self._memory_cards),
+        }
+    
+    def persist(self) -> None:
+        """
+        Persist all state to disk.
+        
+        Call this at the end of an evaluation run to save:
+        - User state vectors
+        - Memory cards
+        """
+        # Save user store
+        self._user_store.persist()
+        
+        # Save memory cards
+        with open(self._memory_cards_path, "w", encoding="utf-8") as f:
+            for card in self._memory_cards:
+                f.write(card.model_dump_json() + "\n")
+        
+        # Save embeddings
+        np.save(self._memory_embeddings_path, self._memory_embeddings)
+        
+        # Save item projection with updated vectors
+        if self._projection is not None:
+            np.savez(
+                self._item_projection_path,
+                P=self._projection.P,
+                mean=self._projection.mean,
+                V=self._item_vectors,
+            )
+        
+        print("[PersonalizedLLM] State persisted to disk.")
+
+
+# =============================================================================
+# Convenience Factory
+# =============================================================================
+
+def create_personalized_llm(
+    config_path: Optional[str] = None,
+    **kwargs
+) -> PersonalizedLLM:
+    """
+    Factory function to create a PersonalizedLLM instance.
+    
+    Args:
+        config_path: Optional path to configuration file.
+        **kwargs: Additional arguments passed to PersonalizedLLM constructor.
+    
+    Returns:
+        Configured PersonalizedLLM instance.
+    """
+    return PersonalizedLLM(config_path=config_path, **kwargs)
+
diff --git a/src/personalization/types.py b/src/personalization/types.py
new file mode 100644
index 0000000..a25b560
--- /dev/null
+++ b/src/personalization/types.py
@@ -0,0 +1,4 @@
+from personalization.retrieval.preference_store.schemas import ChatTurn
+
+__all__ = ["ChatTurn"]
+
diff --git a/src/personalization/user_model/__init__.py b/src/personalization/user_model/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/user_model/__init__.py
diff --git a/src/personalization/user_model/features.py b/src/personalization/user_model/features.py
new file mode 100644
index 0000000..a4508b4
--- /dev/null
+++ b/src/personalization/user_model/features.py
@@ -0,0 +1,49 @@
+import numpy as np
+from dataclasses import dataclass
+from sklearn.decomposition import PCA
+
+@dataclass
+class ItemProjection:
+    P: np.ndarray     # [k, d]
+    mean: np.ndarray  # [d]
+
+    @classmethod
+    def from_pca(cls, embeddings: np.ndarray, k: int) -> "ItemProjection":
+        """
+        embeddings: [M, d]
+        """
+        mean = embeddings.mean(axis=0)
+        centered = embeddings - mean
+        
+        # Ensure k is not larger than min(n_samples, n_features)
+        n_samples, n_features = embeddings.shape
+        actual_k = min(k, n_samples, n_features)
+        
+        pca = PCA(n_components=actual_k)
+        pca.fit(centered)
+
+        # pca.components_: [k, d]
+        P = pca.components_  # Each row is a principal component vector
+        
+        # If we had to reduce k, we might want to pad P or handle it?
+        # For now, let's assume we get what we asked for or less if data is small.
+        # But for the system we want fixed k. 
+        # If actual_k < k, we should pad with zeros to match expected dimension.
+        if actual_k < k:
+            padding = np.zeros((k - actual_k, n_features), dtype=P.dtype)
+            P = np.vstack([P, padding])
+            
+        return cls(P=P, mean=mean)
+
+    def transform_embeddings(self, E: np.ndarray) -> np.ndarray:
+        """
+        E: [N, d] -> [N, k]
+        """
+        return (E - self.mean) @ self.P.T
+
+    def transform_vector(self, e: np.ndarray) -> np.ndarray:
+        """
+        e: [d] -> [k]
+        """
+        return self.P @ (e - self.mean)
+
diff --git a/src/personalization/user_model/policy/__init__.py b/src/personalization/user_model/policy/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/user_model/policy/__init__.py
diff --git a/src/personalization/user_model/policy/optimizer.py b/src/personalization/user_model/policy/optimizer.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/user_model/policy/optimizer.py
diff --git a/src/personalization/user_model/policy/reinforce.py b/src/personalization/user_model/policy/reinforce.py
new file mode 100644
index 0000000..adfaef7
--- /dev/null
+++ b/src/personalization/user_model/policy/reinforce.py
@@ -0,0 +1,104 @@
+from typing import Sequence, List
+from dataclasses import dataclass
+import numpy as np
+
+from personalization.user_model.tensor_store import UserState
+
+@dataclass
+class PolicyScores:
+    scores: np.ndarray     # [K] s(q_t, m; u)
+    probs: np.ndarray      # [K] π_z(m|q_t)
+
+def compute_policy_scores(
+    base_scores: np.ndarray,      # [K], from reranker
+    user_state: UserState,
+    item_vectors: np.ndarray,     # [K, k], v_m for the K candidates
+    beta_long: float,
+    beta_short: float,
+    tau: float,
+) -> PolicyScores:
+    """
+    Compute personalized scores and softmax probabilities.
+    s(q_t, m; u) = s_0(q_t,m) + z_t^{(eff)}.T @ v_m
+    z_t^{(eff)} = beta_long * z_long + beta_short * z_short
+    """
+    if len(item_vectors) == 0:
+        return PolicyScores(scores=np.array([]), probs=np.array([]))
+
+    z_eff = beta_long * user_state.z_long + beta_short * user_state.z_short
+    
+    # Calculate personalized term
+    # item_vectors: [K, k]
+    # z_eff: [k]
+    # term: [K]
+    personalization_term = np.dot(item_vectors, z_eff)
+    
+    # Total scores
+    scores = base_scores + personalization_term
+    
+    # Softmax
+    # Use exp(score/tau)
+    # Subtract max for stability
+    scaled_scores = scores / tau
+    exp_scores = np.exp(scaled_scores - np.max(scaled_scores))
+    probs = exp_scores / np.sum(exp_scores)
+    
+    return PolicyScores(scores=scores, probs=probs)
+
+def reinforce_update_user_state(
+    user_state: UserState,
+    item_vectors: np.ndarray,        # [K, k] for candidates
+    chosen_indices: Sequence[int],   # indices of A_t in 0..K-1
+    policy_probs: np.ndarray,        # [K] π_z(m|q_t)
+    reward_hat: float,               # \hat r_t
+    gating: float,                   # g_t
+    tau: float,
+    eta_long: float,
+    eta_short: float,
+    ema_alpha: float,
+    short_decay: float,
+) -> bool:
+    """
+    In-place update user_state.z_long / z_short / reward_ma via REINFORCE.
+    Returns True if update occurred, False otherwise.
+    """
+    if len(chosen_indices) == 0:
+        return False
+
+    # 1. Baseline Advantage
+    advantage = gating * (reward_hat - user_state.reward_ma)
+    
+    # Optimization: skip if advantage is negligible
+    if abs(advantage) < 1e-6:
+        return False
+
+    # 2. Chosen Vector Average (v_{chosen,t})
+    chosen_mask = np.zeros(len(item_vectors), dtype=np.float32)
+    for idx in chosen_indices:
+        idx_int = int(idx)
+        if 0 <= idx_int < len(item_vectors):
+            chosen_mask[idx_int] = 1.0
+            
+    if chosen_mask.sum() == 0:
+        return False
+        
+    chosen_mask /= chosen_mask.sum() # Normalize to average
+    v_chosen = np.dot(chosen_mask, item_vectors) # [k]
+
+    # 3. Expected Vector (\mu_t(z))
+    # policy_probs: [K]
+    # item_vectors: [K, k]
+    v_expect = np.dot(policy_probs, item_vectors) # [k]
+
+    # 4. Gradient Direction
+    grad = (advantage / tau) * (v_chosen - v_expect)
+
+    # 5. Update Vectors
+    user_state.z_long  += eta_long  * grad
+    user_state.z_short = (1.0 - short_decay) * user_state.z_short + eta_short * grad
+
+    # 6. Update Reward Baseline (EMA)
+    user_state.reward_ma = (1.0 - ema_alpha) * user_state.reward_ma + ema_alpha * reward_hat
+    
+    return True
+
diff --git a/src/personalization/user_model/scoring.py b/src/personalization/user_model/scoring.py
new file mode 100644
index 0000000..75ffc84
--- /dev/null
+++ b/src/personalization/user_model/scoring.py
@@ -0,0 +1,25 @@
+import numpy as np
+from .tensor_store import UserState
+
+def score_with_user(
+    base_score: float,
+    user_state: UserState,
+    v_m: np.ndarray,        # [k]
+    beta_long: float,
+    beta_short: float,
+) -> float:
+    """
+    Personalized scoring:
+    s = base_score + (beta_long * z_long + beta_short * z_short) . v_m
+    Day2: beta_long = beta_short = 0 -> s == base_score
+    """
+    z_eff = beta_long * user_state.z_long + beta_short * user_state.z_short
+    # dot product
+    # Ensure shapes match
+    if v_m.shape != z_eff.shape:
+        # Just in case of dimension mismatch
+        return float(base_score)
+        
+    term = np.dot(z_eff, v_m)
+    return float(base_score + term)
+
diff --git a/src/personalization/user_model/session_state.py b/src/personalization/user_model/session_state.py
new file mode 100644
index 0000000..5cd2243
--- /dev/null
+++ b/src/personalization/user_model/session_state.py
@@ -0,0 +1,19 @@
+from dataclasses import dataclass, field
+from typing import List, Optional
+import numpy as np
+
+from personalization.retrieval.preference_store.schemas import ChatTurn, MemoryCard
+
+@dataclass
+class OnlineSessionState:
+    user_id: str
+    history: List[ChatTurn] = field(default_factory=list)
+    last_query: Optional[str] = None
+    last_answer: Optional[str] = None
+    last_memories: List[MemoryCard] = field(default_factory=list)
+    last_query_embedding: Optional[np.ndarray] = None
+    last_candidate_item_vectors: Optional[np.ndarray] = None  # [K, k]
+    last_policy_probs: Optional[np.ndarray] = None            # [K]
+    last_chosen_indices: List[int] = field(default_factory=list)
+
+
diff --git a/src/personalization/user_model/tensor_store.py b/src/personalization/user_model/tensor_store.py
new file mode 100644
index 0000000..42dbf4e
--- /dev/null
+++ b/src/personalization/user_model/tensor_store.py
@@ -0,0 +1,80 @@
+import numpy as np
+from dataclasses import dataclass
+from typing import Dict, Optional
+import os
+
+@dataclass
+class UserState:
+    user_id: str
+    z_long: np.ndarray   # [k]
+    z_short: np.ndarray  # [k]
+    reward_ma: float     # baseline for reward, init 0.0
+
+class UserTensorStore:
+    def __init__(self, k: int, path: str):
+        self.k = k
+        self.path = path
+        self._states: Dict[str, UserState] = {}
+        self._load()
+        
+        # Calculate global mean for initialization
+        if self._states:
+            z_all = np.stack([st.z_long for st in self._states.values()])
+            self.global_init_z = np.mean(z_all, axis=0)
+        else:
+            self.global_init_z = np.zeros(self.k, dtype=np.float32)
+
+    def _load(self):
+        if os.path.exists(self.path):
+            try:
+                data = np.load(self.path, allow_pickle=True)
+                # Assume saved as dict of user_id -> dict/object
+                # For simplicity, let's say we save a single dict in a .npy or .npz
+                # But np.save/load with pickle is tricky for complex objects.
+                # Let's save as .npz where each key is user_id and value is a structured array or just use z_long for now?
+                # A robust way for prototype:
+                # save multiple arrays: "u1_long", "u1_short", "u1_meta"
+                pass 
+                # For Day 2 prototype, we might just re-init from init script or rely on memory if not persisting strictly.
+                # But let's try to load if we can.
+                
+                # Let's implement a simple npz schema:
+                # keys: "{uid}_long", "{uid}_short", "{uid}_meta" (meta=[reward_ma])
+                for key in data.files:
+                    if key.endswith("_long"):
+                        uid = key[:-5]
+                        z_long = data[key]
+                        z_short = data.get(f"{uid}_short", np.zeros(self.k))
+                        meta = data.get(f"{uid}_meta", np.array([0.0]))
+                        self._states[uid] = UserState(uid, z_long, z_short, float(meta[0]))
+            except Exception as e:
+                print(f"Warning: Failed to load UserStore from {self.path}: {e}")
+
+    def _save(self):
+        # Save to npz
+        save_dict = {}
+        for uid, state in self._states.items():
+            save_dict[f"{uid}_long"] = state.z_long
+            save_dict[f"{uid}_short"] = state.z_short
+            save_dict[f"{uid}_meta"] = np.array([state.reward_ma])
+        np.savez(self.path, **save_dict)
+
+    def get_state(self, user_id: str) -> UserState:
+        if user_id not in self._states:
+            # Lazy init with global mean for new users
+            state = UserState(
+                user_id=user_id,
+                z_long=self.global_init_z.copy(),
+                z_short=np.zeros(self.k, dtype=np.float32),
+                reward_ma=0.0,
+            )
+            self._states[user_id] = state
+        return self._states[user_id]
+
+    def save_state(self, state: UserState) -> None:
+        self._states[state.user_id] = state
+    
+    def persist(self):
+        """Public method to force save to disk."""
+        self._save()
+
diff --git a/src/personalization/utils/__init__.py b/src/personalization/utils/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/utils/__init__.py
diff --git a/src/personalization/utils/ids.py b/src/personalization/utils/ids.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/utils/ids.py
diff --git a/src/personalization/utils/io.py b/src/personalization/utils/io.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/utils/io.py
diff --git a/src/personalization/utils/logging.py b/src/personalization/utils/logging.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/utils/logging.py
diff --git a/src/personalization/utils/timing.py b/src/personalization/utils/timing.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/personalization/utils/timing.py