From e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 Mon Sep 17 00:00:00 2001
From: YurenHao0426 <blackhao0426@gmail.com>
Date: Wed, 17 Dec 2025 04:29:37 -0600
Subject: Initial commit (clean history)

---
 scripts/eval_interface_example.py | 154 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 scripts/eval_interface_example.py

(limited to 'scripts/eval_interface_example.py')

diff --git a/scripts/eval_interface_example.py b/scripts/eval_interface_example.py
new file mode 100644
index 0000000..d5dc6cd
--- /dev/null
+++ b/scripts/eval_interface_example.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""
+Example: Using the PersonalizedLLM Interface for Evaluation.
+
+This script demonstrates the evaluation interface that can be used
+by a user simulator or evaluation framework.
+
+Call sequence per evaluation run:
+1. reset_user(user_id) - Start fresh for this user's "life"
+2. For each session (s=1..S):
+    a. reset_session(user_id) - New chat window
+    b. For each turn (t=1..T):
+        i.   [Turn 2+] apply_feedback() for previous turn
+        ii.  resp = chat(user_id, query)
+        iii. [Simulator computes reward from response]
+3. persist() - Save state at end
+"""
+
+import sys
+import os
+
+# Add src to sys.path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../src"))
+
+from personalization.serving import (
+    PersonalizedLLM,
+    AssistantResponse,
+    Feedback,
+)
+
+
+def main():
+    print("=" * 60)
+    print("PersonalizedLLM Evaluation Interface Demo")
+    print("=" * 60)
+    
+    # Initialize the system
+    # Note: This will load models, which takes time and GPU memory
+    print("\n[1] Initializing PersonalizedLLM...")
+    
+    llm = PersonalizedLLM(
+        user_store_path="data/users/user_store_eval_demo.npz",
+        only_own_memories=True,
+        enable_preference_extraction=True,
+        enable_rl_updates=True,
+    )
+    
+    # Define test user
+    user_id = "eval_demo_user"
+    
+    # Reset user for clean experiment
+    print(f"\n[2] Resetting user: {user_id}")
+    llm.reset_user(user_id)
+    
+    # Check initial state
+    print(f"\n[3] Initial user state:")
+    print(f"    {llm.get_user_state_summary(user_id)}")
+    
+    # Simulate multiple sessions
+    num_sessions = 2
+    queries_per_session = [
+        # Session 1: Food preferences
+        [
+            "What's a good recipe for dinner tonight?",
+            "I prefer vegetarian food with Asian flavors.",
+            "Can you suggest something spicy?",
+        ],
+        # Session 2: Test personalization retention
+        [
+            "What should I cook for lunch?",
+            "Give me a quick meal idea.",
+        ],
+    ]
+    
+    all_responses = []
+    
+    for session_idx, session_queries in enumerate(queries_per_session):
+        print(f"\n{'=' * 60}")
+        print(f"SESSION {session_idx + 1}")
+        print("=" * 60)
+        
+        # Reset session (new chat window)
+        llm.reset_session(user_id)
+        print(f"[Session {session_idx + 1}] Started new session")
+        
+        session_responses = []
+        
+        for turn_idx, query in enumerate(session_queries):
+            print(f"\n--- Turn {turn_idx + 1} ---")
+            
+            # Apply feedback for previous turn (from turn 2 onwards)
+            if turn_idx > 0:
+                # Simulated feedback - in real eval, this comes from user simulator
+                simulated_reward = 0.7 + 0.1 * (turn_idx % 2)  # Varies by turn
+                simulated_gating = 1.0 if turn_idx > 0 else 0.0
+                
+                feedback = Feedback(
+                    user_id=user_id,
+                    turn_id=turn_idx - 1,
+                    reward=simulated_reward,
+                    gating=simulated_gating,
+                    meta={"source": "demo_simulator"}
+                )
+                
+                print(f"[Feedback] Applying: reward={simulated_reward:.2f}, gating={simulated_gating:.1f}")
+                llm.apply_feedback(feedback)
+            
+            # Main chat call
+            print(f"User: {query}")
+            response: AssistantResponse = llm.chat(user_id, query)
+            
+            print(f"Assistant: {response.answer[:200]}..." if len(response.answer) > 200 else f"Assistant: {response.answer}")
+            print(f"[Usage] prompt={response.usage.prompt_tokens}, completion={response.usage.completion_tokens}, model={response.usage.model}")
+            
+            if response.debug:
+                print(f"[Debug] memories={len(response.debug.selected_memory_ids)}, z_long_norm={response.debug.extra.get('z_long_norm', 0):.4f}")
+                if response.debug.extracted_preferences:
+                    print(f"[Debug] Extracted {len(response.debug.extracted_preferences)} preferences")
+            
+            session_responses.append(response)
+        
+        all_responses.append(session_responses)
+        
+        # Show user state after session
+        print(f"\n[Session {session_idx + 1}] Final state:")
+        print(f"    {llm.get_user_state_summary(user_id)}")
+    
+    # Summary
+    print(f"\n{'=' * 60}")
+    print("EVALUATION SUMMARY")
+    print("=" * 60)
+    
+    total_tokens = sum(
+        r.usage.total_tokens 
+        for session in all_responses 
+        for r in session
+    )
+    total_turns = sum(len(s) for s in all_responses)
+    
+    print(f"Total sessions: {len(all_responses)}")
+    print(f"Total turns: {total_turns}")
+    print(f"Total tokens: {total_tokens}")
+    print(f"Final user state: {llm.get_user_state_summary(user_id)}")
+    
+    # Persist (optional, for saving state between runs)
+    # llm.persist()
+    # print("\nState persisted to disk.")
+    
+    print("\nDemo complete!")
+
+
+if __name__ == "__main__":
+    main()
+
-- 
cgit v1.2.3