#!/usr/bin/env python3 """ Example: Using the PersonalizedLLM Interface for Evaluation. This script demonstrates the evaluation interface that can be used by a user simulator or evaluation framework. Call sequence per evaluation run: 1. reset_user(user_id) - Start fresh for this user's "life" 2. For each session (s=1..S): a. reset_session(user_id) - New chat window b. For each turn (t=1..T): i. [Turn 2+] apply_feedback() for previous turn ii. resp = chat(user_id, query) iii. [Simulator computes reward from response] 3. persist() - Save state at end """ import sys import os # Add src to sys.path sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../src")) from personalization.serving import ( PersonalizedLLM, AssistantResponse, Feedback, ) def main(): print("=" * 60) print("PersonalizedLLM Evaluation Interface Demo") print("=" * 60) # Initialize the system # Note: This will load models, which takes time and GPU memory print("\n[1] Initializing PersonalizedLLM...") llm = PersonalizedLLM( user_store_path="data/users/user_store_eval_demo.npz", only_own_memories=True, enable_preference_extraction=True, enable_rl_updates=True, ) # Define test user user_id = "eval_demo_user" # Reset user for clean experiment print(f"\n[2] Resetting user: {user_id}") llm.reset_user(user_id) # Check initial state print(f"\n[3] Initial user state:") print(f" {llm.get_user_state_summary(user_id)}") # Simulate multiple sessions num_sessions = 2 queries_per_session = [ # Session 1: Food preferences [ "What's a good recipe for dinner tonight?", "I prefer vegetarian food with Asian flavors.", "Can you suggest something spicy?", ], # Session 2: Test personalization retention [ "What should I cook for lunch?", "Give me a quick meal idea.", ], ] all_responses = [] for session_idx, session_queries in enumerate(queries_per_session): print(f"\n{'=' * 60}") print(f"SESSION {session_idx + 1}") print("=" * 60) # Reset session (new chat window) llm.reset_session(user_id) print(f"[Session {session_idx + 1}] Started new session") session_responses = [] for turn_idx, query in enumerate(session_queries): print(f"\n--- Turn {turn_idx + 1} ---") # Apply feedback for previous turn (from turn 2 onwards) if turn_idx > 0: # Simulated feedback - in real eval, this comes from user simulator simulated_reward = 0.7 + 0.1 * (turn_idx % 2) # Varies by turn simulated_gating = 1.0 if turn_idx > 0 else 0.0 feedback = Feedback( user_id=user_id, turn_id=turn_idx - 1, reward=simulated_reward, gating=simulated_gating, meta={"source": "demo_simulator"} ) print(f"[Feedback] Applying: reward={simulated_reward:.2f}, gating={simulated_gating:.1f}") llm.apply_feedback(feedback) # Main chat call print(f"User: {query}") response: AssistantResponse = llm.chat(user_id, query) print(f"Assistant: {response.answer[:200]}..." if len(response.answer) > 200 else f"Assistant: {response.answer}") print(f"[Usage] prompt={response.usage.prompt_tokens}, completion={response.usage.completion_tokens}, model={response.usage.model}") if response.debug: print(f"[Debug] memories={len(response.debug.selected_memory_ids)}, z_long_norm={response.debug.extra.get('z_long_norm', 0):.4f}") if response.debug.extracted_preferences: print(f"[Debug] Extracted {len(response.debug.extracted_preferences)} preferences") session_responses.append(response) all_responses.append(session_responses) # Show user state after session print(f"\n[Session {session_idx + 1}] Final state:") print(f" {llm.get_user_state_summary(user_id)}") # Summary print(f"\n{'=' * 60}") print("EVALUATION SUMMARY") print("=" * 60) total_tokens = sum( r.usage.total_tokens for session in all_responses for r in session ) total_turns = sum(len(s) for s in all_responses) print(f"Total sessions: {len(all_responses)}") print(f"Total turns: {total_turns}") print(f"Total tokens: {total_tokens}") print(f"Final user state: {llm.get_user_state_summary(user_id)}") # Persist (optional, for saving state between runs) # llm.persist() # print("\nState persisted to disk.") print("\nDemo complete!") if __name__ == "__main__": main()