From e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Wed, 17 Dec 2025 04:29:37 -0600 Subject: Initial commit (clean history) --- scripts/eval_interface_example.py | 154 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 scripts/eval_interface_example.py (limited to 'scripts/eval_interface_example.py') diff --git a/scripts/eval_interface_example.py b/scripts/eval_interface_example.py new file mode 100644 index 0000000..d5dc6cd --- /dev/null +++ b/scripts/eval_interface_example.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +Example: Using the PersonalizedLLM Interface for Evaluation. + +This script demonstrates the evaluation interface that can be used +by a user simulator or evaluation framework. + +Call sequence per evaluation run: +1. reset_user(user_id) - Start fresh for this user's "life" +2. For each session (s=1..S): + a. reset_session(user_id) - New chat window + b. For each turn (t=1..T): + i. [Turn 2+] apply_feedback() for previous turn + ii. resp = chat(user_id, query) + iii. [Simulator computes reward from response] +3. persist() - Save state at end +""" + +import sys +import os + +# Add src to sys.path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../src")) + +from personalization.serving import ( + PersonalizedLLM, + AssistantResponse, + Feedback, +) + + +def main(): + print("=" * 60) + print("PersonalizedLLM Evaluation Interface Demo") + print("=" * 60) + + # Initialize the system + # Note: This will load models, which takes time and GPU memory + print("\n[1] Initializing PersonalizedLLM...") + + llm = PersonalizedLLM( + user_store_path="data/users/user_store_eval_demo.npz", + only_own_memories=True, + enable_preference_extraction=True, + enable_rl_updates=True, + ) + + # Define test user + user_id = "eval_demo_user" + + # Reset user for clean experiment + print(f"\n[2] Resetting user: {user_id}") + llm.reset_user(user_id) + + # Check initial state + print(f"\n[3] Initial user state:") + print(f" {llm.get_user_state_summary(user_id)}") + + # Simulate multiple sessions + num_sessions = 2 + queries_per_session = [ + # Session 1: Food preferences + [ + "What's a good recipe for dinner tonight?", + "I prefer vegetarian food with Asian flavors.", + "Can you suggest something spicy?", + ], + # Session 2: Test personalization retention + [ + "What should I cook for lunch?", + "Give me a quick meal idea.", + ], + ] + + all_responses = [] + + for session_idx, session_queries in enumerate(queries_per_session): + print(f"\n{'=' * 60}") + print(f"SESSION {session_idx + 1}") + print("=" * 60) + + # Reset session (new chat window) + llm.reset_session(user_id) + print(f"[Session {session_idx + 1}] Started new session") + + session_responses = [] + + for turn_idx, query in enumerate(session_queries): + print(f"\n--- Turn {turn_idx + 1} ---") + + # Apply feedback for previous turn (from turn 2 onwards) + if turn_idx > 0: + # Simulated feedback - in real eval, this comes from user simulator + simulated_reward = 0.7 + 0.1 * (turn_idx % 2) # Varies by turn + simulated_gating = 1.0 if turn_idx > 0 else 0.0 + + feedback = Feedback( + user_id=user_id, + turn_id=turn_idx - 1, + reward=simulated_reward, + gating=simulated_gating, + meta={"source": "demo_simulator"} + ) + + print(f"[Feedback] Applying: reward={simulated_reward:.2f}, gating={simulated_gating:.1f}") + llm.apply_feedback(feedback) + + # Main chat call + print(f"User: {query}") + response: AssistantResponse = llm.chat(user_id, query) + + print(f"Assistant: {response.answer[:200]}..." if len(response.answer) > 200 else f"Assistant: {response.answer}") + print(f"[Usage] prompt={response.usage.prompt_tokens}, completion={response.usage.completion_tokens}, model={response.usage.model}") + + if response.debug: + print(f"[Debug] memories={len(response.debug.selected_memory_ids)}, z_long_norm={response.debug.extra.get('z_long_norm', 0):.4f}") + if response.debug.extracted_preferences: + print(f"[Debug] Extracted {len(response.debug.extracted_preferences)} preferences") + + session_responses.append(response) + + all_responses.append(session_responses) + + # Show user state after session + print(f"\n[Session {session_idx + 1}] Final state:") + print(f" {llm.get_user_state_summary(user_id)}") + + # Summary + print(f"\n{'=' * 60}") + print("EVALUATION SUMMARY") + print("=" * 60) + + total_tokens = sum( + r.usage.total_tokens + for session in all_responses + for r in session + ) + total_turns = sum(len(s) for s in all_responses) + + print(f"Total sessions: {len(all_responses)}") + print(f"Total turns: {total_turns}") + print(f"Total tokens: {total_tokens}") + print(f"Final user state: {llm.get_user_state_summary(user_id)}") + + # Persist (optional, for saving state between runs) + # llm.persist() + # print("\nState persisted to disk.") + + print("\nDemo complete!") + + +if __name__ == "__main__": + main() + -- cgit v1.2.3