summaryrefslogtreecommitdiff
path: root/scripts/eval_interface_example.py
diff options
context:
space:
mode:
authorYurenHao0426 <blackhao0426@gmail.com>2025-12-17 04:29:37 -0600
committerYurenHao0426 <blackhao0426@gmail.com>2025-12-17 04:29:37 -0600
commite43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (patch)
tree6ce8a00d2f8b9ebd83c894a27ea01ac50cfb2ff5 /scripts/eval_interface_example.py
Initial commit (clean history)HEADmain
Diffstat (limited to 'scripts/eval_interface_example.py')
-rw-r--r--scripts/eval_interface_example.py154
1 files changed, 154 insertions, 0 deletions
diff --git a/scripts/eval_interface_example.py b/scripts/eval_interface_example.py
new file mode 100644
index 0000000..d5dc6cd
--- /dev/null
+++ b/scripts/eval_interface_example.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""
+Example: Using the PersonalizedLLM Interface for Evaluation.
+
+This script demonstrates the evaluation interface that can be used
+by a user simulator or evaluation framework.
+
+Call sequence per evaluation run:
+1. reset_user(user_id) - Start fresh for this user's "life"
+2. For each session (s=1..S):
+ a. reset_session(user_id) - New chat window
+ b. For each turn (t=1..T):
+ i. [Turn 2+] apply_feedback() for previous turn
+ ii. resp = chat(user_id, query)
+ iii. [Simulator computes reward from response]
+3. persist() - Save state at end
+"""
+
+import sys
+import os
+
+# Add src to sys.path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../src"))
+
+from personalization.serving import (
+ PersonalizedLLM,
+ AssistantResponse,
+ Feedback,
+)
+
+
+def main():
+ print("=" * 60)
+ print("PersonalizedLLM Evaluation Interface Demo")
+ print("=" * 60)
+
+ # Initialize the system
+ # Note: This will load models, which takes time and GPU memory
+ print("\n[1] Initializing PersonalizedLLM...")
+
+ llm = PersonalizedLLM(
+ user_store_path="data/users/user_store_eval_demo.npz",
+ only_own_memories=True,
+ enable_preference_extraction=True,
+ enable_rl_updates=True,
+ )
+
+ # Define test user
+ user_id = "eval_demo_user"
+
+ # Reset user for clean experiment
+ print(f"\n[2] Resetting user: {user_id}")
+ llm.reset_user(user_id)
+
+ # Check initial state
+ print(f"\n[3] Initial user state:")
+ print(f" {llm.get_user_state_summary(user_id)}")
+
+ # Simulate multiple sessions
+ num_sessions = 2
+ queries_per_session = [
+ # Session 1: Food preferences
+ [
+ "What's a good recipe for dinner tonight?",
+ "I prefer vegetarian food with Asian flavors.",
+ "Can you suggest something spicy?",
+ ],
+ # Session 2: Test personalization retention
+ [
+ "What should I cook for lunch?",
+ "Give me a quick meal idea.",
+ ],
+ ]
+
+ all_responses = []
+
+ for session_idx, session_queries in enumerate(queries_per_session):
+ print(f"\n{'=' * 60}")
+ print(f"SESSION {session_idx + 1}")
+ print("=" * 60)
+
+ # Reset session (new chat window)
+ llm.reset_session(user_id)
+ print(f"[Session {session_idx + 1}] Started new session")
+
+ session_responses = []
+
+ for turn_idx, query in enumerate(session_queries):
+ print(f"\n--- Turn {turn_idx + 1} ---")
+
+ # Apply feedback for previous turn (from turn 2 onwards)
+ if turn_idx > 0:
+ # Simulated feedback - in real eval, this comes from user simulator
+ simulated_reward = 0.7 + 0.1 * (turn_idx % 2) # Varies by turn
+ simulated_gating = 1.0 if turn_idx > 0 else 0.0
+
+ feedback = Feedback(
+ user_id=user_id,
+ turn_id=turn_idx - 1,
+ reward=simulated_reward,
+ gating=simulated_gating,
+ meta={"source": "demo_simulator"}
+ )
+
+ print(f"[Feedback] Applying: reward={simulated_reward:.2f}, gating={simulated_gating:.1f}")
+ llm.apply_feedback(feedback)
+
+ # Main chat call
+ print(f"User: {query}")
+ response: AssistantResponse = llm.chat(user_id, query)
+
+ print(f"Assistant: {response.answer[:200]}..." if len(response.answer) > 200 else f"Assistant: {response.answer}")
+ print(f"[Usage] prompt={response.usage.prompt_tokens}, completion={response.usage.completion_tokens}, model={response.usage.model}")
+
+ if response.debug:
+ print(f"[Debug] memories={len(response.debug.selected_memory_ids)}, z_long_norm={response.debug.extra.get('z_long_norm', 0):.4f}")
+ if response.debug.extracted_preferences:
+ print(f"[Debug] Extracted {len(response.debug.extracted_preferences)} preferences")
+
+ session_responses.append(response)
+
+ all_responses.append(session_responses)
+
+ # Show user state after session
+ print(f"\n[Session {session_idx + 1}] Final state:")
+ print(f" {llm.get_user_state_summary(user_id)}")
+
+ # Summary
+ print(f"\n{'=' * 60}")
+ print("EVALUATION SUMMARY")
+ print("=" * 60)
+
+ total_tokens = sum(
+ r.usage.total_tokens
+ for session in all_responses
+ for r in session
+ )
+ total_turns = sum(len(s) for s in all_responses)
+
+ print(f"Total sessions: {len(all_responses)}")
+ print(f"Total turns: {total_turns}")
+ print(f"Total tokens: {total_tokens}")
+ print(f"Final user state: {llm.get_user_state_summary(user_id)}")
+
+ # Persist (optional, for saving state between runs)
+ # llm.persist()
+ # print("\nState persisted to disk.")
+
+ print("\nDemo complete!")
+
+
+if __name__ == "__main__":
+ main()
+