1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
|
#!/usr/bin/env python3
"""
Example: Using the PersonalizedLLM Interface for Evaluation.
This script demonstrates the evaluation interface that can be used
by a user simulator or evaluation framework.
Call sequence per evaluation run:
1. reset_user(user_id) - Start fresh for this user's "life"
2. For each session (s=1..S):
a. reset_session(user_id) - New chat window
b. For each turn (t=1..T):
i. [Turn 2+] apply_feedback() for previous turn
ii. resp = chat(user_id, query)
iii. [Simulator computes reward from response]
3. persist() - Save state at end
"""
import sys
import os
# Add src to sys.path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../src"))
from personalization.serving import (
PersonalizedLLM,
AssistantResponse,
Feedback,
)
def main():
print("=" * 60)
print("PersonalizedLLM Evaluation Interface Demo")
print("=" * 60)
# Initialize the system
# Note: This will load models, which takes time and GPU memory
print("\n[1] Initializing PersonalizedLLM...")
llm = PersonalizedLLM(
user_store_path="data/users/user_store_eval_demo.npz",
only_own_memories=True,
enable_preference_extraction=True,
enable_rl_updates=True,
)
# Define test user
user_id = "eval_demo_user"
# Reset user for clean experiment
print(f"\n[2] Resetting user: {user_id}")
llm.reset_user(user_id)
# Check initial state
print(f"\n[3] Initial user state:")
print(f" {llm.get_user_state_summary(user_id)}")
# Simulate multiple sessions
num_sessions = 2
queries_per_session = [
# Session 1: Food preferences
[
"What's a good recipe for dinner tonight?",
"I prefer vegetarian food with Asian flavors.",
"Can you suggest something spicy?",
],
# Session 2: Test personalization retention
[
"What should I cook for lunch?",
"Give me a quick meal idea.",
],
]
all_responses = []
for session_idx, session_queries in enumerate(queries_per_session):
print(f"\n{'=' * 60}")
print(f"SESSION {session_idx + 1}")
print("=" * 60)
# Reset session (new chat window)
llm.reset_session(user_id)
print(f"[Session {session_idx + 1}] Started new session")
session_responses = []
for turn_idx, query in enumerate(session_queries):
print(f"\n--- Turn {turn_idx + 1} ---")
# Apply feedback for previous turn (from turn 2 onwards)
if turn_idx > 0:
# Simulated feedback - in real eval, this comes from user simulator
simulated_reward = 0.7 + 0.1 * (turn_idx % 2) # Varies by turn
simulated_gating = 1.0 if turn_idx > 0 else 0.0
feedback = Feedback(
user_id=user_id,
turn_id=turn_idx - 1,
reward=simulated_reward,
gating=simulated_gating,
meta={"source": "demo_simulator"}
)
print(f"[Feedback] Applying: reward={simulated_reward:.2f}, gating={simulated_gating:.1f}")
llm.apply_feedback(feedback)
# Main chat call
print(f"User: {query}")
response: AssistantResponse = llm.chat(user_id, query)
print(f"Assistant: {response.answer[:200]}..." if len(response.answer) > 200 else f"Assistant: {response.answer}")
print(f"[Usage] prompt={response.usage.prompt_tokens}, completion={response.usage.completion_tokens}, model={response.usage.model}")
if response.debug:
print(f"[Debug] memories={len(response.debug.selected_memory_ids)}, z_long_norm={response.debug.extra.get('z_long_norm', 0):.4f}")
if response.debug.extracted_preferences:
print(f"[Debug] Extracted {len(response.debug.extracted_preferences)} preferences")
session_responses.append(response)
all_responses.append(session_responses)
# Show user state after session
print(f"\n[Session {session_idx + 1}] Final state:")
print(f" {llm.get_user_state_summary(user_id)}")
# Summary
print(f"\n{'=' * 60}")
print("EVALUATION SUMMARY")
print("=" * 60)
total_tokens = sum(
r.usage.total_tokens
for session in all_responses
for r in session
)
total_turns = sum(len(s) for s in all_responses)
print(f"Total sessions: {len(all_responses)}")
print(f"Total turns: {total_turns}")
print(f"Total tokens: {total_tokens}")
print(f"Final user state: {llm.get_user_state_summary(user_id)}")
# Persist (optional, for saving state between runs)
# llm.persist()
# print("\nState persisted to disk.")
print("\nDemo complete!")
if __name__ == "__main__":
main()
|