scripts/eval_interface_example.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

#!/usr/bin/env python3
"""
Example: Using the PersonalizedLLM Interface for Evaluation.

This script demonstrates the evaluation interface that can be used
by a user simulator or evaluation framework.

Call sequence per evaluation run:
1. reset_user(user_id) - Start fresh for this user's "life"
2. For each session (s=1..S):
    a. reset_session(user_id) - New chat window
    b. For each turn (t=1..T):
        i.   [Turn 2+] apply_feedback() for previous turn
        ii.  resp = chat(user_id, query)
        iii. [Simulator computes reward from response]
3. persist() - Save state at end
"""

import sys
import os

# Add src to sys.path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../src"))

from personalization.serving import (
    PersonalizedLLM,
    AssistantResponse,
    Feedback,
)


def main():
    print("=" * 60)
    print("PersonalizedLLM Evaluation Interface Demo")
    print("=" * 60)
    
    # Initialize the system
    # Note: This will load models, which takes time and GPU memory
    print("\n[1] Initializing PersonalizedLLM...")
    
    llm = PersonalizedLLM(
        user_store_path="data/users/user_store_eval_demo.npz",
        only_own_memories=True,
        enable_preference_extraction=True,
        enable_rl_updates=True,
    )
    
    # Define test user
    user_id = "eval_demo_user"
    
    # Reset user for clean experiment
    print(f"\n[2] Resetting user: {user_id}")
    llm.reset_user(user_id)
    
    # Check initial state
    print(f"\n[3] Initial user state:")
    print(f"    {llm.get_user_state_summary(user_id)}")
    
    # Simulate multiple sessions
    num_sessions = 2
    queries_per_session = [
        # Session 1: Food preferences
        [
            "What's a good recipe for dinner tonight?",
            "I prefer vegetarian food with Asian flavors.",
            "Can you suggest something spicy?",
        ],
        # Session 2: Test personalization retention
        [
            "What should I cook for lunch?",
            "Give me a quick meal idea.",
        ],
    ]
    
    all_responses = []
    
    for session_idx, session_queries in enumerate(queries_per_session):
        print(f"\n{'=' * 60}")
        print(f"SESSION {session_idx + 1}")
        print("=" * 60)
        
        # Reset session (new chat window)
        llm.reset_session(user_id)
        print(f"[Session {session_idx + 1}] Started new session")
        
        session_responses = []
        
        for turn_idx, query in enumerate(session_queries):
            print(f"\n--- Turn {turn_idx + 1} ---")
            
            # Apply feedback for previous turn (from turn 2 onwards)
            if turn_idx > 0:
                # Simulated feedback - in real eval, this comes from user simulator
                simulated_reward = 0.7 + 0.1 * (turn_idx % 2)  # Varies by turn
                simulated_gating = 1.0 if turn_idx > 0 else 0.0
                
                feedback = Feedback(
                    user_id=user_id,
                    turn_id=turn_idx - 1,
                    reward=simulated_reward,
                    gating=simulated_gating,
                    meta={"source": "demo_simulator"}
                )
                
                print(f"[Feedback] Applying: reward={simulated_reward:.2f}, gating={simulated_gating:.1f}")
                llm.apply_feedback(feedback)
            
            # Main chat call
            print(f"User: {query}")
            response: AssistantResponse = llm.chat(user_id, query)
            
            print(f"Assistant: {response.answer[:200]}..." if len(response.answer) > 200 else f"Assistant: {response.answer}")
            print(f"[Usage] prompt={response.usage.prompt_tokens}, completion={response.usage.completion_tokens}, model={response.usage.model}")
            
            if response.debug:
                print(f"[Debug] memories={len(response.debug.selected_memory_ids)}, z_long_norm={response.debug.extra.get('z_long_norm', 0):.4f}")
                if response.debug.extracted_preferences:
                    print(f"[Debug] Extracted {len(response.debug.extracted_preferences)} preferences")
            
            session_responses.append(response)
        
        all_responses.append(session_responses)
        
        # Show user state after session
        print(f"\n[Session {session_idx + 1}] Final state:")
        print(f"    {llm.get_user_state_summary(user_id)}")
    
    # Summary
    print(f"\n{'=' * 60}")
    print("EVALUATION SUMMARY")
    print("=" * 60)
    
    total_tokens = sum(
        r.usage.total_tokens 
        for session in all_responses 
        for r in session
    )
    total_turns = sum(len(s) for s in all_responses)
    
    print(f"Total sessions: {len(all_responses)}")
    print(f"Total turns: {total_turns}")
    print(f"Total tokens: {total_tokens}")
    print(f"Final user state: {llm.get_user_state_summary(user_id)}")
    
    # Persist (optional, for saving state between runs)
    # llm.persist()
    # print("\nState persisted to disk.")
    
    print("\nDemo complete!")


if __name__ == "__main__":
    main()