summaryrefslogtreecommitdiff
path: root/src/personalization/evaluation/demo/run_demo.py
blob: 805d046efcd667e05e255c7e99bf29514d6ad48f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
#!/usr/bin/env python3
"""
Demo Runner Script

A minimal demo to verify the evaluation pipeline works:
- Generates preference bank (5 topics × 5 prefs = 25 total)
- Creates 2 user profiles (10 prefs each)
- Runs 3 tasks per user
- Compares T1 (NoMemory) vs Y3 (RAG) agents

Usage:
    # With LLM servers running:
    python run_demo.py
    
    # Dry run (no LLM, uses fallback responses):
    python run_demo.py --dry-run
    
    # Specify output directory:
    python run_demo.py --output-dir /path/to/output
"""

import argparse
import os
import sys

# Add src to path
_src_path = os.path.join(os.path.dirname(__file__), "../../../..")
if _src_path not in sys.path:
    sys.path.insert(0, _src_path)


def run_preference_bank_demo():
    """Generate and display a demo preference bank."""
    print("\n" + "=" * 60)
    print("STEP 1: Generate Preference Bank")
    print("=" * 60)
    
    from personalization.evaluation.preference_bank.generator import generate_demo_bank
    
    output_dir = "data/eval/demo"
    os.makedirs(output_dir, exist_ok=True)
    
    bank_path = os.path.join(output_dir, "preference_bank.json")
    bank = generate_demo_bank(output_path=bank_path, use_llm=False)
    
    print(f"\nGenerated preference bank with {bank.stats()['total_preferences']} preferences")
    print(f"Topics: {list(bank.topics.keys())}")
    
    # Show sample preferences
    print("\nSample preferences:")
    for topic_name, topic in list(bank.topics.items())[:2]:
        print(f"\n  {topic_name}:")
        for pref in topic.preferences[:2]:
            print(f"    - When {pref.condition}: {pref.action}")
    
    return bank


def run_profile_demo(bank):
    """Generate demo user profiles."""
    print("\n" + "=" * 60)
    print("STEP 2: Generate User Profiles")
    print("=" * 60)
    
    from personalization.evaluation.profiles.generator import generate_demo_profiles
    
    output_dir = "data/eval/demo"
    profiles_path = os.path.join(output_dir, "user_profiles.json")
    
    profiles = generate_demo_profiles(
        bank=bank,
        num_users=2,
        prefs_per_user=10,
        output_path=profiles_path,
        seed=42,
    )
    
    print(f"\nGenerated {len(profiles)} user profiles")
    
    for profile in profiles:
        print(f"\n  {profile.user_id}:")
        print(f"    Persona: {profile.persona}")
        print(f"    Primary topics: {profile.primary_topics}")
        print(f"    Num preferences: {len(profile.preferences)}")
    
    return profiles


def run_agent_demo(dry_run: bool = True):
    """Test agent response generation."""
    print("\n" + "=" * 60)
    print("STEP 3: Test Agent Responses")
    print("=" * 60)
    
    from personalization.evaluation.baselines.no_memory import NoMemoryAgent
    
    # Create agent (will use fallback if no LLM available)
    agent = NoMemoryAgent(
        model_name="llama-8b",
        api_base="http://localhost:8003/v1" if not dry_run else None,
    )
    
    # Test response
    test_query = "What is 2 + 2?"
    response = agent.respond(
        user_id="test_user",
        query=test_query,
        conversation_history=[],
    )
    
    print(f"\nQuery: {test_query}")
    print(f"Response: {response.answer[:200]}...")
    print(f"Debug: {response.debug_info}")
    
    return agent


def run_user_simulator_demo(profiles, dry_run: bool = True):
    """Test user simulator."""
    print("\n" + "=" * 60)
    print("STEP 4: Test User Simulator")
    print("=" * 60)
    
    from personalization.evaluation.user_simulator.simulator import UserSimulator
    from personalization.evaluation.pipeline.evaluator import Task
    
    # Create simulator
    simulator = UserSimulator(
        model_name="Llama-3.3-70B-Instruct",
        api_base="http://localhost:8004/v1" if not dry_run else None,
    )
    
    # Setup with first profile
    profile = profiles[0]
    task = Task(
        task_id="test_001",
        dataset="test",
        problem="What is the derivative of x^2?",
        solution="2x",
        task_description="Solve this calculus problem:",
    )
    
    simulator.setup(
        profile=profile,
        task_description=task.task_description,
        problem=task.problem,
        solution=task.solution,
    )
    
    # Simulate first turn
    conversation = [
        {"role": "assistant", "content": "How can I help you?"}
    ]
    
    response = simulator.respond(conversation)
    
    print(f"\nUser profile: {profile.user_id}")
    print(f"Task: {task.problem}")
    print(f"\nUser response: {response.response[:200]}...")
    print(f"Enforcement needed: {response.enforcement_needed}")
    print(f"Draft answer: {response.draft_answer}")
    
    return simulator


def run_full_demo(dry_run: bool = True, output_dir: str = "data/eval/demo"):
    """Run complete demo experiment."""
    print("\n" + "=" * 60)
    print("STEP 5: Run Full Demo Experiment")
    print("=" * 60)
    
    if dry_run:
        print("\n[DRY RUN MODE] Using fallback responses, no LLM calls\n")
    
    from personalization.evaluation.pipeline.runner import ExperimentRunner, ExperimentConfig
    
    config = ExperimentConfig(
        name="demo_experiment",
        output_dir=output_dir,
        num_users=2,
        prefs_per_user=10,
        tasks_per_user=2,  # Just 2 tasks for quick demo
        max_turns=10,      # Short conversations
        run_no_memory=True,
        run_rag=False,     # Skip RAG for initial demo (needs more setup)
        run_rag_uv=False,
        agent_api_base="http://localhost:8003/v1" if not dry_run else "http://localhost:9999/v1",
        user_sim_api_base="http://localhost:8004/v1" if not dry_run else "http://localhost:9999/v1",
    )
    
    runner = ExperimentRunner(config)
    runner.setup()
    metrics = runner.run()
    
    return metrics


def main():
    parser = argparse.ArgumentParser(description="Run evaluation demo")
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Run without LLM (uses fallback responses)",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="data/eval/demo",
        help="Output directory for results",
    )
    parser.add_argument(
        "--step",
        type=str,
        choices=["bank", "profiles", "agent", "simulator", "full", "all"],
        default="all",
        help="Which step to run",
    )
    
    args = parser.parse_args()
    
    print("\n" + "=" * 60)
    print("PERSONALIZATION EVALUATION DEMO")
    print("=" * 60)
    print(f"Mode: {'DRY RUN (no LLM)' if args.dry_run else 'LIVE (requires LLM servers)'}")
    print(f"Output: {args.output_dir}")
    print("=" * 60)
    
    os.makedirs(args.output_dir, exist_ok=True)
    
    if args.step in ["bank", "all"]:
        bank = run_preference_bank_demo()
    else:
        # Load existing bank
        from personalization.evaluation.preference_bank.schemas import PreferenceBank
        bank_path = os.path.join(args.output_dir, "preference_bank.json")
        if os.path.exists(bank_path):
            bank = PreferenceBank.load(bank_path)
        else:
            bank = run_preference_bank_demo()
    
    if args.step in ["profiles", "all"]:
        profiles = run_profile_demo(bank)
    else:
        from personalization.evaluation.profiles.generator import UserProfileGenerator
        profiles_path = os.path.join(args.output_dir, "user_profiles.json")
        if os.path.exists(profiles_path):
            profiles = UserProfileGenerator.load_profiles(profiles_path)
        else:
            profiles = run_profile_demo(bank)
    
    if args.step in ["agent", "all"]:
        run_agent_demo(dry_run=args.dry_run)
    
    if args.step in ["simulator", "all"]:
        run_user_simulator_demo(profiles, dry_run=args.dry_run)
    
    if args.step in ["full", "all"]:
        run_full_demo(dry_run=args.dry_run, output_dir=args.output_dir)
    
    print("\n" + "=" * 60)
    print("DEMO COMPLETE!")
    print("=" * 60)
    print(f"\nResults saved to: {args.output_dir}/")
    print("\nNext steps:")
    print("  1. Start LLM servers (vLLM/SGLang)")
    print("  2. Run without --dry-run flag")
    print("  3. Enable RAG baseline for full comparison")


if __name__ == "__main__":
    main()