2 files changed, 276 insertions, 0 deletions
diff --git a/src/personalization/evaluation/demo/__init__.py b/src/personalization/evaluation/demo/__init__.py
new file mode 100644
index 0000000..7d50041
--- /dev/null
+++ b/src/personalization/evaluation/demo/__init__.py
@@ -0,0 +1,3 @@
+# Demo scripts for evaluation
+
+
diff --git a/src/personalization/evaluation/demo/run_demo.py b/src/personalization/evaluation/demo/run_demo.py
new file mode 100644
index 0000000..805d046
--- /dev/null
+++ b/src/personalization/evaluation/demo/run_demo.py
@@ -0,0 +1,273 @@
+#!/usr/bin/env python3
+"""
+Demo Runner Script
+
+A minimal demo to verify the evaluation pipeline works:
+- Generates preference bank (5 topics × 5 prefs = 25 total)
+- Creates 2 user profiles (10 prefs each)
+- Runs 3 tasks per user
+- Compares T1 (NoMemory) vs Y3 (RAG) agents
+
+Usage:
+    # With LLM servers running:
+    python run_demo.py
+    
+    # Dry run (no LLM, uses fallback responses):
+    python run_demo.py --dry-run
+    
+    # Specify output directory:
+    python run_demo.py --output-dir /path/to/output
+"""
+
+import argparse
+import os
+import sys
+
+# Add src to path
+_src_path = os.path.join(os.path.dirname(__file__), "../../../..")
+if _src_path not in sys.path:
+    sys.path.insert(0, _src_path)
+
+
+def run_preference_bank_demo():
+    """Generate and display a demo preference bank."""
+    print("\n" + "=" * 60)
+    print("STEP 1: Generate Preference Bank")
+    print("=" * 60)
+    
+    from personalization.evaluation.preference_bank.generator import generate_demo_bank
+    
+    output_dir = "data/eval/demo"
+    os.makedirs(output_dir, exist_ok=True)
+    
+    bank_path = os.path.join(output_dir, "preference_bank.json")
+    bank = generate_demo_bank(output_path=bank_path, use_llm=False)
+    
+    print(f"\nGenerated preference bank with {bank.stats()['total_preferences']} preferences")
+    print(f"Topics: {list(bank.topics.keys())}")
+    
+    # Show sample preferences
+    print("\nSample preferences:")
+    for topic_name, topic in list(bank.topics.items())[:2]:
+        print(f"\n  {topic_name}:")
+        for pref in topic.preferences[:2]:
+            print(f"    - When {pref.condition}: {pref.action}")
+    
+    return bank
+
+
+def run_profile_demo(bank):
+    """Generate demo user profiles."""
+    print("\n" + "=" * 60)
+    print("STEP 2: Generate User Profiles")
+    print("=" * 60)
+    
+    from personalization.evaluation.profiles.generator import generate_demo_profiles
+    
+    output_dir = "data/eval/demo"
+    profiles_path = os.path.join(output_dir, "user_profiles.json")
+    
+    profiles = generate_demo_profiles(
+        bank=bank,
+        num_users=2,
+        prefs_per_user=10,
+        output_path=profiles_path,
+        seed=42,
+    )
+    
+    print(f"\nGenerated {len(profiles)} user profiles")
+    
+    for profile in profiles:
+        print(f"\n  {profile.user_id}:")
+        print(f"    Persona: {profile.persona}")
+        print(f"    Primary topics: {profile.primary_topics}")
+        print(f"    Num preferences: {len(profile.preferences)}")
+    
+    return profiles
+
+
+def run_agent_demo(dry_run: bool = True):
+    """Test agent response generation."""
+    print("\n" + "=" * 60)
+    print("STEP 3: Test Agent Responses")
+    print("=" * 60)
+    
+    from personalization.evaluation.baselines.no_memory import NoMemoryAgent
+    
+    # Create agent (will use fallback if no LLM available)
+    agent = NoMemoryAgent(
+        model_name="llama-8b",
+        api_base="http://localhost:8003/v1" if not dry_run else None,
+    )
+    
+    # Test response
+    test_query = "What is 2 + 2?"
+    response = agent.respond(
+        user_id="test_user",
+        query=test_query,
+        conversation_history=[],
+    )
+    
+    print(f"\nQuery: {test_query}")
+    print(f"Response: {response.answer[:200]}...")
+    print(f"Debug: {response.debug_info}")
+    
+    return agent
+
+
+def run_user_simulator_demo(profiles, dry_run: bool = True):
+    """Test user simulator."""
+    print("\n" + "=" * 60)
+    print("STEP 4: Test User Simulator")
+    print("=" * 60)
+    
+    from personalization.evaluation.user_simulator.simulator import UserSimulator
+    from personalization.evaluation.pipeline.evaluator import Task
+    
+    # Create simulator
+    simulator = UserSimulator(
+        model_name="Llama-3.3-70B-Instruct",
+        api_base="http://localhost:8004/v1" if not dry_run else None,
+    )
+    
+    # Setup with first profile
+    profile = profiles[0]
+    task = Task(
+        task_id="test_001",
+        dataset="test",
+        problem="What is the derivative of x^2?",
+        solution="2x",
+        task_description="Solve this calculus problem:",
+    )
+    
+    simulator.setup(
+        profile=profile,
+        task_description=task.task_description,
+        problem=task.problem,
+        solution=task.solution,
+    )
+    
+    # Simulate first turn
+    conversation = [
+        {"role": "assistant", "content": "How can I help you?"}
+    ]
+    
+    response = simulator.respond(conversation)
+    
+    print(f"\nUser profile: {profile.user_id}")
+    print(f"Task: {task.problem}")
+    print(f"\nUser response: {response.response[:200]}...")
+    print(f"Enforcement needed: {response.enforcement_needed}")
+    print(f"Draft answer: {response.draft_answer}")
+    
+    return simulator
+
+
+def run_full_demo(dry_run: bool = True, output_dir: str = "data/eval/demo"):
+    """Run complete demo experiment."""
+    print("\n" + "=" * 60)
+    print("STEP 5: Run Full Demo Experiment")
+    print("=" * 60)
+    
+    if dry_run:
+        print("\n[DRY RUN MODE] Using fallback responses, no LLM calls\n")
+    
+    from personalization.evaluation.pipeline.runner import ExperimentRunner, ExperimentConfig
+    
+    config = ExperimentConfig(
+        name="demo_experiment",
+        output_dir=output_dir,
+        num_users=2,
+        prefs_per_user=10,
+        tasks_per_user=2,  # Just 2 tasks for quick demo
+        max_turns=10,      # Short conversations
+        run_no_memory=True,
+        run_rag=False,     # Skip RAG for initial demo (needs more setup)
+        run_rag_uv=False,
+        agent_api_base="http://localhost:8003/v1" if not dry_run else "http://localhost:9999/v1",
+        user_sim_api_base="http://localhost:8004/v1" if not dry_run else "http://localhost:9999/v1",
+    )
+    
+    runner = ExperimentRunner(config)
+    runner.setup()
+    metrics = runner.run()
+    
+    return metrics
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run evaluation demo")
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Run without LLM (uses fallback responses)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="data/eval/demo",
+        help="Output directory for results",
+    )
+    parser.add_argument(
+        "--step",
+        type=str,
+        choices=["bank", "profiles", "agent", "simulator", "full", "all"],
+        default="all",
+        help="Which step to run",
+    )
+    
+    args = parser.parse_args()
+    
+    print("\n" + "=" * 60)
+    print("PERSONALIZATION EVALUATION DEMO")
+    print("=" * 60)
+    print(f"Mode: {'DRY RUN (no LLM)' if args.dry_run else 'LIVE (requires LLM servers)'}")
+    print(f"Output: {args.output_dir}")
+    print("=" * 60)
+    
+    os.makedirs(args.output_dir, exist_ok=True)
+    
+    if args.step in ["bank", "all"]:
+        bank = run_preference_bank_demo()
+    else:
+        # Load existing bank
+        from personalization.evaluation.preference_bank.schemas import PreferenceBank
+        bank_path = os.path.join(args.output_dir, "preference_bank.json")
+        if os.path.exists(bank_path):
+            bank = PreferenceBank.load(bank_path)
+        else:
+            bank = run_preference_bank_demo()
+    
+    if args.step in ["profiles", "all"]:
+        profiles = run_profile_demo(bank)
+    else:
+        from personalization.evaluation.profiles.generator import UserProfileGenerator
+        profiles_path = os.path.join(args.output_dir, "user_profiles.json")
+        if os.path.exists(profiles_path):
+            profiles = UserProfileGenerator.load_profiles(profiles_path)
+        else:
+            profiles = run_profile_demo(bank)
+    
+    if args.step in ["agent", "all"]:
+        run_agent_demo(dry_run=args.dry_run)
+    
+    if args.step in ["simulator", "all"]:
+        run_user_simulator_demo(profiles, dry_run=args.dry_run)
+    
+    if args.step in ["full", "all"]:
+        run_full_demo(dry_run=args.dry_run, output_dir=args.output_dir)
+    
+    print("\n" + "=" * 60)
+    print("DEMO COMPLETE!")
+    print("=" * 60)
+    print(f"\nResults saved to: {args.output_dir}/")
+    print("\nNext steps:")
+    print("  1. Start LLM servers (vLLM/SGLang)")
+    print("  2. Run without --dry-run flag")
+    print("  3. Enable RAG baseline for full comparison")
+
+
+if __name__ == "__main__":
+    main()
+
+