#!/usr/bin/env python3 """ Demo Runner Script A minimal demo to verify the evaluation pipeline works: - Generates preference bank (5 topics × 5 prefs = 25 total) - Creates 2 user profiles (10 prefs each) - Runs 3 tasks per user - Compares T1 (NoMemory) vs Y3 (RAG) agents Usage: # With LLM servers running: python run_demo.py # Dry run (no LLM, uses fallback responses): python run_demo.py --dry-run # Specify output directory: python run_demo.py --output-dir /path/to/output """ import argparse import os import sys # Add src to path _src_path = os.path.join(os.path.dirname(__file__), "../../../..") if _src_path not in sys.path: sys.path.insert(0, _src_path) def run_preference_bank_demo(): """Generate and display a demo preference bank.""" print("\n" + "=" * 60) print("STEP 1: Generate Preference Bank") print("=" * 60) from personalization.evaluation.preference_bank.generator import generate_demo_bank output_dir = "data/eval/demo" os.makedirs(output_dir, exist_ok=True) bank_path = os.path.join(output_dir, "preference_bank.json") bank = generate_demo_bank(output_path=bank_path, use_llm=False) print(f"\nGenerated preference bank with {bank.stats()['total_preferences']} preferences") print(f"Topics: {list(bank.topics.keys())}") # Show sample preferences print("\nSample preferences:") for topic_name, topic in list(bank.topics.items())[:2]: print(f"\n {topic_name}:") for pref in topic.preferences[:2]: print(f" - When {pref.condition}: {pref.action}") return bank def run_profile_demo(bank): """Generate demo user profiles.""" print("\n" + "=" * 60) print("STEP 2: Generate User Profiles") print("=" * 60) from personalization.evaluation.profiles.generator import generate_demo_profiles output_dir = "data/eval/demo" profiles_path = os.path.join(output_dir, "user_profiles.json") profiles = generate_demo_profiles( bank=bank, num_users=2, prefs_per_user=10, output_path=profiles_path, seed=42, ) print(f"\nGenerated {len(profiles)} user profiles") for profile in profiles: print(f"\n {profile.user_id}:") print(f" Persona: {profile.persona}") print(f" Primary topics: {profile.primary_topics}") print(f" Num preferences: {len(profile.preferences)}") return profiles def run_agent_demo(dry_run: bool = True): """Test agent response generation.""" print("\n" + "=" * 60) print("STEP 3: Test Agent Responses") print("=" * 60) from personalization.evaluation.baselines.no_memory import NoMemoryAgent # Create agent (will use fallback if no LLM available) agent = NoMemoryAgent( model_name="llama-8b", api_base="http://localhost:8003/v1" if not dry_run else None, ) # Test response test_query = "What is 2 + 2?" response = agent.respond( user_id="test_user", query=test_query, conversation_history=[], ) print(f"\nQuery: {test_query}") print(f"Response: {response.answer[:200]}...") print(f"Debug: {response.debug_info}") return agent def run_user_simulator_demo(profiles, dry_run: bool = True): """Test user simulator.""" print("\n" + "=" * 60) print("STEP 4: Test User Simulator") print("=" * 60) from personalization.evaluation.user_simulator.simulator import UserSimulator from personalization.evaluation.pipeline.evaluator import Task # Create simulator simulator = UserSimulator( model_name="Llama-3.3-70B-Instruct", api_base="http://localhost:8004/v1" if not dry_run else None, ) # Setup with first profile profile = profiles[0] task = Task( task_id="test_001", dataset="test", problem="What is the derivative of x^2?", solution="2x", task_description="Solve this calculus problem:", ) simulator.setup( profile=profile, task_description=task.task_description, problem=task.problem, solution=task.solution, ) # Simulate first turn conversation = [ {"role": "assistant", "content": "How can I help you?"} ] response = simulator.respond(conversation) print(f"\nUser profile: {profile.user_id}") print(f"Task: {task.problem}") print(f"\nUser response: {response.response[:200]}...") print(f"Enforcement needed: {response.enforcement_needed}") print(f"Draft answer: {response.draft_answer}") return simulator def run_full_demo(dry_run: bool = True, output_dir: str = "data/eval/demo"): """Run complete demo experiment.""" print("\n" + "=" * 60) print("STEP 5: Run Full Demo Experiment") print("=" * 60) if dry_run: print("\n[DRY RUN MODE] Using fallback responses, no LLM calls\n") from personalization.evaluation.pipeline.runner import ExperimentRunner, ExperimentConfig config = ExperimentConfig( name="demo_experiment", output_dir=output_dir, num_users=2, prefs_per_user=10, tasks_per_user=2, # Just 2 tasks for quick demo max_turns=10, # Short conversations run_no_memory=True, run_rag=False, # Skip RAG for initial demo (needs more setup) run_rag_uv=False, agent_api_base="http://localhost:8003/v1" if not dry_run else "http://localhost:9999/v1", user_sim_api_base="http://localhost:8004/v1" if not dry_run else "http://localhost:9999/v1", ) runner = ExperimentRunner(config) runner.setup() metrics = runner.run() return metrics def main(): parser = argparse.ArgumentParser(description="Run evaluation demo") parser.add_argument( "--dry-run", action="store_true", help="Run without LLM (uses fallback responses)", ) parser.add_argument( "--output-dir", type=str, default="data/eval/demo", help="Output directory for results", ) parser.add_argument( "--step", type=str, choices=["bank", "profiles", "agent", "simulator", "full", "all"], default="all", help="Which step to run", ) args = parser.parse_args() print("\n" + "=" * 60) print("PERSONALIZATION EVALUATION DEMO") print("=" * 60) print(f"Mode: {'DRY RUN (no LLM)' if args.dry_run else 'LIVE (requires LLM servers)'}") print(f"Output: {args.output_dir}") print("=" * 60) os.makedirs(args.output_dir, exist_ok=True) if args.step in ["bank", "all"]: bank = run_preference_bank_demo() else: # Load existing bank from personalization.evaluation.preference_bank.schemas import PreferenceBank bank_path = os.path.join(args.output_dir, "preference_bank.json") if os.path.exists(bank_path): bank = PreferenceBank.load(bank_path) else: bank = run_preference_bank_demo() if args.step in ["profiles", "all"]: profiles = run_profile_demo(bank) else: from personalization.evaluation.profiles.generator import UserProfileGenerator profiles_path = os.path.join(args.output_dir, "user_profiles.json") if os.path.exists(profiles_path): profiles = UserProfileGenerator.load_profiles(profiles_path) else: profiles = run_profile_demo(bank) if args.step in ["agent", "all"]: run_agent_demo(dry_run=args.dry_run) if args.step in ["simulator", "all"]: run_user_simulator_demo(profiles, dry_run=args.dry_run) if args.step in ["full", "all"]: run_full_demo(dry_run=args.dry_run, output_dir=args.output_dir) print("\n" + "=" * 60) print("DEMO COMPLETE!") print("=" * 60) print(f"\nResults saved to: {args.output_dir}/") print("\nNext steps:") print(" 1. Start LLM servers (vLLM/SGLang)") print(" 2. Run without --dry-run flag") print(" 3. Enable RAG baseline for full comparison") if __name__ == "__main__": main()