diff options
Diffstat (limited to 'src/personalization/evaluation/demo')
| -rw-r--r-- | src/personalization/evaluation/demo/__init__.py | 3 | ||||
| -rw-r--r-- | src/personalization/evaluation/demo/run_demo.py | 273 |
2 files changed, 276 insertions, 0 deletions
diff --git a/src/personalization/evaluation/demo/__init__.py b/src/personalization/evaluation/demo/__init__.py new file mode 100644 index 0000000..7d50041 --- /dev/null +++ b/src/personalization/evaluation/demo/__init__.py @@ -0,0 +1,3 @@ +# Demo scripts for evaluation + + diff --git a/src/personalization/evaluation/demo/run_demo.py b/src/personalization/evaluation/demo/run_demo.py new file mode 100644 index 0000000..805d046 --- /dev/null +++ b/src/personalization/evaluation/demo/run_demo.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python3 +""" +Demo Runner Script + +A minimal demo to verify the evaluation pipeline works: +- Generates preference bank (5 topics × 5 prefs = 25 total) +- Creates 2 user profiles (10 prefs each) +- Runs 3 tasks per user +- Compares T1 (NoMemory) vs Y3 (RAG) agents + +Usage: + # With LLM servers running: + python run_demo.py + + # Dry run (no LLM, uses fallback responses): + python run_demo.py --dry-run + + # Specify output directory: + python run_demo.py --output-dir /path/to/output +""" + +import argparse +import os +import sys + +# Add src to path +_src_path = os.path.join(os.path.dirname(__file__), "../../../..") +if _src_path not in sys.path: + sys.path.insert(0, _src_path) + + +def run_preference_bank_demo(): + """Generate and display a demo preference bank.""" + print("\n" + "=" * 60) + print("STEP 1: Generate Preference Bank") + print("=" * 60) + + from personalization.evaluation.preference_bank.generator import generate_demo_bank + + output_dir = "data/eval/demo" + os.makedirs(output_dir, exist_ok=True) + + bank_path = os.path.join(output_dir, "preference_bank.json") + bank = generate_demo_bank(output_path=bank_path, use_llm=False) + + print(f"\nGenerated preference bank with {bank.stats()['total_preferences']} preferences") + print(f"Topics: {list(bank.topics.keys())}") + + # Show sample preferences + print("\nSample preferences:") + for topic_name, topic in list(bank.topics.items())[:2]: + print(f"\n {topic_name}:") + for pref in topic.preferences[:2]: + print(f" - When {pref.condition}: {pref.action}") + + return bank + + +def run_profile_demo(bank): + """Generate demo user profiles.""" + print("\n" + "=" * 60) + print("STEP 2: Generate User Profiles") + print("=" * 60) + + from personalization.evaluation.profiles.generator import generate_demo_profiles + + output_dir = "data/eval/demo" + profiles_path = os.path.join(output_dir, "user_profiles.json") + + profiles = generate_demo_profiles( + bank=bank, + num_users=2, + prefs_per_user=10, + output_path=profiles_path, + seed=42, + ) + + print(f"\nGenerated {len(profiles)} user profiles") + + for profile in profiles: + print(f"\n {profile.user_id}:") + print(f" Persona: {profile.persona}") + print(f" Primary topics: {profile.primary_topics}") + print(f" Num preferences: {len(profile.preferences)}") + + return profiles + + +def run_agent_demo(dry_run: bool = True): + """Test agent response generation.""" + print("\n" + "=" * 60) + print("STEP 3: Test Agent Responses") + print("=" * 60) + + from personalization.evaluation.baselines.no_memory import NoMemoryAgent + + # Create agent (will use fallback if no LLM available) + agent = NoMemoryAgent( + model_name="llama-8b", + api_base="http://localhost:8003/v1" if not dry_run else None, + ) + + # Test response + test_query = "What is 2 + 2?" + response = agent.respond( + user_id="test_user", + query=test_query, + conversation_history=[], + ) + + print(f"\nQuery: {test_query}") + print(f"Response: {response.answer[:200]}...") + print(f"Debug: {response.debug_info}") + + return agent + + +def run_user_simulator_demo(profiles, dry_run: bool = True): + """Test user simulator.""" + print("\n" + "=" * 60) + print("STEP 4: Test User Simulator") + print("=" * 60) + + from personalization.evaluation.user_simulator.simulator import UserSimulator + from personalization.evaluation.pipeline.evaluator import Task + + # Create simulator + simulator = UserSimulator( + model_name="Llama-3.3-70B-Instruct", + api_base="http://localhost:8004/v1" if not dry_run else None, + ) + + # Setup with first profile + profile = profiles[0] + task = Task( + task_id="test_001", + dataset="test", + problem="What is the derivative of x^2?", + solution="2x", + task_description="Solve this calculus problem:", + ) + + simulator.setup( + profile=profile, + task_description=task.task_description, + problem=task.problem, + solution=task.solution, + ) + + # Simulate first turn + conversation = [ + {"role": "assistant", "content": "How can I help you?"} + ] + + response = simulator.respond(conversation) + + print(f"\nUser profile: {profile.user_id}") + print(f"Task: {task.problem}") + print(f"\nUser response: {response.response[:200]}...") + print(f"Enforcement needed: {response.enforcement_needed}") + print(f"Draft answer: {response.draft_answer}") + + return simulator + + +def run_full_demo(dry_run: bool = True, output_dir: str = "data/eval/demo"): + """Run complete demo experiment.""" + print("\n" + "=" * 60) + print("STEP 5: Run Full Demo Experiment") + print("=" * 60) + + if dry_run: + print("\n[DRY RUN MODE] Using fallback responses, no LLM calls\n") + + from personalization.evaluation.pipeline.runner import ExperimentRunner, ExperimentConfig + + config = ExperimentConfig( + name="demo_experiment", + output_dir=output_dir, + num_users=2, + prefs_per_user=10, + tasks_per_user=2, # Just 2 tasks for quick demo + max_turns=10, # Short conversations + run_no_memory=True, + run_rag=False, # Skip RAG for initial demo (needs more setup) + run_rag_uv=False, + agent_api_base="http://localhost:8003/v1" if not dry_run else "http://localhost:9999/v1", + user_sim_api_base="http://localhost:8004/v1" if not dry_run else "http://localhost:9999/v1", + ) + + runner = ExperimentRunner(config) + runner.setup() + metrics = runner.run() + + return metrics + + +def main(): + parser = argparse.ArgumentParser(description="Run evaluation demo") + parser.add_argument( + "--dry-run", + action="store_true", + help="Run without LLM (uses fallback responses)", + ) + parser.add_argument( + "--output-dir", + type=str, + default="data/eval/demo", + help="Output directory for results", + ) + parser.add_argument( + "--step", + type=str, + choices=["bank", "profiles", "agent", "simulator", "full", "all"], + default="all", + help="Which step to run", + ) + + args = parser.parse_args() + + print("\n" + "=" * 60) + print("PERSONALIZATION EVALUATION DEMO") + print("=" * 60) + print(f"Mode: {'DRY RUN (no LLM)' if args.dry_run else 'LIVE (requires LLM servers)'}") + print(f"Output: {args.output_dir}") + print("=" * 60) + + os.makedirs(args.output_dir, exist_ok=True) + + if args.step in ["bank", "all"]: + bank = run_preference_bank_demo() + else: + # Load existing bank + from personalization.evaluation.preference_bank.schemas import PreferenceBank + bank_path = os.path.join(args.output_dir, "preference_bank.json") + if os.path.exists(bank_path): + bank = PreferenceBank.load(bank_path) + else: + bank = run_preference_bank_demo() + + if args.step in ["profiles", "all"]: + profiles = run_profile_demo(bank) + else: + from personalization.evaluation.profiles.generator import UserProfileGenerator + profiles_path = os.path.join(args.output_dir, "user_profiles.json") + if os.path.exists(profiles_path): + profiles = UserProfileGenerator.load_profiles(profiles_path) + else: + profiles = run_profile_demo(bank) + + if args.step in ["agent", "all"]: + run_agent_demo(dry_run=args.dry_run) + + if args.step in ["simulator", "all"]: + run_user_simulator_demo(profiles, dry_run=args.dry_run) + + if args.step in ["full", "all"]: + run_full_demo(dry_run=args.dry_run, output_dir=args.output_dir) + + print("\n" + "=" * 60) + print("DEMO COMPLETE!") + print("=" * 60) + print(f"\nResults saved to: {args.output_dir}/") + print("\nNext steps:") + print(" 1. Start LLM servers (vLLM/SGLang)") + print(" 2. Run without --dry-run flag") + print(" 3. Enable RAG baseline for full comparison") + + +if __name__ == "__main__": + main() + + |
