summaryrefslogtreecommitdiff
path: root/src/personalization/evaluation/demo/run_demo.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/personalization/evaluation/demo/run_demo.py')
-rw-r--r--src/personalization/evaluation/demo/run_demo.py273
1 files changed, 273 insertions, 0 deletions
diff --git a/src/personalization/evaluation/demo/run_demo.py b/src/personalization/evaluation/demo/run_demo.py
new file mode 100644
index 0000000..805d046
--- /dev/null
+++ b/src/personalization/evaluation/demo/run_demo.py
@@ -0,0 +1,273 @@
+#!/usr/bin/env python3
+"""
+Demo Runner Script
+
+A minimal demo to verify the evaluation pipeline works:
+- Generates preference bank (5 topics × 5 prefs = 25 total)
+- Creates 2 user profiles (10 prefs each)
+- Runs 3 tasks per user
+- Compares T1 (NoMemory) vs Y3 (RAG) agents
+
+Usage:
+ # With LLM servers running:
+ python run_demo.py
+
+ # Dry run (no LLM, uses fallback responses):
+ python run_demo.py --dry-run
+
+ # Specify output directory:
+ python run_demo.py --output-dir /path/to/output
+"""
+
+import argparse
+import os
+import sys
+
+# Add src to path
+_src_path = os.path.join(os.path.dirname(__file__), "../../../..")
+if _src_path not in sys.path:
+ sys.path.insert(0, _src_path)
+
+
+def run_preference_bank_demo():
+ """Generate and display a demo preference bank."""
+ print("\n" + "=" * 60)
+ print("STEP 1: Generate Preference Bank")
+ print("=" * 60)
+
+ from personalization.evaluation.preference_bank.generator import generate_demo_bank
+
+ output_dir = "data/eval/demo"
+ os.makedirs(output_dir, exist_ok=True)
+
+ bank_path = os.path.join(output_dir, "preference_bank.json")
+ bank = generate_demo_bank(output_path=bank_path, use_llm=False)
+
+ print(f"\nGenerated preference bank with {bank.stats()['total_preferences']} preferences")
+ print(f"Topics: {list(bank.topics.keys())}")
+
+ # Show sample preferences
+ print("\nSample preferences:")
+ for topic_name, topic in list(bank.topics.items())[:2]:
+ print(f"\n {topic_name}:")
+ for pref in topic.preferences[:2]:
+ print(f" - When {pref.condition}: {pref.action}")
+
+ return bank
+
+
+def run_profile_demo(bank):
+ """Generate demo user profiles."""
+ print("\n" + "=" * 60)
+ print("STEP 2: Generate User Profiles")
+ print("=" * 60)
+
+ from personalization.evaluation.profiles.generator import generate_demo_profiles
+
+ output_dir = "data/eval/demo"
+ profiles_path = os.path.join(output_dir, "user_profiles.json")
+
+ profiles = generate_demo_profiles(
+ bank=bank,
+ num_users=2,
+ prefs_per_user=10,
+ output_path=profiles_path,
+ seed=42,
+ )
+
+ print(f"\nGenerated {len(profiles)} user profiles")
+
+ for profile in profiles:
+ print(f"\n {profile.user_id}:")
+ print(f" Persona: {profile.persona}")
+ print(f" Primary topics: {profile.primary_topics}")
+ print(f" Num preferences: {len(profile.preferences)}")
+
+ return profiles
+
+
+def run_agent_demo(dry_run: bool = True):
+ """Test agent response generation."""
+ print("\n" + "=" * 60)
+ print("STEP 3: Test Agent Responses")
+ print("=" * 60)
+
+ from personalization.evaluation.baselines.no_memory import NoMemoryAgent
+
+ # Create agent (will use fallback if no LLM available)
+ agent = NoMemoryAgent(
+ model_name="llama-8b",
+ api_base="http://localhost:8003/v1" if not dry_run else None,
+ )
+
+ # Test response
+ test_query = "What is 2 + 2?"
+ response = agent.respond(
+ user_id="test_user",
+ query=test_query,
+ conversation_history=[],
+ )
+
+ print(f"\nQuery: {test_query}")
+ print(f"Response: {response.answer[:200]}...")
+ print(f"Debug: {response.debug_info}")
+
+ return agent
+
+
+def run_user_simulator_demo(profiles, dry_run: bool = True):
+ """Test user simulator."""
+ print("\n" + "=" * 60)
+ print("STEP 4: Test User Simulator")
+ print("=" * 60)
+
+ from personalization.evaluation.user_simulator.simulator import UserSimulator
+ from personalization.evaluation.pipeline.evaluator import Task
+
+ # Create simulator
+ simulator = UserSimulator(
+ model_name="Llama-3.3-70B-Instruct",
+ api_base="http://localhost:8004/v1" if not dry_run else None,
+ )
+
+ # Setup with first profile
+ profile = profiles[0]
+ task = Task(
+ task_id="test_001",
+ dataset="test",
+ problem="What is the derivative of x^2?",
+ solution="2x",
+ task_description="Solve this calculus problem:",
+ )
+
+ simulator.setup(
+ profile=profile,
+ task_description=task.task_description,
+ problem=task.problem,
+ solution=task.solution,
+ )
+
+ # Simulate first turn
+ conversation = [
+ {"role": "assistant", "content": "How can I help you?"}
+ ]
+
+ response = simulator.respond(conversation)
+
+ print(f"\nUser profile: {profile.user_id}")
+ print(f"Task: {task.problem}")
+ print(f"\nUser response: {response.response[:200]}...")
+ print(f"Enforcement needed: {response.enforcement_needed}")
+ print(f"Draft answer: {response.draft_answer}")
+
+ return simulator
+
+
+def run_full_demo(dry_run: bool = True, output_dir: str = "data/eval/demo"):
+ """Run complete demo experiment."""
+ print("\n" + "=" * 60)
+ print("STEP 5: Run Full Demo Experiment")
+ print("=" * 60)
+
+ if dry_run:
+ print("\n[DRY RUN MODE] Using fallback responses, no LLM calls\n")
+
+ from personalization.evaluation.pipeline.runner import ExperimentRunner, ExperimentConfig
+
+ config = ExperimentConfig(
+ name="demo_experiment",
+ output_dir=output_dir,
+ num_users=2,
+ prefs_per_user=10,
+ tasks_per_user=2, # Just 2 tasks for quick demo
+ max_turns=10, # Short conversations
+ run_no_memory=True,
+ run_rag=False, # Skip RAG for initial demo (needs more setup)
+ run_rag_uv=False,
+ agent_api_base="http://localhost:8003/v1" if not dry_run else "http://localhost:9999/v1",
+ user_sim_api_base="http://localhost:8004/v1" if not dry_run else "http://localhost:9999/v1",
+ )
+
+ runner = ExperimentRunner(config)
+ runner.setup()
+ metrics = runner.run()
+
+ return metrics
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Run evaluation demo")
+ parser.add_argument(
+ "--dry-run",
+ action="store_true",
+ help="Run without LLM (uses fallback responses)",
+ )
+ parser.add_argument(
+ "--output-dir",
+ type=str,
+ default="data/eval/demo",
+ help="Output directory for results",
+ )
+ parser.add_argument(
+ "--step",
+ type=str,
+ choices=["bank", "profiles", "agent", "simulator", "full", "all"],
+ default="all",
+ help="Which step to run",
+ )
+
+ args = parser.parse_args()
+
+ print("\n" + "=" * 60)
+ print("PERSONALIZATION EVALUATION DEMO")
+ print("=" * 60)
+ print(f"Mode: {'DRY RUN (no LLM)' if args.dry_run else 'LIVE (requires LLM servers)'}")
+ print(f"Output: {args.output_dir}")
+ print("=" * 60)
+
+ os.makedirs(args.output_dir, exist_ok=True)
+
+ if args.step in ["bank", "all"]:
+ bank = run_preference_bank_demo()
+ else:
+ # Load existing bank
+ from personalization.evaluation.preference_bank.schemas import PreferenceBank
+ bank_path = os.path.join(args.output_dir, "preference_bank.json")
+ if os.path.exists(bank_path):
+ bank = PreferenceBank.load(bank_path)
+ else:
+ bank = run_preference_bank_demo()
+
+ if args.step in ["profiles", "all"]:
+ profiles = run_profile_demo(bank)
+ else:
+ from personalization.evaluation.profiles.generator import UserProfileGenerator
+ profiles_path = os.path.join(args.output_dir, "user_profiles.json")
+ if os.path.exists(profiles_path):
+ profiles = UserProfileGenerator.load_profiles(profiles_path)
+ else:
+ profiles = run_profile_demo(bank)
+
+ if args.step in ["agent", "all"]:
+ run_agent_demo(dry_run=args.dry_run)
+
+ if args.step in ["simulator", "all"]:
+ run_user_simulator_demo(profiles, dry_run=args.dry_run)
+
+ if args.step in ["full", "all"]:
+ run_full_demo(dry_run=args.dry_run, output_dir=args.output_dir)
+
+ print("\n" + "=" * 60)
+ print("DEMO COMPLETE!")
+ print("=" * 60)
+ print(f"\nResults saved to: {args.output_dir}/")
+ print("\nNext steps:")
+ print(" 1. Start LLM servers (vLLM/SGLang)")
+ print(" 2. Run without --dry-run flag")
+ print(" 3. Enable RAG baseline for full comparison")
+
+
+if __name__ == "__main__":
+ main()
+
+