#!/usr/bin/env python3
"""
Demo Runner Script

A minimal demo to verify the evaluation pipeline works:
- Generates preference bank (5 topics × 5 prefs = 25 total)
- Creates 2 user profiles (10 prefs each)
- Runs 3 tasks per user
- Compares T1 (NoMemory) vs Y3 (RAG) agents

Usage:
    # With LLM servers running:
    python run_demo.py
    
    # Dry run (no LLM, uses fallback responses):
    python run_demo.py --dry-run
    
    # Specify output directory:
    python run_demo.py --output-dir /path/to/output
"""

import argparse
import os
import sys

# Add src to path
_src_path = os.path.join(os.path.dirname(__file__), "../../../..")
if _src_path not in sys.path:
    sys.path.insert(0, _src_path)


def run_preference_bank_demo():
    """Generate and display a demo preference bank."""
    print("\n" + "=" * 60)
    print("STEP 1: Generate Preference Bank")
    print("=" * 60)
    
    from personalization.evaluation.preference_bank.generator import generate_demo_bank
    
    output_dir = "data/eval/demo"
    os.makedirs(output_dir, exist_ok=True)
    
    bank_path = os.path.join(output_dir, "preference_bank.json")
    bank = generate_demo_bank(output_path=bank_path, use_llm=False)
    
    print(f"\nGenerated preference bank with {bank.stats()['total_preferences']} preferences")
    print(f"Topics: {list(bank.topics.keys())}")
    
    # Show sample preferences
    print("\nSample preferences:")
    for topic_name, topic in list(bank.topics.items())[:2]:
        print(f"\n  {topic_name}:")
        for pref in topic.preferences[:2]:
            print(f"    - When {pref.condition}: {pref.action}")
    
    return bank


def run_profile_demo(bank):
    """Generate demo user profiles."""
    print("\n" + "=" * 60)
    print("STEP 2: Generate User Profiles")
    print("=" * 60)
    
    from personalization.evaluation.profiles.generator import generate_demo_profiles
    
    output_dir = "data/eval/demo"
    profiles_path = os.path.join(output_dir, "user_profiles.json")
    
    profiles = generate_demo_profiles(
        bank=bank,
        num_users=2,
        prefs_per_user=10,
        output_path=profiles_path,
        seed=42,
    )
    
    print(f"\nGenerated {len(profiles)} user profiles")
    
    for profile in profiles:
        print(f"\n  {profile.user_id}:")
        print(f"    Persona: {profile.persona}")
        print(f"    Primary topics: {profile.primary_topics}")
        print(f"    Num preferences: {len(profile.preferences)}")
    
    return profiles


def run_agent_demo(dry_run: bool = True):
    """Test agent response generation."""
    print("\n" + "=" * 60)
    print("STEP 3: Test Agent Responses")
    print("=" * 60)
    
    from personalization.evaluation.baselines.no_memory import NoMemoryAgent
    
    # Create agent (will use fallback if no LLM available)
    agent = NoMemoryAgent(
        model_name="llama-8b",
        api_base="http://localhost:8003/v1" if not dry_run else None,
    )
    
    # Test response
    test_query = "What is 2 + 2?"
    response = agent.respond(
        user_id="test_user",
        query=test_query,
        conversation_history=[],
    )
    
    print(f"\nQuery: {test_query}")
    print(f"Response: {response.answer[:200]}...")
    print(f"Debug: {response.debug_info}")
    
    return agent


def run_user_simulator_demo(profiles, dry_run: bool = True):
    """Test user simulator."""
    print("\n" + "=" * 60)
    print("STEP 4: Test User Simulator")
    print("=" * 60)
    
    from personalization.evaluation.user_simulator.simulator import UserSimulator
    from personalization.evaluation.pipeline.evaluator import Task
    
    # Create simulator
    simulator = UserSimulator(
        model_name="Llama-3.3-70B-Instruct",
        api_base="http://localhost:8004/v1" if not dry_run else None,
    )
    
    # Setup with first profile
    profile = profiles[0]
    task = Task(
        task_id="test_001",
        dataset="test",
        problem="What is the derivative of x^2?",
        solution="2x",
        task_description="Solve this calculus problem:",
    )
    
    simulator.setup(
        profile=profile,
        task_description=task.task_description,
        problem=task.problem,
        solution=task.solution,
    )
    
    # Simulate first turn
    conversation = [
        {"role": "assistant", "content": "How can I help you?"}
    ]
    
    response = simulator.respond(conversation)
    
    print(f"\nUser profile: {profile.user_id}")
    print(f"Task: {task.problem}")
    print(f"\nUser response: {response.response[:200]}...")
    print(f"Enforcement needed: {response.enforcement_needed}")
    print(f"Draft answer: {response.draft_answer}")
    
    return simulator


def run_full_demo(dry_run: bool = True, output_dir: str = "data/eval/demo"):
    """Run complete demo experiment."""
    print("\n" + "=" * 60)
    print("STEP 5: Run Full Demo Experiment")
    print("=" * 60)
    
    if dry_run:
        print("\n[DRY RUN MODE] Using fallback responses, no LLM calls\n")
    
    from personalization.evaluation.pipeline.runner import ExperimentRunner, ExperimentConfig
    
    config = ExperimentConfig(
        name="demo_experiment",
        output_dir=output_dir,
        num_users=2,
        prefs_per_user=10,
        tasks_per_user=2,  # Just 2 tasks for quick demo
        max_turns=10,      # Short conversations
        run_no_memory=True,
        run_rag=False,     # Skip RAG for initial demo (needs more setup)
        run_rag_uv=False,
        agent_api_base="http://localhost:8003/v1" if not dry_run else "http://localhost:9999/v1",
        user_sim_api_base="http://localhost:8004/v1" if not dry_run else "http://localhost:9999/v1",
    )
    
    runner = ExperimentRunner(config)
    runner.setup()
    metrics = runner.run()
    
    return metrics


def main():
    parser = argparse.ArgumentParser(description="Run evaluation demo")
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Run without LLM (uses fallback responses)",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="data/eval/demo",
        help="Output directory for results",
    )
    parser.add_argument(
        "--step",
        type=str,
        choices=["bank", "profiles", "agent", "simulator", "full", "all"],
        default="all",
        help="Which step to run",
    )
    
    args = parser.parse_args()
    
    print("\n" + "=" * 60)
    print("PERSONALIZATION EVALUATION DEMO")
    print("=" * 60)
    print(f"Mode: {'DRY RUN (no LLM)' if args.dry_run else 'LIVE (requires LLM servers)'}")
    print(f"Output: {args.output_dir}")
    print("=" * 60)
    
    os.makedirs(args.output_dir, exist_ok=True)
    
    if args.step in ["bank", "all"]:
        bank = run_preference_bank_demo()
    else:
        # Load existing bank
        from personalization.evaluation.preference_bank.schemas import PreferenceBank
        bank_path = os.path.join(args.output_dir, "preference_bank.json")
        if os.path.exists(bank_path):
            bank = PreferenceBank.load(bank_path)
        else:
            bank = run_preference_bank_demo()
    
    if args.step in ["profiles", "all"]:
        profiles = run_profile_demo(bank)
    else:
        from personalization.evaluation.profiles.generator import UserProfileGenerator
        profiles_path = os.path.join(args.output_dir, "user_profiles.json")
        if os.path.exists(profiles_path):
            profiles = UserProfileGenerator.load_profiles(profiles_path)
        else:
            profiles = run_profile_demo(bank)
    
    if args.step in ["agent", "all"]:
        run_agent_demo(dry_run=args.dry_run)
    
    if args.step in ["simulator", "all"]:
        run_user_simulator_demo(profiles, dry_run=args.dry_run)
    
    if args.step in ["full", "all"]:
        run_full_demo(dry_run=args.dry_run, output_dir=args.output_dir)
    
    print("\n" + "=" * 60)
    print("DEMO COMPLETE!")
    print("=" * 60)
    print(f"\nResults saved to: {args.output_dir}/")
    print("\nNext steps:")
    print("  1. Start LLM servers (vLLM/SGLang)")
    print("  2. Run without --dry-run flag")
    print("  3. Enable RAG baseline for full comparison")


if __name__ == "__main__":
    main()