#!/usr/bin/env python3 """ Minimal test script to validate multi-turn conversation works correctly. This runs a single profile with a single session to verify: 1. LocalUserAgent loads and generates responses 2. Multi-turn conversation loop works 3. Metrics are properly extracted """ import sys import json from pathlib import Path # Add paths sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from agents.local_user_agent import LocalUserAgent, SharedLocalUserAgent, TERMINATION_SIGNAL def test_user_agent_standalone(): """Test LocalUserAgent in isolation.""" print("=" * 60) print("TEST 1: LocalUserAgent Standalone") print("=" * 60) user_agent = LocalUserAgent( user_task_description="Help solve a math problem", problem="What is 2 + 2?", user_persona="A student learning math", user_preferences="- Show step by step solutions\n- Use simple language", ) # Simulate a conversation conversation = [{"role": "assistant", "content": "How can I help you today?"}] print("\nGenerating user response...") response = user_agent.generate_user_response(conversation) if response: print(f"SUCCESS! User response: {response.get('response', 'N/A')[:200]}...") print(f"Should terminate: {response.get('should_terminate', 'N/A')}") print(f"Draft answer: {response.get('draft_answer', 'N/A')[:100]}...") return True else: print("FAILED! User agent returned None") return False def test_multiturn_conversation(): """Test full multi-turn conversation with agent adapter.""" print("\n" + "=" * 60) print("TEST 2: Multi-turn Conversation") print("=" * 60) from adapters.personalized_llm_adapter import create_baseline_adapter # Create a simple agent adapter (vanilla mode) print("\nCreating vanilla adapter...") adapter = create_baseline_adapter("vanilla") adapter.initialize() # Load a test profile profile_path = Path(__file__).parent.parent / "data/complex_profiles_v2/profiles_100.jsonl" with open(profile_path) as f: profile = json.loads(f.readline()) print(f"Loaded profile: {profile.get('user_id', 'unknown')}") # Create user agent problem = "What is 15% of 80?" user_prefs = profile.get("preferences", [])[:3] pref_str = "\n".join([f"- {p}" for p in user_prefs]) print(f"\nUser preferences:\n{pref_str}") user_agent = SharedLocalUserAgent( user_task_description="Solve the math problem", problem=problem, user_persona=profile.get("persona", "A user"), user_preferences=pref_str, ) # Start session adapter.start_session(user_id=profile.get("user_id", "test")) # Run multi-turn conversation conversation = [{"role": "assistant", "content": "How can I help you today?"}] turns = [] max_turns = 5 print(f"\nStarting {max_turns}-turn conversation...") for turn_num in range(max_turns): print(f"\n--- Turn {turn_num + 1} ---") # User turn user_response = user_agent.generate_user_response(conversation) if user_response is None: print("User agent failed!") break user_msg = user_response.get("response", "") print(f"USER: {user_msg[:150]}...") conversation.append({"role": "user", "content": user_msg}) turns.append({"role": "user", "content": user_msg}) # Check termination if user_response.get("should_terminate", False) or TERMINATION_SIGNAL in user_msg: print("\n[User terminated conversation]") break # Agent turn response = adapter.generate_response(user_msg, conversation[:-1]) agent_msg = response.get("response", str(response)) if isinstance(response, dict) else str(response) print(f"AGENT: {agent_msg[:150]}...") conversation.append({"role": "assistant", "content": agent_msg}) turns.append({"role": "assistant", "content": agent_msg}) # End session adapter.end_session() print(f"\n--- Results ---") print(f"Total turns: {len(turns)}") print(f"User turns: {len([t for t in turns if t['role'] == 'user'])}") print(f"Agent turns: {len([t for t in turns if t['role'] == 'assistant'])}") return len(turns) > 2 # Success if more than single turn def test_full_session(): """Test run_single_session from ExperimentRunner.""" print("\n" + "=" * 60) print("TEST 3: Full run_single_session") print("=" * 60) from run_experiments import ExperimentRunner, ExperimentConfig from adapters.personalized_llm_adapter import create_baseline_adapter config = ExperimentConfig( methods=["vanilla"], datasets=["math-500"], n_profiles=1, n_sessions_per_profile=1, max_turns_per_session=5, output_dir="/tmp/test_multiturn", profile_path=str(Path(__file__).parent.parent / "data/complex_profiles_v2/profiles_100.jsonl"), ) print("\nCreating ExperimentRunner...") runner = ExperimentRunner(config) # Get first profile and problem profile = runner.profiles[0] dataset = list(runner.datasets.values())[0] sample = dataset.get_testset()[0] problem = { "problem": sample.problem, "solution": sample.solution, "problem_id": sample.problem_id, "domain": sample.domain, } print(f"\nRunning single session...") print(f"Profile: {profile.get('user_id', 'unknown')}") print(f"Problem: {problem['problem'][:100]}...") # Create adapter adapter = create_baseline_adapter("vanilla") adapter.initialize() result = runner.run_single_session( method="vanilla", profile=profile, problem=problem, is_conflict_query=False, adapter=adapter, ) print(f"\n--- Session Results ---") print(f"Total turns: {result['metrics']['total_turns']}") print(f"Task success: {result['metrics']['task_success']}") print(f"Enforcement count: {result['metrics']['enforcement_count']}") print(f"User tokens: {result['metrics']['user_token_count']}") print(f"Agent tokens: {result['metrics']['agent_token_count']}") print(f"Compliance scores: {result['metrics']['preference_compliance_scores']}") if result['conversation']: print(f"\nConversation ({len(result['conversation']['turns'])} messages):") for i, turn in enumerate(result['conversation']['turns'][:6]): print(f" [{turn['role']}]: {turn['content'][:80]}...") return result['metrics']['total_turns'] > 2 if __name__ == "__main__": print("\n" + "=" * 60) print("MULTI-TURN CONVERSATION VALIDATION TEST") print("=" * 60) results = {} # Test 1: User agent standalone try: results["user_agent"] = test_user_agent_standalone() except Exception as e: print(f"TEST 1 FAILED: {e}") import traceback traceback.print_exc() results["user_agent"] = False # Test 2: Multi-turn conversation try: results["multiturn"] = test_multiturn_conversation() except Exception as e: print(f"TEST 2 FAILED: {e}") import traceback traceback.print_exc() results["multiturn"] = False # Test 3: Full session (only if test 2 passed) if results.get("multiturn", False): try: results["full_session"] = test_full_session() except Exception as e: print(f"TEST 3 FAILED: {e}") import traceback traceback.print_exc() results["full_session"] = False else: print("\nSkipping TEST 3 (TEST 2 failed)") results["full_session"] = False # Summary print("\n" + "=" * 60) print("TEST SUMMARY") print("=" * 60) for test_name, passed in results.items(): status = "PASS" if passed else "FAIL" print(f" {test_name}: {status}") all_passed = all(results.values()) print(f"\nOverall: {'ALL TESTS PASSED' if all_passed else 'SOME TESTS FAILED'}") sys.exit(0 if all_passed else 1)