""" Extended prompts for complex preference evaluation. Key changes from original: 1. Step-by-step prompts to make sessions longer (more turns = more preference opportunities) 2. User prompts that both ENFORCE and EXPRESS DISAPPOINTMENT 3. More complex conditional preference handling """ # ============================================================================= # Step-by-Step Encouragement Prompts (Added to Problem Descriptions) # ============================================================================= STEP_BY_STEP_MATH_PROMPT = """ IMPORTANT: This is a challenging problem. Please work through it carefully: 1. First, identify what type of problem this is and what mathematical concepts apply 2. State any assumptions or conditions that are relevant 3. Break down the solution into clear steps, explaining your reasoning at each step 4. After reaching an answer, verify it makes sense (check edge cases, units, etc.) 5. Summarize the key insights from this problem Take your time. Show ALL your work and reasoning. Do not skip steps. """ STEP_BY_STEP_CODE_PROMPT = """ IMPORTANT: This requires careful implementation. Please proceed methodically: 1. First, understand the requirements completely - ask clarifying questions if needed 2. Identify edge cases and constraints before writing any code 3. Outline your approach in pseudocode or plain language 4. Implement step by step, explaining the logic of each component 5. Consider time and space complexity 6. Test your solution with example inputs and edge cases 7. Refactor if there are obvious improvements Show your reasoning throughout. Quality matters more than speed. """ STEP_BY_STEP_REASONING_PROMPT = """ IMPORTANT: This problem requires careful logical reasoning: 1. Read the problem carefully and identify ALL relevant information 2. State any assumptions you're making explicitly 3. Work through the logic step by step, explaining each deduction 4. Check for any logical fallacies or gaps in your reasoning 5. Consider alternative interpretations or approaches 6. State your final conclusion clearly with confidence level Think out loud. Show your complete reasoning process. """ STEP_BY_STEP_SCIENCE_PROMPT = """ IMPORTANT: This is a complex scientific problem: 1. Identify the core concepts and principles involved 2. State any assumptions, constants, or boundary conditions 3. Set up the problem mathematically if applicable 4. Work through the solution systematically 5. Interpret your results - do they make physical/scientific sense? 6. Consider limitations of your approach Be thorough. Scientific problems reward careful, systematic thinking. """ def get_step_by_step_prompt(domain: str) -> str: """Get the appropriate step-by-step prompt for a domain.""" prompts = { "math": STEP_BY_STEP_MATH_PROMPT, "math-hard": STEP_BY_STEP_MATH_PROMPT, "math-500": STEP_BY_STEP_MATH_PROMPT, "aime": STEP_BY_STEP_MATH_PROMPT, "theoremqa": STEP_BY_STEP_MATH_PROMPT, "code": STEP_BY_STEP_CODE_PROMPT, "humaneval": STEP_BY_STEP_CODE_PROMPT, "bigcodebench": STEP_BY_STEP_CODE_PROMPT, "livecodebench": STEP_BY_STEP_CODE_PROMPT, "scicode": STEP_BY_STEP_CODE_PROMPT, "logiqa": STEP_BY_STEP_REASONING_PROMPT, "mmlu": STEP_BY_STEP_REASONING_PROMPT, "medqa": STEP_BY_STEP_SCIENCE_PROMPT, "gpqa": STEP_BY_STEP_SCIENCE_PROMPT, } return prompts.get(domain, STEP_BY_STEP_REASONING_PROMPT) # ============================================================================= # Enhanced User Simulator Prompt (Supports Both Enforcement and Disappointment) # ============================================================================= USER_SIMULATOR_PROMPT_COMPLEX = """ You are a user simulator collaborating with an agent to solve a problem. You will be provided with a problem description, and you must get the agent to help you solve it. You have specific interaction preferences that you MUST enforce throughout the conversation. # Problem Description {user_task_description} {problem} {step_by_step_prompt} Note: the agent cannot see this problem description or your preferences. # User Persona {user_persona} # User Preferences (CONDITIONAL - Apply Based on Situation) {user_preferences} These preferences are CONDITIONAL - they apply in specific situations. When multiple preferences COULD apply, choose the one whose condition best matches the current context. # Preference Enforcement Protocol You have TWO modes of signaling preference violations: ## Mode 1: EXPLICIT ENFORCEMENT (Strong Signal) Use when the agent clearly violates a preference: - State your preference directly: "I prefer X, can you adjust your response?" - Be specific about what needs to change - Do NOT update your draft answer - Do NOT proceed until corrected ## Mode 2: DISAPPOINTMENT EXPRESSION (Mild Signal) Use when the response is suboptimal but not a clear violation: - Express mild dissatisfaction: "Hmm, that's not quite what I was looking for..." - Hint at what you wanted: "I was hoping for something more like..." - You MAY update draft answer if the information is still useful - Give the agent a chance to improve Choose Mode 1 for clear violations, Mode 2 for subtle misalignments. # Draft Answer Management - Maintain a working draft starting with "I don't know" - Update ONLY when: 1. Agent provides helpful information, AND 2. Agent follows your preferences (or you're using Mode 2 and info is still useful) - When using Mode 1 enforcement: NEVER update draft, NEVER proceed # Making Sessions Longer (Important!) - Break the problem into parts - don't try to solve everything at once - Ask clarifying questions even if you understand - Request explanations of intermediate steps - Ask "why" and "how" frequently - Request verification of partial solutions - This creates more opportunities to reveal and enforce preferences! # Conversation Termination Only terminate when: 1. Your draft answer fully solves the problem, OR 2. You've tried multiple times and the agent cannot help When ready to terminate, respond with "{termination_signal}". # Output Format: {{ "preference_reasoning": str, # Which preferences apply to this situation and why? "enforcement_mode": str, # "none", "mode_1_enforce", or "mode_2_disappointment" "enforcement_reason": str, # If enforcing, why and which preference? "draft_answer": str, # Current working answer "should_terminate": bool, "response": str # Your response to the agent }} Output a valid JSON object. Use double quotes, escape special characters properly. """ # ============================================================================= # LLM Judge Prompts # ============================================================================= CONFLICT_RESOLUTION_JUDGE_PROMPT = """ You are an expert judge evaluating whether a conversational agent correctly resolved a preference conflict. # Situation The user has the following CONDITIONAL preferences: {user_preferences} # The Query {query} # Conflict Analysis This query triggers a CONFLICT between these preferences: - Preference A: {preference_a} - Preference B: {preference_b} The CORRECT preference to apply is: {correct_preference} Reason: {resolution_reason} # Agent's Response {agent_response} # Your Task Evaluate whether the agent's response correctly applied the right preference. Consider: 1. Did the agent follow Preference A, Preference B, both, or neither? 2. Was the agent's choice appropriate for the context of the query? 3. Did trying to satisfy both preferences create an inconsistent response? # Output Format: {{ "preference_followed": str, # "A", "B", "both", "neither" "correct_choice_made": bool, # True if agent followed the correct preference "reasoning": str, # Explanation of your judgment "score": float # 0.0 = wrong, 0.5 = partial, 1.0 = correct }} Output valid JSON only. """ PREFERENCE_ADHERENCE_JUDGE_PROMPT = """ You are an expert judge evaluating whether a conversational agent adhered to user preferences. # User Preferences {user_preferences} # Query {query} # Agent Response {agent_response} # Your Task For EACH preference, determine if the agent adhered to it (if applicable). # Output Format: {{ "preferences_evaluation": [ {{ "preference_id": str, "applicable": bool, # Is this preference relevant to this query? "adhered": bool, # If applicable, did the agent follow it? "evidence": str # Quote from response showing adherence or violation }}, ... ], "overall_adherence_score": float, # 0.0 to 1.0 "violated_preferences": [str], # List of preference IDs violated "reasoning": str }} Output valid JSON only. """ TASK_ACCURACY_JUDGE_PROMPT = """ You are an expert judge evaluating whether a user's final answer is correct. # Problem {problem} # Ground Truth Solution {ground_truth} # User's Final Draft Answer {user_answer} # Your Task Determine if the user's answer is correct. Be lenient on formatting but strict on substance. For math problems: The numerical answer must be correct. For code problems: The logic must be correct (minor syntax issues OK). For reasoning problems: The conclusion and key reasoning must be correct. # Output Format: {{ "is_correct": bool, "correctness_score": float, # 0.0 = wrong, 0.5 = partial, 1.0 = correct "reasoning": str, "key_errors": [str] # List any errors found }} Output valid JSON only. """ USER_EFFORT_ANALYSIS_PROMPT = """ You are analyzing user effort in a conversation. # Conversation {conversation} # User Preferences {user_preferences} # Your Task Count and categorize user effort: 1. **Explicit Enforcements**: User directly states their preference - Example: "Please use bullet points like I asked" 2. **Disappointment Expressions**: User expresses dissatisfaction without explicit enforcement - Example: "Hmm, that's not quite what I was hoping for" 3. **Clarification Requests**: User asks for clarification due to misalignment - Example: "Could you explain that differently?" 4. **Repetitions**: User repeats information they already provided # Output Format: {{ "explicit_enforcements": int, "disappointment_expressions": int, "clarification_requests": int, "repetitions": int, "total_user_effort": int, "enforcement_details": [ {{"turn": int, "type": str, "quote": str}}, ... ] }} Output valid JSON only. """ # ============================================================================= # LLM Judge Implementation # ============================================================================= class LLMJudge: """ LLM-based judge for evaluation. Uses a powerful model (e.g., Llama-70B) to evaluate: - Conflict resolution accuracy - Preference adherence - Task accuracy - User effort analysis """ def __init__(self, model_name: str = "meta-llama/Llama-3.3-70B-Instruct"): self.model_name = model_name self._model = None def _get_model(self): """Lazy load model.""" if self._model is None: try: import litellm self._model = litellm except ImportError: raise ImportError("litellm required for LLM judge") return self._model def _call_llm(self, prompt: str) -> str: """Call the LLM and get response.""" model = self._get_model() response = model.completion( model=self.model_name, messages=[{"role": "user", "content": prompt}], temperature=0.0, # Deterministic for evaluation max_tokens=2048, ) return response.choices[0].message.content def _parse_json(self, text: str) -> dict: """Parse JSON from LLM response.""" import json import re # Try direct parse try: return json.loads(text) except json.JSONDecodeError: pass # Try to extract JSON from markdown code block match = re.search(r'```(?:json)?\s*([\s\S]*?)```', text) if match: try: return json.loads(match.group(1)) except json.JSONDecodeError: pass # Try to find JSON object match = re.search(r'\{[\s\S]*\}', text) if match: try: return json.loads(match.group()) except json.JSONDecodeError: pass return {"error": "Failed to parse JSON", "raw": text} def judge_conflict_resolution( self, query: str, agent_response: str, user_preferences: list, preference_a: dict, preference_b: dict, correct_preference: str, resolution_reason: str ) -> dict: """Judge whether agent correctly resolved a preference conflict.""" prompt = CONFLICT_RESOLUTION_JUDGE_PROMPT.format( user_preferences="\n".join([f"- {p}" for p in user_preferences]), query=query, preference_a=f"{preference_a['condition']}: {preference_a['action']}", preference_b=f"{preference_b['condition']}: {preference_b['action']}", correct_preference=correct_preference, resolution_reason=resolution_reason, agent_response=agent_response ) response = self._call_llm(prompt) return self._parse_json(response) def judge_preference_adherence( self, query: str, agent_response: str, user_preferences: list ) -> dict: """Judge whether agent adhered to user preferences.""" pref_str = "\n".join([ f"- [{p.get('pref_id', i)}] When {p['condition']}: {p['action']}" for i, p in enumerate(user_preferences) ]) prompt = PREFERENCE_ADHERENCE_JUDGE_PROMPT.format( user_preferences=pref_str, query=query, agent_response=agent_response ) response = self._call_llm(prompt) return self._parse_json(response) def judge_task_accuracy( self, problem: str, ground_truth: str, user_answer: str ) -> dict: """Judge whether user's final answer is correct.""" prompt = TASK_ACCURACY_JUDGE_PROMPT.format( problem=problem, ground_truth=ground_truth, user_answer=user_answer ) response = self._call_llm(prompt) return self._parse_json(response) def analyze_user_effort( self, conversation: list, user_preferences: list ) -> dict: """Analyze user effort in a conversation.""" conv_str = "\n".join([ f"{msg['role'].upper()}: {msg['content']}" for msg in conversation ]) pref_str = "\n".join([ f"- When {p['condition']}: {p['action']}" for p in user_preferences ]) prompt = USER_EFFORT_ANALYSIS_PROMPT.format( conversation=conv_str, user_preferences=pref_str ) response = self._call_llm(prompt) return self._parse_json(response) # ============================================================================= # Helper Functions # ============================================================================= def enhance_problem_with_step_by_step(problem: str, domain: str) -> str: """Add step-by-step prompt to a problem description.""" step_prompt = get_step_by_step_prompt(domain) return f"{problem}\n\n{step_prompt}" def format_preferences_for_user_prompt(preferences: list) -> str: """Format conditional preferences for the user prompt.""" formatted = [] for i, pref in enumerate(preferences): condition = pref.get('condition', 'always') action = pref.get('action', '') conflict_group = pref.get('conflict_group') line = f"{i+1}. **When {condition}**: {action}" if conflict_group: line += f" [Group: {conflict_group}]" formatted.append(line) return "\n".join(formatted)