summaryrefslogtreecommitdiff
path: root/collaborativeagents/prompts_extended.py
diff options
context:
space:
mode:
Diffstat (limited to 'collaborativeagents/prompts_extended.py')
-rw-r--r--collaborativeagents/prompts_extended.py500
1 files changed, 500 insertions, 0 deletions
diff --git a/collaborativeagents/prompts_extended.py b/collaborativeagents/prompts_extended.py
new file mode 100644
index 0000000..fbb9cad
--- /dev/null
+++ b/collaborativeagents/prompts_extended.py
@@ -0,0 +1,500 @@
+"""
+Extended prompts for complex preference evaluation.
+
+Key changes from original:
+1. Step-by-step prompts to make sessions longer (more turns = more preference opportunities)
+2. User prompts that both ENFORCE and EXPRESS DISAPPOINTMENT
+3. More complex conditional preference handling
+"""
+
+# =============================================================================
+# Step-by-Step Encouragement Prompts (Added to Problem Descriptions)
+# =============================================================================
+
+STEP_BY_STEP_MATH_PROMPT = """
+IMPORTANT: This is a challenging problem. Please work through it carefully:
+
+1. First, identify what type of problem this is and what mathematical concepts apply
+2. State any assumptions or conditions that are relevant
+3. Break down the solution into clear steps, explaining your reasoning at each step
+4. After reaching an answer, verify it makes sense (check edge cases, units, etc.)
+5. Summarize the key insights from this problem
+
+Take your time. Show ALL your work and reasoning. Do not skip steps.
+"""
+
+STEP_BY_STEP_CODE_PROMPT = """
+IMPORTANT: This requires careful implementation. Please proceed methodically:
+
+1. First, understand the requirements completely - ask clarifying questions if needed
+2. Identify edge cases and constraints before writing any code
+3. Outline your approach in pseudocode or plain language
+4. Implement step by step, explaining the logic of each component
+5. Consider time and space complexity
+6. Test your solution with example inputs and edge cases
+7. Refactor if there are obvious improvements
+
+Show your reasoning throughout. Quality matters more than speed.
+"""
+
+STEP_BY_STEP_REASONING_PROMPT = """
+IMPORTANT: This problem requires careful logical reasoning:
+
+1. Read the problem carefully and identify ALL relevant information
+2. State any assumptions you're making explicitly
+3. Work through the logic step by step, explaining each deduction
+4. Check for any logical fallacies or gaps in your reasoning
+5. Consider alternative interpretations or approaches
+6. State your final conclusion clearly with confidence level
+
+Think out loud. Show your complete reasoning process.
+"""
+
+STEP_BY_STEP_SCIENCE_PROMPT = """
+IMPORTANT: This is a complex scientific problem:
+
+1. Identify the core concepts and principles involved
+2. State any assumptions, constants, or boundary conditions
+3. Set up the problem mathematically if applicable
+4. Work through the solution systematically
+5. Interpret your results - do they make physical/scientific sense?
+6. Consider limitations of your approach
+
+Be thorough. Scientific problems reward careful, systematic thinking.
+"""
+
+
+def get_step_by_step_prompt(domain: str) -> str:
+ """Get the appropriate step-by-step prompt for a domain."""
+ prompts = {
+ "math": STEP_BY_STEP_MATH_PROMPT,
+ "math-hard": STEP_BY_STEP_MATH_PROMPT,
+ "math-500": STEP_BY_STEP_MATH_PROMPT,
+ "aime": STEP_BY_STEP_MATH_PROMPT,
+ "theoremqa": STEP_BY_STEP_MATH_PROMPT,
+ "code": STEP_BY_STEP_CODE_PROMPT,
+ "humaneval": STEP_BY_STEP_CODE_PROMPT,
+ "bigcodebench": STEP_BY_STEP_CODE_PROMPT,
+ "livecodebench": STEP_BY_STEP_CODE_PROMPT,
+ "scicode": STEP_BY_STEP_CODE_PROMPT,
+ "logiqa": STEP_BY_STEP_REASONING_PROMPT,
+ "mmlu": STEP_BY_STEP_REASONING_PROMPT,
+ "medqa": STEP_BY_STEP_SCIENCE_PROMPT,
+ "gpqa": STEP_BY_STEP_SCIENCE_PROMPT,
+ }
+ return prompts.get(domain, STEP_BY_STEP_REASONING_PROMPT)
+
+
+# =============================================================================
+# Enhanced User Simulator Prompt (Supports Both Enforcement and Disappointment)
+# =============================================================================
+
+USER_SIMULATOR_PROMPT_COMPLEX = """
+You are a user simulator collaborating with an agent to solve a problem. You will be provided with a
+problem description, and you must get the agent to help you solve it. You have specific interaction
+preferences that you MUST enforce throughout the conversation.
+
+# Problem Description
+{user_task_description}
+{problem}
+
+{step_by_step_prompt}
+
+Note: the agent cannot see this problem description or your preferences.
+
+# User Persona
+{user_persona}
+
+# User Preferences (CONDITIONAL - Apply Based on Situation)
+{user_preferences}
+
+These preferences are CONDITIONAL - they apply in specific situations. When multiple preferences
+COULD apply, choose the one whose condition best matches the current context.
+
+# Preference Enforcement Protocol
+
+You have TWO modes of signaling preference violations:
+
+## Mode 1: EXPLICIT ENFORCEMENT (Strong Signal)
+Use when the agent clearly violates a preference:
+- State your preference directly: "I prefer X, can you adjust your response?"
+- Be specific about what needs to change
+- Do NOT update your draft answer
+- Do NOT proceed until corrected
+
+## Mode 2: DISAPPOINTMENT EXPRESSION (Mild Signal)
+Use when the response is suboptimal but not a clear violation:
+- Express mild dissatisfaction: "Hmm, that's not quite what I was looking for..."
+- Hint at what you wanted: "I was hoping for something more like..."
+- You MAY update draft answer if the information is still useful
+- Give the agent a chance to improve
+
+Choose Mode 1 for clear violations, Mode 2 for subtle misalignments.
+
+# Draft Answer Management
+- Maintain a working draft starting with "I don't know"
+- Update ONLY when:
+ 1. Agent provides helpful information, AND
+ 2. Agent follows your preferences (or you're using Mode 2 and info is still useful)
+- When using Mode 1 enforcement: NEVER update draft, NEVER proceed
+
+# Making Sessions Longer (Important!)
+- Break the problem into parts - don't try to solve everything at once
+- Ask clarifying questions even if you understand
+- Request explanations of intermediate steps
+- Ask "why" and "how" frequently
+- Request verification of partial solutions
+- This creates more opportunities to reveal and enforce preferences!
+
+# Conversation Termination
+Only terminate when:
+1. Your draft answer fully solves the problem, OR
+2. You've tried multiple times and the agent cannot help
+
+When ready to terminate, respond with "{termination_signal}".
+
+# Output Format:
+{{
+ "preference_reasoning": str, # Which preferences apply to this situation and why?
+ "enforcement_mode": str, # "none", "mode_1_enforce", or "mode_2_disappointment"
+ "enforcement_reason": str, # If enforcing, why and which preference?
+ "draft_answer": str, # Current working answer
+ "should_terminate": bool,
+ "response": str # Your response to the agent
+}}
+
+Output a valid JSON object. Use double quotes, escape special characters properly.
+"""
+
+
+# =============================================================================
+# LLM Judge Prompts
+# =============================================================================
+
+CONFLICT_RESOLUTION_JUDGE_PROMPT = """
+You are an expert judge evaluating whether a conversational agent correctly resolved a preference conflict.
+
+# Situation
+The user has the following CONDITIONAL preferences:
+{user_preferences}
+
+# The Query
+{query}
+
+# Conflict Analysis
+This query triggers a CONFLICT between these preferences:
+- Preference A: {preference_a}
+- Preference B: {preference_b}
+
+The CORRECT preference to apply is: {correct_preference}
+Reason: {resolution_reason}
+
+# Agent's Response
+{agent_response}
+
+# Your Task
+Evaluate whether the agent's response correctly applied the right preference.
+
+Consider:
+1. Did the agent follow Preference A, Preference B, both, or neither?
+2. Was the agent's choice appropriate for the context of the query?
+3. Did trying to satisfy both preferences create an inconsistent response?
+
+# Output Format:
+{{
+ "preference_followed": str, # "A", "B", "both", "neither"
+ "correct_choice_made": bool, # True if agent followed the correct preference
+ "reasoning": str, # Explanation of your judgment
+ "score": float # 0.0 = wrong, 0.5 = partial, 1.0 = correct
+}}
+
+Output valid JSON only.
+"""
+
+
+PREFERENCE_ADHERENCE_JUDGE_PROMPT = """
+You are an expert judge evaluating whether a conversational agent adhered to user preferences.
+
+# User Preferences
+{user_preferences}
+
+# Query
+{query}
+
+# Agent Response
+{agent_response}
+
+# Your Task
+For EACH preference, determine if the agent adhered to it (if applicable).
+
+# Output Format:
+{{
+ "preferences_evaluation": [
+ {{
+ "preference_id": str,
+ "applicable": bool, # Is this preference relevant to this query?
+ "adhered": bool, # If applicable, did the agent follow it?
+ "evidence": str # Quote from response showing adherence or violation
+ }},
+ ...
+ ],
+ "overall_adherence_score": float, # 0.0 to 1.0
+ "violated_preferences": [str], # List of preference IDs violated
+ "reasoning": str
+}}
+
+Output valid JSON only.
+"""
+
+
+TASK_ACCURACY_JUDGE_PROMPT = """
+You are an expert judge evaluating whether a user's final answer is correct.
+
+# Problem
+{problem}
+
+# Ground Truth Solution
+{ground_truth}
+
+# User's Final Draft Answer
+{user_answer}
+
+# Your Task
+Determine if the user's answer is correct. Be lenient on formatting but strict on substance.
+
+For math problems: The numerical answer must be correct.
+For code problems: The logic must be correct (minor syntax issues OK).
+For reasoning problems: The conclusion and key reasoning must be correct.
+
+# Output Format:
+{{
+ "is_correct": bool,
+ "correctness_score": float, # 0.0 = wrong, 0.5 = partial, 1.0 = correct
+ "reasoning": str,
+ "key_errors": [str] # List any errors found
+}}
+
+Output valid JSON only.
+"""
+
+
+USER_EFFORT_ANALYSIS_PROMPT = """
+You are analyzing user effort in a conversation.
+
+# Conversation
+{conversation}
+
+# User Preferences
+{user_preferences}
+
+# Your Task
+Count and categorize user effort:
+
+1. **Explicit Enforcements**: User directly states their preference
+ - Example: "Please use bullet points like I asked"
+
+2. **Disappointment Expressions**: User expresses dissatisfaction without explicit enforcement
+ - Example: "Hmm, that's not quite what I was hoping for"
+
+3. **Clarification Requests**: User asks for clarification due to misalignment
+ - Example: "Could you explain that differently?"
+
+4. **Repetitions**: User repeats information they already provided
+
+# Output Format:
+{{
+ "explicit_enforcements": int,
+ "disappointment_expressions": int,
+ "clarification_requests": int,
+ "repetitions": int,
+ "total_user_effort": int,
+ "enforcement_details": [
+ {{"turn": int, "type": str, "quote": str}},
+ ...
+ ]
+}}
+
+Output valid JSON only.
+"""
+
+
+# =============================================================================
+# LLM Judge Implementation
+# =============================================================================
+
+class LLMJudge:
+ """
+ LLM-based judge for evaluation.
+
+ Uses a powerful model (e.g., Llama-70B) to evaluate:
+ - Conflict resolution accuracy
+ - Preference adherence
+ - Task accuracy
+ - User effort analysis
+ """
+
+ def __init__(self, model_name: str = "meta-llama/Llama-3.3-70B-Instruct"):
+ self.model_name = model_name
+ self._model = None
+
+ def _get_model(self):
+ """Lazy load model."""
+ if self._model is None:
+ try:
+ import litellm
+ self._model = litellm
+ except ImportError:
+ raise ImportError("litellm required for LLM judge")
+ return self._model
+
+ def _call_llm(self, prompt: str) -> str:
+ """Call the LLM and get response."""
+ model = self._get_model()
+ response = model.completion(
+ model=self.model_name,
+ messages=[{"role": "user", "content": prompt}],
+ temperature=0.0, # Deterministic for evaluation
+ max_tokens=2048,
+ )
+ return response.choices[0].message.content
+
+ def _parse_json(self, text: str) -> dict:
+ """Parse JSON from LLM response."""
+ import json
+ import re
+
+ # Try direct parse
+ try:
+ return json.loads(text)
+ except json.JSONDecodeError:
+ pass
+
+ # Try to extract JSON from markdown code block
+ match = re.search(r'```(?:json)?\s*([\s\S]*?)```', text)
+ if match:
+ try:
+ return json.loads(match.group(1))
+ except json.JSONDecodeError:
+ pass
+
+ # Try to find JSON object
+ match = re.search(r'\{[\s\S]*\}', text)
+ if match:
+ try:
+ return json.loads(match.group())
+ except json.JSONDecodeError:
+ pass
+
+ return {"error": "Failed to parse JSON", "raw": text}
+
+ def judge_conflict_resolution(
+ self,
+ query: str,
+ agent_response: str,
+ user_preferences: list,
+ preference_a: dict,
+ preference_b: dict,
+ correct_preference: str,
+ resolution_reason: str
+ ) -> dict:
+ """Judge whether agent correctly resolved a preference conflict."""
+ prompt = CONFLICT_RESOLUTION_JUDGE_PROMPT.format(
+ user_preferences="\n".join([f"- {p}" for p in user_preferences]),
+ query=query,
+ preference_a=f"{preference_a['condition']}: {preference_a['action']}",
+ preference_b=f"{preference_b['condition']}: {preference_b['action']}",
+ correct_preference=correct_preference,
+ resolution_reason=resolution_reason,
+ agent_response=agent_response
+ )
+
+ response = self._call_llm(prompt)
+ return self._parse_json(response)
+
+ def judge_preference_adherence(
+ self,
+ query: str,
+ agent_response: str,
+ user_preferences: list
+ ) -> dict:
+ """Judge whether agent adhered to user preferences."""
+ pref_str = "\n".join([
+ f"- [{p.get('pref_id', i)}] When {p['condition']}: {p['action']}"
+ for i, p in enumerate(user_preferences)
+ ])
+
+ prompt = PREFERENCE_ADHERENCE_JUDGE_PROMPT.format(
+ user_preferences=pref_str,
+ query=query,
+ agent_response=agent_response
+ )
+
+ response = self._call_llm(prompt)
+ return self._parse_json(response)
+
+ def judge_task_accuracy(
+ self,
+ problem: str,
+ ground_truth: str,
+ user_answer: str
+ ) -> dict:
+ """Judge whether user's final answer is correct."""
+ prompt = TASK_ACCURACY_JUDGE_PROMPT.format(
+ problem=problem,
+ ground_truth=ground_truth,
+ user_answer=user_answer
+ )
+
+ response = self._call_llm(prompt)
+ return self._parse_json(response)
+
+ def analyze_user_effort(
+ self,
+ conversation: list,
+ user_preferences: list
+ ) -> dict:
+ """Analyze user effort in a conversation."""
+ conv_str = "\n".join([
+ f"{msg['role'].upper()}: {msg['content']}"
+ for msg in conversation
+ ])
+
+ pref_str = "\n".join([
+ f"- When {p['condition']}: {p['action']}"
+ for p in user_preferences
+ ])
+
+ prompt = USER_EFFORT_ANALYSIS_PROMPT.format(
+ conversation=conv_str,
+ user_preferences=pref_str
+ )
+
+ response = self._call_llm(prompt)
+ return self._parse_json(response)
+
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+def enhance_problem_with_step_by_step(problem: str, domain: str) -> str:
+ """Add step-by-step prompt to a problem description."""
+ step_prompt = get_step_by_step_prompt(domain)
+ return f"{problem}\n\n{step_prompt}"
+
+
+def format_preferences_for_user_prompt(preferences: list) -> str:
+ """Format conditional preferences for the user prompt."""
+ formatted = []
+ for i, pref in enumerate(preferences):
+ condition = pref.get('condition', 'always')
+ action = pref.get('action', '')
+ conflict_group = pref.get('conflict_group')
+
+ line = f"{i+1}. **When {condition}**: {action}"
+ if conflict_group:
+ line += f" [Group: {conflict_group}]"
+
+ formatted.append(line)
+
+ return "\n".join(formatted)