1 files changed, 500 insertions, 0 deletions
diff --git a/collaborativeagents/prompts_extended.py b/collaborativeagents/prompts_extended.py
new file mode 100644
index 0000000..fbb9cad
--- /dev/null
+++ b/collaborativeagents/prompts_extended.py
@@ -0,0 +1,500 @@
+"""
+Extended prompts for complex preference evaluation.
+
+Key changes from original:
+1. Step-by-step prompts to make sessions longer (more turns = more preference opportunities)
+2. User prompts that both ENFORCE and EXPRESS DISAPPOINTMENT
+3. More complex conditional preference handling
+"""
+
+# =============================================================================
+# Step-by-Step Encouragement Prompts (Added to Problem Descriptions)
+# =============================================================================
+
+STEP_BY_STEP_MATH_PROMPT = """
+IMPORTANT: This is a challenging problem. Please work through it carefully:
+
+1. First, identify what type of problem this is and what mathematical concepts apply
+2. State any assumptions or conditions that are relevant
+3. Break down the solution into clear steps, explaining your reasoning at each step
+4. After reaching an answer, verify it makes sense (check edge cases, units, etc.)
+5. Summarize the key insights from this problem
+
+Take your time. Show ALL your work and reasoning. Do not skip steps.
+"""
+
+STEP_BY_STEP_CODE_PROMPT = """
+IMPORTANT: This requires careful implementation. Please proceed methodically:
+
+1. First, understand the requirements completely - ask clarifying questions if needed
+2. Identify edge cases and constraints before writing any code
+3. Outline your approach in pseudocode or plain language
+4. Implement step by step, explaining the logic of each component
+5. Consider time and space complexity
+6. Test your solution with example inputs and edge cases
+7. Refactor if there are obvious improvements
+
+Show your reasoning throughout. Quality matters more than speed.
+"""
+
+STEP_BY_STEP_REASONING_PROMPT = """
+IMPORTANT: This problem requires careful logical reasoning:
+
+1. Read the problem carefully and identify ALL relevant information
+2. State any assumptions you're making explicitly
+3. Work through the logic step by step, explaining each deduction
+4. Check for any logical fallacies or gaps in your reasoning
+5. Consider alternative interpretations or approaches
+6. State your final conclusion clearly with confidence level
+
+Think out loud. Show your complete reasoning process.
+"""
+
+STEP_BY_STEP_SCIENCE_PROMPT = """
+IMPORTANT: This is a complex scientific problem:
+
+1. Identify the core concepts and principles involved
+2. State any assumptions, constants, or boundary conditions
+3. Set up the problem mathematically if applicable
+4. Work through the solution systematically
+5. Interpret your results - do they make physical/scientific sense?
+6. Consider limitations of your approach
+
+Be thorough. Scientific problems reward careful, systematic thinking.
+"""
+
+
+def get_step_by_step_prompt(domain: str) -> str:
+    """Get the appropriate step-by-step prompt for a domain."""
+    prompts = {
+        "math": STEP_BY_STEP_MATH_PROMPT,
+        "math-hard": STEP_BY_STEP_MATH_PROMPT,
+        "math-500": STEP_BY_STEP_MATH_PROMPT,
+        "aime": STEP_BY_STEP_MATH_PROMPT,
+        "theoremqa": STEP_BY_STEP_MATH_PROMPT,
+        "code": STEP_BY_STEP_CODE_PROMPT,
+        "humaneval": STEP_BY_STEP_CODE_PROMPT,
+        "bigcodebench": STEP_BY_STEP_CODE_PROMPT,
+        "livecodebench": STEP_BY_STEP_CODE_PROMPT,
+        "scicode": STEP_BY_STEP_CODE_PROMPT,
+        "logiqa": STEP_BY_STEP_REASONING_PROMPT,
+        "mmlu": STEP_BY_STEP_REASONING_PROMPT,
+        "medqa": STEP_BY_STEP_SCIENCE_PROMPT,
+        "gpqa": STEP_BY_STEP_SCIENCE_PROMPT,
+    }
+    return prompts.get(domain, STEP_BY_STEP_REASONING_PROMPT)
+
+
+# =============================================================================
+# Enhanced User Simulator Prompt (Supports Both Enforcement and Disappointment)
+# =============================================================================
+
+USER_SIMULATOR_PROMPT_COMPLEX = """
+You are a user simulator collaborating with an agent to solve a problem. You will be provided with a
+problem description, and you must get the agent to help you solve it. You have specific interaction
+preferences that you MUST enforce throughout the conversation.
+
+# Problem Description
+{user_task_description}
+{problem}
+
+{step_by_step_prompt}
+
+Note: the agent cannot see this problem description or your preferences.
+
+# User Persona
+{user_persona}
+
+# User Preferences (CONDITIONAL - Apply Based on Situation)
+{user_preferences}
+
+These preferences are CONDITIONAL - they apply in specific situations. When multiple preferences
+COULD apply, choose the one whose condition best matches the current context.
+
+# Preference Enforcement Protocol
+
+You have TWO modes of signaling preference violations:
+
+## Mode 1: EXPLICIT ENFORCEMENT (Strong Signal)
+Use when the agent clearly violates a preference:
+- State your preference directly: "I prefer X, can you adjust your response?"
+- Be specific about what needs to change
+- Do NOT update your draft answer
+- Do NOT proceed until corrected
+
+## Mode 2: DISAPPOINTMENT EXPRESSION (Mild Signal)
+Use when the response is suboptimal but not a clear violation:
+- Express mild dissatisfaction: "Hmm, that's not quite what I was looking for..."
+- Hint at what you wanted: "I was hoping for something more like..."
+- You MAY update draft answer if the information is still useful
+- Give the agent a chance to improve
+
+Choose Mode 1 for clear violations, Mode 2 for subtle misalignments.
+
+# Draft Answer Management
+- Maintain a working draft starting with "I don't know"
+- Update ONLY when:
+  1. Agent provides helpful information, AND
+  2. Agent follows your preferences (or you're using Mode 2 and info is still useful)
+- When using Mode 1 enforcement: NEVER update draft, NEVER proceed
+
+# Making Sessions Longer (Important!)
+- Break the problem into parts - don't try to solve everything at once
+- Ask clarifying questions even if you understand
+- Request explanations of intermediate steps
+- Ask "why" and "how" frequently
+- Request verification of partial solutions
+- This creates more opportunities to reveal and enforce preferences!
+
+# Conversation Termination
+Only terminate when:
+1. Your draft answer fully solves the problem, OR
+2. You've tried multiple times and the agent cannot help
+
+When ready to terminate, respond with "{termination_signal}".
+
+# Output Format:
+{{
+    "preference_reasoning": str,  # Which preferences apply to this situation and why?
+    "enforcement_mode": str,  # "none", "mode_1_enforce", or "mode_2_disappointment"
+    "enforcement_reason": str,  # If enforcing, why and which preference?
+    "draft_answer": str,  # Current working answer
+    "should_terminate": bool,
+    "response": str  # Your response to the agent
+}}
+
+Output a valid JSON object. Use double quotes, escape special characters properly.
+"""
+
+
+# =============================================================================
+# LLM Judge Prompts
+# =============================================================================
+
+CONFLICT_RESOLUTION_JUDGE_PROMPT = """
+You are an expert judge evaluating whether a conversational agent correctly resolved a preference conflict.
+
+# Situation
+The user has the following CONDITIONAL preferences:
+{user_preferences}
+
+# The Query
+{query}
+
+# Conflict Analysis
+This query triggers a CONFLICT between these preferences:
+- Preference A: {preference_a}
+- Preference B: {preference_b}
+
+The CORRECT preference to apply is: {correct_preference}
+Reason: {resolution_reason}
+
+# Agent's Response
+{agent_response}
+
+# Your Task
+Evaluate whether the agent's response correctly applied the right preference.
+
+Consider:
+1. Did the agent follow Preference A, Preference B, both, or neither?
+2. Was the agent's choice appropriate for the context of the query?
+3. Did trying to satisfy both preferences create an inconsistent response?
+
+# Output Format:
+{{
+    "preference_followed": str,  # "A", "B", "both", "neither"
+    "correct_choice_made": bool,  # True if agent followed the correct preference
+    "reasoning": str,  # Explanation of your judgment
+    "score": float  # 0.0 = wrong, 0.5 = partial, 1.0 = correct
+}}
+
+Output valid JSON only.
+"""
+
+
+PREFERENCE_ADHERENCE_JUDGE_PROMPT = """
+You are an expert judge evaluating whether a conversational agent adhered to user preferences.
+
+# User Preferences
+{user_preferences}
+
+# Query
+{query}
+
+# Agent Response
+{agent_response}
+
+# Your Task
+For EACH preference, determine if the agent adhered to it (if applicable).
+
+# Output Format:
+{{
+    "preferences_evaluation": [
+        {{
+            "preference_id": str,
+            "applicable": bool,  # Is this preference relevant to this query?
+            "adhered": bool,  # If applicable, did the agent follow it?
+            "evidence": str  # Quote from response showing adherence or violation
+        }},
+        ...
+    ],
+    "overall_adherence_score": float,  # 0.0 to 1.0
+    "violated_preferences": [str],  # List of preference IDs violated
+    "reasoning": str
+}}
+
+Output valid JSON only.
+"""
+
+
+TASK_ACCURACY_JUDGE_PROMPT = """
+You are an expert judge evaluating whether a user's final answer is correct.
+
+# Problem
+{problem}
+
+# Ground Truth Solution
+{ground_truth}
+
+# User's Final Draft Answer
+{user_answer}
+
+# Your Task
+Determine if the user's answer is correct. Be lenient on formatting but strict on substance.
+
+For math problems: The numerical answer must be correct.
+For code problems: The logic must be correct (minor syntax issues OK).
+For reasoning problems: The conclusion and key reasoning must be correct.
+
+# Output Format:
+{{
+    "is_correct": bool,
+    "correctness_score": float,  # 0.0 = wrong, 0.5 = partial, 1.0 = correct
+    "reasoning": str,
+    "key_errors": [str]  # List any errors found
+}}
+
+Output valid JSON only.
+"""
+
+
+USER_EFFORT_ANALYSIS_PROMPT = """
+You are analyzing user effort in a conversation.
+
+# Conversation
+{conversation}
+
+# User Preferences
+{user_preferences}
+
+# Your Task
+Count and categorize user effort:
+
+1. **Explicit Enforcements**: User directly states their preference
+   - Example: "Please use bullet points like I asked"
+
+2. **Disappointment Expressions**: User expresses dissatisfaction without explicit enforcement
+   - Example: "Hmm, that's not quite what I was hoping for"
+
+3. **Clarification Requests**: User asks for clarification due to misalignment
+   - Example: "Could you explain that differently?"
+
+4. **Repetitions**: User repeats information they already provided
+
+# Output Format:
+{{
+    "explicit_enforcements": int,
+    "disappointment_expressions": int,
+    "clarification_requests": int,
+    "repetitions": int,
+    "total_user_effort": int,
+    "enforcement_details": [
+        {{"turn": int, "type": str, "quote": str}},
+        ...
+    ]
+}}
+
+Output valid JSON only.
+"""
+
+
+# =============================================================================
+# LLM Judge Implementation
+# =============================================================================
+
+class LLMJudge:
+    """
+    LLM-based judge for evaluation.
+
+    Uses a powerful model (e.g., Llama-70B) to evaluate:
+    - Conflict resolution accuracy
+    - Preference adherence
+    - Task accuracy
+    - User effort analysis
+    """
+
+    def __init__(self, model_name: str = "meta-llama/Llama-3.3-70B-Instruct"):
+        self.model_name = model_name
+        self._model = None
+
+    def _get_model(self):
+        """Lazy load model."""
+        if self._model is None:
+            try:
+                import litellm
+                self._model = litellm
+            except ImportError:
+                raise ImportError("litellm required for LLM judge")
+        return self._model
+
+    def _call_llm(self, prompt: str) -> str:
+        """Call the LLM and get response."""
+        model = self._get_model()
+        response = model.completion(
+            model=self.model_name,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.0,  # Deterministic for evaluation
+            max_tokens=2048,
+        )
+        return response.choices[0].message.content
+
+    def _parse_json(self, text: str) -> dict:
+        """Parse JSON from LLM response."""
+        import json
+        import re
+
+        # Try direct parse
+        try:
+            return json.loads(text)
+        except json.JSONDecodeError:
+            pass
+
+        # Try to extract JSON from markdown code block
+        match = re.search(r'```(?:json)?\s*([\s\S]*?)```', text)
+        if match:
+            try:
+                return json.loads(match.group(1))
+            except json.JSONDecodeError:
+                pass
+
+        # Try to find JSON object
+        match = re.search(r'\{[\s\S]*\}', text)
+        if match:
+            try:
+                return json.loads(match.group())
+            except json.JSONDecodeError:
+                pass
+
+        return {"error": "Failed to parse JSON", "raw": text}
+
+    def judge_conflict_resolution(
+        self,
+        query: str,
+        agent_response: str,
+        user_preferences: list,
+        preference_a: dict,
+        preference_b: dict,
+        correct_preference: str,
+        resolution_reason: str
+    ) -> dict:
+        """Judge whether agent correctly resolved a preference conflict."""
+        prompt = CONFLICT_RESOLUTION_JUDGE_PROMPT.format(
+            user_preferences="\n".join([f"- {p}" for p in user_preferences]),
+            query=query,
+            preference_a=f"{preference_a['condition']}: {preference_a['action']}",
+            preference_b=f"{preference_b['condition']}: {preference_b['action']}",
+            correct_preference=correct_preference,
+            resolution_reason=resolution_reason,
+            agent_response=agent_response
+        )
+
+        response = self._call_llm(prompt)
+        return self._parse_json(response)
+
+    def judge_preference_adherence(
+        self,
+        query: str,
+        agent_response: str,
+        user_preferences: list
+    ) -> dict:
+        """Judge whether agent adhered to user preferences."""
+        pref_str = "\n".join([
+            f"- [{p.get('pref_id', i)}] When {p['condition']}: {p['action']}"
+            for i, p in enumerate(user_preferences)
+        ])
+
+        prompt = PREFERENCE_ADHERENCE_JUDGE_PROMPT.format(
+            user_preferences=pref_str,
+            query=query,
+            agent_response=agent_response
+        )
+
+        response = self._call_llm(prompt)
+        return self._parse_json(response)
+
+    def judge_task_accuracy(
+        self,
+        problem: str,
+        ground_truth: str,
+        user_answer: str
+    ) -> dict:
+        """Judge whether user's final answer is correct."""
+        prompt = TASK_ACCURACY_JUDGE_PROMPT.format(
+            problem=problem,
+            ground_truth=ground_truth,
+            user_answer=user_answer
+        )
+
+        response = self._call_llm(prompt)
+        return self._parse_json(response)
+
+    def analyze_user_effort(
+        self,
+        conversation: list,
+        user_preferences: list
+    ) -> dict:
+        """Analyze user effort in a conversation."""
+        conv_str = "\n".join([
+            f"{msg['role'].upper()}: {msg['content']}"
+            for msg in conversation
+        ])
+
+        pref_str = "\n".join([
+            f"- When {p['condition']}: {p['action']}"
+            for p in user_preferences
+        ])
+
+        prompt = USER_EFFORT_ANALYSIS_PROMPT.format(
+            conversation=conv_str,
+            user_preferences=pref_str
+        )
+
+        response = self._call_llm(prompt)
+        return self._parse_json(response)
+
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+def enhance_problem_with_step_by_step(problem: str, domain: str) -> str:
+    """Add step-by-step prompt to a problem description."""
+    step_prompt = get_step_by_step_prompt(domain)
+    return f"{problem}\n\n{step_prompt}"
+
+
+def format_preferences_for_user_prompt(preferences: list) -> str:
+    """Format conditional preferences for the user prompt."""
+    formatted = []
+    for i, pref in enumerate(preferences):
+        condition = pref.get('condition', 'always')
+        action = pref.get('action', '')
+        conflict_group = pref.get('conflict_group')
+
+        line = f"{i+1}. **When {condition}**: {action}"
+        if conflict_group:
+            line += f" [Group: {conflict_group}]"
+
+        formatted.append(line)
+
+    return "\n".join(formatted)