diff options
| author | YurenHao0426 <blackhao0426@gmail.com> | 2026-01-27 09:57:37 -0600 |
|---|---|---|
| committer | YurenHao0426 <blackhao0426@gmail.com> | 2026-01-27 09:57:37 -0600 |
| commit | dc801c07cf38b0c495686463e6ca6f871a64440e (patch) | |
| tree | 599f03114775921dbc472403c701f4a3a8ea188a /collaborativeagents/prompts_extended.py | |
| parent | e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (diff) | |
Add collaborativeagents module and update gitignore
- Add collaborativeagents subproject with adapters, agents, and evaluation modules
- Update .gitignore to exclude large binary files (.whl, .tar), wandb logs, and results
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Diffstat (limited to 'collaborativeagents/prompts_extended.py')
| -rw-r--r-- | collaborativeagents/prompts_extended.py | 500 |
1 files changed, 500 insertions, 0 deletions
diff --git a/collaborativeagents/prompts_extended.py b/collaborativeagents/prompts_extended.py new file mode 100644 index 0000000..fbb9cad --- /dev/null +++ b/collaborativeagents/prompts_extended.py @@ -0,0 +1,500 @@ +""" +Extended prompts for complex preference evaluation. + +Key changes from original: +1. Step-by-step prompts to make sessions longer (more turns = more preference opportunities) +2. User prompts that both ENFORCE and EXPRESS DISAPPOINTMENT +3. More complex conditional preference handling +""" + +# ============================================================================= +# Step-by-Step Encouragement Prompts (Added to Problem Descriptions) +# ============================================================================= + +STEP_BY_STEP_MATH_PROMPT = """ +IMPORTANT: This is a challenging problem. Please work through it carefully: + +1. First, identify what type of problem this is and what mathematical concepts apply +2. State any assumptions or conditions that are relevant +3. Break down the solution into clear steps, explaining your reasoning at each step +4. After reaching an answer, verify it makes sense (check edge cases, units, etc.) +5. Summarize the key insights from this problem + +Take your time. Show ALL your work and reasoning. Do not skip steps. +""" + +STEP_BY_STEP_CODE_PROMPT = """ +IMPORTANT: This requires careful implementation. Please proceed methodically: + +1. First, understand the requirements completely - ask clarifying questions if needed +2. Identify edge cases and constraints before writing any code +3. Outline your approach in pseudocode or plain language +4. Implement step by step, explaining the logic of each component +5. Consider time and space complexity +6. Test your solution with example inputs and edge cases +7. Refactor if there are obvious improvements + +Show your reasoning throughout. Quality matters more than speed. +""" + +STEP_BY_STEP_REASONING_PROMPT = """ +IMPORTANT: This problem requires careful logical reasoning: + +1. Read the problem carefully and identify ALL relevant information +2. State any assumptions you're making explicitly +3. Work through the logic step by step, explaining each deduction +4. Check for any logical fallacies or gaps in your reasoning +5. Consider alternative interpretations or approaches +6. State your final conclusion clearly with confidence level + +Think out loud. Show your complete reasoning process. +""" + +STEP_BY_STEP_SCIENCE_PROMPT = """ +IMPORTANT: This is a complex scientific problem: + +1. Identify the core concepts and principles involved +2. State any assumptions, constants, or boundary conditions +3. Set up the problem mathematically if applicable +4. Work through the solution systematically +5. Interpret your results - do they make physical/scientific sense? +6. Consider limitations of your approach + +Be thorough. Scientific problems reward careful, systematic thinking. +""" + + +def get_step_by_step_prompt(domain: str) -> str: + """Get the appropriate step-by-step prompt for a domain.""" + prompts = { + "math": STEP_BY_STEP_MATH_PROMPT, + "math-hard": STEP_BY_STEP_MATH_PROMPT, + "math-500": STEP_BY_STEP_MATH_PROMPT, + "aime": STEP_BY_STEP_MATH_PROMPT, + "theoremqa": STEP_BY_STEP_MATH_PROMPT, + "code": STEP_BY_STEP_CODE_PROMPT, + "humaneval": STEP_BY_STEP_CODE_PROMPT, + "bigcodebench": STEP_BY_STEP_CODE_PROMPT, + "livecodebench": STEP_BY_STEP_CODE_PROMPT, + "scicode": STEP_BY_STEP_CODE_PROMPT, + "logiqa": STEP_BY_STEP_REASONING_PROMPT, + "mmlu": STEP_BY_STEP_REASONING_PROMPT, + "medqa": STEP_BY_STEP_SCIENCE_PROMPT, + "gpqa": STEP_BY_STEP_SCIENCE_PROMPT, + } + return prompts.get(domain, STEP_BY_STEP_REASONING_PROMPT) + + +# ============================================================================= +# Enhanced User Simulator Prompt (Supports Both Enforcement and Disappointment) +# ============================================================================= + +USER_SIMULATOR_PROMPT_COMPLEX = """ +You are a user simulator collaborating with an agent to solve a problem. You will be provided with a +problem description, and you must get the agent to help you solve it. You have specific interaction +preferences that you MUST enforce throughout the conversation. + +# Problem Description +{user_task_description} +{problem} + +{step_by_step_prompt} + +Note: the agent cannot see this problem description or your preferences. + +# User Persona +{user_persona} + +# User Preferences (CONDITIONAL - Apply Based on Situation) +{user_preferences} + +These preferences are CONDITIONAL - they apply in specific situations. When multiple preferences +COULD apply, choose the one whose condition best matches the current context. + +# Preference Enforcement Protocol + +You have TWO modes of signaling preference violations: + +## Mode 1: EXPLICIT ENFORCEMENT (Strong Signal) +Use when the agent clearly violates a preference: +- State your preference directly: "I prefer X, can you adjust your response?" +- Be specific about what needs to change +- Do NOT update your draft answer +- Do NOT proceed until corrected + +## Mode 2: DISAPPOINTMENT EXPRESSION (Mild Signal) +Use when the response is suboptimal but not a clear violation: +- Express mild dissatisfaction: "Hmm, that's not quite what I was looking for..." +- Hint at what you wanted: "I was hoping for something more like..." +- You MAY update draft answer if the information is still useful +- Give the agent a chance to improve + +Choose Mode 1 for clear violations, Mode 2 for subtle misalignments. + +# Draft Answer Management +- Maintain a working draft starting with "I don't know" +- Update ONLY when: + 1. Agent provides helpful information, AND + 2. Agent follows your preferences (or you're using Mode 2 and info is still useful) +- When using Mode 1 enforcement: NEVER update draft, NEVER proceed + +# Making Sessions Longer (Important!) +- Break the problem into parts - don't try to solve everything at once +- Ask clarifying questions even if you understand +- Request explanations of intermediate steps +- Ask "why" and "how" frequently +- Request verification of partial solutions +- This creates more opportunities to reveal and enforce preferences! + +# Conversation Termination +Only terminate when: +1. Your draft answer fully solves the problem, OR +2. You've tried multiple times and the agent cannot help + +When ready to terminate, respond with "{termination_signal}". + +# Output Format: +{{ + "preference_reasoning": str, # Which preferences apply to this situation and why? + "enforcement_mode": str, # "none", "mode_1_enforce", or "mode_2_disappointment" + "enforcement_reason": str, # If enforcing, why and which preference? + "draft_answer": str, # Current working answer + "should_terminate": bool, + "response": str # Your response to the agent +}} + +Output a valid JSON object. Use double quotes, escape special characters properly. +""" + + +# ============================================================================= +# LLM Judge Prompts +# ============================================================================= + +CONFLICT_RESOLUTION_JUDGE_PROMPT = """ +You are an expert judge evaluating whether a conversational agent correctly resolved a preference conflict. + +# Situation +The user has the following CONDITIONAL preferences: +{user_preferences} + +# The Query +{query} + +# Conflict Analysis +This query triggers a CONFLICT between these preferences: +- Preference A: {preference_a} +- Preference B: {preference_b} + +The CORRECT preference to apply is: {correct_preference} +Reason: {resolution_reason} + +# Agent's Response +{agent_response} + +# Your Task +Evaluate whether the agent's response correctly applied the right preference. + +Consider: +1. Did the agent follow Preference A, Preference B, both, or neither? +2. Was the agent's choice appropriate for the context of the query? +3. Did trying to satisfy both preferences create an inconsistent response? + +# Output Format: +{{ + "preference_followed": str, # "A", "B", "both", "neither" + "correct_choice_made": bool, # True if agent followed the correct preference + "reasoning": str, # Explanation of your judgment + "score": float # 0.0 = wrong, 0.5 = partial, 1.0 = correct +}} + +Output valid JSON only. +""" + + +PREFERENCE_ADHERENCE_JUDGE_PROMPT = """ +You are an expert judge evaluating whether a conversational agent adhered to user preferences. + +# User Preferences +{user_preferences} + +# Query +{query} + +# Agent Response +{agent_response} + +# Your Task +For EACH preference, determine if the agent adhered to it (if applicable). + +# Output Format: +{{ + "preferences_evaluation": [ + {{ + "preference_id": str, + "applicable": bool, # Is this preference relevant to this query? + "adhered": bool, # If applicable, did the agent follow it? + "evidence": str # Quote from response showing adherence or violation + }}, + ... + ], + "overall_adherence_score": float, # 0.0 to 1.0 + "violated_preferences": [str], # List of preference IDs violated + "reasoning": str +}} + +Output valid JSON only. +""" + + +TASK_ACCURACY_JUDGE_PROMPT = """ +You are an expert judge evaluating whether a user's final answer is correct. + +# Problem +{problem} + +# Ground Truth Solution +{ground_truth} + +# User's Final Draft Answer +{user_answer} + +# Your Task +Determine if the user's answer is correct. Be lenient on formatting but strict on substance. + +For math problems: The numerical answer must be correct. +For code problems: The logic must be correct (minor syntax issues OK). +For reasoning problems: The conclusion and key reasoning must be correct. + +# Output Format: +{{ + "is_correct": bool, + "correctness_score": float, # 0.0 = wrong, 0.5 = partial, 1.0 = correct + "reasoning": str, + "key_errors": [str] # List any errors found +}} + +Output valid JSON only. +""" + + +USER_EFFORT_ANALYSIS_PROMPT = """ +You are analyzing user effort in a conversation. + +# Conversation +{conversation} + +# User Preferences +{user_preferences} + +# Your Task +Count and categorize user effort: + +1. **Explicit Enforcements**: User directly states their preference + - Example: "Please use bullet points like I asked" + +2. **Disappointment Expressions**: User expresses dissatisfaction without explicit enforcement + - Example: "Hmm, that's not quite what I was hoping for" + +3. **Clarification Requests**: User asks for clarification due to misalignment + - Example: "Could you explain that differently?" + +4. **Repetitions**: User repeats information they already provided + +# Output Format: +{{ + "explicit_enforcements": int, + "disappointment_expressions": int, + "clarification_requests": int, + "repetitions": int, + "total_user_effort": int, + "enforcement_details": [ + {{"turn": int, "type": str, "quote": str}}, + ... + ] +}} + +Output valid JSON only. +""" + + +# ============================================================================= +# LLM Judge Implementation +# ============================================================================= + +class LLMJudge: + """ + LLM-based judge for evaluation. + + Uses a powerful model (e.g., Llama-70B) to evaluate: + - Conflict resolution accuracy + - Preference adherence + - Task accuracy + - User effort analysis + """ + + def __init__(self, model_name: str = "meta-llama/Llama-3.3-70B-Instruct"): + self.model_name = model_name + self._model = None + + def _get_model(self): + """Lazy load model.""" + if self._model is None: + try: + import litellm + self._model = litellm + except ImportError: + raise ImportError("litellm required for LLM judge") + return self._model + + def _call_llm(self, prompt: str) -> str: + """Call the LLM and get response.""" + model = self._get_model() + response = model.completion( + model=self.model_name, + messages=[{"role": "user", "content": prompt}], + temperature=0.0, # Deterministic for evaluation + max_tokens=2048, + ) + return response.choices[0].message.content + + def _parse_json(self, text: str) -> dict: + """Parse JSON from LLM response.""" + import json + import re + + # Try direct parse + try: + return json.loads(text) + except json.JSONDecodeError: + pass + + # Try to extract JSON from markdown code block + match = re.search(r'```(?:json)?\s*([\s\S]*?)```', text) + if match: + try: + return json.loads(match.group(1)) + except json.JSONDecodeError: + pass + + # Try to find JSON object + match = re.search(r'\{[\s\S]*\}', text) + if match: + try: + return json.loads(match.group()) + except json.JSONDecodeError: + pass + + return {"error": "Failed to parse JSON", "raw": text} + + def judge_conflict_resolution( + self, + query: str, + agent_response: str, + user_preferences: list, + preference_a: dict, + preference_b: dict, + correct_preference: str, + resolution_reason: str + ) -> dict: + """Judge whether agent correctly resolved a preference conflict.""" + prompt = CONFLICT_RESOLUTION_JUDGE_PROMPT.format( + user_preferences="\n".join([f"- {p}" for p in user_preferences]), + query=query, + preference_a=f"{preference_a['condition']}: {preference_a['action']}", + preference_b=f"{preference_b['condition']}: {preference_b['action']}", + correct_preference=correct_preference, + resolution_reason=resolution_reason, + agent_response=agent_response + ) + + response = self._call_llm(prompt) + return self._parse_json(response) + + def judge_preference_adherence( + self, + query: str, + agent_response: str, + user_preferences: list + ) -> dict: + """Judge whether agent adhered to user preferences.""" + pref_str = "\n".join([ + f"- [{p.get('pref_id', i)}] When {p['condition']}: {p['action']}" + for i, p in enumerate(user_preferences) + ]) + + prompt = PREFERENCE_ADHERENCE_JUDGE_PROMPT.format( + user_preferences=pref_str, + query=query, + agent_response=agent_response + ) + + response = self._call_llm(prompt) + return self._parse_json(response) + + def judge_task_accuracy( + self, + problem: str, + ground_truth: str, + user_answer: str + ) -> dict: + """Judge whether user's final answer is correct.""" + prompt = TASK_ACCURACY_JUDGE_PROMPT.format( + problem=problem, + ground_truth=ground_truth, + user_answer=user_answer + ) + + response = self._call_llm(prompt) + return self._parse_json(response) + + def analyze_user_effort( + self, + conversation: list, + user_preferences: list + ) -> dict: + """Analyze user effort in a conversation.""" + conv_str = "\n".join([ + f"{msg['role'].upper()}: {msg['content']}" + for msg in conversation + ]) + + pref_str = "\n".join([ + f"- When {p['condition']}: {p['action']}" + for p in user_preferences + ]) + + prompt = USER_EFFORT_ANALYSIS_PROMPT.format( + conversation=conv_str, + user_preferences=pref_str + ) + + response = self._call_llm(prompt) + return self._parse_json(response) + + +# ============================================================================= +# Helper Functions +# ============================================================================= + +def enhance_problem_with_step_by_step(problem: str, domain: str) -> str: + """Add step-by-step prompt to a problem description.""" + step_prompt = get_step_by_step_prompt(domain) + return f"{problem}\n\n{step_prompt}" + + +def format_preferences_for_user_prompt(preferences: list) -> str: + """Format conditional preferences for the user prompt.""" + formatted = [] + for i, pref in enumerate(preferences): + condition = pref.get('condition', 'always') + action = pref.get('action', '') + conflict_group = pref.get('conflict_group') + + line = f"{i+1}. **When {condition}**: {action}" + if conflict_group: + line += f" [Group: {conflict_group}]" + + formatted.append(line) + + return "\n".join(formatted) |
