collaborativeagents/prompts_extended.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500

"""
Extended prompts for complex preference evaluation.

Key changes from original:
1. Step-by-step prompts to make sessions longer (more turns = more preference opportunities)
2. User prompts that both ENFORCE and EXPRESS DISAPPOINTMENT
3. More complex conditional preference handling
"""

# =============================================================================
# Step-by-Step Encouragement Prompts (Added to Problem Descriptions)
# =============================================================================

STEP_BY_STEP_MATH_PROMPT = """
IMPORTANT: This is a challenging problem. Please work through it carefully:

1. First, identify what type of problem this is and what mathematical concepts apply
2. State any assumptions or conditions that are relevant
3. Break down the solution into clear steps, explaining your reasoning at each step
4. After reaching an answer, verify it makes sense (check edge cases, units, etc.)
5. Summarize the key insights from this problem

Take your time. Show ALL your work and reasoning. Do not skip steps.
"""

STEP_BY_STEP_CODE_PROMPT = """
IMPORTANT: This requires careful implementation. Please proceed methodically:

1. First, understand the requirements completely - ask clarifying questions if needed
2. Identify edge cases and constraints before writing any code
3. Outline your approach in pseudocode or plain language
4. Implement step by step, explaining the logic of each component
5. Consider time and space complexity
6. Test your solution with example inputs and edge cases
7. Refactor if there are obvious improvements

Show your reasoning throughout. Quality matters more than speed.
"""

STEP_BY_STEP_REASONING_PROMPT = """
IMPORTANT: This problem requires careful logical reasoning:

1. Read the problem carefully and identify ALL relevant information
2. State any assumptions you're making explicitly
3. Work through the logic step by step, explaining each deduction
4. Check for any logical fallacies or gaps in your reasoning
5. Consider alternative interpretations or approaches
6. State your final conclusion clearly with confidence level

Think out loud. Show your complete reasoning process.
"""

STEP_BY_STEP_SCIENCE_PROMPT = """
IMPORTANT: This is a complex scientific problem:

1. Identify the core concepts and principles involved
2. State any assumptions, constants, or boundary conditions
3. Set up the problem mathematically if applicable
4. Work through the solution systematically
5. Interpret your results - do they make physical/scientific sense?
6. Consider limitations of your approach

Be thorough. Scientific problems reward careful, systematic thinking.
"""


def get_step_by_step_prompt(domain: str) -> str:
    """Get the appropriate step-by-step prompt for a domain."""
    prompts = {
        "math": STEP_BY_STEP_MATH_PROMPT,
        "math-hard": STEP_BY_STEP_MATH_PROMPT,
        "math-500": STEP_BY_STEP_MATH_PROMPT,
        "aime": STEP_BY_STEP_MATH_PROMPT,
        "theoremqa": STEP_BY_STEP_MATH_PROMPT,
        "code": STEP_BY_STEP_CODE_PROMPT,
        "humaneval": STEP_BY_STEP_CODE_PROMPT,
        "bigcodebench": STEP_BY_STEP_CODE_PROMPT,
        "livecodebench": STEP_BY_STEP_CODE_PROMPT,
        "scicode": STEP_BY_STEP_CODE_PROMPT,
        "logiqa": STEP_BY_STEP_REASONING_PROMPT,
        "mmlu": STEP_BY_STEP_REASONING_PROMPT,
        "medqa": STEP_BY_STEP_SCIENCE_PROMPT,
        "gpqa": STEP_BY_STEP_SCIENCE_PROMPT,
    }
    return prompts.get(domain, STEP_BY_STEP_REASONING_PROMPT)


# =============================================================================
# Enhanced User Simulator Prompt (Supports Both Enforcement and Disappointment)
# =============================================================================

USER_SIMULATOR_PROMPT_COMPLEX = """
You are a user simulator collaborating with an agent to solve a problem. You will be provided with a
problem description, and you must get the agent to help you solve it. You have specific interaction
preferences that you MUST enforce throughout the conversation.

# Problem Description
{user_task_description}
{problem}

{step_by_step_prompt}

Note: the agent cannot see this problem description or your preferences.

# User Persona
{user_persona}

# User Preferences (CONDITIONAL - Apply Based on Situation)
{user_preferences}

These preferences are CONDITIONAL - they apply in specific situations. When multiple preferences
COULD apply, choose the one whose condition best matches the current context.

# Preference Enforcement Protocol

You have TWO modes of signaling preference violations:

## Mode 1: EXPLICIT ENFORCEMENT (Strong Signal)
Use when the agent clearly violates a preference:
- State your preference directly: "I prefer X, can you adjust your response?"
- Be specific about what needs to change
- Do NOT update your draft answer
- Do NOT proceed until corrected

## Mode 2: DISAPPOINTMENT EXPRESSION (Mild Signal)
Use when the response is suboptimal but not a clear violation:
- Express mild dissatisfaction: "Hmm, that's not quite what I was looking for..."
- Hint at what you wanted: "I was hoping for something more like..."
- You MAY update draft answer if the information is still useful
- Give the agent a chance to improve

Choose Mode 1 for clear violations, Mode 2 for subtle misalignments.

# Draft Answer Management
- Maintain a working draft starting with "I don't know"
- Update ONLY when:
  1. Agent provides helpful information, AND
  2. Agent follows your preferences (or you're using Mode 2 and info is still useful)
- When using Mode 1 enforcement: NEVER update draft, NEVER proceed

# Making Sessions Longer (Important!)
- Break the problem into parts - don't try to solve everything at once
- Ask clarifying questions even if you understand
- Request explanations of intermediate steps
- Ask "why" and "how" frequently
- Request verification of partial solutions
- This creates more opportunities to reveal and enforce preferences!

# Conversation Termination
Only terminate when:
1. Your draft answer fully solves the problem, OR
2. You've tried multiple times and the agent cannot help

When ready to terminate, respond with "{termination_signal}".

# Output Format:
{{
    "preference_reasoning": str,  # Which preferences apply to this situation and why?
    "enforcement_mode": str,  # "none", "mode_1_enforce", or "mode_2_disappointment"
    "enforcement_reason": str,  # If enforcing, why and which preference?
    "draft_answer": str,  # Current working answer
    "should_terminate": bool,
    "response": str  # Your response to the agent
}}

Output a valid JSON object. Use double quotes, escape special characters properly.
"""


# =============================================================================
# LLM Judge Prompts
# =============================================================================

CONFLICT_RESOLUTION_JUDGE_PROMPT = """
You are an expert judge evaluating whether a conversational agent correctly resolved a preference conflict.

# Situation
The user has the following CONDITIONAL preferences:
{user_preferences}

# The Query
{query}

# Conflict Analysis
This query triggers a CONFLICT between these preferences:
- Preference A: {preference_a}
- Preference B: {preference_b}

The CORRECT preference to apply is: {correct_preference}
Reason: {resolution_reason}

# Agent's Response
{agent_response}

# Your Task
Evaluate whether the agent's response correctly applied the right preference.

Consider:
1. Did the agent follow Preference A, Preference B, both, or neither?
2. Was the agent's choice appropriate for the context of the query?
3. Did trying to satisfy both preferences create an inconsistent response?

# Output Format:
{{
    "preference_followed": str,  # "A", "B", "both", "neither"
    "correct_choice_made": bool,  # True if agent followed the correct preference
    "reasoning": str,  # Explanation of your judgment
    "score": float  # 0.0 = wrong, 0.5 = partial, 1.0 = correct
}}

Output valid JSON only.
"""


PREFERENCE_ADHERENCE_JUDGE_PROMPT = """
You are an expert judge evaluating whether a conversational agent adhered to user preferences.

# User Preferences
{user_preferences}

# Query
{query}

# Agent Response
{agent_response}

# Your Task
For EACH preference, determine if the agent adhered to it (if applicable).

# Output Format:
{{
    "preferences_evaluation": [
        {{
            "preference_id": str,
            "applicable": bool,  # Is this preference relevant to this query?
            "adhered": bool,  # If applicable, did the agent follow it?
            "evidence": str  # Quote from response showing adherence or violation
        }},
        ...
    ],
    "overall_adherence_score": float,  # 0.0 to 1.0
    "violated_preferences": [str],  # List of preference IDs violated
    "reasoning": str
}}

Output valid JSON only.
"""


TASK_ACCURACY_JUDGE_PROMPT = """
You are an expert judge evaluating whether a user's final answer is correct.

# Problem
{problem}

# Ground Truth Solution
{ground_truth}

# User's Final Draft Answer
{user_answer}

# Your Task
Determine if the user's answer is correct. Be lenient on formatting but strict on substance.

For math problems: The numerical answer must be correct.
For code problems: The logic must be correct (minor syntax issues OK).
For reasoning problems: The conclusion and key reasoning must be correct.

# Output Format:
{{
    "is_correct": bool,
    "correctness_score": float,  # 0.0 = wrong, 0.5 = partial, 1.0 = correct
    "reasoning": str,
    "key_errors": [str]  # List any errors found
}}

Output valid JSON only.
"""


USER_EFFORT_ANALYSIS_PROMPT = """
You are analyzing user effort in a conversation.

# Conversation
{conversation}

# User Preferences
{user_preferences}

# Your Task
Count and categorize user effort:

1. **Explicit Enforcements**: User directly states their preference
   - Example: "Please use bullet points like I asked"

2. **Disappointment Expressions**: User expresses dissatisfaction without explicit enforcement
   - Example: "Hmm, that's not quite what I was hoping for"

3. **Clarification Requests**: User asks for clarification due to misalignment
   - Example: "Could you explain that differently?"

4. **Repetitions**: User repeats information they already provided

# Output Format:
{{
    "explicit_enforcements": int,
    "disappointment_expressions": int,
    "clarification_requests": int,
    "repetitions": int,
    "total_user_effort": int,
    "enforcement_details": [
        {{"turn": int, "type": str, "quote": str}},
        ...
    ]
}}

Output valid JSON only.
"""


# =============================================================================
# LLM Judge Implementation
# =============================================================================

class LLMJudge:
    """
    LLM-based judge for evaluation.

    Uses a powerful model (e.g., Llama-70B) to evaluate:
    - Conflict resolution accuracy
    - Preference adherence
    - Task accuracy
    - User effort analysis
    """

    def __init__(self, model_name: str = "meta-llama/Llama-3.3-70B-Instruct"):
        self.model_name = model_name
        self._model = None

    def _get_model(self):
        """Lazy load model."""
        if self._model is None:
            try:
                import litellm
                self._model = litellm
            except ImportError:
                raise ImportError("litellm required for LLM judge")
        return self._model

    def _call_llm(self, prompt: str) -> str:
        """Call the LLM and get response."""
        model = self._get_model()
        response = model.completion(
            model=self.model_name,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,  # Deterministic for evaluation
            max_tokens=2048,
        )
        return response.choices[0].message.content

    def _parse_json(self, text: str) -> dict:
        """Parse JSON from LLM response."""
        import json
        import re

        # Try direct parse
        try:
            return json.loads(text)
        except json.JSONDecodeError:
            pass

        # Try to extract JSON from markdown code block
        match = re.search(r'```(?:json)?\s*([\s\S]*?)```', text)
        if match:
            try:
                return json.loads(match.group(1))
            except json.JSONDecodeError:
                pass

        # Try to find JSON object
        match = re.search(r'\{[\s\S]*\}', text)
        if match:
            try:
                return json.loads(match.group())
            except json.JSONDecodeError:
                pass

        return {"error": "Failed to parse JSON", "raw": text}

    def judge_conflict_resolution(
        self,
        query: str,
        agent_response: str,
        user_preferences: list,
        preference_a: dict,
        preference_b: dict,
        correct_preference: str,
        resolution_reason: str
    ) -> dict:
        """Judge whether agent correctly resolved a preference conflict."""
        prompt = CONFLICT_RESOLUTION_JUDGE_PROMPT.format(
            user_preferences="\n".join([f"- {p}" for p in user_preferences]),
            query=query,
            preference_a=f"{preference_a['condition']}: {preference_a['action']}",
            preference_b=f"{preference_b['condition']}: {preference_b['action']}",
            correct_preference=correct_preference,
            resolution_reason=resolution_reason,
            agent_response=agent_response
        )

        response = self._call_llm(prompt)
        return self._parse_json(response)

    def judge_preference_adherence(
        self,
        query: str,
        agent_response: str,
        user_preferences: list
    ) -> dict:
        """Judge whether agent adhered to user preferences."""
        pref_str = "\n".join([
            f"- [{p.get('pref_id', i)}] When {p['condition']}: {p['action']}"
            for i, p in enumerate(user_preferences)
        ])

        prompt = PREFERENCE_ADHERENCE_JUDGE_PROMPT.format(
            user_preferences=pref_str,
            query=query,
            agent_response=agent_response
        )

        response = self._call_llm(prompt)
        return self._parse_json(response)

    def judge_task_accuracy(
        self,
        problem: str,
        ground_truth: str,
        user_answer: str
    ) -> dict:
        """Judge whether user's final answer is correct."""
        prompt = TASK_ACCURACY_JUDGE_PROMPT.format(
            problem=problem,
            ground_truth=ground_truth,
            user_answer=user_answer
        )

        response = self._call_llm(prompt)
        return self._parse_json(response)

    def analyze_user_effort(
        self,
        conversation: list,
        user_preferences: list
    ) -> dict:
        """Analyze user effort in a conversation."""
        conv_str = "\n".join([
            f"{msg['role'].upper()}: {msg['content']}"
            for msg in conversation
        ])

        pref_str = "\n".join([
            f"- When {p['condition']}: {p['action']}"
            for p in user_preferences
        ])

        prompt = USER_EFFORT_ANALYSIS_PROMPT.format(
            conversation=conv_str,
            user_preferences=pref_str
        )

        response = self._call_llm(prompt)
        return self._parse_json(response)


# =============================================================================
# Helper Functions
# =============================================================================

def enhance_problem_with_step_by_step(problem: str, domain: str) -> str:
    """Add step-by-step prompt to a problem description."""
    step_prompt = get_step_by_step_prompt(domain)
    return f"{problem}\n\n{step_prompt}"


def format_preferences_for_user_prompt(preferences: list) -> str:
    """Format conditional preferences for the user prompt."""
    formatted = []
    for i, pref in enumerate(preferences):
        condition = pref.get('condition', 'always')
        action = pref.get('action', '')
        conflict_group = pref.get('conflict_group')

        line = f"{i+1}. **When {condition}**: {action}"
        if conflict_group:
            line += f" [Group: {conflict_group}]"

        formatted.append(line)

    return "\n".join(formatted)