collaborativeagents/scripts/conflict_scenario_generator.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637

"""
Conflict Scenario Generator

Generates queries that deliberately trigger preference conflicts.
The key insight: RAG naturally resolves conflicts by retrieving ONLY
the relevant preference, while context-based methods see ALL preferences
and get confused.

Design principles:
1. Every test query should trigger 2+ conflicting preferences
2. Only ONE preference is correct given the full context
3. RAG retrieves the correct one (high similarity to query)
4. Context methods see both and often pick wrong one or try to satisfy both
"""

import json
import random
from dataclasses import dataclass, field
from typing import Optional
from pathlib import Path


# ============================================================================
# Conflict Templates
# ============================================================================

@dataclass
class ConflictScenario:
    """A scenario that triggers a preference conflict."""
    scenario_id: str
    conflict_group: str
    query: str
    context_cues: list  # What makes the correct preference clear
    triggered_prefs: list  # Preference IDs that could apply
    correct_pref_id: str  # The one that SHOULD apply
    wrong_pref_ids: list  # The ones that should NOT apply
    why_correct: str  # Explanation for ground truth
    expected_rag_behavior: str  # What RAG should do
    expected_context_failure: str  # How context methods fail


# Core conflict scenarios - each designed to fail context methods
CONFLICT_TEMPLATES = {
    # =========================================================================
    # FORMAT CONFLICTS
    # =========================================================================
    "format_bullets_vs_numbered": [
        {
            "query": "What are the steps to deploy a Docker container? Also list the common mistakes to avoid.",
            "context_cues": ["steps to deploy = procedure", "list mistakes = enumeration"],
            "correct_for": "both apply to different parts",
            "why_context_fails": "Context sees both prefs, might use one format for everything",
            "why_rag_wins": "RAG retrieves procedure-pref for deploy part, list-pref for mistakes part"
        },
        {
            "query": "Walk me through setting up CI/CD - what tools should I consider?",
            "context_cues": ["walk through = sequential", "consider = options"],
            "correct_for": "numbered for walkthrough, bullets for tools",
            "why_context_fails": "Mixes formats inconsistently",
            "why_rag_wins": "Retrieves appropriate format preference per section"
        },
        {
            "query": "How do I configure nginx? Give me the key parameters.",
            "context_cues": ["how do I = procedure", "key parameters = list"],
            "correct_for": "numbered steps + bulleted parameters",
            "why_context_fails": "Context methods apply one format to all",
            "why_rag_wins": "Separate retrieval for procedure vs enumeration context"
        }
    ],

    "format_answer_first_vs_buildup": [
        {
            "query": "What's the time complexity of quicksort and why?",
            "context_cues": ["what's = direct question", "why = needs explanation"],
            "correct_for": "answer first (O(n log n)), then explain why",
            "why_context_fails": "Either gives answer without why, or long buildup first",
            "why_rag_wins": "Retrieves 'answer first' for 'what's', builds explanation for 'why'"
        },
        {
            "query": "Explain how neural networks learn - what's backpropagation?",
            "context_cues": ["explain how = learning", "what's = definition needed"],
            "correct_for": "build up intuition for 'how', then define backprop",
            "why_context_fails": "Starts with backprop definition (answer first) losing context",
            "why_rag_wins": "Identifies learning intent first, answer-seeking second"
        }
    ],

    # =========================================================================
    # VERBOSITY CONFLICTS
    # =========================================================================
    "verbosity_concise_vs_detailed": [
        {
            "query": "Quick question - how does the GIL work in Python?",
            "context_cues": ["quick question = brevity cue", "GIL = complex topic"],
            "correct_for": "concise (user said quick)",
            "why_context_fails": "Sees 'complex topic' pref, gives long explanation",
            "why_rag_wins": "Explicit brevity cue has higher retrieval score"
        },
        {
            "query": "Briefly explain the proof of the halting problem.",
            "context_cues": ["briefly = brevity", "proof = normally detailed"],
            "correct_for": "concise - user explicitly asked for brief",
            "why_context_fails": "Proof preference triggers long format",
            "why_rag_wins": "'Briefly' in query matches concise preference strongly"
        },
        {
            "query": "TL;DR on microservices vs monolith for a startup?",
            "context_cues": ["TL;DR = max brevity", "comparison = could be detailed"],
            "correct_for": "ultra-concise comparison",
            "why_context_fails": "Comparison pref might trigger table/detailed analysis",
            "why_rag_wins": "TL;DR keyword retrieves brevity preference"
        },
        {
            "query": "In detail, what's 2+2?",
            "context_cues": ["in detail = verbosity cue", "2+2 = trivial"],
            "correct_for": "brief (topic too simple for detail)",
            "why_context_fails": "Might over-explain simple arithmetic",
            "why_rag_wins": "Query simplicity context overrides detail cue"
        }
    ],

    # =========================================================================
    # CODE STYLE CONFLICTS
    # =========================================================================
    "code_naming_convention": [
        {
            "query": "Write a function to parse JSON, show it in Python and JavaScript.",
            "context_cues": ["Python = snake_case", "JavaScript = camelCase"],
            "correct_for": "snake_case for Python version, camelCase for JS version",
            "why_context_fails": "Picks one convention for both, or inconsistent",
            "why_rag_wins": "Language detection triggers correct convention per block"
        },
        {
            "query": "Convert this Python script to TypeScript: def get_user_data(): ...",
            "context_cues": ["Python source = snake_case", "TypeScript target = camelCase"],
            "correct_for": "convert snake_case to camelCase in TypeScript output",
            "why_context_fails": "Might keep snake_case in TypeScript",
            "why_rag_wins": "Output language triggers appropriate convention"
        },
        {
            "query": "Write SQL to join users and orders, then show Python code to run it.",
            "context_cues": ["SQL = UPPERCASE keywords", "Python = snake_case"],
            "correct_for": "SQL: SELECT, FROM; Python: result_set, fetch_data",
            "why_context_fails": "Style bleeds across languages",
            "why_rag_wins": "Separate retrieval for each language context"
        }
    ],

    "code_comment_style": [
        {
            "query": "Here's a 5-line utility function, explain what each part does.",
            "context_cues": ["5-line = short", "explain each part = inline comments"],
            "correct_for": "inline comments for each line",
            "why_context_fails": "Might use docstring style for short code",
            "why_rag_wins": "Short code + explanation request = inline comments"
        },
        {
            "query": "Write a complete data processing class with documentation.",
            "context_cues": ["complete class = production code", "documentation = docstrings"],
            "correct_for": "docstrings at class/method level, minimal inline",
            "why_context_fails": "Over-comments with inline explanations",
            "why_rag_wins": "Class + documentation context triggers docstring pref"
        }
    ],

    "code_review_scope": [
        {
            "query": "Review this code for bugs, I need to ship it today.",
            "context_cues": ["review = code review", "ship today = urgent, bugs only"],
            "correct_for": "bugs only, skip style",
            "why_context_fails": "Still comments on style issues",
            "why_rag_wins": "Urgency cue + 'bugs' retrieves bugs-only preference"
        },
        {
            "query": "Look at my code and help me improve it for the codebase.",
            "context_cues": ["improve = refactor scope", "for codebase = style matters"],
            "correct_for": "both logic and style suggestions",
            "why_context_fails": "Might only focus on bugs",
            "why_rag_wins": "'Improve' and 'codebase' retrieve full-review pref"
        }
    ],

    # =========================================================================
    # INTERACTION CONFLICTS
    # =========================================================================
    "interaction_autonomy": [
        {
            "query": "Refactor the authentication module.",
            "context_cues": ["refactor = significant change", "no specific instruction"],
            "correct_for": "confirm approach first",
            "why_context_fails": "Might just start refactoring without plan",
            "why_rag_wins": "Ambiguous scope triggers confirmation pref"
        },
        {
            "query": "Change the variable name from 'x' to 'count' in line 5.",
            "context_cues": ["specific instruction", "single change"],
            "correct_for": "execute directly, no confirmation needed",
            "why_context_fails": "Might still ask for confirmation",
            "why_rag_wins": "Specific instruction retrieves execute-directly pref"
        },
        {
            "query": "Update the database schema to add user preferences - it's complex.",
            "context_cues": ["update schema = significant", "complex = acknowledged"],
            "correct_for": "definitely confirm - user said it's complex",
            "why_context_fails": "Might dive in because 'update' sounds actionable",
            "why_rag_wins": "'Complex' keyword strongly triggers confirmation"
        }
    ],

    "interaction_guidance": [
        {
            "query": "Should I use Redis or Memcached for caching?",
            "context_cues": ["should I = asking for recommendation", "or = comparison"],
            "correct_for": "give recommendation with rationale",
            "why_context_fails": "Gives neutral pros/cons without recommendation",
            "why_rag_wins": "'Should I' retrieves recommendation preference"
        },
        {
            "query": "Compare React, Vue, and Angular for my project.",
            "context_cues": ["compare = explicit comparison", "my project = context needed"],
            "correct_for": "table format with tradeoffs",
            "why_context_fails": "Might just recommend one or give long prose",
            "why_rag_wins": "'Compare' retrieves comparison-table preference"
        }
    ],

    # =========================================================================
    # MATH/EXPLANATION CONFLICTS
    # =========================================================================
    "math_detail_level": [
        {
            "query": "What's the derivative of x^2? I'm preparing for an exam.",
            "context_cues": ["what's = direct ask", "exam prep = practice context"],
            "correct_for": "show steps + give practice problem",
            "why_context_fails": "Just gives answer (2x) without exam context",
            "why_rag_wins": "'Exam' retrieves practice-problem preference"
        },
        {
            "query": "Verify my answer: integral of sin(x) = -cos(x) + C. Is this right?",
            "context_cues": ["verify = checking work", "is this right = confirmation"],
            "correct_for": "check step by step, confirm or point out issue",
            "why_context_fails": "Might re-derive from scratch",
            "why_rag_wins": "'Verify' retrieves check-their-work preference"
        }
    ],

    "math_approach": [
        {
            "query": "What's the probability of rolling two sixes?",
            "context_cues": ["probability = statistics", "rolling dice = intuitive example"],
            "correct_for": "intuition first (1 in 36), then formula",
            "why_context_fails": "Starts with P(A∩B) = P(A)P(B) formula",
            "why_rag_wins": "Statistics topic retrieves intuition-first preference"
        },
        {
            "query": "Prove that the sum of angles in a triangle is 180°.",
            "context_cues": ["prove = formal proof", "geometry = visual possible"],
            "correct_for": "structured proof format per preference",
            "why_context_fails": "Might give intuitive explanation instead of proof",
            "why_rag_wins": "'Prove' retrieves proof-format preference"
        }
    ],

    # =========================================================================
    # DOMAIN CONFLICTS
    # =========================================================================
    "domain_example_position": [
        {
            "query": "How do I use the requests library in Python?",
            "context_cues": ["how do I use = practical/API", "library = code example helpful"],
            "correct_for": "minimal example first, then explain parameters",
            "why_context_fails": "Explains parameters first, example last",
            "why_rag_wins": "API/library context retrieves example-first preference"
        },
        {
            "query": "What is dynamic programming?",
            "context_cues": ["what is = concept/theory", "definition needed"],
            "correct_for": "definition first, then example, then edge cases",
            "why_context_fails": "Might lead with example (Fibonacci)",
            "why_rag_wins": "Theory context retrieves definition-first preference"
        }
    ],

    # =========================================================================
    # OUTPUT ARTIFACT CONFLICTS
    # =========================================================================
    "output_code_presentation": [
        {
            "query": "Give me a sorting function I can use, I'm in a hurry.",
            "context_cues": ["give me = copyable", "in a hurry = no explanation"],
            "correct_for": "single code block, no prose",
            "why_context_fails": "Adds explanatory prose between code",
            "why_rag_wins": "'Give me' + 'hurry' retrieves copy-paste preference"
        },
        {
            "query": "Teach me how to implement quicksort step by step.",
            "context_cues": ["teach me = learning", "step by step = chunked"],
            "correct_for": "code in small chunks with explanation between",
            "why_context_fails": "Gives full implementation at once",
            "why_rag_wins": "'Teach' + 'step by step' retrieves chunked preference"
        }
    ],

    # =========================================================================
    # CORRECTION STYLE CONFLICTS
    # =========================================================================
    "correction_severity": [
        {
            "query": "I'm using a hashmap to store my data, is this right?",
            "context_cues": ["hashmap = might mean dict/map", "is this right = validation"],
            "correct_for": "gentle inline (hashmap is fine, also called dict)",
            "why_context_fails": "Might pedantically correct terminology",
            "why_rag_wins": "Minor terminology + validation retrieves gentle-correction pref"
        },
        {
            "query": "I think recursion is just loops with extra steps, right?",
            "context_cues": ["fundamental misconception", "asking for validation"],
            "correct_for": "directly address misconception before proceeding",
            "why_context_fails": "Might gloss over and just show recursion",
            "why_rag_wins": "Fundamental error retrieves explicit-correction preference"
        }
    ],

    # =========================================================================
    # MULTI-DOMAIN CONFLICTS (hardest!)
    # =========================================================================
    "multi_domain_complex": [
        {
            "query": "Quick question - walk me through implementing a binary tree in Python with proper documentation.",
            "context_cues": ["quick = brief", "walk through = detailed", "documentation = thorough"],
            "correct_for": "quick wins (explicit), but include docstrings (documentation ask)",
            "why_context_fails": "Confused by conflicting signals, inconsistent response",
            "why_rag_wins": "Explicit brevity cue retrieved, documentation pref adds docstrings"
        },
        {
            "query": "I'm debugging my ML model and it's not converging. This is frustrating! Compare Adam vs SGD for me.",
            "context_cues": ["debugging = focus on issue", "frustrating = emotional", "compare = table"],
            "correct_for": "acknowledge frustration, then comparison table for optimizers",
            "why_context_fails": "Might skip emotional acknowledgment or wrong format",
            "why_rag_wins": "Frustration pref + comparison pref both retrieved, applied in order"
        },
        {
            "query": "Review this Python code and convert it to JavaScript. Focus on bugs first.",
            "context_cues": ["review = bugs per 'focus' cue", "convert = language change"],
            "correct_for": "Python review (bugs only) + JS conversion (camelCase)",
            "why_context_fails": "Applies wrong scope or wrong naming convention",
            "why_rag_wins": "Multiple relevant prefs retrieved per task segment"
        }
    ]
}


# ============================================================================
# Scenario Generator
# ============================================================================

class ConflictScenarioGenerator:
    """Generates conflict scenarios from templates and user profiles."""

    def __init__(self, profile: dict = None, seed: int = 42):
        self.profile = profile
        self.preferences = {p['pref_id']: p for p in profile['preferences']} if profile else {}
        self.random = random.Random(seed)

    def generate_for_profile(self, preferences: list, domain: str = None) -> dict:
        """Generate a single conflict scenario for given preferences and domain."""
        # Find conflict groups in these preferences
        conflict_groups = {}
        for pref in preferences:
            cg = pref.get('conflict_group')
            if cg:
                if cg not in conflict_groups:
                    conflict_groups[cg] = []
                conflict_groups[cg].append(pref)

        # Find a conflict group with at least 2 preferences
        for cg, prefs in conflict_groups.items():
            if len(prefs) >= 2 and cg in CONFLICT_TEMPLATES:
                templates = CONFLICT_TEMPLATES[cg]
                template = self.random.choice(templates)
                return {
                    "query": template['query'],
                    "conflict_group": cg,
                    "preferences": prefs,
                    "expected_preference": prefs[0]['pref_id'],  # First one as expected
                }
        return None

    def generate_scenarios(self, num_per_conflict_type: int = 3) -> list:
        """Generate conflict scenarios based on profile's preferences."""
        scenarios = []

        for conflict_group, templates in CONFLICT_TEMPLATES.items():
            # Check if this conflict group exists in user's preferences
            relevant_prefs = [
                p for p in self.profile['preferences']
                if p.get('conflict_group') == conflict_group
            ]

            if len(relevant_prefs) < 2:
                continue  # Need at least 2 prefs to have a conflict

            # Generate scenarios from templates
            selected_templates = self.random.sample(
                templates,
                min(num_per_conflict_type, len(templates))
            )

            for i, template in enumerate(selected_templates):
                scenario = self._create_scenario(
                    conflict_group, template, relevant_prefs, i
                )
                if scenario:
                    scenarios.append(scenario)

        return scenarios

    def _create_scenario(
        self,
        conflict_group: str,
        template: dict,
        relevant_prefs: list,
        index: int
    ) -> ConflictScenario:
        """Create a scenario from a template."""
        # Determine which preference is correct
        # Based on context cues in the query
        query = template['query']
        correct_pref = self._determine_correct_preference(query, relevant_prefs)
        wrong_prefs = [p for p in relevant_prefs if p['pref_id'] != correct_pref['pref_id']]

        return ConflictScenario(
            scenario_id=f"{conflict_group}_{index:03d}",
            conflict_group=conflict_group,
            query=query,
            context_cues=template.get('context_cues', []),
            triggered_prefs=[p['pref_id'] for p in relevant_prefs],
            correct_pref_id=correct_pref['pref_id'],
            wrong_pref_ids=[p['pref_id'] for p in wrong_prefs],
            why_correct=template.get('correct_for', ''),
            expected_rag_behavior=template.get('why_rag_wins', ''),
            expected_context_failure=template.get('why_context_fails', '')
        )

    def _determine_correct_preference(self, query: str, prefs: list) -> dict:
        """
        Determine which preference is correct for a query.
        Uses keyword matching on priority_context.
        """
        query_lower = query.lower()
        scores = []

        for pref in prefs:
            score = 0
            for keyword in pref.get('priority_context', []):
                if keyword.lower() in query_lower:
                    score += 1
                # Bonus for condition match
                if pref.get('condition', '').lower() in query_lower:
                    score += 2
            scores.append((pref, score))

        # Return highest scoring preference
        scores.sort(key=lambda x: x[1], reverse=True)
        return scores[0][0] if scores else prefs[0]


def generate_conflict_enriched_dataset(
    profiles_path: str,
    output_path: str,
    scenarios_per_conflict: int = 3,
    seed: int = 42
):
    """
    Generate a dataset where every query triggers at least one conflict.
    """
    profiles = []
    with open(profiles_path) as f:
        for line in f:
            profiles.append(json.loads(line))

    all_scenarios = []
    conflict_coverage = {}

    for profile in profiles:
        generator = ConflictScenarioGenerator(profile, seed)
        scenarios = generator.generate_scenarios(scenarios_per_conflict)

        for scenario in scenarios:
            scenario_dict = {
                'user_id': profile['user_id'],
                'scenario_id': scenario.scenario_id,
                'conflict_group': scenario.conflict_group,
                'query': scenario.query,
                'context_cues': scenario.context_cues,
                'triggered_prefs': scenario.triggered_prefs,
                'correct_pref_id': scenario.correct_pref_id,
                'wrong_pref_ids': scenario.wrong_pref_ids,
                'why_correct': scenario.why_correct,
                'expected_rag_behavior': scenario.expected_rag_behavior,
                'expected_context_failure': scenario.expected_context_failure
            }
            all_scenarios.append(scenario_dict)

            # Track coverage
            cg = scenario.conflict_group
            conflict_coverage[cg] = conflict_coverage.get(cg, 0) + 1

    # Save
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, 'w') as f:
        for scenario in all_scenarios:
            f.write(json.dumps(scenario) + '\n')

    print(f"Generated {len(all_scenarios)} conflict scenarios")
    print(f"Coverage by conflict type:")
    for cg, count in sorted(conflict_coverage.items()):
        print(f"  {cg}: {count}")

    return all_scenarios


def create_evaluation_harness(scenarios: list) -> dict:
    """
    Create an evaluation harness that programmatically checks
    if the correct preference was applied.
    """
    harness = {
        "total_scenarios": len(scenarios),
        "by_conflict_type": {},
        "evaluation_functions": {}
    }

    # Group by conflict type
    for scenario in scenarios:
        cg = scenario['conflict_group']
        if cg not in harness['by_conflict_type']:
            harness['by_conflict_type'][cg] = []
        harness['by_conflict_type'][cg].append(scenario)

    # Add evaluation functions for each conflict type
    harness['evaluation_functions'] = {
        "format_structure": check_format_structure,
        "verbosity": check_verbosity,
        "naming_convention": check_naming_convention,
        "answer_position": check_answer_position,
        # ... more evaluators
    }

    return harness


# ============================================================================
# Evaluation Functions (check if correct preference was applied)
# ============================================================================

def check_format_structure(response: str, correct_pref: dict) -> bool:
    """Check if response uses correct format (bullets vs numbered)."""
    has_bullets = bool(any(c in response for c in ['•', '-', '*']))
    has_numbers = bool(any(f"{i}." in response or f"{i})" in response for i in range(1, 10)))

    if 'bullet' in correct_pref.get('action', '').lower():
        return has_bullets and not has_numbers
    elif 'numbered' in correct_pref.get('action', '').lower():
        return has_numbers
    return True  # Can't determine


def check_verbosity(response: str, correct_pref: dict) -> bool:
    """Check if response matches verbosity preference."""
    word_count = len(response.split())

    if 'concise' in correct_pref.get('action', '').lower() or \
       '3 sentences' in correct_pref.get('action', '').lower():
        return word_count < 100  # Rough threshold
    elif 'detailed' in correct_pref.get('action', '').lower():
        return word_count > 150
    return True


def check_naming_convention(response: str, correct_pref: dict) -> bool:
    """Check if code uses correct naming convention."""
    import re

    # Look for function/variable definitions
    if 'snake_case' in correct_pref.get('action', '').lower():
        # Should have underscores, no camelCase
        has_snake = bool(re.search(r'[a-z]+_[a-z]+', response))
        has_camel = bool(re.search(r'[a-z]+[A-Z][a-z]+', response))
        return has_snake and not has_camel

    elif 'camelCase' in correct_pref.get('action', '').lower():
        has_camel = bool(re.search(r'[a-z]+[A-Z][a-z]+', response))
        return has_camel

    return True


def check_answer_position(response: str, correct_pref: dict) -> bool:
    """Check if answer comes first or explanation builds up."""
    # Simplified: check if response starts with answer-like content
    first_sentence = response.split('.')[0] if '.' in response else response[:100]

    if 'answer first' in correct_pref.get('action', '').lower():
        # First sentence should be direct
        direct_indicators = ['is', 'are', 'the answer', 'yes', 'no', 'it\'s']
        return any(ind in first_sentence.lower() for ind in direct_indicators)

    elif 'build up' in correct_pref.get('action', '').lower():
        # First sentence should be explanatory
        buildup_indicators = ['let\'s', 'first', 'to understand', 'consider']
        return any(ind in first_sentence.lower() for ind in buildup_indicators)

    return True


# ============================================================================
# Main
# ============================================================================

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--profiles", default="collaborativeagents/data/complex_profiles/profiles.jsonl")
    parser.add_argument("--output", default="collaborativeagents/data/conflict_scenarios.jsonl")
    parser.add_argument("--scenarios_per_conflict", type=int, default=3)
    parser.add_argument("--seed", type=int, default=42)

    args = parser.parse_args()

    scenarios = generate_conflict_enriched_dataset(
        args.profiles,
        args.output,
        args.scenarios_per_conflict,
        args.seed
    )