1 files changed, 637 insertions, 0 deletions
diff --git a/collaborativeagents/scripts/conflict_scenario_generator.py b/collaborativeagents/scripts/conflict_scenario_generator.py
new file mode 100644
index 0000000..9d00de8
--- /dev/null
+++ b/collaborativeagents/scripts/conflict_scenario_generator.py
@@ -0,0 +1,637 @@
+"""
+Conflict Scenario Generator
+
+Generates queries that deliberately trigger preference conflicts.
+The key insight: RAG naturally resolves conflicts by retrieving ONLY
+the relevant preference, while context-based methods see ALL preferences
+and get confused.
+
+Design principles:
+1. Every test query should trigger 2+ conflicting preferences
+2. Only ONE preference is correct given the full context
+3. RAG retrieves the correct one (high similarity to query)
+4. Context methods see both and often pick wrong one or try to satisfy both
+"""
+
+import json
+import random
+from dataclasses import dataclass, field
+from typing import Optional
+from pathlib import Path
+
+
+# ============================================================================
+# Conflict Templates
+# ============================================================================
+
+@dataclass
+class ConflictScenario:
+    """A scenario that triggers a preference conflict."""
+    scenario_id: str
+    conflict_group: str
+    query: str
+    context_cues: list  # What makes the correct preference clear
+    triggered_prefs: list  # Preference IDs that could apply
+    correct_pref_id: str  # The one that SHOULD apply
+    wrong_pref_ids: list  # The ones that should NOT apply
+    why_correct: str  # Explanation for ground truth
+    expected_rag_behavior: str  # What RAG should do
+    expected_context_failure: str  # How context methods fail
+
+
+# Core conflict scenarios - each designed to fail context methods
+CONFLICT_TEMPLATES = {
+    # =========================================================================
+    # FORMAT CONFLICTS
+    # =========================================================================
+    "format_bullets_vs_numbered": [
+        {
+            "query": "What are the steps to deploy a Docker container? Also list the common mistakes to avoid.",
+            "context_cues": ["steps to deploy = procedure", "list mistakes = enumeration"],
+            "correct_for": "both apply to different parts",
+            "why_context_fails": "Context sees both prefs, might use one format for everything",
+            "why_rag_wins": "RAG retrieves procedure-pref for deploy part, list-pref for mistakes part"
+        },
+        {
+            "query": "Walk me through setting up CI/CD - what tools should I consider?",
+            "context_cues": ["walk through = sequential", "consider = options"],
+            "correct_for": "numbered for walkthrough, bullets for tools",
+            "why_context_fails": "Mixes formats inconsistently",
+            "why_rag_wins": "Retrieves appropriate format preference per section"
+        },
+        {
+            "query": "How do I configure nginx? Give me the key parameters.",
+            "context_cues": ["how do I = procedure", "key parameters = list"],
+            "correct_for": "numbered steps + bulleted parameters",
+            "why_context_fails": "Context methods apply one format to all",
+            "why_rag_wins": "Separate retrieval for procedure vs enumeration context"
+        }
+    ],
+
+    "format_answer_first_vs_buildup": [
+        {
+            "query": "What's the time complexity of quicksort and why?",
+            "context_cues": ["what's = direct question", "why = needs explanation"],
+            "correct_for": "answer first (O(n log n)), then explain why",
+            "why_context_fails": "Either gives answer without why, or long buildup first",
+            "why_rag_wins": "Retrieves 'answer first' for 'what's', builds explanation for 'why'"
+        },
+        {
+            "query": "Explain how neural networks learn - what's backpropagation?",
+            "context_cues": ["explain how = learning", "what's = definition needed"],
+            "correct_for": "build up intuition for 'how', then define backprop",
+            "why_context_fails": "Starts with backprop definition (answer first) losing context",
+            "why_rag_wins": "Identifies learning intent first, answer-seeking second"
+        }
+    ],
+
+    # =========================================================================
+    # VERBOSITY CONFLICTS
+    # =========================================================================
+    "verbosity_concise_vs_detailed": [
+        {
+            "query": "Quick question - how does the GIL work in Python?",
+            "context_cues": ["quick question = brevity cue", "GIL = complex topic"],
+            "correct_for": "concise (user said quick)",
+            "why_context_fails": "Sees 'complex topic' pref, gives long explanation",
+            "why_rag_wins": "Explicit brevity cue has higher retrieval score"
+        },
+        {
+            "query": "Briefly explain the proof of the halting problem.",
+            "context_cues": ["briefly = brevity", "proof = normally detailed"],
+            "correct_for": "concise - user explicitly asked for brief",
+            "why_context_fails": "Proof preference triggers long format",
+            "why_rag_wins": "'Briefly' in query matches concise preference strongly"
+        },
+        {
+            "query": "TL;DR on microservices vs monolith for a startup?",
+            "context_cues": ["TL;DR = max brevity", "comparison = could be detailed"],
+            "correct_for": "ultra-concise comparison",
+            "why_context_fails": "Comparison pref might trigger table/detailed analysis",
+            "why_rag_wins": "TL;DR keyword retrieves brevity preference"
+        },
+        {
+            "query": "In detail, what's 2+2?",
+            "context_cues": ["in detail = verbosity cue", "2+2 = trivial"],
+            "correct_for": "brief (topic too simple for detail)",
+            "why_context_fails": "Might over-explain simple arithmetic",
+            "why_rag_wins": "Query simplicity context overrides detail cue"
+        }
+    ],
+
+    # =========================================================================
+    # CODE STYLE CONFLICTS
+    # =========================================================================
+    "code_naming_convention": [
+        {
+            "query": "Write a function to parse JSON, show it in Python and JavaScript.",
+            "context_cues": ["Python = snake_case", "JavaScript = camelCase"],
+            "correct_for": "snake_case for Python version, camelCase for JS version",
+            "why_context_fails": "Picks one convention for both, or inconsistent",
+            "why_rag_wins": "Language detection triggers correct convention per block"
+        },
+        {
+            "query": "Convert this Python script to TypeScript: def get_user_data(): ...",
+            "context_cues": ["Python source = snake_case", "TypeScript target = camelCase"],
+            "correct_for": "convert snake_case to camelCase in TypeScript output",
+            "why_context_fails": "Might keep snake_case in TypeScript",
+            "why_rag_wins": "Output language triggers appropriate convention"
+        },
+        {
+            "query": "Write SQL to join users and orders, then show Python code to run it.",
+            "context_cues": ["SQL = UPPERCASE keywords", "Python = snake_case"],
+            "correct_for": "SQL: SELECT, FROM; Python: result_set, fetch_data",
+            "why_context_fails": "Style bleeds across languages",
+            "why_rag_wins": "Separate retrieval for each language context"
+        }
+    ],
+
+    "code_comment_style": [
+        {
+            "query": "Here's a 5-line utility function, explain what each part does.",
+            "context_cues": ["5-line = short", "explain each part = inline comments"],
+            "correct_for": "inline comments for each line",
+            "why_context_fails": "Might use docstring style for short code",
+            "why_rag_wins": "Short code + explanation request = inline comments"
+        },
+        {
+            "query": "Write a complete data processing class with documentation.",
+            "context_cues": ["complete class = production code", "documentation = docstrings"],
+            "correct_for": "docstrings at class/method level, minimal inline",
+            "why_context_fails": "Over-comments with inline explanations",
+            "why_rag_wins": "Class + documentation context triggers docstring pref"
+        }
+    ],
+
+    "code_review_scope": [
+        {
+            "query": "Review this code for bugs, I need to ship it today.",
+            "context_cues": ["review = code review", "ship today = urgent, bugs only"],
+            "correct_for": "bugs only, skip style",
+            "why_context_fails": "Still comments on style issues",
+            "why_rag_wins": "Urgency cue + 'bugs' retrieves bugs-only preference"
+        },
+        {
+            "query": "Look at my code and help me improve it for the codebase.",
+            "context_cues": ["improve = refactor scope", "for codebase = style matters"],
+            "correct_for": "both logic and style suggestions",
+            "why_context_fails": "Might only focus on bugs",
+            "why_rag_wins": "'Improve' and 'codebase' retrieve full-review pref"
+        }
+    ],
+
+    # =========================================================================
+    # INTERACTION CONFLICTS
+    # =========================================================================
+    "interaction_autonomy": [
+        {
+            "query": "Refactor the authentication module.",
+            "context_cues": ["refactor = significant change", "no specific instruction"],
+            "correct_for": "confirm approach first",
+            "why_context_fails": "Might just start refactoring without plan",
+            "why_rag_wins": "Ambiguous scope triggers confirmation pref"
+        },
+        {
+            "query": "Change the variable name from 'x' to 'count' in line 5.",
+            "context_cues": ["specific instruction", "single change"],
+            "correct_for": "execute directly, no confirmation needed",
+            "why_context_fails": "Might still ask for confirmation",
+            "why_rag_wins": "Specific instruction retrieves execute-directly pref"
+        },
+        {
+            "query": "Update the database schema to add user preferences - it's complex.",
+            "context_cues": ["update schema = significant", "complex = acknowledged"],
+            "correct_for": "definitely confirm - user said it's complex",
+            "why_context_fails": "Might dive in because 'update' sounds actionable",
+            "why_rag_wins": "'Complex' keyword strongly triggers confirmation"
+        }
+    ],
+
+    "interaction_guidance": [
+        {
+            "query": "Should I use Redis or Memcached for caching?",
+            "context_cues": ["should I = asking for recommendation", "or = comparison"],
+            "correct_for": "give recommendation with rationale",
+            "why_context_fails": "Gives neutral pros/cons without recommendation",
+            "why_rag_wins": "'Should I' retrieves recommendation preference"
+        },
+        {
+            "query": "Compare React, Vue, and Angular for my project.",
+            "context_cues": ["compare = explicit comparison", "my project = context needed"],
+            "correct_for": "table format with tradeoffs",
+            "why_context_fails": "Might just recommend one or give long prose",
+            "why_rag_wins": "'Compare' retrieves comparison-table preference"
+        }
+    ],
+
+    # =========================================================================
+    # MATH/EXPLANATION CONFLICTS
+    # =========================================================================
+    "math_detail_level": [
+        {
+            "query": "What's the derivative of x^2? I'm preparing for an exam.",
+            "context_cues": ["what's = direct ask", "exam prep = practice context"],
+            "correct_for": "show steps + give practice problem",
+            "why_context_fails": "Just gives answer (2x) without exam context",
+            "why_rag_wins": "'Exam' retrieves practice-problem preference"
+        },
+        {
+            "query": "Verify my answer: integral of sin(x) = -cos(x) + C. Is this right?",
+            "context_cues": ["verify = checking work", "is this right = confirmation"],
+            "correct_for": "check step by step, confirm or point out issue",
+            "why_context_fails": "Might re-derive from scratch",
+            "why_rag_wins": "'Verify' retrieves check-their-work preference"
+        }
+    ],
+
+    "math_approach": [
+        {
+            "query": "What's the probability of rolling two sixes?",
+            "context_cues": ["probability = statistics", "rolling dice = intuitive example"],
+            "correct_for": "intuition first (1 in 36), then formula",
+            "why_context_fails": "Starts with P(A∩B) = P(A)P(B) formula",
+            "why_rag_wins": "Statistics topic retrieves intuition-first preference"
+        },
+        {
+            "query": "Prove that the sum of angles in a triangle is 180°.",
+            "context_cues": ["prove = formal proof", "geometry = visual possible"],
+            "correct_for": "structured proof format per preference",
+            "why_context_fails": "Might give intuitive explanation instead of proof",
+            "why_rag_wins": "'Prove' retrieves proof-format preference"
+        }
+    ],
+
+    # =========================================================================
+    # DOMAIN CONFLICTS
+    # =========================================================================
+    "domain_example_position": [
+        {
+            "query": "How do I use the requests library in Python?",
+            "context_cues": ["how do I use = practical/API", "library = code example helpful"],
+            "correct_for": "minimal example first, then explain parameters",
+            "why_context_fails": "Explains parameters first, example last",
+            "why_rag_wins": "API/library context retrieves example-first preference"
+        },
+        {
+            "query": "What is dynamic programming?",
+            "context_cues": ["what is = concept/theory", "definition needed"],
+            "correct_for": "definition first, then example, then edge cases",
+            "why_context_fails": "Might lead with example (Fibonacci)",
+            "why_rag_wins": "Theory context retrieves definition-first preference"
+        }
+    ],
+
+    # =========================================================================
+    # OUTPUT ARTIFACT CONFLICTS
+    # =========================================================================
+    "output_code_presentation": [
+        {
+            "query": "Give me a sorting function I can use, I'm in a hurry.",
+            "context_cues": ["give me = copyable", "in a hurry = no explanation"],
+            "correct_for": "single code block, no prose",
+            "why_context_fails": "Adds explanatory prose between code",
+            "why_rag_wins": "'Give me' + 'hurry' retrieves copy-paste preference"
+        },
+        {
+            "query": "Teach me how to implement quicksort step by step.",
+            "context_cues": ["teach me = learning", "step by step = chunked"],
+            "correct_for": "code in small chunks with explanation between",
+            "why_context_fails": "Gives full implementation at once",
+            "why_rag_wins": "'Teach' + 'step by step' retrieves chunked preference"
+        }
+    ],
+
+    # =========================================================================
+    # CORRECTION STYLE CONFLICTS
+    # =========================================================================
+    "correction_severity": [
+        {
+            "query": "I'm using a hashmap to store my data, is this right?",
+            "context_cues": ["hashmap = might mean dict/map", "is this right = validation"],
+            "correct_for": "gentle inline (hashmap is fine, also called dict)",
+            "why_context_fails": "Might pedantically correct terminology",
+            "why_rag_wins": "Minor terminology + validation retrieves gentle-correction pref"
+        },
+        {
+            "query": "I think recursion is just loops with extra steps, right?",
+            "context_cues": ["fundamental misconception", "asking for validation"],
+            "correct_for": "directly address misconception before proceeding",
+            "why_context_fails": "Might gloss over and just show recursion",
+            "why_rag_wins": "Fundamental error retrieves explicit-correction preference"
+        }
+    ],
+
+    # =========================================================================
+    # MULTI-DOMAIN CONFLICTS (hardest!)
+    # =========================================================================
+    "multi_domain_complex": [
+        {
+            "query": "Quick question - walk me through implementing a binary tree in Python with proper documentation.",
+            "context_cues": ["quick = brief", "walk through = detailed", "documentation = thorough"],
+            "correct_for": "quick wins (explicit), but include docstrings (documentation ask)",
+            "why_context_fails": "Confused by conflicting signals, inconsistent response",
+            "why_rag_wins": "Explicit brevity cue retrieved, documentation pref adds docstrings"
+        },
+        {
+            "query": "I'm debugging my ML model and it's not converging. This is frustrating! Compare Adam vs SGD for me.",
+            "context_cues": ["debugging = focus on issue", "frustrating = emotional", "compare = table"],
+            "correct_for": "acknowledge frustration, then comparison table for optimizers",
+            "why_context_fails": "Might skip emotional acknowledgment or wrong format",
+            "why_rag_wins": "Frustration pref + comparison pref both retrieved, applied in order"
+        },
+        {
+            "query": "Review this Python code and convert it to JavaScript. Focus on bugs first.",
+            "context_cues": ["review = bugs per 'focus' cue", "convert = language change"],
+            "correct_for": "Python review (bugs only) + JS conversion (camelCase)",
+            "why_context_fails": "Applies wrong scope or wrong naming convention",
+            "why_rag_wins": "Multiple relevant prefs retrieved per task segment"
+        }
+    ]
+}
+
+
+# ============================================================================
+# Scenario Generator
+# ============================================================================
+
+class ConflictScenarioGenerator:
+    """Generates conflict scenarios from templates and user profiles."""
+
+    def __init__(self, profile: dict = None, seed: int = 42):
+        self.profile = profile
+        self.preferences = {p['pref_id']: p for p in profile['preferences']} if profile else {}
+        self.random = random.Random(seed)
+
+    def generate_for_profile(self, preferences: list, domain: str = None) -> dict:
+        """Generate a single conflict scenario for given preferences and domain."""
+        # Find conflict groups in these preferences
+        conflict_groups = {}
+        for pref in preferences:
+            cg = pref.get('conflict_group')
+            if cg:
+                if cg not in conflict_groups:
+                    conflict_groups[cg] = []
+                conflict_groups[cg].append(pref)
+
+        # Find a conflict group with at least 2 preferences
+        for cg, prefs in conflict_groups.items():
+            if len(prefs) >= 2 and cg in CONFLICT_TEMPLATES:
+                templates = CONFLICT_TEMPLATES[cg]
+                template = self.random.choice(templates)
+                return {
+                    "query": template['query'],
+                    "conflict_group": cg,
+                    "preferences": prefs,
+                    "expected_preference": prefs[0]['pref_id'],  # First one as expected
+                }
+        return None
+
+    def generate_scenarios(self, num_per_conflict_type: int = 3) -> list:
+        """Generate conflict scenarios based on profile's preferences."""
+        scenarios = []
+
+        for conflict_group, templates in CONFLICT_TEMPLATES.items():
+            # Check if this conflict group exists in user's preferences
+            relevant_prefs = [
+                p for p in self.profile['preferences']
+                if p.get('conflict_group') == conflict_group
+            ]
+
+            if len(relevant_prefs) < 2:
+                continue  # Need at least 2 prefs to have a conflict
+
+            # Generate scenarios from templates
+            selected_templates = self.random.sample(
+                templates,
+                min(num_per_conflict_type, len(templates))
+            )
+
+            for i, template in enumerate(selected_templates):
+                scenario = self._create_scenario(
+                    conflict_group, template, relevant_prefs, i
+                )
+                if scenario:
+                    scenarios.append(scenario)
+
+        return scenarios
+
+    def _create_scenario(
+        self,
+        conflict_group: str,
+        template: dict,
+        relevant_prefs: list,
+        index: int
+    ) -> ConflictScenario:
+        """Create a scenario from a template."""
+        # Determine which preference is correct
+        # Based on context cues in the query
+        query = template['query']
+        correct_pref = self._determine_correct_preference(query, relevant_prefs)
+        wrong_prefs = [p for p in relevant_prefs if p['pref_id'] != correct_pref['pref_id']]
+
+        return ConflictScenario(
+            scenario_id=f"{conflict_group}_{index:03d}",
+            conflict_group=conflict_group,
+            query=query,
+            context_cues=template.get('context_cues', []),
+            triggered_prefs=[p['pref_id'] for p in relevant_prefs],
+            correct_pref_id=correct_pref['pref_id'],
+            wrong_pref_ids=[p['pref_id'] for p in wrong_prefs],
+            why_correct=template.get('correct_for', ''),
+            expected_rag_behavior=template.get('why_rag_wins', ''),
+            expected_context_failure=template.get('why_context_fails', '')
+        )
+
+    def _determine_correct_preference(self, query: str, prefs: list) -> dict:
+        """
+        Determine which preference is correct for a query.
+        Uses keyword matching on priority_context.
+        """
+        query_lower = query.lower()
+        scores = []
+
+        for pref in prefs:
+            score = 0
+            for keyword in pref.get('priority_context', []):
+                if keyword.lower() in query_lower:
+                    score += 1
+                # Bonus for condition match
+                if pref.get('condition', '').lower() in query_lower:
+                    score += 2
+            scores.append((pref, score))
+
+        # Return highest scoring preference
+        scores.sort(key=lambda x: x[1], reverse=True)
+        return scores[0][0] if scores else prefs[0]
+
+
+def generate_conflict_enriched_dataset(
+    profiles_path: str,
+    output_path: str,
+    scenarios_per_conflict: int = 3,
+    seed: int = 42
+):
+    """
+    Generate a dataset where every query triggers at least one conflict.
+    """
+    profiles = []
+    with open(profiles_path) as f:
+        for line in f:
+            profiles.append(json.loads(line))
+
+    all_scenarios = []
+    conflict_coverage = {}
+
+    for profile in profiles:
+        generator = ConflictScenarioGenerator(profile, seed)
+        scenarios = generator.generate_scenarios(scenarios_per_conflict)
+
+        for scenario in scenarios:
+            scenario_dict = {
+                'user_id': profile['user_id'],
+                'scenario_id': scenario.scenario_id,
+                'conflict_group': scenario.conflict_group,
+                'query': scenario.query,
+                'context_cues': scenario.context_cues,
+                'triggered_prefs': scenario.triggered_prefs,
+                'correct_pref_id': scenario.correct_pref_id,
+                'wrong_pref_ids': scenario.wrong_pref_ids,
+                'why_correct': scenario.why_correct,
+                'expected_rag_behavior': scenario.expected_rag_behavior,
+                'expected_context_failure': scenario.expected_context_failure
+            }
+            all_scenarios.append(scenario_dict)
+
+            # Track coverage
+            cg = scenario.conflict_group
+            conflict_coverage[cg] = conflict_coverage.get(cg, 0) + 1
+
+    # Save
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, 'w') as f:
+        for scenario in all_scenarios:
+            f.write(json.dumps(scenario) + '\n')
+
+    print(f"Generated {len(all_scenarios)} conflict scenarios")
+    print(f"Coverage by conflict type:")
+    for cg, count in sorted(conflict_coverage.items()):
+        print(f"  {cg}: {count}")
+
+    return all_scenarios
+
+
+def create_evaluation_harness(scenarios: list) -> dict:
+    """
+    Create an evaluation harness that programmatically checks
+    if the correct preference was applied.
+    """
+    harness = {
+        "total_scenarios": len(scenarios),
+        "by_conflict_type": {},
+        "evaluation_functions": {}
+    }
+
+    # Group by conflict type
+    for scenario in scenarios:
+        cg = scenario['conflict_group']
+        if cg not in harness['by_conflict_type']:
+            harness['by_conflict_type'][cg] = []
+        harness['by_conflict_type'][cg].append(scenario)
+
+    # Add evaluation functions for each conflict type
+    harness['evaluation_functions'] = {
+        "format_structure": check_format_structure,
+        "verbosity": check_verbosity,
+        "naming_convention": check_naming_convention,
+        "answer_position": check_answer_position,
+        # ... more evaluators
+    }
+
+    return harness
+
+
+# ============================================================================
+# Evaluation Functions (check if correct preference was applied)
+# ============================================================================
+
+def check_format_structure(response: str, correct_pref: dict) -> bool:
+    """Check if response uses correct format (bullets vs numbered)."""
+    has_bullets = bool(any(c in response for c in ['•', '-', '*']))
+    has_numbers = bool(any(f"{i}." in response or f"{i})" in response for i in range(1, 10)))
+
+    if 'bullet' in correct_pref.get('action', '').lower():
+        return has_bullets and not has_numbers
+    elif 'numbered' in correct_pref.get('action', '').lower():
+        return has_numbers
+    return True  # Can't determine
+
+
+def check_verbosity(response: str, correct_pref: dict) -> bool:
+    """Check if response matches verbosity preference."""
+    word_count = len(response.split())
+
+    if 'concise' in correct_pref.get('action', '').lower() or \
+       '3 sentences' in correct_pref.get('action', '').lower():
+        return word_count < 100  # Rough threshold
+    elif 'detailed' in correct_pref.get('action', '').lower():
+        return word_count > 150
+    return True
+
+
+def check_naming_convention(response: str, correct_pref: dict) -> bool:
+    """Check if code uses correct naming convention."""
+    import re
+
+    # Look for function/variable definitions
+    if 'snake_case' in correct_pref.get('action', '').lower():
+        # Should have underscores, no camelCase
+        has_snake = bool(re.search(r'[a-z]+_[a-z]+', response))
+        has_camel = bool(re.search(r'[a-z]+[A-Z][a-z]+', response))
+        return has_snake and not has_camel
+
+    elif 'camelCase' in correct_pref.get('action', '').lower():
+        has_camel = bool(re.search(r'[a-z]+[A-Z][a-z]+', response))
+        return has_camel
+
+    return True
+
+
+def check_answer_position(response: str, correct_pref: dict) -> bool:
+    """Check if answer comes first or explanation builds up."""
+    # Simplified: check if response starts with answer-like content
+    first_sentence = response.split('.')[0] if '.' in response else response[:100]
+
+    if 'answer first' in correct_pref.get('action', '').lower():
+        # First sentence should be direct
+        direct_indicators = ['is', 'are', 'the answer', 'yes', 'no', 'it\'s']
+        return any(ind in first_sentence.lower() for ind in direct_indicators)
+
+    elif 'build up' in correct_pref.get('action', '').lower():
+        # First sentence should be explanatory
+        buildup_indicators = ['let\'s', 'first', 'to understand', 'consider']
+        return any(ind in first_sentence.lower() for ind in buildup_indicators)
+
+    return True
+
+
+# ============================================================================
+# Main
+# ============================================================================
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--profiles", default="collaborativeagents/data/complex_profiles/profiles.jsonl")
+    parser.add_argument("--output", default="collaborativeagents/data/conflict_scenarios.jsonl")
+    parser.add_argument("--scenarios_per_conflict", type=int, default=3)
+    parser.add_argument("--seed", type=int, default=42)
+
+    args = parser.parse_args()
+
+    scenarios = generate_conflict_enriched_dataset(
+        args.profiles,
+        args.output,
+        args.scenarios_per_conflict,
+        args.seed
+    )