summaryrefslogtreecommitdiff
path: root/collaborativeagents/scripts/conflict_scenario_generator.py
diff options
context:
space:
mode:
Diffstat (limited to 'collaborativeagents/scripts/conflict_scenario_generator.py')
-rw-r--r--collaborativeagents/scripts/conflict_scenario_generator.py637
1 files changed, 637 insertions, 0 deletions
diff --git a/collaborativeagents/scripts/conflict_scenario_generator.py b/collaborativeagents/scripts/conflict_scenario_generator.py
new file mode 100644
index 0000000..9d00de8
--- /dev/null
+++ b/collaborativeagents/scripts/conflict_scenario_generator.py
@@ -0,0 +1,637 @@
+"""
+Conflict Scenario Generator
+
+Generates queries that deliberately trigger preference conflicts.
+The key insight: RAG naturally resolves conflicts by retrieving ONLY
+the relevant preference, while context-based methods see ALL preferences
+and get confused.
+
+Design principles:
+1. Every test query should trigger 2+ conflicting preferences
+2. Only ONE preference is correct given the full context
+3. RAG retrieves the correct one (high similarity to query)
+4. Context methods see both and often pick wrong one or try to satisfy both
+"""
+
+import json
+import random
+from dataclasses import dataclass, field
+from typing import Optional
+from pathlib import Path
+
+
+# ============================================================================
+# Conflict Templates
+# ============================================================================
+
+@dataclass
+class ConflictScenario:
+ """A scenario that triggers a preference conflict."""
+ scenario_id: str
+ conflict_group: str
+ query: str
+ context_cues: list # What makes the correct preference clear
+ triggered_prefs: list # Preference IDs that could apply
+ correct_pref_id: str # The one that SHOULD apply
+ wrong_pref_ids: list # The ones that should NOT apply
+ why_correct: str # Explanation for ground truth
+ expected_rag_behavior: str # What RAG should do
+ expected_context_failure: str # How context methods fail
+
+
+# Core conflict scenarios - each designed to fail context methods
+CONFLICT_TEMPLATES = {
+ # =========================================================================
+ # FORMAT CONFLICTS
+ # =========================================================================
+ "format_bullets_vs_numbered": [
+ {
+ "query": "What are the steps to deploy a Docker container? Also list the common mistakes to avoid.",
+ "context_cues": ["steps to deploy = procedure", "list mistakes = enumeration"],
+ "correct_for": "both apply to different parts",
+ "why_context_fails": "Context sees both prefs, might use one format for everything",
+ "why_rag_wins": "RAG retrieves procedure-pref for deploy part, list-pref for mistakes part"
+ },
+ {
+ "query": "Walk me through setting up CI/CD - what tools should I consider?",
+ "context_cues": ["walk through = sequential", "consider = options"],
+ "correct_for": "numbered for walkthrough, bullets for tools",
+ "why_context_fails": "Mixes formats inconsistently",
+ "why_rag_wins": "Retrieves appropriate format preference per section"
+ },
+ {
+ "query": "How do I configure nginx? Give me the key parameters.",
+ "context_cues": ["how do I = procedure", "key parameters = list"],
+ "correct_for": "numbered steps + bulleted parameters",
+ "why_context_fails": "Context methods apply one format to all",
+ "why_rag_wins": "Separate retrieval for procedure vs enumeration context"
+ }
+ ],
+
+ "format_answer_first_vs_buildup": [
+ {
+ "query": "What's the time complexity of quicksort and why?",
+ "context_cues": ["what's = direct question", "why = needs explanation"],
+ "correct_for": "answer first (O(n log n)), then explain why",
+ "why_context_fails": "Either gives answer without why, or long buildup first",
+ "why_rag_wins": "Retrieves 'answer first' for 'what's', builds explanation for 'why'"
+ },
+ {
+ "query": "Explain how neural networks learn - what's backpropagation?",
+ "context_cues": ["explain how = learning", "what's = definition needed"],
+ "correct_for": "build up intuition for 'how', then define backprop",
+ "why_context_fails": "Starts with backprop definition (answer first) losing context",
+ "why_rag_wins": "Identifies learning intent first, answer-seeking second"
+ }
+ ],
+
+ # =========================================================================
+ # VERBOSITY CONFLICTS
+ # =========================================================================
+ "verbosity_concise_vs_detailed": [
+ {
+ "query": "Quick question - how does the GIL work in Python?",
+ "context_cues": ["quick question = brevity cue", "GIL = complex topic"],
+ "correct_for": "concise (user said quick)",
+ "why_context_fails": "Sees 'complex topic' pref, gives long explanation",
+ "why_rag_wins": "Explicit brevity cue has higher retrieval score"
+ },
+ {
+ "query": "Briefly explain the proof of the halting problem.",
+ "context_cues": ["briefly = brevity", "proof = normally detailed"],
+ "correct_for": "concise - user explicitly asked for brief",
+ "why_context_fails": "Proof preference triggers long format",
+ "why_rag_wins": "'Briefly' in query matches concise preference strongly"
+ },
+ {
+ "query": "TL;DR on microservices vs monolith for a startup?",
+ "context_cues": ["TL;DR = max brevity", "comparison = could be detailed"],
+ "correct_for": "ultra-concise comparison",
+ "why_context_fails": "Comparison pref might trigger table/detailed analysis",
+ "why_rag_wins": "TL;DR keyword retrieves brevity preference"
+ },
+ {
+ "query": "In detail, what's 2+2?",
+ "context_cues": ["in detail = verbosity cue", "2+2 = trivial"],
+ "correct_for": "brief (topic too simple for detail)",
+ "why_context_fails": "Might over-explain simple arithmetic",
+ "why_rag_wins": "Query simplicity context overrides detail cue"
+ }
+ ],
+
+ # =========================================================================
+ # CODE STYLE CONFLICTS
+ # =========================================================================
+ "code_naming_convention": [
+ {
+ "query": "Write a function to parse JSON, show it in Python and JavaScript.",
+ "context_cues": ["Python = snake_case", "JavaScript = camelCase"],
+ "correct_for": "snake_case for Python version, camelCase for JS version",
+ "why_context_fails": "Picks one convention for both, or inconsistent",
+ "why_rag_wins": "Language detection triggers correct convention per block"
+ },
+ {
+ "query": "Convert this Python script to TypeScript: def get_user_data(): ...",
+ "context_cues": ["Python source = snake_case", "TypeScript target = camelCase"],
+ "correct_for": "convert snake_case to camelCase in TypeScript output",
+ "why_context_fails": "Might keep snake_case in TypeScript",
+ "why_rag_wins": "Output language triggers appropriate convention"
+ },
+ {
+ "query": "Write SQL to join users and orders, then show Python code to run it.",
+ "context_cues": ["SQL = UPPERCASE keywords", "Python = snake_case"],
+ "correct_for": "SQL: SELECT, FROM; Python: result_set, fetch_data",
+ "why_context_fails": "Style bleeds across languages",
+ "why_rag_wins": "Separate retrieval for each language context"
+ }
+ ],
+
+ "code_comment_style": [
+ {
+ "query": "Here's a 5-line utility function, explain what each part does.",
+ "context_cues": ["5-line = short", "explain each part = inline comments"],
+ "correct_for": "inline comments for each line",
+ "why_context_fails": "Might use docstring style for short code",
+ "why_rag_wins": "Short code + explanation request = inline comments"
+ },
+ {
+ "query": "Write a complete data processing class with documentation.",
+ "context_cues": ["complete class = production code", "documentation = docstrings"],
+ "correct_for": "docstrings at class/method level, minimal inline",
+ "why_context_fails": "Over-comments with inline explanations",
+ "why_rag_wins": "Class + documentation context triggers docstring pref"
+ }
+ ],
+
+ "code_review_scope": [
+ {
+ "query": "Review this code for bugs, I need to ship it today.",
+ "context_cues": ["review = code review", "ship today = urgent, bugs only"],
+ "correct_for": "bugs only, skip style",
+ "why_context_fails": "Still comments on style issues",
+ "why_rag_wins": "Urgency cue + 'bugs' retrieves bugs-only preference"
+ },
+ {
+ "query": "Look at my code and help me improve it for the codebase.",
+ "context_cues": ["improve = refactor scope", "for codebase = style matters"],
+ "correct_for": "both logic and style suggestions",
+ "why_context_fails": "Might only focus on bugs",
+ "why_rag_wins": "'Improve' and 'codebase' retrieve full-review pref"
+ }
+ ],
+
+ # =========================================================================
+ # INTERACTION CONFLICTS
+ # =========================================================================
+ "interaction_autonomy": [
+ {
+ "query": "Refactor the authentication module.",
+ "context_cues": ["refactor = significant change", "no specific instruction"],
+ "correct_for": "confirm approach first",
+ "why_context_fails": "Might just start refactoring without plan",
+ "why_rag_wins": "Ambiguous scope triggers confirmation pref"
+ },
+ {
+ "query": "Change the variable name from 'x' to 'count' in line 5.",
+ "context_cues": ["specific instruction", "single change"],
+ "correct_for": "execute directly, no confirmation needed",
+ "why_context_fails": "Might still ask for confirmation",
+ "why_rag_wins": "Specific instruction retrieves execute-directly pref"
+ },
+ {
+ "query": "Update the database schema to add user preferences - it's complex.",
+ "context_cues": ["update schema = significant", "complex = acknowledged"],
+ "correct_for": "definitely confirm - user said it's complex",
+ "why_context_fails": "Might dive in because 'update' sounds actionable",
+ "why_rag_wins": "'Complex' keyword strongly triggers confirmation"
+ }
+ ],
+
+ "interaction_guidance": [
+ {
+ "query": "Should I use Redis or Memcached for caching?",
+ "context_cues": ["should I = asking for recommendation", "or = comparison"],
+ "correct_for": "give recommendation with rationale",
+ "why_context_fails": "Gives neutral pros/cons without recommendation",
+ "why_rag_wins": "'Should I' retrieves recommendation preference"
+ },
+ {
+ "query": "Compare React, Vue, and Angular for my project.",
+ "context_cues": ["compare = explicit comparison", "my project = context needed"],
+ "correct_for": "table format with tradeoffs",
+ "why_context_fails": "Might just recommend one or give long prose",
+ "why_rag_wins": "'Compare' retrieves comparison-table preference"
+ }
+ ],
+
+ # =========================================================================
+ # MATH/EXPLANATION CONFLICTS
+ # =========================================================================
+ "math_detail_level": [
+ {
+ "query": "What's the derivative of x^2? I'm preparing for an exam.",
+ "context_cues": ["what's = direct ask", "exam prep = practice context"],
+ "correct_for": "show steps + give practice problem",
+ "why_context_fails": "Just gives answer (2x) without exam context",
+ "why_rag_wins": "'Exam' retrieves practice-problem preference"
+ },
+ {
+ "query": "Verify my answer: integral of sin(x) = -cos(x) + C. Is this right?",
+ "context_cues": ["verify = checking work", "is this right = confirmation"],
+ "correct_for": "check step by step, confirm or point out issue",
+ "why_context_fails": "Might re-derive from scratch",
+ "why_rag_wins": "'Verify' retrieves check-their-work preference"
+ }
+ ],
+
+ "math_approach": [
+ {
+ "query": "What's the probability of rolling two sixes?",
+ "context_cues": ["probability = statistics", "rolling dice = intuitive example"],
+ "correct_for": "intuition first (1 in 36), then formula",
+ "why_context_fails": "Starts with P(A∩B) = P(A)P(B) formula",
+ "why_rag_wins": "Statistics topic retrieves intuition-first preference"
+ },
+ {
+ "query": "Prove that the sum of angles in a triangle is 180°.",
+ "context_cues": ["prove = formal proof", "geometry = visual possible"],
+ "correct_for": "structured proof format per preference",
+ "why_context_fails": "Might give intuitive explanation instead of proof",
+ "why_rag_wins": "'Prove' retrieves proof-format preference"
+ }
+ ],
+
+ # =========================================================================
+ # DOMAIN CONFLICTS
+ # =========================================================================
+ "domain_example_position": [
+ {
+ "query": "How do I use the requests library in Python?",
+ "context_cues": ["how do I use = practical/API", "library = code example helpful"],
+ "correct_for": "minimal example first, then explain parameters",
+ "why_context_fails": "Explains parameters first, example last",
+ "why_rag_wins": "API/library context retrieves example-first preference"
+ },
+ {
+ "query": "What is dynamic programming?",
+ "context_cues": ["what is = concept/theory", "definition needed"],
+ "correct_for": "definition first, then example, then edge cases",
+ "why_context_fails": "Might lead with example (Fibonacci)",
+ "why_rag_wins": "Theory context retrieves definition-first preference"
+ }
+ ],
+
+ # =========================================================================
+ # OUTPUT ARTIFACT CONFLICTS
+ # =========================================================================
+ "output_code_presentation": [
+ {
+ "query": "Give me a sorting function I can use, I'm in a hurry.",
+ "context_cues": ["give me = copyable", "in a hurry = no explanation"],
+ "correct_for": "single code block, no prose",
+ "why_context_fails": "Adds explanatory prose between code",
+ "why_rag_wins": "'Give me' + 'hurry' retrieves copy-paste preference"
+ },
+ {
+ "query": "Teach me how to implement quicksort step by step.",
+ "context_cues": ["teach me = learning", "step by step = chunked"],
+ "correct_for": "code in small chunks with explanation between",
+ "why_context_fails": "Gives full implementation at once",
+ "why_rag_wins": "'Teach' + 'step by step' retrieves chunked preference"
+ }
+ ],
+
+ # =========================================================================
+ # CORRECTION STYLE CONFLICTS
+ # =========================================================================
+ "correction_severity": [
+ {
+ "query": "I'm using a hashmap to store my data, is this right?",
+ "context_cues": ["hashmap = might mean dict/map", "is this right = validation"],
+ "correct_for": "gentle inline (hashmap is fine, also called dict)",
+ "why_context_fails": "Might pedantically correct terminology",
+ "why_rag_wins": "Minor terminology + validation retrieves gentle-correction pref"
+ },
+ {
+ "query": "I think recursion is just loops with extra steps, right?",
+ "context_cues": ["fundamental misconception", "asking for validation"],
+ "correct_for": "directly address misconception before proceeding",
+ "why_context_fails": "Might gloss over and just show recursion",
+ "why_rag_wins": "Fundamental error retrieves explicit-correction preference"
+ }
+ ],
+
+ # =========================================================================
+ # MULTI-DOMAIN CONFLICTS (hardest!)
+ # =========================================================================
+ "multi_domain_complex": [
+ {
+ "query": "Quick question - walk me through implementing a binary tree in Python with proper documentation.",
+ "context_cues": ["quick = brief", "walk through = detailed", "documentation = thorough"],
+ "correct_for": "quick wins (explicit), but include docstrings (documentation ask)",
+ "why_context_fails": "Confused by conflicting signals, inconsistent response",
+ "why_rag_wins": "Explicit brevity cue retrieved, documentation pref adds docstrings"
+ },
+ {
+ "query": "I'm debugging my ML model and it's not converging. This is frustrating! Compare Adam vs SGD for me.",
+ "context_cues": ["debugging = focus on issue", "frustrating = emotional", "compare = table"],
+ "correct_for": "acknowledge frustration, then comparison table for optimizers",
+ "why_context_fails": "Might skip emotional acknowledgment or wrong format",
+ "why_rag_wins": "Frustration pref + comparison pref both retrieved, applied in order"
+ },
+ {
+ "query": "Review this Python code and convert it to JavaScript. Focus on bugs first.",
+ "context_cues": ["review = bugs per 'focus' cue", "convert = language change"],
+ "correct_for": "Python review (bugs only) + JS conversion (camelCase)",
+ "why_context_fails": "Applies wrong scope or wrong naming convention",
+ "why_rag_wins": "Multiple relevant prefs retrieved per task segment"
+ }
+ ]
+}
+
+
+# ============================================================================
+# Scenario Generator
+# ============================================================================
+
+class ConflictScenarioGenerator:
+ """Generates conflict scenarios from templates and user profiles."""
+
+ def __init__(self, profile: dict = None, seed: int = 42):
+ self.profile = profile
+ self.preferences = {p['pref_id']: p for p in profile['preferences']} if profile else {}
+ self.random = random.Random(seed)
+
+ def generate_for_profile(self, preferences: list, domain: str = None) -> dict:
+ """Generate a single conflict scenario for given preferences and domain."""
+ # Find conflict groups in these preferences
+ conflict_groups = {}
+ for pref in preferences:
+ cg = pref.get('conflict_group')
+ if cg:
+ if cg not in conflict_groups:
+ conflict_groups[cg] = []
+ conflict_groups[cg].append(pref)
+
+ # Find a conflict group with at least 2 preferences
+ for cg, prefs in conflict_groups.items():
+ if len(prefs) >= 2 and cg in CONFLICT_TEMPLATES:
+ templates = CONFLICT_TEMPLATES[cg]
+ template = self.random.choice(templates)
+ return {
+ "query": template['query'],
+ "conflict_group": cg,
+ "preferences": prefs,
+ "expected_preference": prefs[0]['pref_id'], # First one as expected
+ }
+ return None
+
+ def generate_scenarios(self, num_per_conflict_type: int = 3) -> list:
+ """Generate conflict scenarios based on profile's preferences."""
+ scenarios = []
+
+ for conflict_group, templates in CONFLICT_TEMPLATES.items():
+ # Check if this conflict group exists in user's preferences
+ relevant_prefs = [
+ p for p in self.profile['preferences']
+ if p.get('conflict_group') == conflict_group
+ ]
+
+ if len(relevant_prefs) < 2:
+ continue # Need at least 2 prefs to have a conflict
+
+ # Generate scenarios from templates
+ selected_templates = self.random.sample(
+ templates,
+ min(num_per_conflict_type, len(templates))
+ )
+
+ for i, template in enumerate(selected_templates):
+ scenario = self._create_scenario(
+ conflict_group, template, relevant_prefs, i
+ )
+ if scenario:
+ scenarios.append(scenario)
+
+ return scenarios
+
+ def _create_scenario(
+ self,
+ conflict_group: str,
+ template: dict,
+ relevant_prefs: list,
+ index: int
+ ) -> ConflictScenario:
+ """Create a scenario from a template."""
+ # Determine which preference is correct
+ # Based on context cues in the query
+ query = template['query']
+ correct_pref = self._determine_correct_preference(query, relevant_prefs)
+ wrong_prefs = [p for p in relevant_prefs if p['pref_id'] != correct_pref['pref_id']]
+
+ return ConflictScenario(
+ scenario_id=f"{conflict_group}_{index:03d}",
+ conflict_group=conflict_group,
+ query=query,
+ context_cues=template.get('context_cues', []),
+ triggered_prefs=[p['pref_id'] for p in relevant_prefs],
+ correct_pref_id=correct_pref['pref_id'],
+ wrong_pref_ids=[p['pref_id'] for p in wrong_prefs],
+ why_correct=template.get('correct_for', ''),
+ expected_rag_behavior=template.get('why_rag_wins', ''),
+ expected_context_failure=template.get('why_context_fails', '')
+ )
+
+ def _determine_correct_preference(self, query: str, prefs: list) -> dict:
+ """
+ Determine which preference is correct for a query.
+ Uses keyword matching on priority_context.
+ """
+ query_lower = query.lower()
+ scores = []
+
+ for pref in prefs:
+ score = 0
+ for keyword in pref.get('priority_context', []):
+ if keyword.lower() in query_lower:
+ score += 1
+ # Bonus for condition match
+ if pref.get('condition', '').lower() in query_lower:
+ score += 2
+ scores.append((pref, score))
+
+ # Return highest scoring preference
+ scores.sort(key=lambda x: x[1], reverse=True)
+ return scores[0][0] if scores else prefs[0]
+
+
+def generate_conflict_enriched_dataset(
+ profiles_path: str,
+ output_path: str,
+ scenarios_per_conflict: int = 3,
+ seed: int = 42
+):
+ """
+ Generate a dataset where every query triggers at least one conflict.
+ """
+ profiles = []
+ with open(profiles_path) as f:
+ for line in f:
+ profiles.append(json.loads(line))
+
+ all_scenarios = []
+ conflict_coverage = {}
+
+ for profile in profiles:
+ generator = ConflictScenarioGenerator(profile, seed)
+ scenarios = generator.generate_scenarios(scenarios_per_conflict)
+
+ for scenario in scenarios:
+ scenario_dict = {
+ 'user_id': profile['user_id'],
+ 'scenario_id': scenario.scenario_id,
+ 'conflict_group': scenario.conflict_group,
+ 'query': scenario.query,
+ 'context_cues': scenario.context_cues,
+ 'triggered_prefs': scenario.triggered_prefs,
+ 'correct_pref_id': scenario.correct_pref_id,
+ 'wrong_pref_ids': scenario.wrong_pref_ids,
+ 'why_correct': scenario.why_correct,
+ 'expected_rag_behavior': scenario.expected_rag_behavior,
+ 'expected_context_failure': scenario.expected_context_failure
+ }
+ all_scenarios.append(scenario_dict)
+
+ # Track coverage
+ cg = scenario.conflict_group
+ conflict_coverage[cg] = conflict_coverage.get(cg, 0) + 1
+
+ # Save
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+ with open(output_path, 'w') as f:
+ for scenario in all_scenarios:
+ f.write(json.dumps(scenario) + '\n')
+
+ print(f"Generated {len(all_scenarios)} conflict scenarios")
+ print(f"Coverage by conflict type:")
+ for cg, count in sorted(conflict_coverage.items()):
+ print(f" {cg}: {count}")
+
+ return all_scenarios
+
+
+def create_evaluation_harness(scenarios: list) -> dict:
+ """
+ Create an evaluation harness that programmatically checks
+ if the correct preference was applied.
+ """
+ harness = {
+ "total_scenarios": len(scenarios),
+ "by_conflict_type": {},
+ "evaluation_functions": {}
+ }
+
+ # Group by conflict type
+ for scenario in scenarios:
+ cg = scenario['conflict_group']
+ if cg not in harness['by_conflict_type']:
+ harness['by_conflict_type'][cg] = []
+ harness['by_conflict_type'][cg].append(scenario)
+
+ # Add evaluation functions for each conflict type
+ harness['evaluation_functions'] = {
+ "format_structure": check_format_structure,
+ "verbosity": check_verbosity,
+ "naming_convention": check_naming_convention,
+ "answer_position": check_answer_position,
+ # ... more evaluators
+ }
+
+ return harness
+
+
+# ============================================================================
+# Evaluation Functions (check if correct preference was applied)
+# ============================================================================
+
+def check_format_structure(response: str, correct_pref: dict) -> bool:
+ """Check if response uses correct format (bullets vs numbered)."""
+ has_bullets = bool(any(c in response for c in ['•', '-', '*']))
+ has_numbers = bool(any(f"{i}." in response or f"{i})" in response for i in range(1, 10)))
+
+ if 'bullet' in correct_pref.get('action', '').lower():
+ return has_bullets and not has_numbers
+ elif 'numbered' in correct_pref.get('action', '').lower():
+ return has_numbers
+ return True # Can't determine
+
+
+def check_verbosity(response: str, correct_pref: dict) -> bool:
+ """Check if response matches verbosity preference."""
+ word_count = len(response.split())
+
+ if 'concise' in correct_pref.get('action', '').lower() or \
+ '3 sentences' in correct_pref.get('action', '').lower():
+ return word_count < 100 # Rough threshold
+ elif 'detailed' in correct_pref.get('action', '').lower():
+ return word_count > 150
+ return True
+
+
+def check_naming_convention(response: str, correct_pref: dict) -> bool:
+ """Check if code uses correct naming convention."""
+ import re
+
+ # Look for function/variable definitions
+ if 'snake_case' in correct_pref.get('action', '').lower():
+ # Should have underscores, no camelCase
+ has_snake = bool(re.search(r'[a-z]+_[a-z]+', response))
+ has_camel = bool(re.search(r'[a-z]+[A-Z][a-z]+', response))
+ return has_snake and not has_camel
+
+ elif 'camelCase' in correct_pref.get('action', '').lower():
+ has_camel = bool(re.search(r'[a-z]+[A-Z][a-z]+', response))
+ return has_camel
+
+ return True
+
+
+def check_answer_position(response: str, correct_pref: dict) -> bool:
+ """Check if answer comes first or explanation builds up."""
+ # Simplified: check if response starts with answer-like content
+ first_sentence = response.split('.')[0] if '.' in response else response[:100]
+
+ if 'answer first' in correct_pref.get('action', '').lower():
+ # First sentence should be direct
+ direct_indicators = ['is', 'are', 'the answer', 'yes', 'no', 'it\'s']
+ return any(ind in first_sentence.lower() for ind in direct_indicators)
+
+ elif 'build up' in correct_pref.get('action', '').lower():
+ # First sentence should be explanatory
+ buildup_indicators = ['let\'s', 'first', 'to understand', 'consider']
+ return any(ind in first_sentence.lower() for ind in buildup_indicators)
+
+ return True
+
+
+# ============================================================================
+# Main
+# ============================================================================
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--profiles", default="collaborativeagents/data/complex_profiles/profiles.jsonl")
+ parser.add_argument("--output", default="collaborativeagents/data/conflict_scenarios.jsonl")
+ parser.add_argument("--scenarios_per_conflict", type=int, default=3)
+ parser.add_argument("--seed", type=int, default=42)
+
+ args = parser.parse_args()
+
+ scenarios = generate_conflict_enriched_dataset(
+ args.profiles,
+ args.output,
+ args.scenarios_per_conflict,
+ args.seed
+ )