""" Conflict Scenario Generator Generates queries that deliberately trigger preference conflicts. The key insight: RAG naturally resolves conflicts by retrieving ONLY the relevant preference, while context-based methods see ALL preferences and get confused. Design principles: 1. Every test query should trigger 2+ conflicting preferences 2. Only ONE preference is correct given the full context 3. RAG retrieves the correct one (high similarity to query) 4. Context methods see both and often pick wrong one or try to satisfy both """ import json import random from dataclasses import dataclass, field from typing import Optional from pathlib import Path # ============================================================================ # Conflict Templates # ============================================================================ @dataclass class ConflictScenario: """A scenario that triggers a preference conflict.""" scenario_id: str conflict_group: str query: str context_cues: list # What makes the correct preference clear triggered_prefs: list # Preference IDs that could apply correct_pref_id: str # The one that SHOULD apply wrong_pref_ids: list # The ones that should NOT apply why_correct: str # Explanation for ground truth expected_rag_behavior: str # What RAG should do expected_context_failure: str # How context methods fail # Core conflict scenarios - each designed to fail context methods CONFLICT_TEMPLATES = { # ========================================================================= # FORMAT CONFLICTS # ========================================================================= "format_bullets_vs_numbered": [ { "query": "What are the steps to deploy a Docker container? Also list the common mistakes to avoid.", "context_cues": ["steps to deploy = procedure", "list mistakes = enumeration"], "correct_for": "both apply to different parts", "why_context_fails": "Context sees both prefs, might use one format for everything", "why_rag_wins": "RAG retrieves procedure-pref for deploy part, list-pref for mistakes part" }, { "query": "Walk me through setting up CI/CD - what tools should I consider?", "context_cues": ["walk through = sequential", "consider = options"], "correct_for": "numbered for walkthrough, bullets for tools", "why_context_fails": "Mixes formats inconsistently", "why_rag_wins": "Retrieves appropriate format preference per section" }, { "query": "How do I configure nginx? Give me the key parameters.", "context_cues": ["how do I = procedure", "key parameters = list"], "correct_for": "numbered steps + bulleted parameters", "why_context_fails": "Context methods apply one format to all", "why_rag_wins": "Separate retrieval for procedure vs enumeration context" } ], "format_answer_first_vs_buildup": [ { "query": "What's the time complexity of quicksort and why?", "context_cues": ["what's = direct question", "why = needs explanation"], "correct_for": "answer first (O(n log n)), then explain why", "why_context_fails": "Either gives answer without why, or long buildup first", "why_rag_wins": "Retrieves 'answer first' for 'what's', builds explanation for 'why'" }, { "query": "Explain how neural networks learn - what's backpropagation?", "context_cues": ["explain how = learning", "what's = definition needed"], "correct_for": "build up intuition for 'how', then define backprop", "why_context_fails": "Starts with backprop definition (answer first) losing context", "why_rag_wins": "Identifies learning intent first, answer-seeking second" } ], # ========================================================================= # VERBOSITY CONFLICTS # ========================================================================= "verbosity_concise_vs_detailed": [ { "query": "Quick question - how does the GIL work in Python?", "context_cues": ["quick question = brevity cue", "GIL = complex topic"], "correct_for": "concise (user said quick)", "why_context_fails": "Sees 'complex topic' pref, gives long explanation", "why_rag_wins": "Explicit brevity cue has higher retrieval score" }, { "query": "Briefly explain the proof of the halting problem.", "context_cues": ["briefly = brevity", "proof = normally detailed"], "correct_for": "concise - user explicitly asked for brief", "why_context_fails": "Proof preference triggers long format", "why_rag_wins": "'Briefly' in query matches concise preference strongly" }, { "query": "TL;DR on microservices vs monolith for a startup?", "context_cues": ["TL;DR = max brevity", "comparison = could be detailed"], "correct_for": "ultra-concise comparison", "why_context_fails": "Comparison pref might trigger table/detailed analysis", "why_rag_wins": "TL;DR keyword retrieves brevity preference" }, { "query": "In detail, what's 2+2?", "context_cues": ["in detail = verbosity cue", "2+2 = trivial"], "correct_for": "brief (topic too simple for detail)", "why_context_fails": "Might over-explain simple arithmetic", "why_rag_wins": "Query simplicity context overrides detail cue" } ], # ========================================================================= # CODE STYLE CONFLICTS # ========================================================================= "code_naming_convention": [ { "query": "Write a function to parse JSON, show it in Python and JavaScript.", "context_cues": ["Python = snake_case", "JavaScript = camelCase"], "correct_for": "snake_case for Python version, camelCase for JS version", "why_context_fails": "Picks one convention for both, or inconsistent", "why_rag_wins": "Language detection triggers correct convention per block" }, { "query": "Convert this Python script to TypeScript: def get_user_data(): ...", "context_cues": ["Python source = snake_case", "TypeScript target = camelCase"], "correct_for": "convert snake_case to camelCase in TypeScript output", "why_context_fails": "Might keep snake_case in TypeScript", "why_rag_wins": "Output language triggers appropriate convention" }, { "query": "Write SQL to join users and orders, then show Python code to run it.", "context_cues": ["SQL = UPPERCASE keywords", "Python = snake_case"], "correct_for": "SQL: SELECT, FROM; Python: result_set, fetch_data", "why_context_fails": "Style bleeds across languages", "why_rag_wins": "Separate retrieval for each language context" } ], "code_comment_style": [ { "query": "Here's a 5-line utility function, explain what each part does.", "context_cues": ["5-line = short", "explain each part = inline comments"], "correct_for": "inline comments for each line", "why_context_fails": "Might use docstring style for short code", "why_rag_wins": "Short code + explanation request = inline comments" }, { "query": "Write a complete data processing class with documentation.", "context_cues": ["complete class = production code", "documentation = docstrings"], "correct_for": "docstrings at class/method level, minimal inline", "why_context_fails": "Over-comments with inline explanations", "why_rag_wins": "Class + documentation context triggers docstring pref" } ], "code_review_scope": [ { "query": "Review this code for bugs, I need to ship it today.", "context_cues": ["review = code review", "ship today = urgent, bugs only"], "correct_for": "bugs only, skip style", "why_context_fails": "Still comments on style issues", "why_rag_wins": "Urgency cue + 'bugs' retrieves bugs-only preference" }, { "query": "Look at my code and help me improve it for the codebase.", "context_cues": ["improve = refactor scope", "for codebase = style matters"], "correct_for": "both logic and style suggestions", "why_context_fails": "Might only focus on bugs", "why_rag_wins": "'Improve' and 'codebase' retrieve full-review pref" } ], # ========================================================================= # INTERACTION CONFLICTS # ========================================================================= "interaction_autonomy": [ { "query": "Refactor the authentication module.", "context_cues": ["refactor = significant change", "no specific instruction"], "correct_for": "confirm approach first", "why_context_fails": "Might just start refactoring without plan", "why_rag_wins": "Ambiguous scope triggers confirmation pref" }, { "query": "Change the variable name from 'x' to 'count' in line 5.", "context_cues": ["specific instruction", "single change"], "correct_for": "execute directly, no confirmation needed", "why_context_fails": "Might still ask for confirmation", "why_rag_wins": "Specific instruction retrieves execute-directly pref" }, { "query": "Update the database schema to add user preferences - it's complex.", "context_cues": ["update schema = significant", "complex = acknowledged"], "correct_for": "definitely confirm - user said it's complex", "why_context_fails": "Might dive in because 'update' sounds actionable", "why_rag_wins": "'Complex' keyword strongly triggers confirmation" } ], "interaction_guidance": [ { "query": "Should I use Redis or Memcached for caching?", "context_cues": ["should I = asking for recommendation", "or = comparison"], "correct_for": "give recommendation with rationale", "why_context_fails": "Gives neutral pros/cons without recommendation", "why_rag_wins": "'Should I' retrieves recommendation preference" }, { "query": "Compare React, Vue, and Angular for my project.", "context_cues": ["compare = explicit comparison", "my project = context needed"], "correct_for": "table format with tradeoffs", "why_context_fails": "Might just recommend one or give long prose", "why_rag_wins": "'Compare' retrieves comparison-table preference" } ], # ========================================================================= # MATH/EXPLANATION CONFLICTS # ========================================================================= "math_detail_level": [ { "query": "What's the derivative of x^2? I'm preparing for an exam.", "context_cues": ["what's = direct ask", "exam prep = practice context"], "correct_for": "show steps + give practice problem", "why_context_fails": "Just gives answer (2x) without exam context", "why_rag_wins": "'Exam' retrieves practice-problem preference" }, { "query": "Verify my answer: integral of sin(x) = -cos(x) + C. Is this right?", "context_cues": ["verify = checking work", "is this right = confirmation"], "correct_for": "check step by step, confirm or point out issue", "why_context_fails": "Might re-derive from scratch", "why_rag_wins": "'Verify' retrieves check-their-work preference" } ], "math_approach": [ { "query": "What's the probability of rolling two sixes?", "context_cues": ["probability = statistics", "rolling dice = intuitive example"], "correct_for": "intuition first (1 in 36), then formula", "why_context_fails": "Starts with P(A∩B) = P(A)P(B) formula", "why_rag_wins": "Statistics topic retrieves intuition-first preference" }, { "query": "Prove that the sum of angles in a triangle is 180°.", "context_cues": ["prove = formal proof", "geometry = visual possible"], "correct_for": "structured proof format per preference", "why_context_fails": "Might give intuitive explanation instead of proof", "why_rag_wins": "'Prove' retrieves proof-format preference" } ], # ========================================================================= # DOMAIN CONFLICTS # ========================================================================= "domain_example_position": [ { "query": "How do I use the requests library in Python?", "context_cues": ["how do I use = practical/API", "library = code example helpful"], "correct_for": "minimal example first, then explain parameters", "why_context_fails": "Explains parameters first, example last", "why_rag_wins": "API/library context retrieves example-first preference" }, { "query": "What is dynamic programming?", "context_cues": ["what is = concept/theory", "definition needed"], "correct_for": "definition first, then example, then edge cases", "why_context_fails": "Might lead with example (Fibonacci)", "why_rag_wins": "Theory context retrieves definition-first preference" } ], # ========================================================================= # OUTPUT ARTIFACT CONFLICTS # ========================================================================= "output_code_presentation": [ { "query": "Give me a sorting function I can use, I'm in a hurry.", "context_cues": ["give me = copyable", "in a hurry = no explanation"], "correct_for": "single code block, no prose", "why_context_fails": "Adds explanatory prose between code", "why_rag_wins": "'Give me' + 'hurry' retrieves copy-paste preference" }, { "query": "Teach me how to implement quicksort step by step.", "context_cues": ["teach me = learning", "step by step = chunked"], "correct_for": "code in small chunks with explanation between", "why_context_fails": "Gives full implementation at once", "why_rag_wins": "'Teach' + 'step by step' retrieves chunked preference" } ], # ========================================================================= # CORRECTION STYLE CONFLICTS # ========================================================================= "correction_severity": [ { "query": "I'm using a hashmap to store my data, is this right?", "context_cues": ["hashmap = might mean dict/map", "is this right = validation"], "correct_for": "gentle inline (hashmap is fine, also called dict)", "why_context_fails": "Might pedantically correct terminology", "why_rag_wins": "Minor terminology + validation retrieves gentle-correction pref" }, { "query": "I think recursion is just loops with extra steps, right?", "context_cues": ["fundamental misconception", "asking for validation"], "correct_for": "directly address misconception before proceeding", "why_context_fails": "Might gloss over and just show recursion", "why_rag_wins": "Fundamental error retrieves explicit-correction preference" } ], # ========================================================================= # MULTI-DOMAIN CONFLICTS (hardest!) # ========================================================================= "multi_domain_complex": [ { "query": "Quick question - walk me through implementing a binary tree in Python with proper documentation.", "context_cues": ["quick = brief", "walk through = detailed", "documentation = thorough"], "correct_for": "quick wins (explicit), but include docstrings (documentation ask)", "why_context_fails": "Confused by conflicting signals, inconsistent response", "why_rag_wins": "Explicit brevity cue retrieved, documentation pref adds docstrings" }, { "query": "I'm debugging my ML model and it's not converging. This is frustrating! Compare Adam vs SGD for me.", "context_cues": ["debugging = focus on issue", "frustrating = emotional", "compare = table"], "correct_for": "acknowledge frustration, then comparison table for optimizers", "why_context_fails": "Might skip emotional acknowledgment or wrong format", "why_rag_wins": "Frustration pref + comparison pref both retrieved, applied in order" }, { "query": "Review this Python code and convert it to JavaScript. Focus on bugs first.", "context_cues": ["review = bugs per 'focus' cue", "convert = language change"], "correct_for": "Python review (bugs only) + JS conversion (camelCase)", "why_context_fails": "Applies wrong scope or wrong naming convention", "why_rag_wins": "Multiple relevant prefs retrieved per task segment" } ] } # ============================================================================ # Scenario Generator # ============================================================================ class ConflictScenarioGenerator: """Generates conflict scenarios from templates and user profiles.""" def __init__(self, profile: dict = None, seed: int = 42): self.profile = profile self.preferences = {p['pref_id']: p for p in profile['preferences']} if profile else {} self.random = random.Random(seed) def generate_for_profile(self, preferences: list, domain: str = None) -> dict: """Generate a single conflict scenario for given preferences and domain.""" # Find conflict groups in these preferences conflict_groups = {} for pref in preferences: # Handle both dict preferences (with conflict_group) and string preferences if isinstance(pref, dict): cg = pref.get('conflict_group') if cg: if cg not in conflict_groups: conflict_groups[cg] = [] conflict_groups[cg].append(pref) # String preferences don't have conflict groups - skip them # Find a conflict group with at least 2 preferences for cg, prefs in conflict_groups.items(): if len(prefs) >= 2 and cg in CONFLICT_TEMPLATES: templates = CONFLICT_TEMPLATES[cg] template = self.random.choice(templates) return { "query": template['query'], "conflict_group": cg, "preferences": prefs, "expected_preference": prefs[0]['pref_id'], # First one as expected } return None def generate_scenarios(self, num_per_conflict_type: int = 3) -> list: """Generate conflict scenarios based on profile's preferences.""" scenarios = [] for conflict_group, templates in CONFLICT_TEMPLATES.items(): # Check if this conflict group exists in user's preferences relevant_prefs = [ p for p in self.profile['preferences'] if p.get('conflict_group') == conflict_group ] if len(relevant_prefs) < 2: continue # Need at least 2 prefs to have a conflict # Generate scenarios from templates selected_templates = self.random.sample( templates, min(num_per_conflict_type, len(templates)) ) for i, template in enumerate(selected_templates): scenario = self._create_scenario( conflict_group, template, relevant_prefs, i ) if scenario: scenarios.append(scenario) return scenarios def _create_scenario( self, conflict_group: str, template: dict, relevant_prefs: list, index: int ) -> ConflictScenario: """Create a scenario from a template.""" # Determine which preference is correct # Based on context cues in the query query = template['query'] correct_pref = self._determine_correct_preference(query, relevant_prefs) wrong_prefs = [p for p in relevant_prefs if p['pref_id'] != correct_pref['pref_id']] return ConflictScenario( scenario_id=f"{conflict_group}_{index:03d}", conflict_group=conflict_group, query=query, context_cues=template.get('context_cues', []), triggered_prefs=[p['pref_id'] for p in relevant_prefs], correct_pref_id=correct_pref['pref_id'], wrong_pref_ids=[p['pref_id'] for p in wrong_prefs], why_correct=template.get('correct_for', ''), expected_rag_behavior=template.get('why_rag_wins', ''), expected_context_failure=template.get('why_context_fails', '') ) def _determine_correct_preference(self, query: str, prefs: list) -> dict: """ Determine which preference is correct for a query. Uses keyword matching on priority_context. """ query_lower = query.lower() scores = [] for pref in prefs: score = 0 for keyword in pref.get('priority_context', []): if keyword.lower() in query_lower: score += 1 # Bonus for condition match if pref.get('condition', '').lower() in query_lower: score += 2 scores.append((pref, score)) # Return highest scoring preference scores.sort(key=lambda x: x[1], reverse=True) return scores[0][0] if scores else prefs[0] def generate_conflict_enriched_dataset( profiles_path: str, output_path: str, scenarios_per_conflict: int = 3, seed: int = 42 ): """ Generate a dataset where every query triggers at least one conflict. """ profiles = [] with open(profiles_path) as f: for line in f: profiles.append(json.loads(line)) all_scenarios = [] conflict_coverage = {} for profile in profiles: generator = ConflictScenarioGenerator(profile, seed) scenarios = generator.generate_scenarios(scenarios_per_conflict) for scenario in scenarios: scenario_dict = { 'user_id': profile['user_id'], 'scenario_id': scenario.scenario_id, 'conflict_group': scenario.conflict_group, 'query': scenario.query, 'context_cues': scenario.context_cues, 'triggered_prefs': scenario.triggered_prefs, 'correct_pref_id': scenario.correct_pref_id, 'wrong_pref_ids': scenario.wrong_pref_ids, 'why_correct': scenario.why_correct, 'expected_rag_behavior': scenario.expected_rag_behavior, 'expected_context_failure': scenario.expected_context_failure } all_scenarios.append(scenario_dict) # Track coverage cg = scenario.conflict_group conflict_coverage[cg] = conflict_coverage.get(cg, 0) + 1 # Save Path(output_path).parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w') as f: for scenario in all_scenarios: f.write(json.dumps(scenario) + '\n') print(f"Generated {len(all_scenarios)} conflict scenarios") print(f"Coverage by conflict type:") for cg, count in sorted(conflict_coverage.items()): print(f" {cg}: {count}") return all_scenarios def create_evaluation_harness(scenarios: list) -> dict: """ Create an evaluation harness that programmatically checks if the correct preference was applied. """ harness = { "total_scenarios": len(scenarios), "by_conflict_type": {}, "evaluation_functions": {} } # Group by conflict type for scenario in scenarios: cg = scenario['conflict_group'] if cg not in harness['by_conflict_type']: harness['by_conflict_type'][cg] = [] harness['by_conflict_type'][cg].append(scenario) # Add evaluation functions for each conflict type harness['evaluation_functions'] = { "format_structure": check_format_structure, "verbosity": check_verbosity, "naming_convention": check_naming_convention, "answer_position": check_answer_position, # ... more evaluators } return harness # ============================================================================ # Evaluation Functions (check if correct preference was applied) # ============================================================================ def check_format_structure(response: str, correct_pref: dict) -> bool: """Check if response uses correct format (bullets vs numbered).""" has_bullets = bool(any(c in response for c in ['•', '-', '*'])) has_numbers = bool(any(f"{i}." in response or f"{i})" in response for i in range(1, 10))) if 'bullet' in correct_pref.get('action', '').lower(): return has_bullets and not has_numbers elif 'numbered' in correct_pref.get('action', '').lower(): return has_numbers return True # Can't determine def check_verbosity(response: str, correct_pref: dict) -> bool: """Check if response matches verbosity preference.""" word_count = len(response.split()) if 'concise' in correct_pref.get('action', '').lower() or \ '3 sentences' in correct_pref.get('action', '').lower(): return word_count < 100 # Rough threshold elif 'detailed' in correct_pref.get('action', '').lower(): return word_count > 150 return True def check_naming_convention(response: str, correct_pref: dict) -> bool: """Check if code uses correct naming convention.""" import re # Look for function/variable definitions if 'snake_case' in correct_pref.get('action', '').lower(): # Should have underscores, no camelCase has_snake = bool(re.search(r'[a-z]+_[a-z]+', response)) has_camel = bool(re.search(r'[a-z]+[A-Z][a-z]+', response)) return has_snake and not has_camel elif 'camelCase' in correct_pref.get('action', '').lower(): has_camel = bool(re.search(r'[a-z]+[A-Z][a-z]+', response)) return has_camel return True def check_answer_position(response: str, correct_pref: dict) -> bool: """Check if answer comes first or explanation builds up.""" # Simplified: check if response starts with answer-like content first_sentence = response.split('.')[0] if '.' in response else response[:100] if 'answer first' in correct_pref.get('action', '').lower(): # First sentence should be direct direct_indicators = ['is', 'are', 'the answer', 'yes', 'no', 'it\'s'] return any(ind in first_sentence.lower() for ind in direct_indicators) elif 'build up' in correct_pref.get('action', '').lower(): # First sentence should be explanatory buildup_indicators = ['let\'s', 'first', 'to understand', 'consider'] return any(ind in first_sentence.lower() for ind in buildup_indicators) return True # ============================================================================ # Main # ============================================================================ if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--profiles", default="collaborativeagents/data/complex_profiles/profiles.jsonl") parser.add_argument("--output", default="collaborativeagents/data/conflict_scenarios.jsonl") parser.add_argument("--scenarios_per_conflict", type=int, default=3) parser.add_argument("--seed", type=int, default=42) args = parser.parse_args() scenarios = generate_conflict_enriched_dataset( args.profiles, args.output, args.scenarios_per_conflict, args.seed )