summaryrefslogtreecommitdiff
path: root/collaborativeagents/scripts/generate_complex_profiles.py
diff options
context:
space:
mode:
authorYurenHao0426 <blackhao0426@gmail.com>2026-01-27 09:57:37 -0600
committerYurenHao0426 <blackhao0426@gmail.com>2026-01-27 09:57:37 -0600
commitdc801c07cf38b0c495686463e6ca6f871a64440e (patch)
tree599f03114775921dbc472403c701f4a3a8ea188a /collaborativeagents/scripts/generate_complex_profiles.py
parente43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (diff)
Add collaborativeagents module and update gitignore
- Add collaborativeagents subproject with adapters, agents, and evaluation modules - Update .gitignore to exclude large binary files (.whl, .tar), wandb logs, and results Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Diffstat (limited to 'collaborativeagents/scripts/generate_complex_profiles.py')
-rw-r--r--collaborativeagents/scripts/generate_complex_profiles.py719
1 files changed, 719 insertions, 0 deletions
diff --git a/collaborativeagents/scripts/generate_complex_profiles.py b/collaborativeagents/scripts/generate_complex_profiles.py
new file mode 100644
index 0000000..3838413
--- /dev/null
+++ b/collaborativeagents/scripts/generate_complex_profiles.py
@@ -0,0 +1,719 @@
+"""
+Generate complex user profiles with conditional preferences using LLM.
+
+This script generates user profiles with ~40 situation-dependent preferences
+designed to stress-test retrieval-based personalization systems.
+"""
+
+import json
+import random
+from pathlib import Path
+from typing import Optional
+from dataclasses import dataclass, field, asdict
+import hashlib
+
+# Will use litellm for generation
+try:
+ import litellm
+except ImportError:
+ litellm = None
+
+
+# ============================================================================
+# Schema Definitions
+# ============================================================================
+
+@dataclass
+class ConditionalPreference:
+ """A preference that applies under specific conditions."""
+ pref_id: str
+ condition: str # When this preference applies
+ action: str # What the user prefers
+ conflict_group: Optional[str] = None # Which preferences this might conflict with
+ priority_context: list = field(default_factory=list) # Keywords that trigger this pref
+
+ def to_natural_language(self) -> str:
+ """Convert to natural language statement."""
+ return f"When {self.condition}, {self.action}."
+
+ def to_memory_card_format(self) -> dict:
+ """Convert to format compatible with personalization system's MemoryCard."""
+ return {
+ "condition": self.condition,
+ "action": self.action,
+ "confidence": 1.0,
+ "source": "user_profile",
+ "pref_id": self.pref_id,
+ "conflict_group": self.conflict_group,
+ "priority_context": self.priority_context
+ }
+
+
+@dataclass
+class ConflictGroup:
+ """Defines a group of preferences that may conflict."""
+ group_id: str
+ description: str
+ resolution_rule: str # How to programmatically resolve
+ member_pref_ids: list = field(default_factory=list)
+
+
+@dataclass
+class UserProfile:
+ """A complex user profile with conditional preferences."""
+ user_id: str
+ persona: str # High-level description
+ preferences: list # List of ConditionalPreference
+ conflict_groups: dict = field(default_factory=dict) # group_id -> ConflictGroup
+
+ def get_preferences_by_category(self) -> dict:
+ """Group preferences by their category (derived from pref_id prefix)."""
+ categories = {}
+ for pref in self.preferences:
+ cat = pref.pref_id.split('_')[0]
+ if cat not in categories:
+ categories[cat] = []
+ categories[cat].append(pref)
+ return categories
+
+ def get_conflicting_preferences(self, query: str) -> list:
+ """Find preferences that might conflict for a given query."""
+ # Simple keyword matching - in practice, use embeddings
+ triggered = []
+ query_lower = query.lower()
+ for pref in self.preferences:
+ for keyword in pref.priority_context:
+ if keyword.lower() in query_lower:
+ triggered.append(pref)
+ break
+
+ # Group by conflict group
+ conflicts = {}
+ for pref in triggered:
+ if pref.conflict_group:
+ if pref.conflict_group not in conflicts:
+ conflicts[pref.conflict_group] = []
+ conflicts[pref.conflict_group].append(pref)
+
+ # Return groups with more than one triggered preference
+ return {k: v for k, v in conflicts.items() if len(v) > 1}
+
+ def to_dict(self) -> dict:
+ return {
+ "user_id": self.user_id,
+ "persona": self.persona,
+ "preferences": [asdict(p) for p in self.preferences],
+ "conflict_groups": {k: asdict(v) for k, v in self.conflict_groups.items()},
+ "meta": {
+ "total_preferences": len(self.preferences),
+ "total_conflict_groups": len(self.conflict_groups)
+ }
+ }
+
+
+# ============================================================================
+# Preference Templates for LLM Generation
+# ============================================================================
+
+PREFERENCE_CATEGORIES = {
+ "response_format": {
+ "description": "How responses should be structured",
+ "num_preferences": 4,
+ "example_conflicts": ["bullets vs numbered", "answer-first vs build-up"],
+ "generation_prompt": """Generate {n} preferences about response formatting.
+Include conflicting pairs like:
+- When to use bullet points vs numbered lists
+- When to give answer first vs build up to it
+Each preference must have a specific condition (when it applies) and action (what to do)."""
+ },
+ "verbosity": {
+ "description": "How detailed responses should be",
+ "num_preferences": 5,
+ "example_conflicts": ["concise vs detailed", "explain why vs just answer"],
+ "generation_prompt": """Generate {n} preferences about response verbosity.
+Include conflicting pairs like:
+- Brief responses vs detailed explanations
+- When to explain reasoning vs just give answer
+Conditions should include cue phrases like 'quick question', 'briefly', etc."""
+ },
+ "code_style": {
+ "description": "Programming and code preferences",
+ "num_preferences": 8,
+ "example_conflicts": ["naming conventions by language", "comment styles", "review focus"],
+ "generation_prompt": """Generate {n} preferences about code style.
+Include:
+- Language-specific naming conventions (Python snake_case, JS camelCase, etc.)
+- Comment styles for different code lengths
+- Code review focus (bugs only vs style too)
+- Error handling preferences"""
+ },
+ "math_style": {
+ "description": "Mathematical explanation preferences",
+ "num_preferences": 6,
+ "example_conflicts": ["step-by-step vs intuition", "formal vs informal"],
+ "generation_prompt": """Generate {n} preferences about mathematical explanations.
+Include:
+- When to show detailed steps vs high-level approach
+- Intuition-first vs formula-first for statistics
+- How to structure proofs
+- Verification requests"""
+ },
+ "interaction_pattern": {
+ "description": "How to interact with user",
+ "num_preferences": 6,
+ "example_conflicts": ["confirm vs execute", "recommend vs list options"],
+ "generation_prompt": """Generate {n} preferences about interaction patterns.
+Include:
+- When to confirm before acting vs execute directly
+- When to recommend vs present options
+- How to handle user emotions (frustration, gratitude)"""
+ },
+ "domain_specific": {
+ "description": "Preferences for specific technical domains",
+ "num_preferences": 6,
+ "example_conflicts": ["example-first vs definition-first"],
+ "generation_prompt": """Generate {n} domain-specific preferences for:
+- Machine learning explanations
+- System design discussions
+- API/library usage
+- Data structures (include complexity)"""
+ },
+ "error_correction": {
+ "description": "How to handle user mistakes",
+ "num_preferences": 4,
+ "example_conflicts": ["gentle vs direct correction"],
+ "generation_prompt": """Generate {n} preferences about error correction.
+Include:
+- Minor terminology errors vs fundamental misconceptions
+- Code bugs
+- Correcting own previous responses"""
+ },
+ "output_artifacts": {
+ "description": "How to present code and commands",
+ "num_preferences": 4,
+ "example_conflicts": ["single block vs chunked"],
+ "generation_prompt": """Generate {n} preferences about output artifacts.
+Include:
+- Copyable code blocks vs explained chunks
+- Command presentation
+- Language specification in code fences"""
+ }
+}
+
+
+LLM_GENERATION_PROMPT = """You are generating user preferences for a personalization benchmark.
+
+## Task
+Generate {num_prefs} conditional preferences for the category: {category_name}
+Description: {category_description}
+
+## Requirements
+1. Each preference must have:
+ - A specific CONDITION (when it applies, including trigger phrases/situations)
+ - An ACTION (what the user prefers to happen)
+ - A CONFLICT_GROUP (if this preference might conflict with another)
+ - PRIORITY_CONTEXT (list of keywords that trigger this preference)
+
+2. Include at least one pair of CONFLICTING preferences that could both be triggered
+ by different aspects of the same query. The conflict should be resolvable by
+ looking at the specific context.
+
+3. Conditions should be:
+ - Specific and observable (not vague like "when appropriate")
+ - Include trigger phrases users might say
+ - Cover different situations within this category
+
+4. Example conflicts for this category: {example_conflicts}
+
+## Additional Context (if any)
+{extra_context}
+
+## Output Format
+Return a JSON array of preferences:
+```json
+[
+ {{
+ "pref_id": "{category_prefix}_001",
+ "condition": "specific situation or trigger phrase",
+ "action": "what the user prefers",
+ "conflict_group": "group_name or null",
+ "priority_context": ["keyword1", "keyword2"]
+ }},
+ ...
+]
+```
+
+Generate exactly {num_prefs} preferences."""
+
+
+PERSONA_GENERATION_PROMPT = """Generate a realistic user persona for a software developer/researcher.
+
+## Requirements
+1. The persona should feel like a real person with:
+ - A professional background (role, experience level, domain)
+ - Communication style tendencies
+ - Learning preferences
+ - Work context (startup vs enterprise, solo vs team)
+
+2. The persona should naturally motivate the preferences that will be assigned.
+
+3. Keep it to 2-3 sentences.
+
+## Preference Summary
+This user will have preferences in these areas:
+{preference_summary}
+
+## Examples of good personas:
+- "A senior backend engineer at a fintech startup who values efficiency and directness. Prefers practical solutions over theoretical discussions, and likes to understand the 'why' behind recommendations."
+- "A PhD student in machine learning who is meticulous about mathematical rigor. Appreciates step-by-step derivations and often cross-references multiple sources before accepting an explanation."
+- "A junior developer transitioning from frontend to full-stack. Learns best through examples and appreciates patient, incremental explanations without condescension."
+
+## Output
+Return only the persona text (2-3 sentences), no JSON or formatting."""
+
+
+# ============================================================================
+# Conflict Resolution Logic
+# ============================================================================
+
+CONFLICT_RESOLUTION_RULES = {
+ "format_structure": {
+ "signals": {
+ "bullets": ["options", "alternatives", "list", "multiple", "comparison", "pros and cons"],
+ "numbered": ["steps", "procedure", "how to", "setup", "install", "first", "then", "sequence"]
+ },
+ "resolution": "sequential_process -> numbered; parallel_items -> bullets"
+ },
+ "answer_position": {
+ "signals": {
+ "answer_first": ["what is", "what's", "tell me", "give me", "?"],
+ "build_up": ["explain", "why", "how does", "teach", "help me understand"]
+ },
+ "resolution": "direct_question -> answer_first; learning_intent -> build_up"
+ },
+ "response_length": {
+ "signals": {
+ "concise": ["quick", "brief", "short", "tldr", "in a nutshell", "one line"],
+ "detailed": ["explain", "elaborate", "in detail", "thoroughly", "complex", "proof"]
+ },
+ "resolution": "explicit_brevity_cue -> concise (overrides topic complexity)"
+ },
+ "naming_convention": {
+ "signals": {
+ "snake_case": ["python", ".py", "def ", "import "],
+ "camelCase": ["javascript", "typescript", ".js", ".ts", "const ", "let ", "function "],
+ "UPPER_keywords": ["sql", "SELECT", "FROM", "WHERE", "database"]
+ },
+ "resolution": "determined by programming language detection"
+ },
+ "autonomy": {
+ "signals": {
+ "confirm": ["should I", "would you like", "complex", "multiple parts", "project"],
+ "execute": ["do this", "make this", "just", "please", "now"]
+ },
+ "resolution": "ambiguous_task -> confirm; clear_instruction -> execute"
+ },
+ "code_presentation": {
+ "signals": {
+ "single_block": ["copy", "paste", "use this", "give me the code", "full code"],
+ "chunked": ["teach", "explain", "understand", "walk through", "learn"]
+ },
+ "resolution": "copy_intent -> single_block; learning_intent -> chunked"
+ }
+}
+
+
+def resolve_conflict(conflict_group: str, query: str, candidates: list) -> Optional[str]:
+ """
+ Programmatically resolve which preference wins in a conflict.
+
+ Args:
+ conflict_group: The conflict group ID
+ query: The user query
+ candidates: List of ConditionalPreference objects in this conflict
+
+ Returns:
+ pref_id of the winning preference, or None if cannot resolve
+ """
+ if conflict_group not in CONFLICT_RESOLUTION_RULES:
+ return None
+
+ rules = CONFLICT_RESOLUTION_RULES[conflict_group]
+ query_lower = query.lower()
+
+ # Score each candidate based on signal matches
+ scores = {}
+ for pref in candidates:
+ scores[pref.pref_id] = 0
+
+ # Check each signal category
+ for signal_category, keywords in rules["signals"].items():
+ for keyword in keywords:
+ if keyword.lower() in query_lower:
+ # Check if this signal category matches this preference
+ for ctx in pref.priority_context:
+ if ctx.lower() in signal_category.lower() or signal_category.lower() in ctx.lower():
+ scores[pref.pref_id] += 1
+ # Also check if keyword is in priority context
+ if keyword.lower() in ctx.lower():
+ scores[pref.pref_id] += 1
+
+ # Return highest scoring preference
+ if scores:
+ winner = max(scores, key=scores.get)
+ if scores[winner] > 0:
+ return winner
+
+ return None
+
+
+def create_conflict_test_case(conflict_group: str, preferences: list) -> dict:
+ """
+ Create a test case that triggers a specific conflict.
+
+ Returns a dict with:
+ - query: A query that triggers multiple preferences
+ - triggered_prefs: List of preference IDs triggered
+ - correct_pref: The preference that should win
+ - resolution_reason: Why this preference wins
+ """
+ if conflict_group not in CONFLICT_RESOLUTION_RULES:
+ return None
+
+ rules = CONFLICT_RESOLUTION_RULES[conflict_group]
+
+ # Create queries that trigger conflicts
+ test_cases = {
+ "format_structure": {
+ "query": "How do I set up a Python virtual environment? List the main options.",
+ "ambiguity": "Both 'set up' (procedure->numbered) and 'list options' (parallel->bullets)",
+ "resolution": "Primary intent is setup procedure -> numbered steps"
+ },
+ "response_length": {
+ "query": "Quick question - how does backpropagation work?",
+ "ambiguity": "'Quick question' (concise) vs 'how does X work' (complex topic)",
+ "resolution": "Explicit brevity cue 'quick question' overrides topic complexity"
+ },
+ "answer_position": {
+ "query": "What is gradient descent and why is it used?",
+ "ambiguity": "'What is' (answer first) vs 'why' (build up explanation)",
+ "resolution": "Combined question: give brief answer, then explain why"
+ },
+ "naming_convention": {
+ "query": "Write a function to parse JSON in both Python and JavaScript",
+ "ambiguity": "Two languages with different conventions",
+ "resolution": "Use appropriate convention for each: snake_case for Python, camelCase for JS"
+ },
+ "autonomy": {
+ "query": "Refactor this authentication module to use JWT",
+ "ambiguity": "'Refactor' is complex, but instruction is specific",
+ "resolution": "Should confirm approach before major refactor"
+ },
+ "code_presentation": {
+ "query": "I want to understand how this sorting algorithm works, give me the code",
+ "ambiguity": "'understand' (chunked) vs 'give me the code' (single block)",
+ "resolution": "Learning intent detected -> chunked with explanations"
+ }
+ }
+
+ if conflict_group in test_cases:
+ tc = test_cases[conflict_group]
+ # Find which preferences are triggered
+ triggered = [p for p in preferences if p.conflict_group == conflict_group]
+ winner = resolve_conflict(conflict_group, tc["query"], triggered)
+
+ return {
+ "conflict_group": conflict_group,
+ "query": tc["query"],
+ "ambiguity": tc["ambiguity"],
+ "triggered_pref_ids": [p.pref_id for p in triggered],
+ "correct_pref_id": winner,
+ "resolution_reason": tc["resolution"]
+ }
+
+ return None
+
+
+# ============================================================================
+# LLM-based Profile Generation
+# ============================================================================
+
+def generate_preferences_with_llm(
+ category: str,
+ model: str = "gpt-4o-mini",
+ extra_context: str = ""
+) -> list:
+ """Generate preferences for a category using LLM."""
+ if litellm is None:
+ raise ImportError("litellm required for LLM generation")
+
+ cat_info = PREFERENCE_CATEGORIES[category]
+ prompt = LLM_GENERATION_PROMPT.format(
+ num_prefs=cat_info["num_preferences"],
+ category_name=category,
+ category_description=cat_info["description"],
+ example_conflicts=", ".join(cat_info["example_conflicts"]),
+ category_prefix=category[:2],
+ extra_context=extra_context or "None"
+ )
+
+ response = litellm.completion(
+ model=model,
+ messages=[{"role": "user", "content": prompt}],
+ response_format={"type": "json_object"}
+ )
+
+ content = response.choices[0].message.content
+ # Extract JSON from response
+ try:
+ data = json.loads(content)
+ if isinstance(data, dict) and "preferences" in data:
+ data = data["preferences"]
+ return [ConditionalPreference(**p) for p in data]
+ except json.JSONDecodeError:
+ # Try to extract JSON array from markdown code block
+ import re
+ match = re.search(r'\[[\s\S]*\]', content)
+ if match:
+ data = json.loads(match.group())
+ return [ConditionalPreference(**p) for p in data]
+ raise
+
+
+def generate_persona_with_llm(
+ preferences: list,
+ model: str = "gpt-4o-mini"
+) -> str:
+ """Generate a persona that matches the preferences."""
+ if litellm is None:
+ raise ImportError("litellm required for LLM generation")
+
+ # Summarize preferences by category
+ by_cat = {}
+ for p in preferences:
+ cat = p.pref_id.split('_')[0]
+ if cat not in by_cat:
+ by_cat[cat] = []
+ by_cat[cat].append(p.action[:50] + "...")
+
+ summary = "\n".join([f"- {cat}: {', '.join(actions[:3])}" for cat, actions in by_cat.items()])
+
+ prompt = PERSONA_GENERATION_PROMPT.format(preference_summary=summary)
+
+ response = litellm.completion(
+ model=model,
+ messages=[{"role": "user", "content": prompt}]
+ )
+
+ return response.choices[0].message.content.strip()
+
+
+def generate_full_profile(
+ user_id: str,
+ model: str = "gpt-4o-mini",
+ categories: list = None
+) -> UserProfile:
+ """Generate a complete user profile with all preferences."""
+ if categories is None:
+ categories = list(PREFERENCE_CATEGORIES.keys())
+
+ all_preferences = []
+ for cat in categories:
+ prefs = generate_preferences_with_llm(cat, model)
+ all_preferences.extend(prefs)
+
+ persona = generate_persona_with_llm(all_preferences, model)
+
+ # Build conflict groups
+ conflict_groups = {}
+ for pref in all_preferences:
+ if pref.conflict_group:
+ if pref.conflict_group not in conflict_groups:
+ conflict_groups[pref.conflict_group] = ConflictGroup(
+ group_id=pref.conflict_group,
+ description=CONFLICT_RESOLUTION_RULES.get(pref.conflict_group, {}).get("resolution", ""),
+ resolution_rule=CONFLICT_RESOLUTION_RULES.get(pref.conflict_group, {}).get("resolution", ""),
+ member_pref_ids=[]
+ )
+ conflict_groups[pref.conflict_group].member_pref_ids.append(pref.pref_id)
+
+ return UserProfile(
+ user_id=user_id,
+ persona=persona,
+ preferences=all_preferences,
+ conflict_groups=conflict_groups
+ )
+
+
+# ============================================================================
+# Dataset Loading and Challenging Question Selection
+# ============================================================================
+
+CHALLENGING_DATASETS = {
+ # Existing datasets with difficulty filtering
+ "math-hard": {
+ "source": "lighteval/MATH-Hard",
+ "filter": lambda x: x.get("level") in ["Level 4", "Level 5"],
+ "encourage_step_by_step": True
+ },
+ "humaneval-hard": {
+ "source": "openai_humaneval",
+ "filter": lambda x: len(x.get("prompt", "")) > 200, # Longer problems
+ "encourage_step_by_step": True
+ },
+
+ # New challenging datasets to add
+ "gpqa": {
+ "source": "Idavidrein/gpqa",
+ "description": "PhD-level science questions",
+ "filter": lambda x: x.get("difficulty") == "hard",
+ "encourage_step_by_step": True
+ },
+ "theoremqa": {
+ "source": "wenhu/TheoremQA",
+ "description": "Theorem-based math requiring multi-step proofs",
+ "filter": None,
+ "encourage_step_by_step": True
+ },
+ "livecodebench": {
+ "source": "livecodebench/livecodebench",
+ "description": "Recent competitive programming problems",
+ "filter": lambda x: x.get("difficulty") in ["medium", "hard"],
+ "encourage_step_by_step": True
+ },
+ "aime": {
+ "source": "AI-MO/aimo-progress-prize",
+ "description": "American Invitational Mathematics Examination",
+ "filter": None,
+ "encourage_step_by_step": True
+ },
+ "scicode": {
+ "source": "scicode-bench/SciCode",
+ "description": "Scientific computing problems",
+ "filter": None,
+ "encourage_step_by_step": True
+ }
+}
+
+
+STEP_BY_STEP_PROMPT_ADDITIONS = {
+ "math": """
+When solving this problem:
+1. First identify what type of problem this is
+2. State the key concepts/theorems needed
+3. Work through the solution step by step
+4. Verify your answer
+Take your time and show your reasoning at each step.""",
+
+ "code": """
+When solving this problem:
+1. First understand the requirements and edge cases
+2. Outline your approach before writing code
+3. Implement step by step, explaining your logic
+4. Consider time/space complexity
+5. Test with example inputs
+Show your reasoning throughout.""",
+
+ "reasoning": """
+When solving this problem:
+1. Carefully read and identify the key information
+2. State any assumptions you're making
+3. Work through the logic step by step
+4. Check for any flaws in your reasoning
+5. State your conclusion clearly
+Take your time and explain your thought process."""
+}
+
+
+# ============================================================================
+# Batch Generation Script
+# ============================================================================
+
+def generate_profiles_batch(
+ num_profiles: int,
+ output_path: Path,
+ model: str = "gpt-4o-mini",
+ seed: int = 42
+) -> list:
+ """Generate multiple user profiles."""
+ random.seed(seed)
+ profiles = []
+
+ for i in range(num_profiles):
+ user_id = f"user_{hashlib.md5(f'{seed}_{i}'.encode()).hexdigest()[:8]}"
+
+ # Optionally vary which categories are emphasized
+ # Some users might have stronger code preferences, others math, etc.
+ category_weights = {cat: random.random() for cat in PREFERENCE_CATEGORIES}
+
+ try:
+ profile = generate_full_profile(user_id, model)
+ profiles.append(profile)
+ print(f"Generated profile {i+1}/{num_profiles}: {user_id}")
+ except Exception as e:
+ print(f"Error generating profile {i+1}: {e}")
+ continue
+
+ # Save profiles
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(output_path, 'w') as f:
+ for profile in profiles:
+ f.write(json.dumps(profile.to_dict()) + '\n')
+
+ print(f"Saved {len(profiles)} profiles to {output_path}")
+ return profiles
+
+
+def generate_conflict_test_suite(profiles: list, output_path: Path):
+ """Generate test cases for conflict resolution evaluation."""
+ test_cases = []
+
+ for profile in profiles:
+ for conflict_group in profile.conflict_groups:
+ tc = create_conflict_test_case(
+ conflict_group,
+ profile.preferences
+ )
+ if tc:
+ tc["user_id"] = profile.user_id
+ test_cases.append(tc)
+
+ with open(output_path, 'w') as f:
+ json.dump(test_cases, f, indent=2)
+
+ print(f"Generated {len(test_cases)} conflict test cases")
+ return test_cases
+
+
+# ============================================================================
+# Main
+# ============================================================================
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--num_profiles", type=int, default=10)
+ parser.add_argument("--output_dir", type=str, default="collaborativeagents/data/complex_profiles")
+ parser.add_argument("--model", type=str, default="gpt-4o-mini")
+ parser.add_argument("--seed", type=int, default=42)
+ parser.add_argument("--generate_conflicts", action="store_true")
+
+ args = parser.parse_args()
+
+ output_dir = Path(args.output_dir)
+
+ # Generate profiles
+ profiles = generate_profiles_batch(
+ num_profiles=args.num_profiles,
+ output_path=output_dir / "profiles.jsonl",
+ model=args.model,
+ seed=args.seed
+ )
+
+ # Generate conflict test cases
+ if args.generate_conflicts:
+ generate_conflict_test_suite(
+ profiles,
+ output_path=output_dir / "conflict_tests.json"
+ )