summaryrefslogtreecommitdiff
path: root/collaborativeagents/datasets_extended.py
diff options
context:
space:
mode:
authorYurenHao0426 <blackhao0426@gmail.com>2026-01-27 09:57:37 -0600
committerYurenHao0426 <blackhao0426@gmail.com>2026-01-27 09:57:37 -0600
commitdc801c07cf38b0c495686463e6ca6f871a64440e (patch)
tree599f03114775921dbc472403c701f4a3a8ea188a /collaborativeagents/datasets_extended.py
parente43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (diff)
Add collaborativeagents module and update gitignore
- Add collaborativeagents subproject with adapters, agents, and evaluation modules - Update .gitignore to exclude large binary files (.whl, .tar), wandb logs, and results Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Diffstat (limited to 'collaborativeagents/datasets_extended.py')
-rw-r--r--collaborativeagents/datasets_extended.py823
1 files changed, 823 insertions, 0 deletions
diff --git a/collaborativeagents/datasets_extended.py b/collaborativeagents/datasets_extended.py
new file mode 100644
index 0000000..93a4ce8
--- /dev/null
+++ b/collaborativeagents/datasets_extended.py
@@ -0,0 +1,823 @@
+"""
+Extended datasets for challenging personalization evaluation.
+
+New datasets added:
+- GPQA: PhD-level science questions
+- TheoremQA: Theorem-based math proofs
+- LiveCodeBench: Recent competitive programming
+- AIME: American Invitational Mathematics Examination
+- SciCode: Scientific computing problems
+
+All datasets encourage step-by-step problem solving for longer sessions.
+"""
+
+from abc import ABC, abstractmethod
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass
+import json
+from pathlib import Path
+
+try:
+ from datasets import load_dataset
+ HF_AVAILABLE = True
+except ImportError:
+ HF_AVAILABLE = False
+ print("Warning: huggingface datasets not available")
+
+
+@dataclass
+class DatasetSample:
+ """A single sample from a dataset."""
+ problem: str
+ solution: str
+ problem_id: str
+ domain: str
+ difficulty: Optional[str] = None
+ metadata: Optional[Dict] = None
+
+
+class BaseDataset(ABC):
+ """Base class for all datasets."""
+
+ def __init__(self, eval_size: int = 100, train_size: int = 100, cache_dir: str = None):
+ self.eval_size = eval_size
+ self.train_size = train_size
+ self.cache_dir = cache_dir
+ self._test_data: Optional[List[DatasetSample]] = None
+ self._train_data: Optional[List[DatasetSample]] = None
+
+ @property
+ @abstractmethod
+ def name(self) -> str:
+ pass
+
+ @property
+ @abstractmethod
+ def domain(self) -> str:
+ pass
+
+ @property
+ @abstractmethod
+ def task_description(self) -> str:
+ """Description of the task for user simulator."""
+ pass
+
+ @abstractmethod
+ def _load_data(self, split: str) -> List[DatasetSample]:
+ pass
+
+ def get_testset(self) -> List[DatasetSample]:
+ if self._test_data is None:
+ self._test_data = self._load_data("test")[:self.eval_size]
+ return self._test_data
+
+ def get_trainset(self) -> List[DatasetSample]:
+ if self._train_data is None:
+ self._train_data = self._load_data("train")[:self.train_size]
+ return self._train_data
+
+
+# =============================================================================
+# Existing Datasets (Enhanced with step-by-step prompts)
+# =============================================================================
+
+class MATH500Dataset(BaseDataset):
+ """MATH-500 dataset with step-by-step encouragement."""
+
+ @property
+ def name(self) -> str:
+ return "math-500"
+
+ @property
+ def domain(self) -> str:
+ return "math"
+
+ @property
+ def task_description(self) -> str:
+ return """You are trying to solve a mathematics problem. The problem requires careful
+reasoning and step-by-step work. You will collaborate with an AI assistant to understand
+and solve the problem. Break the problem into parts and work through each step carefully.
+Ask the assistant to explain their reasoning at each step."""
+
+ def _load_data(self, split: str) -> List[DatasetSample]:
+ if not HF_AVAILABLE:
+ return []
+
+ ds = load_dataset("HuggingFaceH4/MATH-500", split="test")
+ samples = []
+
+ for i, item in enumerate(ds):
+ samples.append(DatasetSample(
+ problem=item["problem"],
+ solution=item["answer"],
+ problem_id=f"math500_{i}",
+ domain="math",
+ difficulty=item.get("level"),
+ metadata={"type": item.get("type")}
+ ))
+
+ return samples
+
+
+class MATHHardDataset(BaseDataset):
+ """MATH-Hard (Level 4-5 only)."""
+
+ @property
+ def name(self) -> str:
+ return "math-hard"
+
+ @property
+ def domain(self) -> str:
+ return "math"
+
+ @property
+ def task_description(self) -> str:
+ return """You are working on a challenging mathematics competition problem. These problems
+require deep mathematical insight and careful reasoning. Work through the problem step by step,
+explaining your approach clearly. Don't hesitate to ask for hints or verification of your reasoning."""
+
+ def _load_data(self, split: str) -> List[DatasetSample]:
+ if not HF_AVAILABLE:
+ return []
+
+ ds = load_dataset("lighteval/MATH-Hard", split="test" if split == "test" else "train")
+ samples = []
+
+ for i, item in enumerate(ds):
+ level = item.get("level", "")
+ if level not in ["Level 4", "Level 5"]:
+ continue
+
+ samples.append(DatasetSample(
+ problem=item["problem"],
+ solution=item.get("answer", item.get("solution", "")),
+ problem_id=f"mathhard_{i}",
+ domain="math",
+ difficulty=level,
+ metadata={"type": item.get("type")}
+ ))
+
+ return samples
+
+
+class HumanEvalDataset(BaseDataset):
+ """HumanEval code generation."""
+
+ @property
+ def name(self) -> str:
+ return "humaneval"
+
+ @property
+ def domain(self) -> str:
+ return "code"
+
+ @property
+ def task_description(self) -> str:
+ return """You are implementing a Python function. Think through the problem carefully,
+consider edge cases, and implement the solution step by step. Ask for clarification on any
+ambiguous requirements. Discuss your approach before writing code."""
+
+ def _load_data(self, split: str) -> List[DatasetSample]:
+ if not HF_AVAILABLE:
+ return []
+
+ ds = load_dataset("openai/openai_humaneval", split="test")
+ samples = []
+
+ for item in ds:
+ samples.append(DatasetSample(
+ problem=item["prompt"],
+ solution=item["canonical_solution"],
+ problem_id=item["task_id"],
+ domain="code",
+ metadata={"entry_point": item["entry_point"], "test": item["test"]}
+ ))
+
+ return samples
+
+
+class BigCodeBenchDataset(BaseDataset):
+ """BigCodeBench - harder code generation."""
+
+ @property
+ def name(self) -> str:
+ return "bigcodebench"
+
+ @property
+ def domain(self) -> str:
+ return "code"
+
+ @property
+ def task_description(self) -> str:
+ return """You are working on a complex programming task that requires multiple libraries
+and careful implementation. Break down the problem, discuss the approach, and implement step by step.
+Ask about library choices and implementation details."""
+
+ def _load_data(self, split: str) -> List[DatasetSample]:
+ if not HF_AVAILABLE:
+ return []
+
+ ds = load_dataset("bigcode/bigcodebench", split="v0.1.2")
+ samples = []
+
+ for item in ds:
+ samples.append(DatasetSample(
+ problem=item["instruct_prompt"],
+ solution=item["canonical_solution"],
+ problem_id=item["task_id"],
+ domain="code",
+ difficulty="hard",
+ metadata={"libs": item.get("libs", [])}
+ ))
+
+ return samples
+
+
+class LogiQADataset(BaseDataset):
+ """LogiQA logical reasoning."""
+
+ @property
+ def name(self) -> str:
+ return "logiqa"
+
+ @property
+ def domain(self) -> str:
+ return "reasoning"
+
+ @property
+ def task_description(self) -> str:
+ return """You are solving a logical reasoning problem. Read the passage carefully,
+analyze each answer choice, and reason through the logic step by step. Explain your
+reasoning process clearly."""
+
+ def _load_data(self, split: str) -> List[DatasetSample]:
+ if not HF_AVAILABLE:
+ return []
+
+ ds = load_dataset("lucasmccabe/logiqa", split=split if split in ["train", "test"] else "test")
+ samples = []
+
+ for i, item in enumerate(ds):
+ options = item["options"]
+ options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(options)])
+ problem = f"{item['context']}\n\nQuestion: {item['query']}\n\nOptions:\n{options_str}"
+ answer = chr(65 + item["correct_option"])
+
+ samples.append(DatasetSample(
+ problem=problem,
+ solution=answer,
+ problem_id=f"logiqa_{i}",
+ domain="reasoning"
+ ))
+
+ return samples
+
+
+class MMLUDataset(BaseDataset):
+ """MMLU multi-domain knowledge."""
+
+ @property
+ def name(self) -> str:
+ return "mmlu"
+
+ @property
+ def domain(self) -> str:
+ return "knowledge"
+
+ @property
+ def task_description(self) -> str:
+ return """You are answering a knowledge question that requires domain expertise.
+Think through the question carefully, consider what you know about the topic, and
+reason to the correct answer. Explain your thought process."""
+
+ def _load_data(self, split: str) -> List[DatasetSample]:
+ if not HF_AVAILABLE:
+ return []
+
+ # Load hard subjects
+ hard_subjects = ["abstract_algebra", "college_mathematics", "college_physics",
+ "formal_logic", "high_school_physics", "machine_learning"]
+
+ samples = []
+ for subject in hard_subjects:
+ try:
+ ds = load_dataset("cais/mmlu", subject, split="test")
+ for i, item in enumerate(ds):
+ choices = item["choices"]
+ options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(choices)])
+ problem = f"{item['question']}\n\nOptions:\n{options_str}"
+ answer = chr(65 + item["answer"])
+
+ samples.append(DatasetSample(
+ problem=problem,
+ solution=answer,
+ problem_id=f"mmlu_{subject}_{i}",
+ domain="knowledge",
+ metadata={"subject": subject}
+ ))
+ except Exception:
+ continue
+
+ return samples
+
+
+class MedQADataset(BaseDataset):
+ """MedQA medical knowledge."""
+
+ @property
+ def name(self) -> str:
+ return "medqa"
+
+ @property
+ def domain(self) -> str:
+ return "medical"
+
+ @property
+ def task_description(self) -> str:
+ return """You are answering a medical knowledge question. Consider the clinical
+scenario carefully, think through the pathophysiology, and reason to the correct answer.
+Explain your medical reasoning step by step."""
+
+ def _load_data(self, split: str) -> List[DatasetSample]:
+ if not HF_AVAILABLE:
+ return []
+
+ ds = load_dataset("bigbio/med_qa", "med_qa_en_source", split=split if split in ["train", "test"] else "test")
+ samples = []
+
+ for i, item in enumerate(ds):
+ options = item["options"]
+ if isinstance(options, dict):
+ options_str = "\n".join([f"{k}. {v}" for k, v in options.items()])
+ else:
+ options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(options)])
+
+ problem = f"{item['question']}\n\nOptions:\n{options_str}"
+
+ samples.append(DatasetSample(
+ problem=problem,
+ solution=item["answer_idx"],
+ problem_id=f"medqa_{i}",
+ domain="medical"
+ ))
+
+ return samples
+
+
+# =============================================================================
+# NEW Challenging Datasets
+# =============================================================================
+
+class GPQADataset(BaseDataset):
+ """GPQA - Graduate-level PhD science questions.
+
+ Extremely challenging questions that require deep domain expertise.
+ Perfect for testing complex, multi-step reasoning preferences.
+ """
+
+ @property
+ def name(self) -> str:
+ return "gpqa"
+
+ @property
+ def domain(self) -> str:
+ return "science"
+
+ @property
+ def task_description(self) -> str:
+ return """You are working on a PhD-level science question that requires deep domain expertise.
+These questions are extremely challenging and require careful, methodical reasoning.
+Break the problem into parts, discuss the relevant concepts, and work through each step.
+Don't hesitate to ask for clarification or verification of your reasoning at each step.
+Consider multiple approaches before committing to an answer."""
+
+ def _load_data(self, split: str) -> List[DatasetSample]:
+ if not HF_AVAILABLE:
+ return []
+
+ # GPQA diamond is the hardest subset
+ try:
+ ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train")
+ except Exception:
+ return []
+
+ samples = []
+ for i, item in enumerate(ds):
+ # Format the multiple choice
+ choices = [item.get(f"choice_{c}", "") for c in ["A", "B", "C", "D"] if item.get(f"choice_{c}")]
+ options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(choices)])
+ problem = f"{item['question']}\n\nOptions:\n{options_str}"
+
+ samples.append(DatasetSample(
+ problem=problem,
+ solution=item.get("correct_answer", "A"),
+ problem_id=f"gpqa_{i}",
+ domain="science",
+ difficulty="phd",
+ metadata={"subdomain": item.get("subdomain", "unknown")}
+ ))
+
+ return samples
+
+
+class TheoremQADataset(BaseDataset):
+ """TheoremQA - Theorem-based mathematical reasoning.
+
+ Requires applying mathematical theorems to solve problems.
+ Tests formal mathematical reasoning and explanation preferences.
+ """
+
+ @property
+ def name(self) -> str:
+ return "theoremqa"
+
+ @property
+ def domain(self) -> str:
+ return "math"
+
+ @property
+ def task_description(self) -> str:
+ return """You are solving a theorem-based mathematics problem. This requires identifying
+the relevant mathematical theorems, understanding their conditions, and applying them correctly.
+Work through the problem step by step:
+1. Identify what theorems might apply
+2. Verify the conditions are met
+3. Apply the theorem carefully
+4. Verify the result
+Discuss your reasoning at each step and ask for verification when needed."""
+
+ def _load_data(self, split: str) -> List[DatasetSample]:
+ if not HF_AVAILABLE:
+ return []
+
+ try:
+ ds = load_dataset("TIGER-Lab/TheoremQA", split="test")
+ except Exception:
+ return []
+
+ samples = []
+ for i, item in enumerate(ds):
+ samples.append(DatasetSample(
+ problem=item["question"],
+ solution=str(item.get("answer", "")),
+ problem_id=f"theoremqa_{i}",
+ domain="math",
+ difficulty="hard",
+ metadata={
+ "theorem": item.get("theorem", ""),
+ "field": item.get("field", "")
+ }
+ ))
+
+ return samples
+
+
+class AIMEDataset(BaseDataset):
+ """AIME - American Invitational Mathematics Examination.
+
+ Competition-level math problems requiring creative problem-solving.
+ Answers are integers from 0-999.
+ """
+
+ @property
+ def name(self) -> str:
+ return "aime"
+
+ @property
+ def domain(self) -> str:
+ return "math"
+
+ @property
+ def task_description(self) -> str:
+ return """You are working on an AIME (American Invitational Mathematics Examination) problem.
+These are competition math problems that require creative problem-solving approaches.
+The answer is always an integer from 000 to 999.
+Work through the problem systematically:
+1. Understand what the problem is asking
+2. Explore different approaches
+3. Calculate carefully
+4. Verify your answer
+Discuss your thought process and ask for hints if you're stuck."""
+
+ def _load_data(self, split: str) -> List[DatasetSample]:
+ if not HF_AVAILABLE:
+ return []
+
+ # Try to load AIME from available sources
+ try:
+ ds = load_dataset("AI-MO/aimo-validation-aime", split="train")
+ except Exception:
+ # Fallback to MATH competition problems
+ try:
+ ds = load_dataset("hendrycks/competition_math", split="test")
+ ds = [item for item in ds if "AIME" in item.get("source", "")]
+ except Exception:
+ return []
+
+ samples = []
+ for i, item in enumerate(ds):
+ samples.append(DatasetSample(
+ problem=item.get("problem", item.get("question", "")),
+ solution=str(item.get("answer", item.get("solution", ""))),
+ problem_id=f"aime_{i}",
+ domain="math",
+ difficulty="competition",
+ metadata={"year": item.get("year", ""), "problem_num": item.get("problem_number", "")}
+ ))
+
+ return samples
+
+
+class LiveCodeBenchDataset(BaseDataset):
+ """LiveCodeBench - Recent competitive programming problems.
+
+ Problems from recent programming contests (post-training cutoff).
+ Tests code generation on truly novel problems.
+ """
+
+ @property
+ def name(self) -> str:
+ return "livecodebench"
+
+ @property
+ def domain(self) -> str:
+ return "code"
+
+ @property
+ def task_description(self) -> str:
+ return """You are solving a competitive programming problem from recent contests.
+These problems require careful algorithm design and implementation.
+Approach systematically:
+1. Understand the problem constraints
+2. Identify the algorithm pattern (DP, graphs, greedy, etc.)
+3. Design the solution approach
+4. Implement carefully with attention to edge cases
+5. Analyze time/space complexity
+Discuss your approach before coding and verify your logic at each step."""
+
+ def _load_data(self, split: str) -> List[DatasetSample]:
+ if not HF_AVAILABLE:
+ return []
+
+ try:
+ ds = load_dataset("livecodebench/livecodebench", split="test")
+ except Exception:
+ return []
+
+ samples = []
+ for i, item in enumerate(ds):
+ samples.append(DatasetSample(
+ problem=item.get("question_content", item.get("problem", "")),
+ solution=item.get("solution", ""),
+ problem_id=item.get("question_id", f"lcb_{i}"),
+ domain="code",
+ difficulty=item.get("difficulty", "unknown"),
+ metadata={
+ "contest": item.get("contest_name", ""),
+ "date": item.get("contest_date", ""),
+ "tags": item.get("tags", [])
+ }
+ ))
+
+ return samples
+
+
+class SciCodeDataset(BaseDataset):
+ """SciCode - Scientific computing problems.
+
+ Requires domain knowledge + coding ability.
+ Tests both scientific reasoning and implementation preferences.
+ """
+
+ @property
+ def name(self) -> str:
+ return "scicode"
+
+ @property
+ def domain(self) -> str:
+ return "science-code"
+
+ @property
+ def task_description(self) -> str:
+ return """You are implementing a scientific computing solution. This requires both
+domain knowledge (physics, chemistry, biology, etc.) and programming expertise.
+Approach the problem by:
+1. Understanding the scientific concepts involved
+2. Formulating the mathematical model
+3. Designing the computational approach
+4. Implementing with proper numerical methods
+5. Validating the results make scientific sense
+Discuss the science and the code at each step."""
+
+ def _load_data(self, split: str) -> List[DatasetSample]:
+ if not HF_AVAILABLE:
+ return []
+
+ try:
+ ds = load_dataset("xlangai/SciCode", split="test")
+ except Exception:
+ return []
+
+ samples = []
+ for i, item in enumerate(ds):
+ samples.append(DatasetSample(
+ problem=item.get("problem", ""),
+ solution=item.get("solution", ""),
+ problem_id=f"scicode_{i}",
+ domain="science-code",
+ difficulty="hard",
+ metadata={
+ "discipline": item.get("discipline", ""),
+ "libraries": item.get("libraries", [])
+ }
+ ))
+
+ return samples
+
+
+# =============================================================================
+# Dataset Registry
+# =============================================================================
+
+DATASET_REGISTRY = {
+ # Existing (enhanced)
+ "math-500": MATH500Dataset,
+ "math-hard": MATHHardDataset,
+ "humaneval": HumanEvalDataset,
+ "bigcodebench": BigCodeBenchDataset,
+ "logiqa": LogiQADataset,
+ "mmlu": MMLUDataset,
+ "medqa": MedQADataset,
+ # New challenging datasets
+ "gpqa": GPQADataset,
+ "theoremqa": TheoremQADataset,
+ "aime": AIMEDataset,
+ "livecodebench": LiveCodeBenchDataset,
+ "scicode": SciCodeDataset,
+}
+
+
+def get_dataset(name: str, **kwargs) -> BaseDataset:
+ """Get a dataset by name."""
+ if name not in DATASET_REGISTRY:
+ raise ValueError(f"Unknown dataset: {name}. Available: {list(DATASET_REGISTRY.keys())}")
+ return DATASET_REGISTRY[name](**kwargs)
+
+
+def get_all_datasets(**kwargs) -> Dict[str, BaseDataset]:
+ """Get all available datasets."""
+ return {name: cls(**kwargs) for name, cls in DATASET_REGISTRY.items()}
+
+
+def get_challenging_datasets(**kwargs) -> Dict[str, BaseDataset]:
+ """Get only the new challenging datasets."""
+ challenging = ["gpqa", "theoremqa", "aime", "livecodebench", "scicode"]
+ return {name: DATASET_REGISTRY[name](**kwargs) for name in challenging}
+
+
+# =============================================================================
+# Step-by-Step Query Wrapper
+# =============================================================================
+
+def wrap_with_step_by_step_prompt(sample: DatasetSample) -> str:
+ """Wrap a problem with prompts encouraging step-by-step interaction.
+
+ This makes sessions longer and creates more opportunities for
+ preference expression/violation.
+ """
+ domain_prompts = {
+ "math": """Let's solve this step by step. Please:
+1. First, help me understand what the problem is asking
+2. Then, let's identify the key concepts/formulas needed
+3. Work through the solution one step at a time
+4. Verify our answer at the end
+
+Problem:
+{problem}
+
+Let's start by understanding the problem. What is it asking?""",
+
+ "code": """Let's implement this systematically. Please:
+1. First, clarify the requirements and edge cases
+2. Discuss the algorithm approach before coding
+3. Implement step by step, explaining each part
+4. Test with examples
+
+Problem:
+{problem}
+
+Let's start by understanding the requirements. What are the inputs, outputs, and edge cases?""",
+
+ "reasoning": """Let's think through this carefully. Please:
+1. Break down the key information in the passage
+2. Analyze each answer choice
+3. Eliminate wrong answers with clear reasoning
+4. Verify the correct answer
+
+Problem:
+{problem}
+
+Let's start by identifying the key facts in this passage.""",
+
+ "science": """Let's approach this PhD-level problem systematically. Please:
+1. Identify the domain and key concepts involved
+2. Recall relevant theories/equations
+3. Work through the reasoning step by step
+4. Verify our conclusion is scientifically sound
+
+Problem:
+{problem}
+
+Let's start by identifying what field this question is from and what concepts we'll need.""",
+
+ "science-code": """This combines scientific knowledge with coding. Let's:
+1. Understand the scientific concepts first
+2. Formulate the mathematical approach
+3. Design the algorithm
+4. Implement and validate
+
+Problem:
+{problem}
+
+Let's start by understanding the science behind this problem.""",
+ }
+
+ # Get the appropriate prompt or default to generic
+ domain = sample.domain
+ if domain in domain_prompts:
+ template = domain_prompts[domain]
+ else:
+ template = """Let's work through this step by step:
+1. Understand the problem
+2. Plan our approach
+3. Execute carefully
+4. Verify our answer
+
+Problem:
+{problem}
+
+Let's start by understanding what we need to do."""
+
+ return template.format(problem=sample.problem)
+
+
+# =============================================================================
+# Conflict-Inducing Query Augmentation
+# =============================================================================
+
+def augment_for_conflict_testing(sample: DatasetSample, conflict_type: str) -> str:
+ """Augment a query to trigger specific preference conflicts.
+
+ Args:
+ sample: The base problem
+ conflict_type: Type of conflict to trigger
+
+ Returns:
+ Augmented query that triggers the conflict
+ """
+ conflict_augmentations = {
+ # Verbosity conflict: "quick" + complex problem
+ "verbosity": "Quick question - {problem}",
+
+ # Format conflict: asks for both structure types
+ "format": "Can you explain this with examples and also give me a summary? {problem}",
+
+ # Tone conflict: frustrated + learning context
+ "tone": "I'm so frustrated with this! But I really want to understand it properly. {problem}",
+
+ # Code style conflict: multi-language context
+ "code_style": "I need this in Python first, then JavaScript. {problem}",
+
+ # Detail conflict: overview + specifics requested
+ "detail": "Give me the big picture but also the specific details. {problem}",
+
+ # Guidance conflict: incremental + full solution
+ "guidance": "Walk me through this but also just show me the answer if it's simple. {problem}",
+
+ # Rushed + thorough
+ "time_pressure": "I'm in a hurry but this is important so don't skip anything. {problem}",
+
+ # My attempt + fresh perspective
+ "approach": "I tried [some approach] but maybe start fresh with a better way? {problem}",
+ }
+
+ if conflict_type in conflict_augmentations:
+ template = conflict_augmentations[conflict_type]
+ return template.format(problem=sample.problem)
+
+ return sample.problem
+
+
+if __name__ == "__main__":
+ # Test loading datasets
+ print("Testing dataset loading...")
+
+ for name, cls in DATASET_REGISTRY.items():
+ try:
+ ds = cls(eval_size=5)
+ samples = ds.get_testset()
+ print(f"{name}: {len(samples)} samples loaded")
+ if samples:
+ print(f" Sample: {samples[0].problem[:100]}...")
+ except Exception as e:
+ print(f"{name}: Failed - {e}")