From dc801c07cf38b0c495686463e6ca6f871a64440e Mon Sep 17 00:00:00 2001
From: YurenHao0426 <blackhao0426@gmail.com>
Date: Tue, 27 Jan 2026 09:57:37 -0600
Subject: Add collaborativeagents module and update gitignore

- Add collaborativeagents subproject with adapters, agents, and evaluation modules
- Update .gitignore to exclude large binary files (.whl, .tar), wandb logs, and results

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 collaborativeagents/datasets_extended.py | 823 +++++++++++++++++++++++++++++++
 1 file changed, 823 insertions(+)
 create mode 100644 collaborativeagents/datasets_extended.py

(limited to 'collaborativeagents/datasets_extended.py')

diff --git a/collaborativeagents/datasets_extended.py b/collaborativeagents/datasets_extended.py
new file mode 100644
index 0000000..93a4ce8
--- /dev/null
+++ b/collaborativeagents/datasets_extended.py
@@ -0,0 +1,823 @@
+"""
+Extended datasets for challenging personalization evaluation.
+
+New datasets added:
+- GPQA: PhD-level science questions
+- TheoremQA: Theorem-based math proofs
+- LiveCodeBench: Recent competitive programming
+- AIME: American Invitational Mathematics Examination
+- SciCode: Scientific computing problems
+
+All datasets encourage step-by-step problem solving for longer sessions.
+"""
+
+from abc import ABC, abstractmethod
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass
+import json
+from pathlib import Path
+
+try:
+    from datasets import load_dataset
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+    print("Warning: huggingface datasets not available")
+
+
+@dataclass
+class DatasetSample:
+    """A single sample from a dataset."""
+    problem: str
+    solution: str
+    problem_id: str
+    domain: str
+    difficulty: Optional[str] = None
+    metadata: Optional[Dict] = None
+
+
+class BaseDataset(ABC):
+    """Base class for all datasets."""
+
+    def __init__(self, eval_size: int = 100, train_size: int = 100, cache_dir: str = None):
+        self.eval_size = eval_size
+        self.train_size = train_size
+        self.cache_dir = cache_dir
+        self._test_data: Optional[List[DatasetSample]] = None
+        self._train_data: Optional[List[DatasetSample]] = None
+
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        pass
+
+    @property
+    @abstractmethod
+    def domain(self) -> str:
+        pass
+
+    @property
+    @abstractmethod
+    def task_description(self) -> str:
+        """Description of the task for user simulator."""
+        pass
+
+    @abstractmethod
+    def _load_data(self, split: str) -> List[DatasetSample]:
+        pass
+
+    def get_testset(self) -> List[DatasetSample]:
+        if self._test_data is None:
+            self._test_data = self._load_data("test")[:self.eval_size]
+        return self._test_data
+
+    def get_trainset(self) -> List[DatasetSample]:
+        if self._train_data is None:
+            self._train_data = self._load_data("train")[:self.train_size]
+        return self._train_data
+
+
+# =============================================================================
+# Existing Datasets (Enhanced with step-by-step prompts)
+# =============================================================================
+
+class MATH500Dataset(BaseDataset):
+    """MATH-500 dataset with step-by-step encouragement."""
+
+    @property
+    def name(self) -> str:
+        return "math-500"
+
+    @property
+    def domain(self) -> str:
+        return "math"
+
+    @property
+    def task_description(self) -> str:
+        return """You are trying to solve a mathematics problem. The problem requires careful
+reasoning and step-by-step work. You will collaborate with an AI assistant to understand
+and solve the problem. Break the problem into parts and work through each step carefully.
+Ask the assistant to explain their reasoning at each step."""
+
+    def _load_data(self, split: str) -> List[DatasetSample]:
+        if not HF_AVAILABLE:
+            return []
+
+        ds = load_dataset("HuggingFaceH4/MATH-500", split="test")
+        samples = []
+
+        for i, item in enumerate(ds):
+            samples.append(DatasetSample(
+                problem=item["problem"],
+                solution=item["answer"],
+                problem_id=f"math500_{i}",
+                domain="math",
+                difficulty=item.get("level"),
+                metadata={"type": item.get("type")}
+            ))
+
+        return samples
+
+
+class MATHHardDataset(BaseDataset):
+    """MATH-Hard (Level 4-5 only)."""
+
+    @property
+    def name(self) -> str:
+        return "math-hard"
+
+    @property
+    def domain(self) -> str:
+        return "math"
+
+    @property
+    def task_description(self) -> str:
+        return """You are working on a challenging mathematics competition problem. These problems
+require deep mathematical insight and careful reasoning. Work through the problem step by step,
+explaining your approach clearly. Don't hesitate to ask for hints or verification of your reasoning."""
+
+    def _load_data(self, split: str) -> List[DatasetSample]:
+        if not HF_AVAILABLE:
+            return []
+
+        ds = load_dataset("lighteval/MATH-Hard", split="test" if split == "test" else "train")
+        samples = []
+
+        for i, item in enumerate(ds):
+            level = item.get("level", "")
+            if level not in ["Level 4", "Level 5"]:
+                continue
+
+            samples.append(DatasetSample(
+                problem=item["problem"],
+                solution=item.get("answer", item.get("solution", "")),
+                problem_id=f"mathhard_{i}",
+                domain="math",
+                difficulty=level,
+                metadata={"type": item.get("type")}
+            ))
+
+        return samples
+
+
+class HumanEvalDataset(BaseDataset):
+    """HumanEval code generation."""
+
+    @property
+    def name(self) -> str:
+        return "humaneval"
+
+    @property
+    def domain(self) -> str:
+        return "code"
+
+    @property
+    def task_description(self) -> str:
+        return """You are implementing a Python function. Think through the problem carefully,
+consider edge cases, and implement the solution step by step. Ask for clarification on any
+ambiguous requirements. Discuss your approach before writing code."""
+
+    def _load_data(self, split: str) -> List[DatasetSample]:
+        if not HF_AVAILABLE:
+            return []
+
+        ds = load_dataset("openai/openai_humaneval", split="test")
+        samples = []
+
+        for item in ds:
+            samples.append(DatasetSample(
+                problem=item["prompt"],
+                solution=item["canonical_solution"],
+                problem_id=item["task_id"],
+                domain="code",
+                metadata={"entry_point": item["entry_point"], "test": item["test"]}
+            ))
+
+        return samples
+
+
+class BigCodeBenchDataset(BaseDataset):
+    """BigCodeBench - harder code generation."""
+
+    @property
+    def name(self) -> str:
+        return "bigcodebench"
+
+    @property
+    def domain(self) -> str:
+        return "code"
+
+    @property
+    def task_description(self) -> str:
+        return """You are working on a complex programming task that requires multiple libraries
+and careful implementation. Break down the problem, discuss the approach, and implement step by step.
+Ask about library choices and implementation details."""
+
+    def _load_data(self, split: str) -> List[DatasetSample]:
+        if not HF_AVAILABLE:
+            return []
+
+        ds = load_dataset("bigcode/bigcodebench", split="v0.1.2")
+        samples = []
+
+        for item in ds:
+            samples.append(DatasetSample(
+                problem=item["instruct_prompt"],
+                solution=item["canonical_solution"],
+                problem_id=item["task_id"],
+                domain="code",
+                difficulty="hard",
+                metadata={"libs": item.get("libs", [])}
+            ))
+
+        return samples
+
+
+class LogiQADataset(BaseDataset):
+    """LogiQA logical reasoning."""
+
+    @property
+    def name(self) -> str:
+        return "logiqa"
+
+    @property
+    def domain(self) -> str:
+        return "reasoning"
+
+    @property
+    def task_description(self) -> str:
+        return """You are solving a logical reasoning problem. Read the passage carefully,
+analyze each answer choice, and reason through the logic step by step. Explain your
+reasoning process clearly."""
+
+    def _load_data(self, split: str) -> List[DatasetSample]:
+        if not HF_AVAILABLE:
+            return []
+
+        ds = load_dataset("lucasmccabe/logiqa", split=split if split in ["train", "test"] else "test")
+        samples = []
+
+        for i, item in enumerate(ds):
+            options = item["options"]
+            options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(options)])
+            problem = f"{item['context']}\n\nQuestion: {item['query']}\n\nOptions:\n{options_str}"
+            answer = chr(65 + item["correct_option"])
+
+            samples.append(DatasetSample(
+                problem=problem,
+                solution=answer,
+                problem_id=f"logiqa_{i}",
+                domain="reasoning"
+            ))
+
+        return samples
+
+
+class MMLUDataset(BaseDataset):
+    """MMLU multi-domain knowledge."""
+
+    @property
+    def name(self) -> str:
+        return "mmlu"
+
+    @property
+    def domain(self) -> str:
+        return "knowledge"
+
+    @property
+    def task_description(self) -> str:
+        return """You are answering a knowledge question that requires domain expertise.
+Think through the question carefully, consider what you know about the topic, and
+reason to the correct answer. Explain your thought process."""
+
+    def _load_data(self, split: str) -> List[DatasetSample]:
+        if not HF_AVAILABLE:
+            return []
+
+        # Load hard subjects
+        hard_subjects = ["abstract_algebra", "college_mathematics", "college_physics",
+                        "formal_logic", "high_school_physics", "machine_learning"]
+
+        samples = []
+        for subject in hard_subjects:
+            try:
+                ds = load_dataset("cais/mmlu", subject, split="test")
+                for i, item in enumerate(ds):
+                    choices = item["choices"]
+                    options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(choices)])
+                    problem = f"{item['question']}\n\nOptions:\n{options_str}"
+                    answer = chr(65 + item["answer"])
+
+                    samples.append(DatasetSample(
+                        problem=problem,
+                        solution=answer,
+                        problem_id=f"mmlu_{subject}_{i}",
+                        domain="knowledge",
+                        metadata={"subject": subject}
+                    ))
+            except Exception:
+                continue
+
+        return samples
+
+
+class MedQADataset(BaseDataset):
+    """MedQA medical knowledge."""
+
+    @property
+    def name(self) -> str:
+        return "medqa"
+
+    @property
+    def domain(self) -> str:
+        return "medical"
+
+    @property
+    def task_description(self) -> str:
+        return """You are answering a medical knowledge question. Consider the clinical
+scenario carefully, think through the pathophysiology, and reason to the correct answer.
+Explain your medical reasoning step by step."""
+
+    def _load_data(self, split: str) -> List[DatasetSample]:
+        if not HF_AVAILABLE:
+            return []
+
+        ds = load_dataset("bigbio/med_qa", "med_qa_en_source", split=split if split in ["train", "test"] else "test")
+        samples = []
+
+        for i, item in enumerate(ds):
+            options = item["options"]
+            if isinstance(options, dict):
+                options_str = "\n".join([f"{k}. {v}" for k, v in options.items()])
+            else:
+                options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(options)])
+
+            problem = f"{item['question']}\n\nOptions:\n{options_str}"
+
+            samples.append(DatasetSample(
+                problem=problem,
+                solution=item["answer_idx"],
+                problem_id=f"medqa_{i}",
+                domain="medical"
+            ))
+
+        return samples
+
+
+# =============================================================================
+# NEW Challenging Datasets
+# =============================================================================
+
+class GPQADataset(BaseDataset):
+    """GPQA - Graduate-level PhD science questions.
+
+    Extremely challenging questions that require deep domain expertise.
+    Perfect for testing complex, multi-step reasoning preferences.
+    """
+
+    @property
+    def name(self) -> str:
+        return "gpqa"
+
+    @property
+    def domain(self) -> str:
+        return "science"
+
+    @property
+    def task_description(self) -> str:
+        return """You are working on a PhD-level science question that requires deep domain expertise.
+These questions are extremely challenging and require careful, methodical reasoning.
+Break the problem into parts, discuss the relevant concepts, and work through each step.
+Don't hesitate to ask for clarification or verification of your reasoning at each step.
+Consider multiple approaches before committing to an answer."""
+
+    def _load_data(self, split: str) -> List[DatasetSample]:
+        if not HF_AVAILABLE:
+            return []
+
+        # GPQA diamond is the hardest subset
+        try:
+            ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train")
+        except Exception:
+            return []
+
+        samples = []
+        for i, item in enumerate(ds):
+            # Format the multiple choice
+            choices = [item.get(f"choice_{c}", "") for c in ["A", "B", "C", "D"] if item.get(f"choice_{c}")]
+            options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(choices)])
+            problem = f"{item['question']}\n\nOptions:\n{options_str}"
+
+            samples.append(DatasetSample(
+                problem=problem,
+                solution=item.get("correct_answer", "A"),
+                problem_id=f"gpqa_{i}",
+                domain="science",
+                difficulty="phd",
+                metadata={"subdomain": item.get("subdomain", "unknown")}
+            ))
+
+        return samples
+
+
+class TheoremQADataset(BaseDataset):
+    """TheoremQA - Theorem-based mathematical reasoning.
+
+    Requires applying mathematical theorems to solve problems.
+    Tests formal mathematical reasoning and explanation preferences.
+    """
+
+    @property
+    def name(self) -> str:
+        return "theoremqa"
+
+    @property
+    def domain(self) -> str:
+        return "math"
+
+    @property
+    def task_description(self) -> str:
+        return """You are solving a theorem-based mathematics problem. This requires identifying
+the relevant mathematical theorems, understanding their conditions, and applying them correctly.
+Work through the problem step by step:
+1. Identify what theorems might apply
+2. Verify the conditions are met
+3. Apply the theorem carefully
+4. Verify the result
+Discuss your reasoning at each step and ask for verification when needed."""
+
+    def _load_data(self, split: str) -> List[DatasetSample]:
+        if not HF_AVAILABLE:
+            return []
+
+        try:
+            ds = load_dataset("TIGER-Lab/TheoremQA", split="test")
+        except Exception:
+            return []
+
+        samples = []
+        for i, item in enumerate(ds):
+            samples.append(DatasetSample(
+                problem=item["question"],
+                solution=str(item.get("answer", "")),
+                problem_id=f"theoremqa_{i}",
+                domain="math",
+                difficulty="hard",
+                metadata={
+                    "theorem": item.get("theorem", ""),
+                    "field": item.get("field", "")
+                }
+            ))
+
+        return samples
+
+
+class AIMEDataset(BaseDataset):
+    """AIME - American Invitational Mathematics Examination.
+
+    Competition-level math problems requiring creative problem-solving.
+    Answers are integers from 0-999.
+    """
+
+    @property
+    def name(self) -> str:
+        return "aime"
+
+    @property
+    def domain(self) -> str:
+        return "math"
+
+    @property
+    def task_description(self) -> str:
+        return """You are working on an AIME (American Invitational Mathematics Examination) problem.
+These are competition math problems that require creative problem-solving approaches.
+The answer is always an integer from 000 to 999.
+Work through the problem systematically:
+1. Understand what the problem is asking
+2. Explore different approaches
+3. Calculate carefully
+4. Verify your answer
+Discuss your thought process and ask for hints if you're stuck."""
+
+    def _load_data(self, split: str) -> List[DatasetSample]:
+        if not HF_AVAILABLE:
+            return []
+
+        # Try to load AIME from available sources
+        try:
+            ds = load_dataset("AI-MO/aimo-validation-aime", split="train")
+        except Exception:
+            # Fallback to MATH competition problems
+            try:
+                ds = load_dataset("hendrycks/competition_math", split="test")
+                ds = [item for item in ds if "AIME" in item.get("source", "")]
+            except Exception:
+                return []
+
+        samples = []
+        for i, item in enumerate(ds):
+            samples.append(DatasetSample(
+                problem=item.get("problem", item.get("question", "")),
+                solution=str(item.get("answer", item.get("solution", ""))),
+                problem_id=f"aime_{i}",
+                domain="math",
+                difficulty="competition",
+                metadata={"year": item.get("year", ""), "problem_num": item.get("problem_number", "")}
+            ))
+
+        return samples
+
+
+class LiveCodeBenchDataset(BaseDataset):
+    """LiveCodeBench - Recent competitive programming problems.
+
+    Problems from recent programming contests (post-training cutoff).
+    Tests code generation on truly novel problems.
+    """
+
+    @property
+    def name(self) -> str:
+        return "livecodebench"
+
+    @property
+    def domain(self) -> str:
+        return "code"
+
+    @property
+    def task_description(self) -> str:
+        return """You are solving a competitive programming problem from recent contests.
+These problems require careful algorithm design and implementation.
+Approach systematically:
+1. Understand the problem constraints
+2. Identify the algorithm pattern (DP, graphs, greedy, etc.)
+3. Design the solution approach
+4. Implement carefully with attention to edge cases
+5. Analyze time/space complexity
+Discuss your approach before coding and verify your logic at each step."""
+
+    def _load_data(self, split: str) -> List[DatasetSample]:
+        if not HF_AVAILABLE:
+            return []
+
+        try:
+            ds = load_dataset("livecodebench/livecodebench", split="test")
+        except Exception:
+            return []
+
+        samples = []
+        for i, item in enumerate(ds):
+            samples.append(DatasetSample(
+                problem=item.get("question_content", item.get("problem", "")),
+                solution=item.get("solution", ""),
+                problem_id=item.get("question_id", f"lcb_{i}"),
+                domain="code",
+                difficulty=item.get("difficulty", "unknown"),
+                metadata={
+                    "contest": item.get("contest_name", ""),
+                    "date": item.get("contest_date", ""),
+                    "tags": item.get("tags", [])
+                }
+            ))
+
+        return samples
+
+
+class SciCodeDataset(BaseDataset):
+    """SciCode - Scientific computing problems.
+
+    Requires domain knowledge + coding ability.
+    Tests both scientific reasoning and implementation preferences.
+    """
+
+    @property
+    def name(self) -> str:
+        return "scicode"
+
+    @property
+    def domain(self) -> str:
+        return "science-code"
+
+    @property
+    def task_description(self) -> str:
+        return """You are implementing a scientific computing solution. This requires both
+domain knowledge (physics, chemistry, biology, etc.) and programming expertise.
+Approach the problem by:
+1. Understanding the scientific concepts involved
+2. Formulating the mathematical model
+3. Designing the computational approach
+4. Implementing with proper numerical methods
+5. Validating the results make scientific sense
+Discuss the science and the code at each step."""
+
+    def _load_data(self, split: str) -> List[DatasetSample]:
+        if not HF_AVAILABLE:
+            return []
+
+        try:
+            ds = load_dataset("xlangai/SciCode", split="test")
+        except Exception:
+            return []
+
+        samples = []
+        for i, item in enumerate(ds):
+            samples.append(DatasetSample(
+                problem=item.get("problem", ""),
+                solution=item.get("solution", ""),
+                problem_id=f"scicode_{i}",
+                domain="science-code",
+                difficulty="hard",
+                metadata={
+                    "discipline": item.get("discipline", ""),
+                    "libraries": item.get("libraries", [])
+                }
+            ))
+
+        return samples
+
+
+# =============================================================================
+# Dataset Registry
+# =============================================================================
+
+DATASET_REGISTRY = {
+    # Existing (enhanced)
+    "math-500": MATH500Dataset,
+    "math-hard": MATHHardDataset,
+    "humaneval": HumanEvalDataset,
+    "bigcodebench": BigCodeBenchDataset,
+    "logiqa": LogiQADataset,
+    "mmlu": MMLUDataset,
+    "medqa": MedQADataset,
+    # New challenging datasets
+    "gpqa": GPQADataset,
+    "theoremqa": TheoremQADataset,
+    "aime": AIMEDataset,
+    "livecodebench": LiveCodeBenchDataset,
+    "scicode": SciCodeDataset,
+}
+
+
+def get_dataset(name: str, **kwargs) -> BaseDataset:
+    """Get a dataset by name."""
+    if name not in DATASET_REGISTRY:
+        raise ValueError(f"Unknown dataset: {name}. Available: {list(DATASET_REGISTRY.keys())}")
+    return DATASET_REGISTRY[name](**kwargs)
+
+
+def get_all_datasets(**kwargs) -> Dict[str, BaseDataset]:
+    """Get all available datasets."""
+    return {name: cls(**kwargs) for name, cls in DATASET_REGISTRY.items()}
+
+
+def get_challenging_datasets(**kwargs) -> Dict[str, BaseDataset]:
+    """Get only the new challenging datasets."""
+    challenging = ["gpqa", "theoremqa", "aime", "livecodebench", "scicode"]
+    return {name: DATASET_REGISTRY[name](**kwargs) for name in challenging}
+
+
+# =============================================================================
+# Step-by-Step Query Wrapper
+# =============================================================================
+
+def wrap_with_step_by_step_prompt(sample: DatasetSample) -> str:
+    """Wrap a problem with prompts encouraging step-by-step interaction.
+
+    This makes sessions longer and creates more opportunities for
+    preference expression/violation.
+    """
+    domain_prompts = {
+        "math": """Let's solve this step by step. Please:
+1. First, help me understand what the problem is asking
+2. Then, let's identify the key concepts/formulas needed
+3. Work through the solution one step at a time
+4. Verify our answer at the end
+
+Problem:
+{problem}
+
+Let's start by understanding the problem. What is it asking?""",
+
+        "code": """Let's implement this systematically. Please:
+1. First, clarify the requirements and edge cases
+2. Discuss the algorithm approach before coding
+3. Implement step by step, explaining each part
+4. Test with examples
+
+Problem:
+{problem}
+
+Let's start by understanding the requirements. What are the inputs, outputs, and edge cases?""",
+
+        "reasoning": """Let's think through this carefully. Please:
+1. Break down the key information in the passage
+2. Analyze each answer choice
+3. Eliminate wrong answers with clear reasoning
+4. Verify the correct answer
+
+Problem:
+{problem}
+
+Let's start by identifying the key facts in this passage.""",
+
+        "science": """Let's approach this PhD-level problem systematically. Please:
+1. Identify the domain and key concepts involved
+2. Recall relevant theories/equations
+3. Work through the reasoning step by step
+4. Verify our conclusion is scientifically sound
+
+Problem:
+{problem}
+
+Let's start by identifying what field this question is from and what concepts we'll need.""",
+
+        "science-code": """This combines scientific knowledge with coding. Let's:
+1. Understand the scientific concepts first
+2. Formulate the mathematical approach
+3. Design the algorithm
+4. Implement and validate
+
+Problem:
+{problem}
+
+Let's start by understanding the science behind this problem.""",
+    }
+
+    # Get the appropriate prompt or default to generic
+    domain = sample.domain
+    if domain in domain_prompts:
+        template = domain_prompts[domain]
+    else:
+        template = """Let's work through this step by step:
+1. Understand the problem
+2. Plan our approach
+3. Execute carefully
+4. Verify our answer
+
+Problem:
+{problem}
+
+Let's start by understanding what we need to do."""
+
+    return template.format(problem=sample.problem)
+
+
+# =============================================================================
+# Conflict-Inducing Query Augmentation
+# =============================================================================
+
+def augment_for_conflict_testing(sample: DatasetSample, conflict_type: str) -> str:
+    """Augment a query to trigger specific preference conflicts.
+
+    Args:
+        sample: The base problem
+        conflict_type: Type of conflict to trigger
+
+    Returns:
+        Augmented query that triggers the conflict
+    """
+    conflict_augmentations = {
+        # Verbosity conflict: "quick" + complex problem
+        "verbosity": "Quick question - {problem}",
+
+        # Format conflict: asks for both structure types
+        "format": "Can you explain this with examples and also give me a summary? {problem}",
+
+        # Tone conflict: frustrated + learning context
+        "tone": "I'm so frustrated with this! But I really want to understand it properly. {problem}",
+
+        # Code style conflict: multi-language context
+        "code_style": "I need this in Python first, then JavaScript. {problem}",
+
+        # Detail conflict: overview + specifics requested
+        "detail": "Give me the big picture but also the specific details. {problem}",
+
+        # Guidance conflict: incremental + full solution
+        "guidance": "Walk me through this but also just show me the answer if it's simple. {problem}",
+
+        # Rushed + thorough
+        "time_pressure": "I'm in a hurry but this is important so don't skip anything. {problem}",
+
+        # My attempt + fresh perspective
+        "approach": "I tried [some approach] but maybe start fresh with a better way? {problem}",
+    }
+
+    if conflict_type in conflict_augmentations:
+        template = conflict_augmentations[conflict_type]
+        return template.format(problem=sample.problem)
+
+    return sample.problem
+
+
+if __name__ == "__main__":
+    # Test loading datasets
+    print("Testing dataset loading...")
+
+    for name, cls in DATASET_REGISTRY.items():
+        try:
+            ds = cls(eval_size=5)
+            samples = ds.get_testset()
+            print(f"{name}: {len(samples)} samples loaded")
+            if samples:
+                print(f"  Sample: {samples[0].problem[:100]}...")
+        except Exception as e:
+            print(f"{name}: Failed - {e}")
-- 
cgit v1.2.3