From dc801c07cf38b0c495686463e6ca6f871a64440e Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Tue, 27 Jan 2026 09:57:37 -0600 Subject: Add collaborativeagents module and update gitignore - Add collaborativeagents subproject with adapters, agents, and evaluation modules - Update .gitignore to exclude large binary files (.whl, .tar), wandb logs, and results Co-Authored-By: Claude Opus 4.5 --- collaborativeagents/datasets_extended.py | 823 +++++++++++++++++++++++++++++++ 1 file changed, 823 insertions(+) create mode 100644 collaborativeagents/datasets_extended.py (limited to 'collaborativeagents/datasets_extended.py') diff --git a/collaborativeagents/datasets_extended.py b/collaborativeagents/datasets_extended.py new file mode 100644 index 0000000..93a4ce8 --- /dev/null +++ b/collaborativeagents/datasets_extended.py @@ -0,0 +1,823 @@ +""" +Extended datasets for challenging personalization evaluation. + +New datasets added: +- GPQA: PhD-level science questions +- TheoremQA: Theorem-based math proofs +- LiveCodeBench: Recent competitive programming +- AIME: American Invitational Mathematics Examination +- SciCode: Scientific computing problems + +All datasets encourage step-by-step problem solving for longer sessions. +""" + +from abc import ABC, abstractmethod +from typing import List, Dict, Any, Optional +from dataclasses import dataclass +import json +from pathlib import Path + +try: + from datasets import load_dataset + HF_AVAILABLE = True +except ImportError: + HF_AVAILABLE = False + print("Warning: huggingface datasets not available") + + +@dataclass +class DatasetSample: + """A single sample from a dataset.""" + problem: str + solution: str + problem_id: str + domain: str + difficulty: Optional[str] = None + metadata: Optional[Dict] = None + + +class BaseDataset(ABC): + """Base class for all datasets.""" + + def __init__(self, eval_size: int = 100, train_size: int = 100, cache_dir: str = None): + self.eval_size = eval_size + self.train_size = train_size + self.cache_dir = cache_dir + self._test_data: Optional[List[DatasetSample]] = None + self._train_data: Optional[List[DatasetSample]] = None + + @property + @abstractmethod + def name(self) -> str: + pass + + @property + @abstractmethod + def domain(self) -> str: + pass + + @property + @abstractmethod + def task_description(self) -> str: + """Description of the task for user simulator.""" + pass + + @abstractmethod + def _load_data(self, split: str) -> List[DatasetSample]: + pass + + def get_testset(self) -> List[DatasetSample]: + if self._test_data is None: + self._test_data = self._load_data("test")[:self.eval_size] + return self._test_data + + def get_trainset(self) -> List[DatasetSample]: + if self._train_data is None: + self._train_data = self._load_data("train")[:self.train_size] + return self._train_data + + +# ============================================================================= +# Existing Datasets (Enhanced with step-by-step prompts) +# ============================================================================= + +class MATH500Dataset(BaseDataset): + """MATH-500 dataset with step-by-step encouragement.""" + + @property + def name(self) -> str: + return "math-500" + + @property + def domain(self) -> str: + return "math" + + @property + def task_description(self) -> str: + return """You are trying to solve a mathematics problem. The problem requires careful +reasoning and step-by-step work. You will collaborate with an AI assistant to understand +and solve the problem. Break the problem into parts and work through each step carefully. +Ask the assistant to explain their reasoning at each step.""" + + def _load_data(self, split: str) -> List[DatasetSample]: + if not HF_AVAILABLE: + return [] + + ds = load_dataset("HuggingFaceH4/MATH-500", split="test") + samples = [] + + for i, item in enumerate(ds): + samples.append(DatasetSample( + problem=item["problem"], + solution=item["answer"], + problem_id=f"math500_{i}", + domain="math", + difficulty=item.get("level"), + metadata={"type": item.get("type")} + )) + + return samples + + +class MATHHardDataset(BaseDataset): + """MATH-Hard (Level 4-5 only).""" + + @property + def name(self) -> str: + return "math-hard" + + @property + def domain(self) -> str: + return "math" + + @property + def task_description(self) -> str: + return """You are working on a challenging mathematics competition problem. These problems +require deep mathematical insight and careful reasoning. Work through the problem step by step, +explaining your approach clearly. Don't hesitate to ask for hints or verification of your reasoning.""" + + def _load_data(self, split: str) -> List[DatasetSample]: + if not HF_AVAILABLE: + return [] + + ds = load_dataset("lighteval/MATH-Hard", split="test" if split == "test" else "train") + samples = [] + + for i, item in enumerate(ds): + level = item.get("level", "") + if level not in ["Level 4", "Level 5"]: + continue + + samples.append(DatasetSample( + problem=item["problem"], + solution=item.get("answer", item.get("solution", "")), + problem_id=f"mathhard_{i}", + domain="math", + difficulty=level, + metadata={"type": item.get("type")} + )) + + return samples + + +class HumanEvalDataset(BaseDataset): + """HumanEval code generation.""" + + @property + def name(self) -> str: + return "humaneval" + + @property + def domain(self) -> str: + return "code" + + @property + def task_description(self) -> str: + return """You are implementing a Python function. Think through the problem carefully, +consider edge cases, and implement the solution step by step. Ask for clarification on any +ambiguous requirements. Discuss your approach before writing code.""" + + def _load_data(self, split: str) -> List[DatasetSample]: + if not HF_AVAILABLE: + return [] + + ds = load_dataset("openai/openai_humaneval", split="test") + samples = [] + + for item in ds: + samples.append(DatasetSample( + problem=item["prompt"], + solution=item["canonical_solution"], + problem_id=item["task_id"], + domain="code", + metadata={"entry_point": item["entry_point"], "test": item["test"]} + )) + + return samples + + +class BigCodeBenchDataset(BaseDataset): + """BigCodeBench - harder code generation.""" + + @property + def name(self) -> str: + return "bigcodebench" + + @property + def domain(self) -> str: + return "code" + + @property + def task_description(self) -> str: + return """You are working on a complex programming task that requires multiple libraries +and careful implementation. Break down the problem, discuss the approach, and implement step by step. +Ask about library choices and implementation details.""" + + def _load_data(self, split: str) -> List[DatasetSample]: + if not HF_AVAILABLE: + return [] + + ds = load_dataset("bigcode/bigcodebench", split="v0.1.2") + samples = [] + + for item in ds: + samples.append(DatasetSample( + problem=item["instruct_prompt"], + solution=item["canonical_solution"], + problem_id=item["task_id"], + domain="code", + difficulty="hard", + metadata={"libs": item.get("libs", [])} + )) + + return samples + + +class LogiQADataset(BaseDataset): + """LogiQA logical reasoning.""" + + @property + def name(self) -> str: + return "logiqa" + + @property + def domain(self) -> str: + return "reasoning" + + @property + def task_description(self) -> str: + return """You are solving a logical reasoning problem. Read the passage carefully, +analyze each answer choice, and reason through the logic step by step. Explain your +reasoning process clearly.""" + + def _load_data(self, split: str) -> List[DatasetSample]: + if not HF_AVAILABLE: + return [] + + ds = load_dataset("lucasmccabe/logiqa", split=split if split in ["train", "test"] else "test") + samples = [] + + for i, item in enumerate(ds): + options = item["options"] + options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(options)]) + problem = f"{item['context']}\n\nQuestion: {item['query']}\n\nOptions:\n{options_str}" + answer = chr(65 + item["correct_option"]) + + samples.append(DatasetSample( + problem=problem, + solution=answer, + problem_id=f"logiqa_{i}", + domain="reasoning" + )) + + return samples + + +class MMLUDataset(BaseDataset): + """MMLU multi-domain knowledge.""" + + @property + def name(self) -> str: + return "mmlu" + + @property + def domain(self) -> str: + return "knowledge" + + @property + def task_description(self) -> str: + return """You are answering a knowledge question that requires domain expertise. +Think through the question carefully, consider what you know about the topic, and +reason to the correct answer. Explain your thought process.""" + + def _load_data(self, split: str) -> List[DatasetSample]: + if not HF_AVAILABLE: + return [] + + # Load hard subjects + hard_subjects = ["abstract_algebra", "college_mathematics", "college_physics", + "formal_logic", "high_school_physics", "machine_learning"] + + samples = [] + for subject in hard_subjects: + try: + ds = load_dataset("cais/mmlu", subject, split="test") + for i, item in enumerate(ds): + choices = item["choices"] + options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(choices)]) + problem = f"{item['question']}\n\nOptions:\n{options_str}" + answer = chr(65 + item["answer"]) + + samples.append(DatasetSample( + problem=problem, + solution=answer, + problem_id=f"mmlu_{subject}_{i}", + domain="knowledge", + metadata={"subject": subject} + )) + except Exception: + continue + + return samples + + +class MedQADataset(BaseDataset): + """MedQA medical knowledge.""" + + @property + def name(self) -> str: + return "medqa" + + @property + def domain(self) -> str: + return "medical" + + @property + def task_description(self) -> str: + return """You are answering a medical knowledge question. Consider the clinical +scenario carefully, think through the pathophysiology, and reason to the correct answer. +Explain your medical reasoning step by step.""" + + def _load_data(self, split: str) -> List[DatasetSample]: + if not HF_AVAILABLE: + return [] + + ds = load_dataset("bigbio/med_qa", "med_qa_en_source", split=split if split in ["train", "test"] else "test") + samples = [] + + for i, item in enumerate(ds): + options = item["options"] + if isinstance(options, dict): + options_str = "\n".join([f"{k}. {v}" for k, v in options.items()]) + else: + options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(options)]) + + problem = f"{item['question']}\n\nOptions:\n{options_str}" + + samples.append(DatasetSample( + problem=problem, + solution=item["answer_idx"], + problem_id=f"medqa_{i}", + domain="medical" + )) + + return samples + + +# ============================================================================= +# NEW Challenging Datasets +# ============================================================================= + +class GPQADataset(BaseDataset): + """GPQA - Graduate-level PhD science questions. + + Extremely challenging questions that require deep domain expertise. + Perfect for testing complex, multi-step reasoning preferences. + """ + + @property + def name(self) -> str: + return "gpqa" + + @property + def domain(self) -> str: + return "science" + + @property + def task_description(self) -> str: + return """You are working on a PhD-level science question that requires deep domain expertise. +These questions are extremely challenging and require careful, methodical reasoning. +Break the problem into parts, discuss the relevant concepts, and work through each step. +Don't hesitate to ask for clarification or verification of your reasoning at each step. +Consider multiple approaches before committing to an answer.""" + + def _load_data(self, split: str) -> List[DatasetSample]: + if not HF_AVAILABLE: + return [] + + # GPQA diamond is the hardest subset + try: + ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train") + except Exception: + return [] + + samples = [] + for i, item in enumerate(ds): + # Format the multiple choice + choices = [item.get(f"choice_{c}", "") for c in ["A", "B", "C", "D"] if item.get(f"choice_{c}")] + options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(choices)]) + problem = f"{item['question']}\n\nOptions:\n{options_str}" + + samples.append(DatasetSample( + problem=problem, + solution=item.get("correct_answer", "A"), + problem_id=f"gpqa_{i}", + domain="science", + difficulty="phd", + metadata={"subdomain": item.get("subdomain", "unknown")} + )) + + return samples + + +class TheoremQADataset(BaseDataset): + """TheoremQA - Theorem-based mathematical reasoning. + + Requires applying mathematical theorems to solve problems. + Tests formal mathematical reasoning and explanation preferences. + """ + + @property + def name(self) -> str: + return "theoremqa" + + @property + def domain(self) -> str: + return "math" + + @property + def task_description(self) -> str: + return """You are solving a theorem-based mathematics problem. This requires identifying +the relevant mathematical theorems, understanding their conditions, and applying them correctly. +Work through the problem step by step: +1. Identify what theorems might apply +2. Verify the conditions are met +3. Apply the theorem carefully +4. Verify the result +Discuss your reasoning at each step and ask for verification when needed.""" + + def _load_data(self, split: str) -> List[DatasetSample]: + if not HF_AVAILABLE: + return [] + + try: + ds = load_dataset("TIGER-Lab/TheoremQA", split="test") + except Exception: + return [] + + samples = [] + for i, item in enumerate(ds): + samples.append(DatasetSample( + problem=item["question"], + solution=str(item.get("answer", "")), + problem_id=f"theoremqa_{i}", + domain="math", + difficulty="hard", + metadata={ + "theorem": item.get("theorem", ""), + "field": item.get("field", "") + } + )) + + return samples + + +class AIMEDataset(BaseDataset): + """AIME - American Invitational Mathematics Examination. + + Competition-level math problems requiring creative problem-solving. + Answers are integers from 0-999. + """ + + @property + def name(self) -> str: + return "aime" + + @property + def domain(self) -> str: + return "math" + + @property + def task_description(self) -> str: + return """You are working on an AIME (American Invitational Mathematics Examination) problem. +These are competition math problems that require creative problem-solving approaches. +The answer is always an integer from 000 to 999. +Work through the problem systematically: +1. Understand what the problem is asking +2. Explore different approaches +3. Calculate carefully +4. Verify your answer +Discuss your thought process and ask for hints if you're stuck.""" + + def _load_data(self, split: str) -> List[DatasetSample]: + if not HF_AVAILABLE: + return [] + + # Try to load AIME from available sources + try: + ds = load_dataset("AI-MO/aimo-validation-aime", split="train") + except Exception: + # Fallback to MATH competition problems + try: + ds = load_dataset("hendrycks/competition_math", split="test") + ds = [item for item in ds if "AIME" in item.get("source", "")] + except Exception: + return [] + + samples = [] + for i, item in enumerate(ds): + samples.append(DatasetSample( + problem=item.get("problem", item.get("question", "")), + solution=str(item.get("answer", item.get("solution", ""))), + problem_id=f"aime_{i}", + domain="math", + difficulty="competition", + metadata={"year": item.get("year", ""), "problem_num": item.get("problem_number", "")} + )) + + return samples + + +class LiveCodeBenchDataset(BaseDataset): + """LiveCodeBench - Recent competitive programming problems. + + Problems from recent programming contests (post-training cutoff). + Tests code generation on truly novel problems. + """ + + @property + def name(self) -> str: + return "livecodebench" + + @property + def domain(self) -> str: + return "code" + + @property + def task_description(self) -> str: + return """You are solving a competitive programming problem from recent contests. +These problems require careful algorithm design and implementation. +Approach systematically: +1. Understand the problem constraints +2. Identify the algorithm pattern (DP, graphs, greedy, etc.) +3. Design the solution approach +4. Implement carefully with attention to edge cases +5. Analyze time/space complexity +Discuss your approach before coding and verify your logic at each step.""" + + def _load_data(self, split: str) -> List[DatasetSample]: + if not HF_AVAILABLE: + return [] + + try: + ds = load_dataset("livecodebench/livecodebench", split="test") + except Exception: + return [] + + samples = [] + for i, item in enumerate(ds): + samples.append(DatasetSample( + problem=item.get("question_content", item.get("problem", "")), + solution=item.get("solution", ""), + problem_id=item.get("question_id", f"lcb_{i}"), + domain="code", + difficulty=item.get("difficulty", "unknown"), + metadata={ + "contest": item.get("contest_name", ""), + "date": item.get("contest_date", ""), + "tags": item.get("tags", []) + } + )) + + return samples + + +class SciCodeDataset(BaseDataset): + """SciCode - Scientific computing problems. + + Requires domain knowledge + coding ability. + Tests both scientific reasoning and implementation preferences. + """ + + @property + def name(self) -> str: + return "scicode" + + @property + def domain(self) -> str: + return "science-code" + + @property + def task_description(self) -> str: + return """You are implementing a scientific computing solution. This requires both +domain knowledge (physics, chemistry, biology, etc.) and programming expertise. +Approach the problem by: +1. Understanding the scientific concepts involved +2. Formulating the mathematical model +3. Designing the computational approach +4. Implementing with proper numerical methods +5. Validating the results make scientific sense +Discuss the science and the code at each step.""" + + def _load_data(self, split: str) -> List[DatasetSample]: + if not HF_AVAILABLE: + return [] + + try: + ds = load_dataset("xlangai/SciCode", split="test") + except Exception: + return [] + + samples = [] + for i, item in enumerate(ds): + samples.append(DatasetSample( + problem=item.get("problem", ""), + solution=item.get("solution", ""), + problem_id=f"scicode_{i}", + domain="science-code", + difficulty="hard", + metadata={ + "discipline": item.get("discipline", ""), + "libraries": item.get("libraries", []) + } + )) + + return samples + + +# ============================================================================= +# Dataset Registry +# ============================================================================= + +DATASET_REGISTRY = { + # Existing (enhanced) + "math-500": MATH500Dataset, + "math-hard": MATHHardDataset, + "humaneval": HumanEvalDataset, + "bigcodebench": BigCodeBenchDataset, + "logiqa": LogiQADataset, + "mmlu": MMLUDataset, + "medqa": MedQADataset, + # New challenging datasets + "gpqa": GPQADataset, + "theoremqa": TheoremQADataset, + "aime": AIMEDataset, + "livecodebench": LiveCodeBenchDataset, + "scicode": SciCodeDataset, +} + + +def get_dataset(name: str, **kwargs) -> BaseDataset: + """Get a dataset by name.""" + if name not in DATASET_REGISTRY: + raise ValueError(f"Unknown dataset: {name}. Available: {list(DATASET_REGISTRY.keys())}") + return DATASET_REGISTRY[name](**kwargs) + + +def get_all_datasets(**kwargs) -> Dict[str, BaseDataset]: + """Get all available datasets.""" + return {name: cls(**kwargs) for name, cls in DATASET_REGISTRY.items()} + + +def get_challenging_datasets(**kwargs) -> Dict[str, BaseDataset]: + """Get only the new challenging datasets.""" + challenging = ["gpqa", "theoremqa", "aime", "livecodebench", "scicode"] + return {name: DATASET_REGISTRY[name](**kwargs) for name in challenging} + + +# ============================================================================= +# Step-by-Step Query Wrapper +# ============================================================================= + +def wrap_with_step_by_step_prompt(sample: DatasetSample) -> str: + """Wrap a problem with prompts encouraging step-by-step interaction. + + This makes sessions longer and creates more opportunities for + preference expression/violation. + """ + domain_prompts = { + "math": """Let's solve this step by step. Please: +1. First, help me understand what the problem is asking +2. Then, let's identify the key concepts/formulas needed +3. Work through the solution one step at a time +4. Verify our answer at the end + +Problem: +{problem} + +Let's start by understanding the problem. What is it asking?""", + + "code": """Let's implement this systematically. Please: +1. First, clarify the requirements and edge cases +2. Discuss the algorithm approach before coding +3. Implement step by step, explaining each part +4. Test with examples + +Problem: +{problem} + +Let's start by understanding the requirements. What are the inputs, outputs, and edge cases?""", + + "reasoning": """Let's think through this carefully. Please: +1. Break down the key information in the passage +2. Analyze each answer choice +3. Eliminate wrong answers with clear reasoning +4. Verify the correct answer + +Problem: +{problem} + +Let's start by identifying the key facts in this passage.""", + + "science": """Let's approach this PhD-level problem systematically. Please: +1. Identify the domain and key concepts involved +2. Recall relevant theories/equations +3. Work through the reasoning step by step +4. Verify our conclusion is scientifically sound + +Problem: +{problem} + +Let's start by identifying what field this question is from and what concepts we'll need.""", + + "science-code": """This combines scientific knowledge with coding. Let's: +1. Understand the scientific concepts first +2. Formulate the mathematical approach +3. Design the algorithm +4. Implement and validate + +Problem: +{problem} + +Let's start by understanding the science behind this problem.""", + } + + # Get the appropriate prompt or default to generic + domain = sample.domain + if domain in domain_prompts: + template = domain_prompts[domain] + else: + template = """Let's work through this step by step: +1. Understand the problem +2. Plan our approach +3. Execute carefully +4. Verify our answer + +Problem: +{problem} + +Let's start by understanding what we need to do.""" + + return template.format(problem=sample.problem) + + +# ============================================================================= +# Conflict-Inducing Query Augmentation +# ============================================================================= + +def augment_for_conflict_testing(sample: DatasetSample, conflict_type: str) -> str: + """Augment a query to trigger specific preference conflicts. + + Args: + sample: The base problem + conflict_type: Type of conflict to trigger + + Returns: + Augmented query that triggers the conflict + """ + conflict_augmentations = { + # Verbosity conflict: "quick" + complex problem + "verbosity": "Quick question - {problem}", + + # Format conflict: asks for both structure types + "format": "Can you explain this with examples and also give me a summary? {problem}", + + # Tone conflict: frustrated + learning context + "tone": "I'm so frustrated with this! But I really want to understand it properly. {problem}", + + # Code style conflict: multi-language context + "code_style": "I need this in Python first, then JavaScript. {problem}", + + # Detail conflict: overview + specifics requested + "detail": "Give me the big picture but also the specific details. {problem}", + + # Guidance conflict: incremental + full solution + "guidance": "Walk me through this but also just show me the answer if it's simple. {problem}", + + # Rushed + thorough + "time_pressure": "I'm in a hurry but this is important so don't skip anything. {problem}", + + # My attempt + fresh perspective + "approach": "I tried [some approach] but maybe start fresh with a better way? {problem}", + } + + if conflict_type in conflict_augmentations: + template = conflict_augmentations[conflict_type] + return template.format(problem=sample.problem) + + return sample.problem + + +if __name__ == "__main__": + # Test loading datasets + print("Testing dataset loading...") + + for name, cls in DATASET_REGISTRY.items(): + try: + ds = cls(eval_size=5) + samples = ds.get_testset() + print(f"{name}: {len(samples)} samples loaded") + if samples: + print(f" Sample: {samples[0].problem[:100]}...") + except Exception as e: + print(f"{name}: Failed - {e}") -- cgit v1.2.3