""" Extended datasets for challenging personalization evaluation. New datasets added: - GPQA: PhD-level science questions - TheoremQA: Theorem-based math proofs - LiveCodeBench: Recent competitive programming - AIME: American Invitational Mathematics Examination - SciCode: Scientific computing problems All datasets encourage step-by-step problem solving for longer sessions. """ from abc import ABC, abstractmethod from typing import List, Dict, Any, Optional from dataclasses import dataclass import json from pathlib import Path try: from datasets import load_dataset HF_AVAILABLE = True except ImportError: HF_AVAILABLE = False print("Warning: huggingface datasets not available") @dataclass class DatasetSample: """A single sample from a dataset.""" problem: str solution: str problem_id: str domain: str difficulty: Optional[str] = None metadata: Optional[Dict] = None class BaseDataset(ABC): """Base class for all datasets.""" def __init__(self, eval_size: int = 100, train_size: int = 100, cache_dir: str = None): self.eval_size = eval_size self.train_size = train_size self.cache_dir = cache_dir self._test_data: Optional[List[DatasetSample]] = None self._train_data: Optional[List[DatasetSample]] = None @property @abstractmethod def name(self) -> str: pass @property @abstractmethod def domain(self) -> str: pass @property @abstractmethod def task_description(self) -> str: """Description of the task for user simulator.""" pass @abstractmethod def _load_data(self, split: str) -> List[DatasetSample]: pass def get_testset(self) -> List[DatasetSample]: if self._test_data is None: self._test_data = self._load_data("test")[:self.eval_size] return self._test_data def get_trainset(self) -> List[DatasetSample]: if self._train_data is None: self._train_data = self._load_data("train")[:self.train_size] return self._train_data # ============================================================================= # Existing Datasets (Enhanced with step-by-step prompts) # ============================================================================= class MATH500Dataset(BaseDataset): """MATH-500 dataset with step-by-step encouragement.""" @property def name(self) -> str: return "math-500" @property def domain(self) -> str: return "math" @property def task_description(self) -> str: return """You are trying to solve a mathematics problem. The problem requires careful reasoning and step-by-step work. You will collaborate with an AI assistant to understand and solve the problem. Break the problem into parts and work through each step carefully. Ask the assistant to explain their reasoning at each step.""" def _load_data(self, split: str) -> List[DatasetSample]: if not HF_AVAILABLE: return [] ds = load_dataset("HuggingFaceH4/MATH-500", split="test") samples = [] for i, item in enumerate(ds): samples.append(DatasetSample( problem=item["problem"], solution=item["answer"], problem_id=f"math500_{i}", domain="math", difficulty=item.get("level"), metadata={"type": item.get("type")} )) return samples class MATHHardDataset(BaseDataset): """MATH-Hard (Level 4-5 only).""" @property def name(self) -> str: return "math-hard" @property def domain(self) -> str: return "math" @property def task_description(self) -> str: return """You are working on a challenging mathematics competition problem. These problems require deep mathematical insight and careful reasoning. Work through the problem step by step, explaining your approach clearly. Don't hesitate to ask for hints or verification of your reasoning.""" def _load_data(self, split: str) -> List[DatasetSample]: if not HF_AVAILABLE: return [] ds = load_dataset("lighteval/MATH-Hard", split="test" if split == "test" else "train") samples = [] for i, item in enumerate(ds): level = item.get("level", "") if level not in ["Level 4", "Level 5"]: continue samples.append(DatasetSample( problem=item["problem"], solution=item.get("answer", item.get("solution", "")), problem_id=f"mathhard_{i}", domain="math", difficulty=level, metadata={"type": item.get("type")} )) return samples class HumanEvalDataset(BaseDataset): """HumanEval code generation.""" @property def name(self) -> str: return "humaneval" @property def domain(self) -> str: return "code" @property def task_description(self) -> str: return """You are implementing a Python function. Think through the problem carefully, consider edge cases, and implement the solution step by step. Ask for clarification on any ambiguous requirements. Discuss your approach before writing code.""" def _load_data(self, split: str) -> List[DatasetSample]: if not HF_AVAILABLE: return [] ds = load_dataset("openai/openai_humaneval", split="test") samples = [] for item in ds: samples.append(DatasetSample( problem=item["prompt"], solution=item["canonical_solution"], problem_id=item["task_id"], domain="code", metadata={"entry_point": item["entry_point"], "test": item["test"]} )) return samples class BigCodeBenchDataset(BaseDataset): """BigCodeBench - harder code generation.""" @property def name(self) -> str: return "bigcodebench" @property def domain(self) -> str: return "code" @property def task_description(self) -> str: return """You are working on a complex programming task that requires multiple libraries and careful implementation. Break down the problem, discuss the approach, and implement step by step. Ask about library choices and implementation details.""" def _load_data(self, split: str) -> List[DatasetSample]: if not HF_AVAILABLE: return [] ds = load_dataset("bigcode/bigcodebench", split="v0.1.2") samples = [] for item in ds: samples.append(DatasetSample( problem=item["instruct_prompt"], solution=item["canonical_solution"], problem_id=item["task_id"], domain="code", difficulty="hard", metadata={"libs": item.get("libs", [])} )) return samples class LogiQADataset(BaseDataset): """LogiQA logical reasoning.""" @property def name(self) -> str: return "logiqa" @property def domain(self) -> str: return "reasoning" @property def task_description(self) -> str: return """You are solving a logical reasoning problem. Read the passage carefully, analyze each answer choice, and reason through the logic step by step. Explain your reasoning process clearly.""" def _load_data(self, split: str) -> List[DatasetSample]: if not HF_AVAILABLE: return [] ds = load_dataset("lucasmccabe/logiqa", split=split if split in ["train", "test"] else "test") samples = [] for i, item in enumerate(ds): options = item["options"] options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(options)]) problem = f"{item['context']}\n\nQuestion: {item['query']}\n\nOptions:\n{options_str}" answer = chr(65 + item["correct_option"]) samples.append(DatasetSample( problem=problem, solution=answer, problem_id=f"logiqa_{i}", domain="reasoning" )) return samples class MMLUDataset(BaseDataset): """MMLU multi-domain knowledge.""" @property def name(self) -> str: return "mmlu" @property def domain(self) -> str: return "knowledge" @property def task_description(self) -> str: return """You are answering a knowledge question that requires domain expertise. Think through the question carefully, consider what you know about the topic, and reason to the correct answer. Explain your thought process.""" def _load_data(self, split: str) -> List[DatasetSample]: if not HF_AVAILABLE: return [] # Load hard subjects hard_subjects = ["abstract_algebra", "college_mathematics", "college_physics", "formal_logic", "high_school_physics", "machine_learning"] samples = [] for subject in hard_subjects: try: ds = load_dataset("cais/mmlu", subject, split="test") for i, item in enumerate(ds): choices = item["choices"] options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(choices)]) problem = f"{item['question']}\n\nOptions:\n{options_str}" answer = chr(65 + item["answer"]) samples.append(DatasetSample( problem=problem, solution=answer, problem_id=f"mmlu_{subject}_{i}", domain="knowledge", metadata={"subject": subject} )) except Exception: continue return samples class MedQADataset(BaseDataset): """MedQA medical knowledge.""" @property def name(self) -> str: return "medqa" @property def domain(self) -> str: return "medical" @property def task_description(self) -> str: return """You are answering a medical knowledge question. Consider the clinical scenario carefully, think through the pathophysiology, and reason to the correct answer. Explain your medical reasoning step by step.""" def _load_data(self, split: str) -> List[DatasetSample]: if not HF_AVAILABLE: return [] ds = load_dataset("bigbio/med_qa", "med_qa_en_source", split=split if split in ["train", "test"] else "test") samples = [] for i, item in enumerate(ds): options = item["options"] if isinstance(options, dict): options_str = "\n".join([f"{k}. {v}" for k, v in options.items()]) else: options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(options)]) problem = f"{item['question']}\n\nOptions:\n{options_str}" samples.append(DatasetSample( problem=problem, solution=item["answer_idx"], problem_id=f"medqa_{i}", domain="medical" )) return samples # ============================================================================= # NEW Challenging Datasets # ============================================================================= class GPQADataset(BaseDataset): """GPQA - Graduate-level PhD science questions. Extremely challenging questions that require deep domain expertise. Perfect for testing complex, multi-step reasoning preferences. """ @property def name(self) -> str: return "gpqa" @property def domain(self) -> str: return "science" @property def task_description(self) -> str: return """You are working on a PhD-level science question that requires deep domain expertise. These questions are extremely challenging and require careful, methodical reasoning. Break the problem into parts, discuss the relevant concepts, and work through each step. Don't hesitate to ask for clarification or verification of your reasoning at each step. Consider multiple approaches before committing to an answer.""" def _load_data(self, split: str) -> List[DatasetSample]: if not HF_AVAILABLE: return [] # GPQA diamond is the hardest subset try: ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train") except Exception: return [] samples = [] for i, item in enumerate(ds): # Format the multiple choice choices = [item.get(f"choice_{c}", "") for c in ["A", "B", "C", "D"] if item.get(f"choice_{c}")] options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(choices)]) problem = f"{item['question']}\n\nOptions:\n{options_str}" samples.append(DatasetSample( problem=problem, solution=item.get("correct_answer", "A"), problem_id=f"gpqa_{i}", domain="science", difficulty="phd", metadata={"subdomain": item.get("subdomain", "unknown")} )) return samples class TheoremQADataset(BaseDataset): """TheoremQA - Theorem-based mathematical reasoning. Requires applying mathematical theorems to solve problems. Tests formal mathematical reasoning and explanation preferences. """ @property def name(self) -> str: return "theoremqa" @property def domain(self) -> str: return "math" @property def task_description(self) -> str: return """You are solving a theorem-based mathematics problem. This requires identifying the relevant mathematical theorems, understanding their conditions, and applying them correctly. Work through the problem step by step: 1. Identify what theorems might apply 2. Verify the conditions are met 3. Apply the theorem carefully 4. Verify the result Discuss your reasoning at each step and ask for verification when needed.""" def _load_data(self, split: str) -> List[DatasetSample]: if not HF_AVAILABLE: return [] try: ds = load_dataset("TIGER-Lab/TheoremQA", split="test") except Exception: return [] samples = [] for i, item in enumerate(ds): samples.append(DatasetSample( problem=item["question"], solution=str(item.get("answer", "")), problem_id=f"theoremqa_{i}", domain="math", difficulty="hard", metadata={ "theorem": item.get("theorem", ""), "field": item.get("field", "") } )) return samples class AIMEDataset(BaseDataset): """AIME - American Invitational Mathematics Examination. Competition-level math problems requiring creative problem-solving. Answers are integers from 0-999. """ @property def name(self) -> str: return "aime" @property def domain(self) -> str: return "math" @property def task_description(self) -> str: return """You are working on an AIME (American Invitational Mathematics Examination) problem. These are competition math problems that require creative problem-solving approaches. The answer is always an integer from 000 to 999. Work through the problem systematically: 1. Understand what the problem is asking 2. Explore different approaches 3. Calculate carefully 4. Verify your answer Discuss your thought process and ask for hints if you're stuck.""" def _load_data(self, split: str) -> List[DatasetSample]: if not HF_AVAILABLE: return [] # Try to load AIME from available sources try: ds = load_dataset("AI-MO/aimo-validation-aime", split="train") except Exception: # Fallback to MATH competition problems try: ds = load_dataset("hendrycks/competition_math", split="test") ds = [item for item in ds if "AIME" in item.get("source", "")] except Exception: return [] samples = [] for i, item in enumerate(ds): samples.append(DatasetSample( problem=item.get("problem", item.get("question", "")), solution=str(item.get("answer", item.get("solution", ""))), problem_id=f"aime_{i}", domain="math", difficulty="competition", metadata={"year": item.get("year", ""), "problem_num": item.get("problem_number", "")} )) return samples class LiveCodeBenchDataset(BaseDataset): """LiveCodeBench - Recent competitive programming problems. Problems from recent programming contests (post-training cutoff). Tests code generation on truly novel problems. """ @property def name(self) -> str: return "livecodebench" @property def domain(self) -> str: return "code" @property def task_description(self) -> str: return """You are solving a competitive programming problem from recent contests. These problems require careful algorithm design and implementation. Approach systematically: 1. Understand the problem constraints 2. Identify the algorithm pattern (DP, graphs, greedy, etc.) 3. Design the solution approach 4. Implement carefully with attention to edge cases 5. Analyze time/space complexity Discuss your approach before coding and verify your logic at each step.""" def _load_data(self, split: str) -> List[DatasetSample]: if not HF_AVAILABLE: return [] try: ds = load_dataset("livecodebench/livecodebench", split="test") except Exception: return [] samples = [] for i, item in enumerate(ds): samples.append(DatasetSample( problem=item.get("question_content", item.get("problem", "")), solution=item.get("solution", ""), problem_id=item.get("question_id", f"lcb_{i}"), domain="code", difficulty=item.get("difficulty", "unknown"), metadata={ "contest": item.get("contest_name", ""), "date": item.get("contest_date", ""), "tags": item.get("tags", []) } )) return samples class SciCodeDataset(BaseDataset): """SciCode - Scientific computing problems. Requires domain knowledge + coding ability. Tests both scientific reasoning and implementation preferences. """ @property def name(self) -> str: return "scicode" @property def domain(self) -> str: return "science-code" @property def task_description(self) -> str: return """You are implementing a scientific computing solution. This requires both domain knowledge (physics, chemistry, biology, etc.) and programming expertise. Approach the problem by: 1. Understanding the scientific concepts involved 2. Formulating the mathematical model 3. Designing the computational approach 4. Implementing with proper numerical methods 5. Validating the results make scientific sense Discuss the science and the code at each step.""" def _load_data(self, split: str) -> List[DatasetSample]: if not HF_AVAILABLE: return [] try: ds = load_dataset("xlangai/SciCode", split="test") except Exception: return [] samples = [] for i, item in enumerate(ds): samples.append(DatasetSample( problem=item.get("problem", ""), solution=item.get("solution", ""), problem_id=f"scicode_{i}", domain="science-code", difficulty="hard", metadata={ "discipline": item.get("discipline", ""), "libraries": item.get("libraries", []) } )) return samples # ============================================================================= # Dataset Registry # ============================================================================= DATASET_REGISTRY = { # Existing (enhanced) "math-500": MATH500Dataset, "math-hard": MATHHardDataset, "humaneval": HumanEvalDataset, "bigcodebench": BigCodeBenchDataset, "logiqa": LogiQADataset, "mmlu": MMLUDataset, "medqa": MedQADataset, # New challenging datasets "gpqa": GPQADataset, "theoremqa": TheoremQADataset, "aime": AIMEDataset, "livecodebench": LiveCodeBenchDataset, "scicode": SciCodeDataset, } def get_dataset(name: str, **kwargs) -> BaseDataset: """Get a dataset by name.""" if name not in DATASET_REGISTRY: raise ValueError(f"Unknown dataset: {name}. Available: {list(DATASET_REGISTRY.keys())}") return DATASET_REGISTRY[name](**kwargs) def get_all_datasets(**kwargs) -> Dict[str, BaseDataset]: """Get all available datasets.""" return {name: cls(**kwargs) for name, cls in DATASET_REGISTRY.items()} def get_challenging_datasets(**kwargs) -> Dict[str, BaseDataset]: """Get only the new challenging datasets.""" challenging = ["gpqa", "theoremqa", "aime", "livecodebench", "scicode"] return {name: DATASET_REGISTRY[name](**kwargs) for name in challenging} # ============================================================================= # Step-by-Step Query Wrapper # ============================================================================= def wrap_with_step_by_step_prompt(sample: DatasetSample) -> str: """Wrap a problem with prompts encouraging step-by-step interaction. This makes sessions longer and creates more opportunities for preference expression/violation. """ domain_prompts = { "math": """Let's solve this step by step. Please: 1. First, help me understand what the problem is asking 2. Then, let's identify the key concepts/formulas needed 3. Work through the solution one step at a time 4. Verify our answer at the end Problem: {problem} Let's start by understanding the problem. What is it asking?""", "code": """Let's implement this systematically. Please: 1. First, clarify the requirements and edge cases 2. Discuss the algorithm approach before coding 3. Implement step by step, explaining each part 4. Test with examples Problem: {problem} Let's start by understanding the requirements. What are the inputs, outputs, and edge cases?""", "reasoning": """Let's think through this carefully. Please: 1. Break down the key information in the passage 2. Analyze each answer choice 3. Eliminate wrong answers with clear reasoning 4. Verify the correct answer Problem: {problem} Let's start by identifying the key facts in this passage.""", "science": """Let's approach this PhD-level problem systematically. Please: 1. Identify the domain and key concepts involved 2. Recall relevant theories/equations 3. Work through the reasoning step by step 4. Verify our conclusion is scientifically sound Problem: {problem} Let's start by identifying what field this question is from and what concepts we'll need.""", "science-code": """This combines scientific knowledge with coding. Let's: 1. Understand the scientific concepts first 2. Formulate the mathematical approach 3. Design the algorithm 4. Implement and validate Problem: {problem} Let's start by understanding the science behind this problem.""", } # Get the appropriate prompt or default to generic domain = sample.domain if domain in domain_prompts: template = domain_prompts[domain] else: template = """Let's work through this step by step: 1. Understand the problem 2. Plan our approach 3. Execute carefully 4. Verify our answer Problem: {problem} Let's start by understanding what we need to do.""" return template.format(problem=sample.problem) # ============================================================================= # Conflict-Inducing Query Augmentation # ============================================================================= def augment_for_conflict_testing(sample: DatasetSample, conflict_type: str) -> str: """Augment a query to trigger specific preference conflicts. Args: sample: The base problem conflict_type: Type of conflict to trigger Returns: Augmented query that triggers the conflict """ conflict_augmentations = { # Verbosity conflict: "quick" + complex problem "verbosity": "Quick question - {problem}", # Format conflict: asks for both structure types "format": "Can you explain this with examples and also give me a summary? {problem}", # Tone conflict: frustrated + learning context "tone": "I'm so frustrated with this! But I really want to understand it properly. {problem}", # Code style conflict: multi-language context "code_style": "I need this in Python first, then JavaScript. {problem}", # Detail conflict: overview + specifics requested "detail": "Give me the big picture but also the specific details. {problem}", # Guidance conflict: incremental + full solution "guidance": "Walk me through this but also just show me the answer if it's simple. {problem}", # Rushed + thorough "time_pressure": "I'm in a hurry but this is important so don't skip anything. {problem}", # My attempt + fresh perspective "approach": "I tried [some approach] but maybe start fresh with a better way? {problem}", } if conflict_type in conflict_augmentations: template = conflict_augmentations[conflict_type] return template.format(problem=sample.problem) return sample.problem if __name__ == "__main__": # Test loading datasets print("Testing dataset loading...") for name, cls in DATASET_REGISTRY.items(): try: ds = cls(eval_size=5) samples = ds.get_testset() print(f"{name}: {len(samples)} samples loaded") if samples: print(f" Sample: {samples[0].problem[:100]}...") except Exception as e: print(f"{name}: Failed - {e}")