"""
Extended datasets for challenging personalization evaluation.

New datasets added:
- GPQA: PhD-level science questions
- TheoremQA: Theorem-based math proofs
- LiveCodeBench: Recent competitive programming
- AIME: American Invitational Mathematics Examination
- SciCode: Scientific computing problems

All datasets encourage step-by-step problem solving for longer sessions.
"""

from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
import json
from pathlib import Path

try:
    from datasets import load_dataset
    HF_AVAILABLE = True
except ImportError:
    HF_AVAILABLE = False
    print("Warning: huggingface datasets not available")


@dataclass
class DatasetSample:
    """A single sample from a dataset."""
    problem: str
    solution: str
    problem_id: str
    domain: str
    difficulty: Optional[str] = None
    metadata: Optional[Dict] = None


class BaseDataset(ABC):
    """Base class for all datasets."""

    def __init__(self, eval_size: int = 100, train_size: int = 100, cache_dir: str = None):
        self.eval_size = eval_size
        self.train_size = train_size
        self.cache_dir = cache_dir
        self._test_data: Optional[List[DatasetSample]] = None
        self._train_data: Optional[List[DatasetSample]] = None

    @property
    @abstractmethod
    def name(self) -> str:
        pass

    @property
    @abstractmethod
    def domain(self) -> str:
        pass

    @property
    @abstractmethod
    def task_description(self) -> str:
        """Description of the task for user simulator."""
        pass

    @abstractmethod
    def _load_data(self, split: str) -> List[DatasetSample]:
        pass

    def get_testset(self) -> List[DatasetSample]:
        if self._test_data is None:
            self._test_data = self._load_data("test")[:self.eval_size]
        return self._test_data

    def get_trainset(self) -> List[DatasetSample]:
        if self._train_data is None:
            self._train_data = self._load_data("train")[:self.train_size]
        return self._train_data


# =============================================================================
# Existing Datasets (Enhanced with step-by-step prompts)
# =============================================================================

class MATH500Dataset(BaseDataset):
    """MATH-500 dataset with step-by-step encouragement."""

    @property
    def name(self) -> str:
        return "math-500"

    @property
    def domain(self) -> str:
        return "math"

    @property
    def task_description(self) -> str:
        return """You are trying to solve a mathematics problem. The problem requires careful
reasoning and step-by-step work. You will collaborate with an AI assistant to understand
and solve the problem. Break the problem into parts and work through each step carefully.
Ask the assistant to explain their reasoning at each step."""

    def _load_data(self, split: str) -> List[DatasetSample]:
        if not HF_AVAILABLE:
            return []

        ds = load_dataset("HuggingFaceH4/MATH-500", split="test")
        samples = []

        for i, item in enumerate(ds):
            samples.append(DatasetSample(
                problem=item["problem"],
                solution=item["answer"],
                problem_id=f"math500_{i}",
                domain="math",
                difficulty=item.get("level"),
                metadata={"type": item.get("type")}
            ))

        return samples


class MATHHardDataset(BaseDataset):
    """MATH-Hard (Level 4-5 only)."""

    @property
    def name(self) -> str:
        return "math-hard"

    @property
    def domain(self) -> str:
        return "math"

    @property
    def task_description(self) -> str:
        return """You are working on a challenging mathematics competition problem. These problems
require deep mathematical insight and careful reasoning. Work through the problem step by step,
explaining your approach clearly. Don't hesitate to ask for hints or verification of your reasoning."""

    def _load_data(self, split: str) -> List[DatasetSample]:
        if not HF_AVAILABLE:
            return []

        ds = load_dataset("lighteval/MATH-Hard", split="test" if split == "test" else "train")
        samples = []

        for i, item in enumerate(ds):
            level = item.get("level", "")
            if level not in ["Level 4", "Level 5"]:
                continue

            samples.append(DatasetSample(
                problem=item["problem"],
                solution=item.get("answer", item.get("solution", "")),
                problem_id=f"mathhard_{i}",
                domain="math",
                difficulty=level,
                metadata={"type": item.get("type")}
            ))

        return samples


class HumanEvalDataset(BaseDataset):
    """HumanEval code generation."""

    @property
    def name(self) -> str:
        return "humaneval"

    @property
    def domain(self) -> str:
        return "code"

    @property
    def task_description(self) -> str:
        return """You are implementing a Python function. Think through the problem carefully,
consider edge cases, and implement the solution step by step. Ask for clarification on any
ambiguous requirements. Discuss your approach before writing code."""

    def _load_data(self, split: str) -> List[DatasetSample]:
        if not HF_AVAILABLE:
            return []

        ds = load_dataset("openai/openai_humaneval", split="test")
        samples = []

        for item in ds:
            samples.append(DatasetSample(
                problem=item["prompt"],
                solution=item["canonical_solution"],
                problem_id=item["task_id"],
                domain="code",
                metadata={"entry_point": item["entry_point"], "test": item["test"]}
            ))

        return samples


class BigCodeBenchDataset(BaseDataset):
    """BigCodeBench - harder code generation."""

    @property
    def name(self) -> str:
        return "bigcodebench"

    @property
    def domain(self) -> str:
        return "code"

    @property
    def task_description(self) -> str:
        return """You are working on a complex programming task that requires multiple libraries
and careful implementation. Break down the problem, discuss the approach, and implement step by step.
Ask about library choices and implementation details."""

    def _load_data(self, split: str) -> List[DatasetSample]:
        if not HF_AVAILABLE:
            return []

        ds = load_dataset("bigcode/bigcodebench", split="v0.1.2")
        samples = []

        for item in ds:
            samples.append(DatasetSample(
                problem=item["instruct_prompt"],
                solution=item["canonical_solution"],
                problem_id=item["task_id"],
                domain="code",
                difficulty="hard",
                metadata={"libs": item.get("libs", [])}
            ))

        return samples


class LogiQADataset(BaseDataset):
    """LogiQA logical reasoning."""

    @property
    def name(self) -> str:
        return "logiqa"

    @property
    def domain(self) -> str:
        return "reasoning"

    @property
    def task_description(self) -> str:
        return """You are solving a logical reasoning problem. Read the passage carefully,
analyze each answer choice, and reason through the logic step by step. Explain your
reasoning process clearly."""

    def _load_data(self, split: str) -> List[DatasetSample]:
        if not HF_AVAILABLE:
            return []

        ds = load_dataset("lucasmccabe/logiqa", split=split if split in ["train", "test"] else "test")
        samples = []

        for i, item in enumerate(ds):
            options = item["options"]
            options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(options)])
            problem = f"{item['context']}\n\nQuestion: {item['query']}\n\nOptions:\n{options_str}"
            answer = chr(65 + item["correct_option"])

            samples.append(DatasetSample(
                problem=problem,
                solution=answer,
                problem_id=f"logiqa_{i}",
                domain="reasoning"
            ))

        return samples


class MMLUDataset(BaseDataset):
    """MMLU multi-domain knowledge."""

    @property
    def name(self) -> str:
        return "mmlu"

    @property
    def domain(self) -> str:
        return "knowledge"

    @property
    def task_description(self) -> str:
        return """You are answering a knowledge question that requires domain expertise.
Think through the question carefully, consider what you know about the topic, and
reason to the correct answer. Explain your thought process."""

    def _load_data(self, split: str) -> List[DatasetSample]:
        if not HF_AVAILABLE:
            return []

        # Load hard subjects
        hard_subjects = ["abstract_algebra", "college_mathematics", "college_physics",
                        "formal_logic", "high_school_physics", "machine_learning"]

        samples = []
        for subject in hard_subjects:
            try:
                ds = load_dataset("cais/mmlu", subject, split="test")
                for i, item in enumerate(ds):
                    choices = item["choices"]
                    options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(choices)])
                    problem = f"{item['question']}\n\nOptions:\n{options_str}"
                    answer = chr(65 + item["answer"])

                    samples.append(DatasetSample(
                        problem=problem,
                        solution=answer,
                        problem_id=f"mmlu_{subject}_{i}",
                        domain="knowledge",
                        metadata={"subject": subject}
                    ))
            except Exception:
                continue

        return samples


class MedQADataset(BaseDataset):
    """MedQA medical knowledge."""

    @property
    def name(self) -> str:
        return "medqa"

    @property
    def domain(self) -> str:
        return "medical"

    @property
    def task_description(self) -> str:
        return """You are answering a medical knowledge question. Consider the clinical
scenario carefully, think through the pathophysiology, and reason to the correct answer.
Explain your medical reasoning step by step."""

    def _load_data(self, split: str) -> List[DatasetSample]:
        if not HF_AVAILABLE:
            return []

        ds = load_dataset("bigbio/med_qa", "med_qa_en_source", split=split if split in ["train", "test"] else "test")
        samples = []

        for i, item in enumerate(ds):
            options = item["options"]
            if isinstance(options, dict):
                options_str = "\n".join([f"{k}. {v}" for k, v in options.items()])
            else:
                options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(options)])

            problem = f"{item['question']}\n\nOptions:\n{options_str}"

            samples.append(DatasetSample(
                problem=problem,
                solution=item["answer_idx"],
                problem_id=f"medqa_{i}",
                domain="medical"
            ))

        return samples


# =============================================================================
# NEW Challenging Datasets
# =============================================================================

class GPQADataset(BaseDataset):
    """GPQA - Graduate-level PhD science questions.

    Extremely challenging questions that require deep domain expertise.
    Perfect for testing complex, multi-step reasoning preferences.
    """

    @property
    def name(self) -> str:
        return "gpqa"

    @property
    def domain(self) -> str:
        return "science"

    @property
    def task_description(self) -> str:
        return """You are working on a PhD-level science question that requires deep domain expertise.
These questions are extremely challenging and require careful, methodical reasoning.
Break the problem into parts, discuss the relevant concepts, and work through each step.
Don't hesitate to ask for clarification or verification of your reasoning at each step.
Consider multiple approaches before committing to an answer."""

    def _load_data(self, split: str) -> List[DatasetSample]:
        if not HF_AVAILABLE:
            return []

        # GPQA diamond is the hardest subset
        try:
            ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train")
        except Exception:
            return []

        samples = []
        for i, item in enumerate(ds):
            # Format the multiple choice
            choices = [item.get(f"choice_{c}", "") for c in ["A", "B", "C", "D"] if item.get(f"choice_{c}")]
            options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(choices)])
            problem = f"{item['question']}\n\nOptions:\n{options_str}"

            samples.append(DatasetSample(
                problem=problem,
                solution=item.get("correct_answer", "A"),
                problem_id=f"gpqa_{i}",
                domain="science",
                difficulty="phd",
                metadata={"subdomain": item.get("subdomain", "unknown")}
            ))

        return samples


class TheoremQADataset(BaseDataset):
    """TheoremQA - Theorem-based mathematical reasoning.

    Requires applying mathematical theorems to solve problems.
    Tests formal mathematical reasoning and explanation preferences.
    """

    @property
    def name(self) -> str:
        return "theoremqa"

    @property
    def domain(self) -> str:
        return "math"

    @property
    def task_description(self) -> str:
        return """You are solving a theorem-based mathematics problem. This requires identifying
the relevant mathematical theorems, understanding their conditions, and applying them correctly.
Work through the problem step by step:
1. Identify what theorems might apply
2. Verify the conditions are met
3. Apply the theorem carefully
4. Verify the result
Discuss your reasoning at each step and ask for verification when needed."""

    def _load_data(self, split: str) -> List[DatasetSample]:
        if not HF_AVAILABLE:
            return []

        try:
            ds = load_dataset("TIGER-Lab/TheoremQA", split="test")
        except Exception:
            return []

        samples = []
        for i, item in enumerate(ds):
            samples.append(DatasetSample(
                problem=item["question"],
                solution=str(item.get("answer", "")),
                problem_id=f"theoremqa_{i}",
                domain="math",
                difficulty="hard",
                metadata={
                    "theorem": item.get("theorem", ""),
                    "field": item.get("field", "")
                }
            ))

        return samples


class AIMEDataset(BaseDataset):
    """AIME - American Invitational Mathematics Examination.

    Competition-level math problems requiring creative problem-solving.
    Answers are integers from 0-999.
    """

    @property
    def name(self) -> str:
        return "aime"

    @property
    def domain(self) -> str:
        return "math"

    @property
    def task_description(self) -> str:
        return """You are working on an AIME (American Invitational Mathematics Examination) problem.
These are competition math problems that require creative problem-solving approaches.
The answer is always an integer from 000 to 999.
Work through the problem systematically:
1. Understand what the problem is asking
2. Explore different approaches
3. Calculate carefully
4. Verify your answer
Discuss your thought process and ask for hints if you're stuck."""

    def _load_data(self, split: str) -> List[DatasetSample]:
        if not HF_AVAILABLE:
            return []

        # Try to load AIME from available sources
        try:
            ds = load_dataset("AI-MO/aimo-validation-aime", split="train")
        except Exception:
            # Fallback to MATH competition problems
            try:
                ds = load_dataset("hendrycks/competition_math", split="test")
                ds = [item for item in ds if "AIME" in item.get("source", "")]
            except Exception:
                return []

        samples = []
        for i, item in enumerate(ds):
            samples.append(DatasetSample(
                problem=item.get("problem", item.get("question", "")),
                solution=str(item.get("answer", item.get("solution", ""))),
                problem_id=f"aime_{i}",
                domain="math",
                difficulty="competition",
                metadata={"year": item.get("year", ""), "problem_num": item.get("problem_number", "")}
            ))

        return samples


class LiveCodeBenchDataset(BaseDataset):
    """LiveCodeBench - Recent competitive programming problems.

    Problems from recent programming contests (post-training cutoff).
    Tests code generation on truly novel problems.
    """

    @property
    def name(self) -> str:
        return "livecodebench"

    @property
    def domain(self) -> str:
        return "code"

    @property
    def task_description(self) -> str:
        return """You are solving a competitive programming problem from recent contests.
These problems require careful algorithm design and implementation.
Approach systematically:
1. Understand the problem constraints
2. Identify the algorithm pattern (DP, graphs, greedy, etc.)
3. Design the solution approach
4. Implement carefully with attention to edge cases
5. Analyze time/space complexity
Discuss your approach before coding and verify your logic at each step."""

    def _load_data(self, split: str) -> List[DatasetSample]:
        if not HF_AVAILABLE:
            return []

        try:
            ds = load_dataset("livecodebench/livecodebench", split="test")
        except Exception:
            return []

        samples = []
        for i, item in enumerate(ds):
            samples.append(DatasetSample(
                problem=item.get("question_content", item.get("problem", "")),
                solution=item.get("solution", ""),
                problem_id=item.get("question_id", f"lcb_{i}"),
                domain="code",
                difficulty=item.get("difficulty", "unknown"),
                metadata={
                    "contest": item.get("contest_name", ""),
                    "date": item.get("contest_date", ""),
                    "tags": item.get("tags", [])
                }
            ))

        return samples


class SciCodeDataset(BaseDataset):
    """SciCode - Scientific computing problems.

    Requires domain knowledge + coding ability.
    Tests both scientific reasoning and implementation preferences.
    """

    @property
    def name(self) -> str:
        return "scicode"

    @property
    def domain(self) -> str:
        return "science-code"

    @property
    def task_description(self) -> str:
        return """You are implementing a scientific computing solution. This requires both
domain knowledge (physics, chemistry, biology, etc.) and programming expertise.
Approach the problem by:
1. Understanding the scientific concepts involved
2. Formulating the mathematical model
3. Designing the computational approach
4. Implementing with proper numerical methods
5. Validating the results make scientific sense
Discuss the science and the code at each step."""

    def _load_data(self, split: str) -> List[DatasetSample]:
        if not HF_AVAILABLE:
            return []

        try:
            ds = load_dataset("xlangai/SciCode", split="test")
        except Exception:
            return []

        samples = []
        for i, item in enumerate(ds):
            samples.append(DatasetSample(
                problem=item.get("problem", ""),
                solution=item.get("solution", ""),
                problem_id=f"scicode_{i}",
                domain="science-code",
                difficulty="hard",
                metadata={
                    "discipline": item.get("discipline", ""),
                    "libraries": item.get("libraries", [])
                }
            ))

        return samples


# =============================================================================
# Dataset Registry
# =============================================================================

DATASET_REGISTRY = {
    # Existing (enhanced)
    "math-500": MATH500Dataset,
    "math-hard": MATHHardDataset,
    "humaneval": HumanEvalDataset,
    "bigcodebench": BigCodeBenchDataset,
    "logiqa": LogiQADataset,
    "mmlu": MMLUDataset,
    "medqa": MedQADataset,
    # New challenging datasets
    "gpqa": GPQADataset,
    "theoremqa": TheoremQADataset,
    "aime": AIMEDataset,
    "livecodebench": LiveCodeBenchDataset,
    "scicode": SciCodeDataset,
}


def get_dataset(name: str, **kwargs) -> BaseDataset:
    """Get a dataset by name."""
    if name not in DATASET_REGISTRY:
        raise ValueError(f"Unknown dataset: {name}. Available: {list(DATASET_REGISTRY.keys())}")
    return DATASET_REGISTRY[name](**kwargs)


def get_all_datasets(**kwargs) -> Dict[str, BaseDataset]:
    """Get all available datasets."""
    return {name: cls(**kwargs) for name, cls in DATASET_REGISTRY.items()}


def get_challenging_datasets(**kwargs) -> Dict[str, BaseDataset]:
    """Get only the new challenging datasets."""
    challenging = ["gpqa", "theoremqa", "aime", "livecodebench", "scicode"]
    return {name: DATASET_REGISTRY[name](**kwargs) for name in challenging}


# =============================================================================
# Step-by-Step Query Wrapper
# =============================================================================

def wrap_with_step_by_step_prompt(sample: DatasetSample) -> str:
    """Wrap a problem with prompts encouraging step-by-step interaction.

    This makes sessions longer and creates more opportunities for
    preference expression/violation.
    """
    domain_prompts = {
        "math": """Let's solve this step by step. Please:
1. First, help me understand what the problem is asking
2. Then, let's identify the key concepts/formulas needed
3. Work through the solution one step at a time
4. Verify our answer at the end

Problem:
{problem}

Let's start by understanding the problem. What is it asking?""",

        "code": """Let's implement this systematically. Please:
1. First, clarify the requirements and edge cases
2. Discuss the algorithm approach before coding
3. Implement step by step, explaining each part
4. Test with examples

Problem:
{problem}

Let's start by understanding the requirements. What are the inputs, outputs, and edge cases?""",

        "reasoning": """Let's think through this carefully. Please:
1. Break down the key information in the passage
2. Analyze each answer choice
3. Eliminate wrong answers with clear reasoning
4. Verify the correct answer

Problem:
{problem}

Let's start by identifying the key facts in this passage.""",

        "science": """Let's approach this PhD-level problem systematically. Please:
1. Identify the domain and key concepts involved
2. Recall relevant theories/equations
3. Work through the reasoning step by step
4. Verify our conclusion is scientifically sound

Problem:
{problem}

Let's start by identifying what field this question is from and what concepts we'll need.""",

        "science-code": """This combines scientific knowledge with coding. Let's:
1. Understand the scientific concepts first
2. Formulate the mathematical approach
3. Design the algorithm
4. Implement and validate

Problem:
{problem}

Let's start by understanding the science behind this problem.""",
    }

    # Get the appropriate prompt or default to generic
    domain = sample.domain
    if domain in domain_prompts:
        template = domain_prompts[domain]
    else:
        template = """Let's work through this step by step:
1. Understand the problem
2. Plan our approach
3. Execute carefully
4. Verify our answer

Problem:
{problem}

Let's start by understanding what we need to do."""

    return template.format(problem=sample.problem)


# =============================================================================
# Conflict-Inducing Query Augmentation
# =============================================================================

def augment_for_conflict_testing(sample: DatasetSample, conflict_type: str) -> str:
    """Augment a query to trigger specific preference conflicts.

    Args:
        sample: The base problem
        conflict_type: Type of conflict to trigger

    Returns:
        Augmented query that triggers the conflict
    """
    conflict_augmentations = {
        # Verbosity conflict: "quick" + complex problem
        "verbosity": "Quick question - {problem}",

        # Format conflict: asks for both structure types
        "format": "Can you explain this with examples and also give me a summary? {problem}",

        # Tone conflict: frustrated + learning context
        "tone": "I'm so frustrated with this! But I really want to understand it properly. {problem}",

        # Code style conflict: multi-language context
        "code_style": "I need this in Python first, then JavaScript. {problem}",

        # Detail conflict: overview + specifics requested
        "detail": "Give me the big picture but also the specific details. {problem}",

        # Guidance conflict: incremental + full solution
        "guidance": "Walk me through this but also just show me the answer if it's simple. {problem}",

        # Rushed + thorough
        "time_pressure": "I'm in a hurry but this is important so don't skip anything. {problem}",

        # My attempt + fresh perspective
        "approach": "I tried [some approach] but maybe start fresh with a better way? {problem}",
    }

    if conflict_type in conflict_augmentations:
        template = conflict_augmentations[conflict_type]
        return template.format(problem=sample.problem)

    return sample.problem


if __name__ == "__main__":
    # Test loading datasets
    print("Testing dataset loading...")

    for name, cls in DATASET_REGISTRY.items():
        try:
            ds = cls(eval_size=5)
            samples = ds.get_testset()
            print(f"{name}: {len(samples)} samples loaded")
            if samples:
                print(f"  Sample: {samples[0].problem[:100]}...")
        except Exception as e:
            print(f"{name}: Failed - {e}")