#!/usr/bin/env python3
"""
Prepare REAL datasets for RLVR floating-point precision experiments.

Downloads from HuggingFace:
- Training: GSM8K train (7473 samples)
- Evaluation: GSM8K test, MATH-500, AIME, AMC, MMLU-STEM, HumanEval

Usage:
    python scripts/prepare_data.py
"""

import json
import os
import random
from pathlib import Path
from datasets import load_dataset
from tqdm import tqdm

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)


def save_json(data: list, path: Path):
    """Save data as JSON file."""
    with open(path, "w") as f:
        json.dump(data, f, indent=2)
    print(f"  Saved {len(data)} samples to {path}")


def prepare_gsm8k_train():
    """Prepare GSM8K training data."""
    print("\n=== Downloading GSM8K Train ===")
    ds = load_dataset("openai/gsm8k", "main", split="train")

    data = []
    for i, sample in enumerate(tqdm(ds, desc="Processing")):
        # Extract answer from "#### N" format
        answer = sample["answer"].split("####")[-1].strip()
        data.append({
            "id": f"gsm8k_train_{i}",
            "prompt": sample["question"],
            "answer": answer,
            "solution": sample["answer"],
            "source": "gsm8k_train"
        })

    save_json(data, DATA_DIR / "dm_train.json")
    return data


def prepare_gsm8k_test():
    """Prepare GSM8K test data for evaluation."""
    print("\n=== Downloading GSM8K Test ===")
    ds = load_dataset("openai/gsm8k", "main", split="test")

    data = []
    for i, sample in enumerate(tqdm(ds, desc="Processing")):
        answer = sample["answer"].split("####")[-1].strip()
        data.append({
            "id": f"gsm8k_test_{i}",
            "prompt": sample["question"],
            "answer": answer,
            "solution": sample["answer"],
            "source": "gsm8k"
        })

    save_json(data, DATA_DIR / "gsm8k.json")

    # Also create dm_val as a subset (first 500 for on-task eval)
    save_json(data[:500], DATA_DIR / "dm_val.json")
    return data


def prepare_math500():
    """Prepare MATH-500 dataset."""
    print("\n=== Downloading MATH-500 ===")
    ds = load_dataset("HuggingFaceH4/MATH-500", split="test")

    data = []
    for i, sample in enumerate(tqdm(ds, desc="Processing")):
        data.append({
            "id": f"math500_{i}",
            "prompt": sample["problem"],
            "answer": sample["answer"],
            "solution": sample["solution"],
            "subject": sample.get("subject", ""),
            "level": sample.get("level", ""),
            "source": "math500"
        })

    save_json(data, DATA_DIR / "math500.json")
    return data


def prepare_aime():
    """Prepare AIME dataset from AI-MO."""
    print("\n=== Downloading AIME ===")
    ds = load_dataset("AI-MO/aimo-validation-aime", split="train")

    data = []
    for i, sample in enumerate(tqdm(ds, desc="Processing")):
        data.append({
            "id": f"aime_{i}",
            "prompt": sample["problem"],
            "answer": str(sample["answer"]),
            "solution": sample.get("solution", ""),
            "url": sample.get("url", ""),
            "source": "aime"
        })

    # Split into aime24 and aime25
    # Real AIME has 15 problems per contest, 2 contests per year = 30/year
    save_json(data[:30], DATA_DIR / "aime24.json")
    save_json(data[30:60], DATA_DIR / "aime25.json")
    save_json(data, DATA_DIR / "aime_all.json")
    return data


def prepare_amc():
    """Prepare AMC dataset from AI-MO."""
    print("\n=== Downloading AMC ===")
    ds = load_dataset("AI-MO/aimo-validation-amc", split="train")

    data = []
    for i, sample in enumerate(tqdm(ds, desc="Processing")):
        data.append({
            "id": f"amc_{i}",
            "prompt": sample["problem"],
            "answer": str(sample["answer"]),
            "solution": sample.get("solution", ""),
            "source": "amc"
        })

    save_json(data, DATA_DIR / "amc23.json")
    return data


def prepare_mmlu_stem():
    """Prepare MMLU-STEM subset."""
    print("\n=== Downloading MMLU-STEM ===")

    stem_subjects = [
        "abstract_algebra", "astronomy", "college_biology", "college_chemistry",
        "college_computer_science", "college_mathematics", "college_physics",
        "computer_security", "conceptual_physics", "electrical_engineering",
        "elementary_mathematics", "high_school_biology", "high_school_chemistry",
        "high_school_computer_science", "high_school_mathematics", "high_school_physics",
        "high_school_statistics", "machine_learning"
    ]

    data = []
    for subject in tqdm(stem_subjects, desc="Loading subjects"):
        try:
            ds = load_dataset("cais/mmlu", subject, split="test")
            for i, sample in enumerate(ds):
                choices = sample["choices"]
                correct_idx = sample["answer"]
                # Format as multiple choice
                prompt = f"{sample['question']}\n"
                for j, choice in enumerate(choices):
                    prompt += f"({chr(65+j)}) {choice}\n"

                data.append({
                    "id": f"mmlu_{subject}_{i}",
                    "prompt": prompt,
                    "answer": chr(65 + correct_idx),
                    "subject": subject,
                    "source": "mmlu_stem"
                })
        except Exception as e:
            print(f"  Warning: Skipping {subject}: {e}")

    # Take a random subset of 500
    random.seed(42)
    if len(data) > 500:
        data = random.sample(data, 500)

    save_json(data, DATA_DIR / "mmlu_stem.json")
    return data


def prepare_humaneval():
    """Prepare HumanEval code dataset."""
    print("\n=== Downloading HumanEval ===")
    ds = load_dataset("openai/openai_humaneval", split="test")

    data = []
    for i, sample in enumerate(tqdm(ds, desc="Processing")):
        data.append({
            "id": f"humaneval_{i}",
            "prompt": sample["prompt"],
            "answer": sample["canonical_solution"],
            "entry_point": sample["entry_point"],
            "test": sample["test"],
            "source": "humaneval"
        })

    save_json(data, DATA_DIR / "humaneval.json")
    return data


def verify_data():
    """Verify downloaded data quality."""
    print("\n" + "=" * 60)
    print("Verifying Data Quality")
    print("=" * 60)

    for f in sorted(DATA_DIR.glob("*.json")):
        with open(f) as fp:
            data = json.load(fp)

        # Check for unique prompts
        prompts = [d["prompt"] for d in data]
        unique = len(set(prompts))

        status = "OK" if unique == len(prompts) else f"WARN: {len(prompts)-unique} duplicates"
        print(f"  {f.name}: {len(data)} samples, {unique} unique [{status}]")

        # Show first example
        if data:
            print(f"    Example: {data[0]['prompt'][:60]}...")


def main():
    print("=" * 60)
    print("RLVR Real Data Preparation")
    print("=" * 60)

    # Backup old data
    backup_dir = DATA_DIR / "backup_synthetic"
    if not backup_dir.exists() and any(DATA_DIR.glob("*.json")):
        backup_dir.mkdir(exist_ok=True)
        for f in DATA_DIR.glob("*.json"):
            f.rename(backup_dir / f.name)
        print(f"Backed up synthetic data to {backup_dir}")

    # Training data
    prepare_gsm8k_train()

    # Evaluation data
    prepare_gsm8k_test()
    prepare_math500()
    prepare_aime()
    prepare_amc()
    prepare_mmlu_stem()
    prepare_humaneval()

    # Verify
    verify_data()

    print("\n" + "=" * 60)
    print("Data preparation complete!")
    print("=" * 60)


if __name__ == "__main__":
    main()