#!/usr/bin/env python3 """ Prepare REAL datasets for RLVR floating-point precision experiments. Downloads from HuggingFace: - Training: GSM8K train (7473 samples) - Evaluation: GSM8K test, MATH-500, AIME, AMC, MMLU-STEM, HumanEval Usage: python scripts/prepare_data.py """ import json import os import random from pathlib import Path from datasets import load_dataset from tqdm import tqdm DATA_DIR = Path("data") DATA_DIR.mkdir(exist_ok=True) def save_json(data: list, path: Path): """Save data as JSON file.""" with open(path, "w") as f: json.dump(data, f, indent=2) print(f" Saved {len(data)} samples to {path}") def prepare_gsm8k_train(): """Prepare GSM8K training data.""" print("\n=== Downloading GSM8K Train ===") ds = load_dataset("openai/gsm8k", "main", split="train") data = [] for i, sample in enumerate(tqdm(ds, desc="Processing")): # Extract answer from "#### N" format answer = sample["answer"].split("####")[-1].strip() data.append({ "id": f"gsm8k_train_{i}", "prompt": sample["question"], "answer": answer, "solution": sample["answer"], "source": "gsm8k_train" }) save_json(data, DATA_DIR / "dm_train.json") return data def prepare_gsm8k_test(): """Prepare GSM8K test data for evaluation.""" print("\n=== Downloading GSM8K Test ===") ds = load_dataset("openai/gsm8k", "main", split="test") data = [] for i, sample in enumerate(tqdm(ds, desc="Processing")): answer = sample["answer"].split("####")[-1].strip() data.append({ "id": f"gsm8k_test_{i}", "prompt": sample["question"], "answer": answer, "solution": sample["answer"], "source": "gsm8k" }) save_json(data, DATA_DIR / "gsm8k.json") # Also create dm_val as a subset (first 500 for on-task eval) save_json(data[:500], DATA_DIR / "dm_val.json") return data def prepare_math500(): """Prepare MATH-500 dataset.""" print("\n=== Downloading MATH-500 ===") ds = load_dataset("HuggingFaceH4/MATH-500", split="test") data = [] for i, sample in enumerate(tqdm(ds, desc="Processing")): data.append({ "id": f"math500_{i}", "prompt": sample["problem"], "answer": sample["answer"], "solution": sample["solution"], "subject": sample.get("subject", ""), "level": sample.get("level", ""), "source": "math500" }) save_json(data, DATA_DIR / "math500.json") return data def prepare_aime(): """Prepare AIME dataset from AI-MO.""" print("\n=== Downloading AIME ===") ds = load_dataset("AI-MO/aimo-validation-aime", split="train") data = [] for i, sample in enumerate(tqdm(ds, desc="Processing")): data.append({ "id": f"aime_{i}", "prompt": sample["problem"], "answer": str(sample["answer"]), "solution": sample.get("solution", ""), "url": sample.get("url", ""), "source": "aime" }) # Split into aime24 and aime25 # Real AIME has 15 problems per contest, 2 contests per year = 30/year save_json(data[:30], DATA_DIR / "aime24.json") save_json(data[30:60], DATA_DIR / "aime25.json") save_json(data, DATA_DIR / "aime_all.json") return data def prepare_amc(): """Prepare AMC dataset from AI-MO.""" print("\n=== Downloading AMC ===") ds = load_dataset("AI-MO/aimo-validation-amc", split="train") data = [] for i, sample in enumerate(tqdm(ds, desc="Processing")): data.append({ "id": f"amc_{i}", "prompt": sample["problem"], "answer": str(sample["answer"]), "solution": sample.get("solution", ""), "source": "amc" }) save_json(data, DATA_DIR / "amc23.json") return data def prepare_mmlu_stem(): """Prepare MMLU-STEM subset.""" print("\n=== Downloading MMLU-STEM ===") stem_subjects = [ "abstract_algebra", "astronomy", "college_biology", "college_chemistry", "college_computer_science", "college_mathematics", "college_physics", "computer_security", "conceptual_physics", "electrical_engineering", "elementary_mathematics", "high_school_biology", "high_school_chemistry", "high_school_computer_science", "high_school_mathematics", "high_school_physics", "high_school_statistics", "machine_learning" ] data = [] for subject in tqdm(stem_subjects, desc="Loading subjects"): try: ds = load_dataset("cais/mmlu", subject, split="test") for i, sample in enumerate(ds): choices = sample["choices"] correct_idx = sample["answer"] # Format as multiple choice prompt = f"{sample['question']}\n" for j, choice in enumerate(choices): prompt += f"({chr(65+j)}) {choice}\n" data.append({ "id": f"mmlu_{subject}_{i}", "prompt": prompt, "answer": chr(65 + correct_idx), "subject": subject, "source": "mmlu_stem" }) except Exception as e: print(f" Warning: Skipping {subject}: {e}") # Take a random subset of 500 random.seed(42) if len(data) > 500: data = random.sample(data, 500) save_json(data, DATA_DIR / "mmlu_stem.json") return data def prepare_humaneval(): """Prepare HumanEval code dataset.""" print("\n=== Downloading HumanEval ===") ds = load_dataset("openai/openai_humaneval", split="test") data = [] for i, sample in enumerate(tqdm(ds, desc="Processing")): data.append({ "id": f"humaneval_{i}", "prompt": sample["prompt"], "answer": sample["canonical_solution"], "entry_point": sample["entry_point"], "test": sample["test"], "source": "humaneval" }) save_json(data, DATA_DIR / "humaneval.json") return data def verify_data(): """Verify downloaded data quality.""" print("\n" + "=" * 60) print("Verifying Data Quality") print("=" * 60) for f in sorted(DATA_DIR.glob("*.json")): with open(f) as fp: data = json.load(fp) # Check for unique prompts prompts = [d["prompt"] for d in data] unique = len(set(prompts)) status = "OK" if unique == len(prompts) else f"WARN: {len(prompts)-unique} duplicates" print(f" {f.name}: {len(data)} samples, {unique} unique [{status}]") # Show first example if data: print(f" Example: {data[0]['prompt'][:60]}...") def main(): print("=" * 60) print("RLVR Real Data Preparation") print("=" * 60) # Backup old data backup_dir = DATA_DIR / "backup_synthetic" if not backup_dir.exists() and any(DATA_DIR.glob("*.json")): backup_dir.mkdir(exist_ok=True) for f in DATA_DIR.glob("*.json"): f.rename(backup_dir / f.name) print(f"Backed up synthetic data to {backup_dir}") # Training data prepare_gsm8k_train() # Evaluation data prepare_gsm8k_test() prepare_math500() prepare_aime() prepare_amc() prepare_mmlu_stem() prepare_humaneval() # Verify verify_data() print("\n" + "=" * 60) print("Data preparation complete!") print("=" * 60) if __name__ == "__main__": main()