# config.py """ Configuration definitions for RLVR floating-point precision experiments. This module defines configurations for: - Training parameters (DAPO algorithm, hyperparameters) - Precision settings (FP32 vs bf16) - Evaluation task specifications """ from dataclasses import dataclass, field from typing import List, Dict, Any, Optional import os @dataclass class TrainingConfig: """Configuration for RLVR training with DAPO algorithm.""" # Model specification model_name: str = "Qwen/Qwen2.5-Math-7B" # Precision mode: "fp32" for high precision, "bf16" for default RLVR precision_mode: str = "bf16" # Batch configuration (sized for single H200 GPU with gradient checkpointing) global_batch_size: int = 32 # Reduced from 256 for single GPU micro_batch_size: int = 4 # Reduced further for fp32 memory safety grad_accumulation_steps: int = 8 # Increased to maintain effective batch size # Rollout configuration num_rollouts_per_prompt: int = 4 # Reduced from 16 for speed max_seq_len: int = 2048 # Reduced from 8192 (GSM8K answers are short) # Training steps and checkpointing # Note: With sequential generation, each step takes ~18 min on H200 # 150 steps ≈ 45 hours, fits in 2-day limit with buffer num_steps: int = 150 checkpoint_steps: List[int] = field(default_factory=lambda: [0, 50, 100, 150]) # Optimizer configuration (AdamW) learning_rate: float = 1e-6 beta1: float = 0.9 beta2: float = 0.999 weight_decay: float = 0.01 # RL algorithm rl_algorithm: str = "dapo" clip_ratio: float = 0.2 # DAPO clip parameter kl_coef: float = 0.0 # DAPO uses clip-only, no explicit KL penalty # Reproducibility # IMPORTANT: Keep these constant across precision modes to isolate precision effects # We maximize determinism so the ONLY variance source is floating-point precision seed: int = 1 use_dropout: bool = False # Disabled to reduce stochasticity use_deterministic_algorithms: bool = True # Enabled for reproducibility # Paths output_dir: str = "./results/train_logs" train_dataset_path: str = "./data/dm_train.json" # GPU configuration (single GPU for this implementation) num_gpus: int = 1 # Current implementation is single-GPU def __post_init__(self): """ Note: We intentionally keep dropout and determinism settings CONSTANT across precision modes to isolate the effect of floating-point precision. Previously this coupled dropout=False with fp32 and dropout=True with bf16, which confounded precision effects with stochasticity effects. To study pure precision effects: - Both modes use the SAME dropout setting (default: False) - Both modes use the SAME determinism setting (default: False for speed) The only difference between fp32 and bf16 should be param_dtype. """ # Don't modify settings based on precision_mode - keep them independent pass @dataclass class PrecisionConfig: """Configuration for floating-point precision settings.""" # Parameter storage dtype param_dtype: str = "bfloat16" # "float32" or "bfloat16" # Automatic mixed precision use_amp: bool = True amp_dtype: str = "bfloat16" # "float16" or "bfloat16" # Gradient and optimizer state always in FP32 grad_dtype: str = "float32" optimizer_dtype: str = "float32" # Deterministic algorithms deterministic: bool = False # CUDNN settings cudnn_benchmark: bool = True cudnn_deterministic: bool = False @dataclass class EvalTaskConfig: """Configuration for a single evaluation task.""" # Task identification name: str = "" task_type: str = "math" # "math", "code", "qa", "general" # Dataset dataset_path: str = "" num_samples: int = -1 # -1 means use all samples # Whether task has verifiable answers (math problems) is_verifiable: bool = True # Metric type for non-verifiable tasks metric_type: str = "accuracy" # "accuracy", "bleu", "rouge", "score" # Generation parameters max_gen_len: int = 2048 temperature: float = 0.7 top_p: float = 0.8 num_samples_per_prompt: int = 1 @dataclass class ExperimentConfig: """Master configuration for the entire experiment.""" # Experiment identification experiment_name: str = "fp_precision_rlvr" # Seeds for multiple runs seeds: List[int] = field(default_factory=lambda: [1, 2, 3, 4, 5]) # Precision modes to compare precision_modes: List[str] = field(default_factory=lambda: ["fp32", "bf16"]) # Base model checkpoint (shared starting point) base_model_path: str = "Qwen/Qwen2.5-Math-7B" # Output directories base_output_dir: str = "./results" train_logs_dir: str = "./results/train_logs" checkpoints_dir: str = "./results/checkpoints" eval_metrics_dir: str = "./results/eval_metrics" # Evaluation configuration eval_tasks_config_path: str = "./configs/eval_tasks_config.json" # bf16 sparsity analysis bf16_sparsity_eta: float = 1e-3 def make_precision_config(precision_mode: str) -> PrecisionConfig: """ Create precision configuration based on mode. IMPORTANT: Only the precision-related settings differ between modes. All other settings (determinism, cudnn) are kept CONSTANT to isolate the effect of floating-point precision on training outcomes. Args: precision_mode: "fp32" for high precision, "bf16" for default RLVR Returns: PrecisionConfig with appropriate settings """ # Common settings for both modes (to avoid confounds) # Maximize determinism so precision is the ONLY source of variance common_settings = { "grad_dtype": "float32", "optimizer_dtype": "float32", "deterministic": True, # Enable deterministic algorithms "cudnn_benchmark": False, # Disable for reproducibility "cudnn_deterministic": True, # Enable for reproducibility } if precision_mode == "fp32": return PrecisionConfig( param_dtype="float32", use_amp=False, # No AMP needed for fp32 amp_dtype="float32", **common_settings ) elif precision_mode == "bf16": return PrecisionConfig( param_dtype="bfloat16", use_amp=True, amp_dtype="bfloat16", **common_settings ) else: raise ValueError(f"Unknown precision_mode: {precision_mode}") def make_training_config( precision_mode: str, seed: int, output_dir: str, train_dataset_path: str, model_name: str = "Qwen/Qwen2.5-Math-7B" ) -> TrainingConfig: """ Create training configuration for a specific run. Args: precision_mode: "fp32" or "bf16" seed: Random seed for this run output_dir: Directory to save outputs train_dataset_path: Path to training data model_name: HuggingFace model identifier Returns: TrainingConfig with all parameters set """ config = TrainingConfig( model_name=model_name, precision_mode=precision_mode, seed=seed, output_dir=output_dir, train_dataset_path=train_dataset_path ) return config def get_run_output_dir( base_dir: str, precision_mode: str, seed: int ) -> str: """Get output directory for a specific run.""" return os.path.join(base_dir, f"{precision_mode}_seed{seed}") def get_checkpoint_path( output_dir: str, step: Optional[int] = None ) -> str: """Get checkpoint path for a specific step (None = final).""" if step is None: return os.path.join(output_dir, "final_model") return os.path.join(output_dir, f"checkpoint_step{step}") # Default evaluation tasks for the experiment DEFAULT_EVAL_TASKS = [ # On-task: Training distribution EvalTaskConfig( name="dm_val", task_type="math", dataset_path="./data/dm_val.json", is_verifiable=True, metric_type="accuracy", max_gen_len=2048, temperature=0.7, top_p=0.8 ), # In-domain OOD: Math benchmarks EvalTaskConfig( name="aime24", task_type="math", dataset_path="./data/aime24.json", is_verifiable=True, metric_type="accuracy", max_gen_len=4096, temperature=0.7, top_p=0.8 ), EvalTaskConfig( name="aime25", task_type="math", dataset_path="./data/aime25.json", is_verifiable=True, metric_type="accuracy", max_gen_len=4096, temperature=0.7, top_p=0.8 ), EvalTaskConfig( name="amc23", task_type="math", dataset_path="./data/amc23.json", is_verifiable=True, metric_type="accuracy", max_gen_len=2048, temperature=0.7, top_p=0.8 ), EvalTaskConfig( name="math500", task_type="math", dataset_path="./data/math500.json", is_verifiable=True, metric_type="accuracy", max_gen_len=2048, temperature=0.7, top_p=0.8 ), # Off-domain: General tasks EvalTaskConfig( name="mmlu_stem", task_type="qa", dataset_path="./data/mmlu_stem.json", is_verifiable=True, metric_type="accuracy", max_gen_len=512, temperature=0.3, top_p=0.9 ), EvalTaskConfig( name="humaneval", task_type="code", dataset_path="./data/humaneval.json", is_verifiable=True, metric_type="accuracy", max_gen_len=1024, temperature=0.2, top_p=0.95 ), ]