#!/bin/bash #SBATCH --job-name=rlvr_fp_exp #SBATCH --account=bfqt-delta-gpu #SBATCH --partition=gpuH200x8 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=16 #SBATCH --gres=gpu:h200:4 #SBATCH --mem=200G #SBATCH --time=2-00:00:00 #SBATCH --output=results/slurm_logs/%x_%j.out #SBATCH --error=results/slurm_logs/%x_%j.err #SBATCH --mail-type=BEGIN,END,FAIL #SBATCH --mail-user=$USER@example.com # Exit on error and propagate exit codes through pipes set -o pipefail # ============================================ # RLVR Floating-Point Precision Experiment # H200x8 SLURM Job Script # ============================================ # Configuration - modify these as needed PRECISION_MODE=${PRECISION_MODE:-"bf16"} SEED=${SEED:-1} NUM_STEPS=${NUM_STEPS:-150} # ~45 hours on H200 with sequential generation # Paths PROJECT_DIR="/projects/bfqt/users/yurenh2/ml-projects/rl-floating-noise" CONDA_ENV="rlvr-fp" # Change to your conda env name MODEL_NAME="Qwen/Qwen2.5-Math-7B" TRAIN_DATA="${PROJECT_DIR}/data/dm_train.json" # ============================================ # HuggingFace cache configuration # Use shared HDD storage to avoid home directory quota issues # This cache is shared across all projects # ============================================ export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache" export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub" mkdir -p "$HF_HOME" "$HF_HUB_CACHE" # Print job info echo "============================================" echo "SLURM Job ID: $SLURM_JOB_ID" echo "Running on: $(hostname)" echo "Start time: $(date)" echo "============================================" echo "Precision Mode: $PRECISION_MODE" echo "Seed: $SEED" echo "Num Steps: $NUM_STEPS" echo "GPUs: $CUDA_VISIBLE_DEVICES" echo "HF Cache: $HF_HOME" echo "============================================" # Setup environment cd "$PROJECT_DIR" mkdir -p results/slurm_logs # Activate conda environment source ~/.bashrc # Check if conda environment exists if conda env list | grep -q "^${CONDA_ENV} "; then echo "Activating existing conda environment: $CONDA_ENV" conda activate "$CONDA_ENV" else echo "ERROR: Conda environment '$CONDA_ENV' does not exist!" echo "Please create it first by running:" echo " conda create -n $CONDA_ENV python=3.10 -y" echo " conda activate $CONDA_ENV" echo " pip install -r requirements.txt" exit 1 fi # Verify activation succeeded if [[ "$CONDA_DEFAULT_ENV" != "$CONDA_ENV" ]]; then echo "ERROR: Failed to activate conda environment '$CONDA_ENV'" exit 1 fi echo "Conda environment activated: $CONDA_DEFAULT_ENV" # Check GPU availability nvidia-smi echo "CUDA devices: $(python -c 'import torch; print(torch.cuda.device_count())')" # Output directory (use /work for large checkpoints - /projects is limited) OUTPUT_DIR="/work/hdd/bfqt/yurenh2/rlvr_results/${PRECISION_MODE}_seed${SEED}" mkdir -p "$OUTPUT_DIR" # DeepSpeed config (ZeRO-3 for full sharding of model/optimizer/gradients) DEEPSPEED_CONFIG="${PROJECT_DIR}/configs/deepspeed_zero3.json" # Number of GPUs for DeepSpeed (all GPUs, ref model on same GPU as training per rank) NUM_GPUS=$(python -c 'import torch; print(torch.cuda.device_count())') echo "Using $NUM_GPUS GPUs for DeepSpeed training (ref model on each rank's GPU)" # Use random port to avoid conflicts with other jobs MASTER_PORT=$((29500 + RANDOM % 1000)) echo "Using master port: $MASTER_PORT" # Run training with DeepSpeed echo "" echo "Starting training with DeepSpeed ZeRO-3..." echo "============================================" deepspeed --num_gpus=$NUM_GPUS --master_port=$MASTER_PORT train_rlvr.py \ --precision_mode "$PRECISION_MODE" \ --seed "$SEED" \ --output_dir "$OUTPUT_DIR" \ --train_dataset_path "$TRAIN_DATA" \ --model_name "$MODEL_NAME" \ --num_steps "$NUM_STEPS" \ --deepspeed "$DEEPSPEED_CONFIG" \ 2>&1 | tee "${OUTPUT_DIR}/training_slurm.log" # IMPORTANT: With set -o pipefail, $? now captures python's exit code, not tee's TRAIN_EXIT_CODE=$? echo "" echo "============================================" echo "Training completed with exit code: $TRAIN_EXIT_CODE" echo "End time: $(date)" echo "============================================" # If training succeeded, run evaluation if [ $TRAIN_EXIT_CODE -eq 0 ]; then echo "" echo "Starting evaluation..." echo "============================================" python eval_policy.py \ --base_ckpt "$MODEL_NAME" \ --ft_ckpt "${OUTPUT_DIR}/final_model" \ --eval_tasks_config configs/eval_tasks_config.json \ --output_path "results/eval_metrics/${PRECISION_MODE}_seed${SEED}.json" \ --eval_base \ --use_amp \ 2>&1 | tee "${OUTPUT_DIR}/eval_slurm.log" fi echo "" echo "Job completed at: $(date)"