diff options
| author | YurenHao0426 <blackhao0426@gmail.com> | 2026-02-04 18:59:35 -0600 |
|---|---|---|
| committer | YurenHao0426 <blackhao0426@gmail.com> | 2026-02-04 18:59:35 -0600 |
| commit | f1c2cc22d46a6976df3555391e667c7e61592fad (patch) | |
| tree | 0b37b52c8ff91042a742d3b3ec54542cb6d6e2f6 /scripts/slurm_train.sh | |
Diffstat (limited to 'scripts/slurm_train.sh')
| -rwxr-xr-x | scripts/slurm_train.sh | 145 |
1 files changed, 145 insertions, 0 deletions
diff --git a/scripts/slurm_train.sh b/scripts/slurm_train.sh new file mode 100755 index 0000000..36bd5b1 --- /dev/null +++ b/scripts/slurm_train.sh @@ -0,0 +1,145 @@ +#!/bin/bash +#SBATCH --job-name=rlvr_fp_exp +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:h200:4 +#SBATCH --mem=200G +#SBATCH --time=2-00:00:00 +#SBATCH --output=results/slurm_logs/%x_%j.out +#SBATCH --error=results/slurm_logs/%x_%j.err +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=$USER@example.com + +# Exit on error and propagate exit codes through pipes +set -o pipefail + +# ============================================ +# RLVR Floating-Point Precision Experiment +# H200x8 SLURM Job Script +# ============================================ + +# Configuration - modify these as needed +PRECISION_MODE=${PRECISION_MODE:-"bf16"} +SEED=${SEED:-1} +NUM_STEPS=${NUM_STEPS:-150} # ~45 hours on H200 with sequential generation + +# Paths +PROJECT_DIR="/projects/bfqt/users/yurenh2/ml-projects/rl-floating-noise" +CONDA_ENV="rlvr-fp" # Change to your conda env name +MODEL_NAME="Qwen/Qwen2.5-Math-7B" +TRAIN_DATA="${PROJECT_DIR}/data/dm_train.json" + +# ============================================ +# HuggingFace cache configuration +# Use shared HDD storage to avoid home directory quota issues +# This cache is shared across all projects +# ============================================ +export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache" +export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub" +mkdir -p "$HF_HOME" "$HF_HUB_CACHE" + +# Print job info +echo "============================================" +echo "SLURM Job ID: $SLURM_JOB_ID" +echo "Running on: $(hostname)" +echo "Start time: $(date)" +echo "============================================" +echo "Precision Mode: $PRECISION_MODE" +echo "Seed: $SEED" +echo "Num Steps: $NUM_STEPS" +echo "GPUs: $CUDA_VISIBLE_DEVICES" +echo "HF Cache: $HF_HOME" +echo "============================================" + +# Setup environment +cd "$PROJECT_DIR" +mkdir -p results/slurm_logs + +# Activate conda environment +source ~/.bashrc + +# Check if conda environment exists +if conda env list | grep -q "^${CONDA_ENV} "; then + echo "Activating existing conda environment: $CONDA_ENV" + conda activate "$CONDA_ENV" +else + echo "ERROR: Conda environment '$CONDA_ENV' does not exist!" + echo "Please create it first by running:" + echo " conda create -n $CONDA_ENV python=3.10 -y" + echo " conda activate $CONDA_ENV" + echo " pip install -r requirements.txt" + exit 1 +fi + +# Verify activation succeeded +if [[ "$CONDA_DEFAULT_ENV" != "$CONDA_ENV" ]]; then + echo "ERROR: Failed to activate conda environment '$CONDA_ENV'" + exit 1 +fi +echo "Conda environment activated: $CONDA_DEFAULT_ENV" + +# Check GPU availability +nvidia-smi +echo "CUDA devices: $(python -c 'import torch; print(torch.cuda.device_count())')" + +# Output directory (use /work for large checkpoints - /projects is limited) +OUTPUT_DIR="/work/hdd/bfqt/yurenh2/rlvr_results/${PRECISION_MODE}_seed${SEED}" +mkdir -p "$OUTPUT_DIR" + +# DeepSpeed config (ZeRO-3 for full sharding of model/optimizer/gradients) +DEEPSPEED_CONFIG="${PROJECT_DIR}/configs/deepspeed_zero3.json" + +# Number of GPUs for DeepSpeed (all GPUs, ref model on same GPU as training per rank) +NUM_GPUS=$(python -c 'import torch; print(torch.cuda.device_count())') +echo "Using $NUM_GPUS GPUs for DeepSpeed training (ref model on each rank's GPU)" + +# Use random port to avoid conflicts with other jobs +MASTER_PORT=$((29500 + RANDOM % 1000)) +echo "Using master port: $MASTER_PORT" + +# Run training with DeepSpeed +echo "" +echo "Starting training with DeepSpeed ZeRO-3..." +echo "============================================" + +deepspeed --num_gpus=$NUM_GPUS --master_port=$MASTER_PORT train_rlvr.py \ + --precision_mode "$PRECISION_MODE" \ + --seed "$SEED" \ + --output_dir "$OUTPUT_DIR" \ + --train_dataset_path "$TRAIN_DATA" \ + --model_name "$MODEL_NAME" \ + --num_steps "$NUM_STEPS" \ + --deepspeed "$DEEPSPEED_CONFIG" \ + 2>&1 | tee "${OUTPUT_DIR}/training_slurm.log" + +# IMPORTANT: With set -o pipefail, $? now captures python's exit code, not tee's +TRAIN_EXIT_CODE=$? + +echo "" +echo "============================================" +echo "Training completed with exit code: $TRAIN_EXIT_CODE" +echo "End time: $(date)" +echo "============================================" + +# If training succeeded, run evaluation +if [ $TRAIN_EXIT_CODE -eq 0 ]; then + echo "" + echo "Starting evaluation..." + echo "============================================" + + python eval_policy.py \ + --base_ckpt "$MODEL_NAME" \ + --ft_ckpt "${OUTPUT_DIR}/final_model" \ + --eval_tasks_config configs/eval_tasks_config.json \ + --output_path "results/eval_metrics/${PRECISION_MODE}_seed${SEED}.json" \ + --eval_base \ + --use_amp \ + 2>&1 | tee "${OUTPUT_DIR}/eval_slurm.log" +fi + +echo "" +echo "Job completed at: $(date)" + |
