#!/bin/bash #SBATCH --job-name=rlvr_test #SBATCH --account=bfqt-delta-gpu #SBATCH --partition=gpuH200x8 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=16 #SBATCH --gres=gpu:h200:4 #SBATCH --mem=200G #SBATCH --time=04:00:00 #SBATCH --output=results/slurm_logs/test_%j.out #SBATCH --error=results/slurm_logs/test_%j.err set -o pipefail PROJECT_DIR="/projects/bfqt/users/yurenh2/ml-projects/rl-floating-noise" cd "$PROJECT_DIR" source ~/.bashrc conda activate rlvr-fp export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache" export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub" echo "============================================" echo "Quick test on $(hostname)" echo "SLURM Job ID: $SLURM_JOB_ID" nvidia-smi echo "============================================" NUM_GPUS=$(python -c 'import torch; print(torch.cuda.device_count())') echo "Using $NUM_GPUS GPUs for DeepSpeed" # Use random port to avoid conflicts MASTER_PORT=$((29500 + RANDOM % 1000)) echo "Using master port: $MASTER_PORT" # Test fp32 with just 3 steps echo "Testing fp32..." deepspeed --num_gpus=$NUM_GPUS --master_port=$MASTER_PORT train_rlvr.py \ --precision_mode fp32 \ --seed 1 \ --output_dir /work/hdd/bfqt/yurenh2/rlvr_results/test_fp32 \ --train_dataset_path data/dm_train.json \ --model_name Qwen/Qwen2.5-Math-7B \ --num_steps 3 \ --deepspeed configs/deepspeed_zero3.json FP32_EXIT=$? echo "fp32 test exit code: $FP32_EXIT" if [ $FP32_EXIT -eq 0 ]; then echo "fp32 test PASSED" # Also test bf16 echo "Testing bf16..." deepspeed --num_gpus=$NUM_GPUS --master_port=$MASTER_PORT train_rlvr.py \ --precision_mode bf16 \ --seed 1 \ --output_dir /work/hdd/bfqt/yurenh2/rlvr_results/test_bf16 \ --train_dataset_path data/dm_train.json \ --model_name Qwen/Qwen2.5-Math-7B \ --num_steps 3 \ --deepspeed configs/deepspeed_zero3.json BF16_EXIT=$? echo "bf16 test exit code: $BF16_EXIT" if [ $BF16_EXIT -eq 0 ]; then echo "============================================" echo "ALL TESTS PASSED!" echo "============================================" else echo "bf16 test FAILED" fi else echo "fp32 test FAILED" fi