diff options
Diffstat (limited to 'scripts/test_quick.sh')
| -rw-r--r-- | scripts/test_quick.sh | 78 |
1 files changed, 78 insertions, 0 deletions
diff --git a/scripts/test_quick.sh b/scripts/test_quick.sh new file mode 100644 index 0000000..f66e73b --- /dev/null +++ b/scripts/test_quick.sh @@ -0,0 +1,78 @@ +#!/bin/bash +#SBATCH --job-name=rlvr_test +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:h200:4 +#SBATCH --mem=200G +#SBATCH --time=04:00:00 +#SBATCH --output=results/slurm_logs/test_%j.out +#SBATCH --error=results/slurm_logs/test_%j.err + +set -o pipefail + +PROJECT_DIR="/projects/bfqt/users/yurenh2/ml-projects/rl-floating-noise" +cd "$PROJECT_DIR" + +source ~/.bashrc +conda activate rlvr-fp + +export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache" +export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub" + +echo "============================================" +echo "Quick test on $(hostname)" +echo "SLURM Job ID: $SLURM_JOB_ID" +nvidia-smi +echo "============================================" + +NUM_GPUS=$(python -c 'import torch; print(torch.cuda.device_count())') +echo "Using $NUM_GPUS GPUs for DeepSpeed" + +# Use random port to avoid conflicts +MASTER_PORT=$((29500 + RANDOM % 1000)) +echo "Using master port: $MASTER_PORT" + +# Test fp32 with just 3 steps +echo "Testing fp32..." +deepspeed --num_gpus=$NUM_GPUS --master_port=$MASTER_PORT train_rlvr.py \ + --precision_mode fp32 \ + --seed 1 \ + --output_dir /work/hdd/bfqt/yurenh2/rlvr_results/test_fp32 \ + --train_dataset_path data/dm_train.json \ + --model_name Qwen/Qwen2.5-Math-7B \ + --num_steps 3 \ + --deepspeed configs/deepspeed_zero3.json + +FP32_EXIT=$? +echo "fp32 test exit code: $FP32_EXIT" + +if [ $FP32_EXIT -eq 0 ]; then + echo "fp32 test PASSED" + + # Also test bf16 + echo "Testing bf16..." + deepspeed --num_gpus=$NUM_GPUS --master_port=$MASTER_PORT train_rlvr.py \ + --precision_mode bf16 \ + --seed 1 \ + --output_dir /work/hdd/bfqt/yurenh2/rlvr_results/test_bf16 \ + --train_dataset_path data/dm_train.json \ + --model_name Qwen/Qwen2.5-Math-7B \ + --num_steps 3 \ + --deepspeed configs/deepspeed_zero3.json + + BF16_EXIT=$? + echo "bf16 test exit code: $BF16_EXIT" + + if [ $BF16_EXIT -eq 0 ]; then + echo "============================================" + echo "ALL TESTS PASSED!" + echo "============================================" + else + echo "bf16 test FAILED" + fi +else + echo "fp32 test FAILED" +fi |
