summaryrefslogtreecommitdiff
path: root/scripts/test_quick.sh
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/test_quick.sh')
-rw-r--r--scripts/test_quick.sh78
1 files changed, 78 insertions, 0 deletions
diff --git a/scripts/test_quick.sh b/scripts/test_quick.sh
new file mode 100644
index 0000000..f66e73b
--- /dev/null
+++ b/scripts/test_quick.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+#SBATCH --job-name=rlvr_test
+#SBATCH --account=bfqt-delta-gpu
+#SBATCH --partition=gpuH200x8
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=16
+#SBATCH --gres=gpu:h200:4
+#SBATCH --mem=200G
+#SBATCH --time=04:00:00
+#SBATCH --output=results/slurm_logs/test_%j.out
+#SBATCH --error=results/slurm_logs/test_%j.err
+
+set -o pipefail
+
+PROJECT_DIR="/projects/bfqt/users/yurenh2/ml-projects/rl-floating-noise"
+cd "$PROJECT_DIR"
+
+source ~/.bashrc
+conda activate rlvr-fp
+
+export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache"
+export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub"
+
+echo "============================================"
+echo "Quick test on $(hostname)"
+echo "SLURM Job ID: $SLURM_JOB_ID"
+nvidia-smi
+echo "============================================"
+
+NUM_GPUS=$(python -c 'import torch; print(torch.cuda.device_count())')
+echo "Using $NUM_GPUS GPUs for DeepSpeed"
+
+# Use random port to avoid conflicts
+MASTER_PORT=$((29500 + RANDOM % 1000))
+echo "Using master port: $MASTER_PORT"
+
+# Test fp32 with just 3 steps
+echo "Testing fp32..."
+deepspeed --num_gpus=$NUM_GPUS --master_port=$MASTER_PORT train_rlvr.py \
+ --precision_mode fp32 \
+ --seed 1 \
+ --output_dir /work/hdd/bfqt/yurenh2/rlvr_results/test_fp32 \
+ --train_dataset_path data/dm_train.json \
+ --model_name Qwen/Qwen2.5-Math-7B \
+ --num_steps 3 \
+ --deepspeed configs/deepspeed_zero3.json
+
+FP32_EXIT=$?
+echo "fp32 test exit code: $FP32_EXIT"
+
+if [ $FP32_EXIT -eq 0 ]; then
+ echo "fp32 test PASSED"
+
+ # Also test bf16
+ echo "Testing bf16..."
+ deepspeed --num_gpus=$NUM_GPUS --master_port=$MASTER_PORT train_rlvr.py \
+ --precision_mode bf16 \
+ --seed 1 \
+ --output_dir /work/hdd/bfqt/yurenh2/rlvr_results/test_bf16 \
+ --train_dataset_path data/dm_train.json \
+ --model_name Qwen/Qwen2.5-Math-7B \
+ --num_steps 3 \
+ --deepspeed configs/deepspeed_zero3.json
+
+ BF16_EXIT=$?
+ echo "bf16 test exit code: $BF16_EXIT"
+
+ if [ $BF16_EXIT -eq 0 ]; then
+ echo "============================================"
+ echo "ALL TESTS PASSED!"
+ echo "============================================"
+ else
+ echo "bf16 test FAILED"
+ fi
+else
+ echo "fp32 test FAILED"
+fi