blob: f66e73bcfe74343de7f6a8a900b8329252c3f0c5 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
|
#!/bin/bash
#SBATCH --job-name=rlvr_test
#SBATCH --account=bfqt-delta-gpu
#SBATCH --partition=gpuH200x8
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=16
#SBATCH --gres=gpu:h200:4
#SBATCH --mem=200G
#SBATCH --time=04:00:00
#SBATCH --output=results/slurm_logs/test_%j.out
#SBATCH --error=results/slurm_logs/test_%j.err
set -o pipefail
PROJECT_DIR="/projects/bfqt/users/yurenh2/ml-projects/rl-floating-noise"
cd "$PROJECT_DIR"
source ~/.bashrc
conda activate rlvr-fp
export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache"
export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub"
echo "============================================"
echo "Quick test on $(hostname)"
echo "SLURM Job ID: $SLURM_JOB_ID"
nvidia-smi
echo "============================================"
NUM_GPUS=$(python -c 'import torch; print(torch.cuda.device_count())')
echo "Using $NUM_GPUS GPUs for DeepSpeed"
# Use random port to avoid conflicts
MASTER_PORT=$((29500 + RANDOM % 1000))
echo "Using master port: $MASTER_PORT"
# Test fp32 with just 3 steps
echo "Testing fp32..."
deepspeed --num_gpus=$NUM_GPUS --master_port=$MASTER_PORT train_rlvr.py \
--precision_mode fp32 \
--seed 1 \
--output_dir /work/hdd/bfqt/yurenh2/rlvr_results/test_fp32 \
--train_dataset_path data/dm_train.json \
--model_name Qwen/Qwen2.5-Math-7B \
--num_steps 3 \
--deepspeed configs/deepspeed_zero3.json
FP32_EXIT=$?
echo "fp32 test exit code: $FP32_EXIT"
if [ $FP32_EXIT -eq 0 ]; then
echo "fp32 test PASSED"
# Also test bf16
echo "Testing bf16..."
deepspeed --num_gpus=$NUM_GPUS --master_port=$MASTER_PORT train_rlvr.py \
--precision_mode bf16 \
--seed 1 \
--output_dir /work/hdd/bfqt/yurenh2/rlvr_results/test_bf16 \
--train_dataset_path data/dm_train.json \
--model_name Qwen/Qwen2.5-Math-7B \
--num_steps 3 \
--deepspeed configs/deepspeed_zero3.json
BF16_EXIT=$?
echo "bf16 test exit code: $BF16_EXIT"
if [ $BF16_EXIT -eq 0 ]; then
echo "============================================"
echo "ALL TESTS PASSED!"
echo "============================================"
else
echo "bf16 test FAILED"
fi
else
echo "fp32 test FAILED"
fi
|