summaryrefslogtreecommitdiff
path: root/scripts/test_quick.sh
blob: f66e73bcfe74343de7f6a8a900b8329252c3f0c5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/bin/bash
#SBATCH --job-name=rlvr_test
#SBATCH --account=bfqt-delta-gpu
#SBATCH --partition=gpuH200x8
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=16
#SBATCH --gres=gpu:h200:4
#SBATCH --mem=200G
#SBATCH --time=04:00:00
#SBATCH --output=results/slurm_logs/test_%j.out
#SBATCH --error=results/slurm_logs/test_%j.err

set -o pipefail

PROJECT_DIR="/projects/bfqt/users/yurenh2/ml-projects/rl-floating-noise"
cd "$PROJECT_DIR"

source ~/.bashrc
conda activate rlvr-fp

export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache"
export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub"

echo "============================================"
echo "Quick test on $(hostname)"
echo "SLURM Job ID: $SLURM_JOB_ID"
nvidia-smi
echo "============================================"

NUM_GPUS=$(python -c 'import torch; print(torch.cuda.device_count())')
echo "Using $NUM_GPUS GPUs for DeepSpeed"

# Use random port to avoid conflicts
MASTER_PORT=$((29500 + RANDOM % 1000))
echo "Using master port: $MASTER_PORT"

# Test fp32 with just 3 steps
echo "Testing fp32..."
deepspeed --num_gpus=$NUM_GPUS --master_port=$MASTER_PORT train_rlvr.py \
    --precision_mode fp32 \
    --seed 1 \
    --output_dir /work/hdd/bfqt/yurenh2/rlvr_results/test_fp32 \
    --train_dataset_path data/dm_train.json \
    --model_name Qwen/Qwen2.5-Math-7B \
    --num_steps 3 \
    --deepspeed configs/deepspeed_zero3.json

FP32_EXIT=$?
echo "fp32 test exit code: $FP32_EXIT"

if [ $FP32_EXIT -eq 0 ]; then
    echo "fp32 test PASSED"

    # Also test bf16
    echo "Testing bf16..."
    deepspeed --num_gpus=$NUM_GPUS --master_port=$MASTER_PORT train_rlvr.py \
        --precision_mode bf16 \
        --seed 1 \
        --output_dir /work/hdd/bfqt/yurenh2/rlvr_results/test_bf16 \
        --train_dataset_path data/dm_train.json \
        --model_name Qwen/Qwen2.5-Math-7B \
        --num_steps 3 \
        --deepspeed configs/deepspeed_zero3.json

    BF16_EXIT=$?
    echo "bf16 test exit code: $BF16_EXIT"

    if [ $BF16_EXIT -eq 0 ]; then
        echo "============================================"
        echo "ALL TESTS PASSED!"
        echo "============================================"
    else
        echo "bf16 test FAILED"
    fi
else
    echo "fp32 test FAILED"
fi