blob: 38b2fc897d271d71c5a41ded772116f92748b6ef (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
#!/bin/bash
# run_training.sh
# Script to run RLVR training experiments with different precision modes
set -e
set -o pipefail # Properly capture exit codes through pipes
# Configuration
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0,1"}
# HuggingFace cache - use shared HDD storage to avoid quota issues
export HF_HOME="/work/hdd/bfqt/yurenh2/huggingface_cache"
export HF_HUB_CACHE="/work/hdd/bfqt/yurenh2/huggingface_cache/hub"
mkdir -p "$HF_HOME" "$HF_HUB_CACHE"
# Default values
PRECISION_MODE=${1:-"bf16"}
SEED=${2:-1}
TRAIN_DATA=${TRAIN_DATA:-"./data/dm_train.json"}
OUTPUT_BASE=${OUTPUT_BASE:-"./results/train_logs"}
MODEL_NAME=${MODEL_NAME:-"Qwen/Qwen2.5-Math-7B"}
NUM_STEPS=${NUM_STEPS:-300}
# Create output directory
OUTPUT_DIR="${OUTPUT_BASE}/${PRECISION_MODE}_seed${SEED}"
mkdir -p "$OUTPUT_DIR"
echo "=============================================="
echo "RLVR Training"
echo "=============================================="
echo "Precision Mode: $PRECISION_MODE"
echo "Seed: $SEED"
echo "Model: $MODEL_NAME"
echo "Training Data: $TRAIN_DATA"
echo "Output: $OUTPUT_DIR"
echo "Num Steps: $NUM_STEPS"
echo "=============================================="
# Run training
python train_rlvr.py \
--precision_mode "$PRECISION_MODE" \
--seed "$SEED" \
--output_dir "$OUTPUT_DIR" \
--train_dataset_path "$TRAIN_DATA" \
--model_name "$MODEL_NAME" \
--num_steps "$NUM_STEPS" \
2>&1 | tee "${OUTPUT_DIR}/training.log"
echo "Training complete. Output saved to: $OUTPUT_DIR"
|