summaryrefslogtreecommitdiff
path: root/scripts/submit_all_jobs.sh
blob: 86c0f5d6b04ac2c5e442aeed30e6eeca47bd0074 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/bin/bash
# submit_all_jobs.sh
# Submit all experiment jobs to SLURM queue
# Jobs will run automatically when resources become available

set -e

PROJECT_DIR="/projects/bfqt/users/yurenh2/ml-projects/rl-floating-noise"
cd "$PROJECT_DIR"

# Create log directory
mkdir -p results/slurm_logs

# Configuration
SEEDS=(1 2 3 4 5)
PRECISION_MODES=("fp32" "bf16")

echo "============================================"
echo "Submitting RLVR Experiment Jobs"
echo "============================================"
echo "Seeds: ${SEEDS[*]}"
echo "Precision Modes: ${PRECISION_MODES[*]}"
echo "Total jobs: $((${#SEEDS[@]} * ${#PRECISION_MODES[@]}))"
echo "============================================"

# Track submitted job IDs
declare -a JOB_IDS

for precision in "${PRECISION_MODES[@]}"; do
    for seed in "${SEEDS[@]}"; do
        JOB_NAME="rlvr_${precision}_s${seed}"
        
        echo "Submitting: $JOB_NAME"
        
        # Submit job with environment variables
        JOB_ID=$(sbatch \
            --job-name="$JOB_NAME" \
            --export=ALL,PRECISION_MODE="$precision",SEED="$seed" \
            scripts/slurm_train.sh | awk '{print $4}')
        
        JOB_IDS+=("$JOB_ID")
        echo "  -> Job ID: $JOB_ID"
    done
done

echo ""
echo "============================================"
echo "All jobs submitted!"
echo "Job IDs: ${JOB_IDS[*]}"
echo "============================================"
echo ""
echo "Monitor with:"
echo "  squeue -u $USER"
echo "  squeue -j $(IFS=,; echo "${JOB_IDS[*]}")"
echo ""
echo "View logs:"
echo "  tail -f results/slurm_logs/rlvr_*.out"
echo ""
echo "Cancel all:"
echo "  scancel ${JOB_IDS[*]}"
echo "============================================"

# Save job IDs for reference
echo "${JOB_IDS[*]}" > results/slurm_logs/submitted_jobs.txt
echo "Job IDs saved to: results/slurm_logs/submitted_jobs.txt"