collaborativeagents/slurm/start_model_servers.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

#!/bin/bash
#SBATCH --job-name=model_servers
#SBATCH --account=bfqt-delta-gpu
#SBATCH --partition=gpuH200x8
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=32
#SBATCH --gres=gpu:h200:8
#SBATCH --mem=400G
#SBATCH --time=24:00:00
#SBATCH --output=logs/model_servers_%j.out
#SBATCH --error=logs/model_servers_%j.err

# Start vLLM/sglang model servers for experiments
# - Port 8004: Llama-3.3-70B-Instruct (user simulator + judge) - 4 GPUs
# - Port 8003: Llama-3.1-8B-Instruct (agent) - 2 GPUs

set -e

cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model

# Create logs directory
mkdir -p collaborativeagents/slurm/logs

echo "Starting model servers at $(date)"
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURMD_NODENAME"
echo "GPUs: $CUDA_VISIBLE_DEVICES"

# Activate environment with sglang
source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
conda activate eval

# Check GPU availability
nvidia-smi

# Start 70B model server (user simulator + judge) - needs 4 GPUs for TP
echo "Starting Llama-3.3-70B-Instruct server on port 8004..."
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m sglang.launch_server \
    --model-path meta-llama/Llama-3.3-70B-Instruct \
    --port 8004 \
    --tp-size 4 \
    --context-length 16384 \
    --mem-fraction-static 0.85 \
    2>&1 | tee logs/server_70b_$SLURM_JOB_ID.log &
SERVER_70B_PID=$!

# Wait for 70B server to start (takes a few minutes)
echo "Waiting for 70B server to initialize..."
sleep 120

# Start 8B model server (agent) - needs 2 GPUs
echo "Starting Llama-3.1-8B-Instruct server on port 8003..."
CUDA_VISIBLE_DEVICES=4,5 python -m sglang.launch_server \
    --model-path models/llama-3.1-8b-instruct \
    --served-model-name meta-llama/Llama-3.1-8B-Instruct \
    --port 8003 \
    --tp-size 2 \
    --context-length 16384 \
    --mem-fraction-static 0.85 \
    2>&1 | tee logs/server_8b_$SLURM_JOB_ID.log &
SERVER_8B_PID=$!

echo "Servers starting..."
echo "70B server PID: $SERVER_70B_PID"
echo "8B server PID: $SERVER_8B_PID"

# Save server info for experiment runner
cat > collaborativeagents/slurm/server_info.txt << EOF
NODE=$SLURMD_NODENAME
JOB_ID=$SLURM_JOB_ID
SERVER_70B_PID=$SERVER_70B_PID
SERVER_8B_PID=$SERVER_8B_PID
USER_API_BASE=http://$SLURMD_NODENAME:8004/v1
AGENT_API_BASE=http://$SLURMD_NODENAME:8003/v1
JUDGE_API_BASE=http://$SLURMD_NODENAME:8004/v1
EOF

echo "Server info saved to collaborativeagents/slurm/server_info.txt"

# Wait for both servers
wait $SERVER_70B_PID $SERVER_8B_PID