collaborativeagents/scripts/test_parallel_vllm.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205

#!/bin/bash
# Test parallel vLLM processing on H200x8-interactive
# Usage: Run this on an interactive H200 node
#
# srun --account=bfqt-delta-gpu --partition=gpuH200x8-interactive \
#      --nodes=1 --gpus-per-node=4 --time=02:00:00 --mem=200G --pty bash
# cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
# bash scripts/test_parallel_vllm.sh

set -e

cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
conda activate eval

export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"

# Configuration
MODEL_70B_AWQ="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
PORT_70B=8004
PORT_8B=8003

echo "============================================"
echo "Parallel vLLM Experiment Test"
echo "============================================"
echo "Date: $(date)"
echo "Node: $(hostname)"
echo ""

echo "=== GPU Info ==="
nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv
echo ""

# Kill any existing vLLM servers
echo "Cleaning up any existing vLLM servers..."
pkill -f "vllm.entrypoints" 2>/dev/null || true
sleep 2

echo "============================================"
echo "Starting vLLM Servers"
echo "============================================"

# Start 70B AWQ server on GPU 0-1 (needs 2 GPUs for tensor parallelism)
echo ""
echo "Starting 70B AWQ vLLM Server (GPU 0-1, TP=2)..."
CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
    --model $MODEL_70B_AWQ \
    --port $PORT_70B \
    --tensor-parallel-size 2 \
    --gpu-memory-utilization 0.90 \
    --max-model-len 4096 \
    --disable-log-requests \
    --quantization awq \
    --dtype float16 &
SERVER_70B_PID=$!
echo "70B Server PID: $SERVER_70B_PID"

# Start 8B server on GPU 2
echo ""
echo "Starting 8B vLLM Server (GPU 2)..."
CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \
    --model $MODEL_8B \
    --port $PORT_8B \
    --gpu-memory-utilization 0.90 \
    --max-model-len 4096 \
    --disable-log-requests \
    --dtype bfloat16 &
SERVER_8B_PID=$!
echo "8B Server PID: $SERVER_8B_PID"

echo ""
echo "Waiting for servers to start..."

# Wait for 70B (may take 3-5 minutes)
for i in $(seq 1 120); do
    if curl -s http://localhost:$PORT_70B/health > /dev/null 2>&1; then
        echo "70B Server ready after $((i*3)) seconds"
        break
    fi
    if [ $((i % 20)) -eq 0 ]; then
        echo "  Waiting for 70B... ($((i*3)) seconds)"
    fi
    sleep 3
done

# Wait for 8B
for i in $(seq 1 60); do
    if curl -s http://localhost:$PORT_8B/health > /dev/null 2>&1; then
        echo "8B Server ready after $((i*2)) seconds"
        break
    fi
    sleep 2
done

# Check both servers
echo ""
if ! curl -s http://localhost:$PORT_70B/health > /dev/null 2>&1; then
    echo "ERROR: 70B server failed to start"
    kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null
    exit 1
fi
echo "✓ 70B server healthy"

if ! curl -s http://localhost:$PORT_8B/health > /dev/null 2>&1; then
    echo "ERROR: 8B server failed to start"
    kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null
    exit 1
fi
echo "✓ 8B server healthy"

echo ""
echo "=== vLLM Server Info ==="
echo "70B model:"
curl -s http://localhost:$PORT_70B/v1/models | python -m json.tool 2>/dev/null | head -10
echo ""
echo "8B model:"
curl -s http://localhost:$PORT_8B/v1/models | python -m json.tool 2>/dev/null | head -10

echo ""
echo "============================================"
echo "Test 1: Sequential Processing (1 profile)"
echo "============================================"

cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/scripts

time python run_experiments.py \
    --methods vanilla \
    --datasets mmlu \
    --n-profiles 1 \
    --n-sessions 3 \
    --use-vllm \
    --vllm-user-url http://localhost:$PORT_70B/v1 \
    --vllm-agent-url http://localhost:$PORT_8B/v1 \
    --parallel-profiles 1 \
    --output-dir ../results/parallel_test_seq \
    --profile-path ../data/complex_profiles_v2/profiles_100.jsonl

echo ""
echo "============================================"
echo "Test 2: Parallel Processing (4 profiles)"
echo "============================================"

time python run_experiments.py \
    --methods vanilla \
    --datasets mmlu \
    --n-profiles 4 \
    --n-sessions 3 \
    --use-vllm \
    --vllm-user-url http://localhost:$PORT_70B/v1 \
    --vllm-agent-url http://localhost:$PORT_8B/v1 \
    --parallel-profiles 4 \
    --output-dir ../results/parallel_test_4 \
    --profile-path ../data/complex_profiles_v2/profiles_100.jsonl

echo ""
echo "============================================"
echo "Test 3: Parallel Processing (8 profiles)"
echo "============================================"

time python run_experiments.py \
    --methods vanilla \
    --datasets mmlu \
    --n-profiles 8 \
    --n-sessions 3 \
    --use-vllm \
    --vllm-user-url http://localhost:$PORT_70B/v1 \
    --vllm-agent-url http://localhost:$PORT_8B/v1 \
    --parallel-profiles 8 \
    --output-dir ../results/parallel_test_8 \
    --profile-path ../data/complex_profiles_v2/profiles_100.jsonl

echo ""
echo "============================================"
echo "Test 4: Parallel Processing (16 profiles)"
echo "============================================"

time python run_experiments.py \
    --methods vanilla \
    --datasets mmlu \
    --n-profiles 16 \
    --n-sessions 3 \
    --use-vllm \
    --vllm-user-url http://localhost:$PORT_70B/v1 \
    --vllm-agent-url http://localhost:$PORT_8B/v1 \
    --parallel-profiles 16 \
    --output-dir ../results/parallel_test_16 \
    --profile-path ../data/complex_profiles_v2/profiles_100.jsonl

# Cleanup
echo ""
echo "Cleaning up..."
kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null
wait $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null

echo ""
echo "============================================"
echo "TEST COMPLETE!"
echo "============================================"
echo ""
echo "Compare the timing results above to estimate optimal parallelism."
echo "Expected scaling: Higher parallelism → Higher throughput (until bottleneck)"
echo ""
date