1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
|
#!/bin/bash
# Quick test of parallel vLLM processing on A100x4-interactive
set -e
cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
conda activate eval
export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"
# Configuration - using only 8B model for user sim to fit in A100
# (70B AWQ needs TP=2 which leaves only 2 GPUs for 8B)
MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
PORT_USER=8004
PORT_AGENT=8003
echo "============================================"
echo "Quick Parallel vLLM Test (A100x4)"
echo "============================================"
date
echo "Node: $(hostname)"
nvidia-smi --query-gpu=index,name,memory.total --format=csv
echo ""
# Kill any existing vLLM servers
pkill -f "vllm.entrypoints" 2>/dev/null || true
sleep 2
# For A100 test, use 8B for both user and agent (to test parallelism)
# In production, user would be 70B AWQ with TP=2
# Start user simulator server (8B) on GPU 0-1
echo "Starting 8B user simulator server (GPU 0-1)..."
CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
--model $MODEL_8B \
--port $PORT_USER \
--tensor-parallel-size 2 \
--gpu-memory-utilization 0.85 \
--max-model-len 4096 \
--disable-log-requests \
--dtype bfloat16 &
SERVER_USER_PID=$!
# Start agent server (8B) on GPU 2-3
echo "Starting 8B agent server (GPU 2-3)..."
CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
--model $MODEL_8B \
--port $PORT_AGENT \
--tensor-parallel-size 2 \
--gpu-memory-utilization 0.85 \
--max-model-len 4096 \
--disable-log-requests \
--dtype bfloat16 &
SERVER_AGENT_PID=$!
echo "Waiting for servers..."
# Wait for servers (up to 5 minutes - A100 needs more time than H200)
for i in $(seq 1 100); do
READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0)
READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0)
if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then
echo "Both servers ready after $((i*3)) seconds"
break
fi
if [ $((i % 20)) -eq 0 ]; then
echo " Still waiting... user=$READY_USER, agent=$READY_AGENT ($((i*3))s)"
fi
sleep 3
done
# Check health
if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then
echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
fi
if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then
echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
fi
echo "✓ Both servers healthy"
cd scripts
echo ""
echo "============================================"
echo "Running throughput tests..."
echo "============================================"
echo "Note: Using 8B for both user and agent (parallelism test)"
echo ""
# Test 1: Sequential (1 profile, 3 sessions)
echo "--- Test 1: Sequential (1 profile, 3 sessions) ---"
START=$(date +%s)
python run_experiments.py \
--methods vanilla \
--datasets mmlu \
--n-profiles 1 \
--n-sessions 3 \
--use-vllm \
--vllm-user-url http://localhost:$PORT_USER/v1 \
--vllm-agent-url http://localhost:$PORT_AGENT/v1 \
--parallel-profiles 1 \
--output-dir ../results/a100_test_1 \
--profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -30
END=$(date +%s)
ELAPSED_1=$((END-START))
echo ""
echo "Time for 1 profile (3 sessions): ${ELAPSED_1} seconds"
echo "Throughput: ~$(echo "scale=1; 3 * 3600 / $ELAPSED_1" | bc) sessions/hr"
# Test 2: Parallel (4 profiles, 3 sessions each = 12 total)
echo ""
echo "--- Test 2: Parallel (4 profiles, 3 sessions each = 12 total) ---"
START=$(date +%s)
python run_experiments.py \
--methods vanilla \
--datasets mmlu \
--n-profiles 4 \
--n-sessions 3 \
--use-vllm \
--vllm-user-url http://localhost:$PORT_USER/v1 \
--vllm-agent-url http://localhost:$PORT_AGENT/v1 \
--parallel-profiles 4 \
--output-dir ../results/a100_test_4 \
--profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -30
END=$(date +%s)
ELAPSED_4=$((END-START))
echo ""
echo "Time for 4 profiles (12 sessions): ${ELAPSED_4} seconds"
echo "Throughput: ~$(echo "scale=1; 12 * 3600 / $ELAPSED_4" | bc) sessions/hr"
# Test 3: More parallel (8 profiles)
echo ""
echo "--- Test 3: Parallel (8 profiles, 3 sessions each = 24 total) ---"
START=$(date +%s)
python run_experiments.py \
--methods vanilla \
--datasets mmlu \
--n-profiles 8 \
--n-sessions 3 \
--use-vllm \
--vllm-user-url http://localhost:$PORT_USER/v1 \
--vllm-agent-url http://localhost:$PORT_AGENT/v1 \
--parallel-profiles 8 \
--output-dir ../results/a100_test_8 \
--profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -30
END=$(date +%s)
ELAPSED_8=$((END-START))
echo ""
echo "Time for 8 profiles (24 sessions): ${ELAPSED_8} seconds"
echo "Throughput: ~$(echo "scale=1; 24 * 3600 / $ELAPSED_8" | bc) sessions/hr"
# Cleanup
echo ""
echo "Cleaning up..."
kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true
echo ""
echo "============================================"
echo "RESULTS SUMMARY"
echo "============================================"
echo ""
echo "1 profile (3 sessions): ${ELAPSED_1}s -> $(echo "scale=0; 3 * 3600 / $ELAPSED_1" | bc) sessions/hr"
echo "4 profiles (12 sessions): ${ELAPSED_4}s -> $(echo "scale=0; 12 * 3600 / $ELAPSED_4" | bc) sessions/hr"
echo "8 profiles (24 sessions): ${ELAPSED_8}s -> $(echo "scale=0; 24 * 3600 / $ELAPSED_8" | bc) sessions/hr"
echo ""
echo "Speedup 4x parallel: $(echo "scale=2; ($ELAPSED_1 * 4) / $ELAPSED_4" | bc)x"
echo "Speedup 8x parallel: $(echo "scale=2; ($ELAPSED_1 * 8) / $ELAPSED_8" | bc)x"
echo ""
date
|