blob: 8429da73ff3c5261aa7c19f8ac7c80c71c9e1a95 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
|
#!/bin/bash
# Quick test of parallel vLLM processing on H200x8-interactive
# Simplified version for 1 hour time limit
set -e
cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
conda activate eval
export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"
# Configuration
MODEL_70B_AWQ="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
PORT_70B=8004
PORT_8B=8003
echo "============================================"
echo "Quick Parallel vLLM Test (H200)"
echo "============================================"
date
echo "Node: $(hostname)"
nvidia-smi --query-gpu=index,name,memory.total --format=csv
echo ""
# Kill any existing vLLM servers
pkill -f "vllm.entrypoints" 2>/dev/null || true
sleep 2
# Start 70B AWQ server on GPU 0-1 (TP=2)
echo "Starting 70B AWQ server (GPU 0-1, TP=2)..."
CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
--model $MODEL_70B_AWQ \
--port $PORT_70B \
--tensor-parallel-size 2 \
--gpu-memory-utilization 0.90 \
--max-model-len 4096 \
--disable-log-requests \
--quantization awq \
--dtype float16 &
SERVER_70B_PID=$!
# Start 8B server on GPU 2
echo "Starting 8B server (GPU 2)..."
CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \
--model $MODEL_8B \
--port $PORT_8B \
--gpu-memory-utilization 0.90 \
--max-model-len 4096 \
--disable-log-requests \
--dtype bfloat16 &
SERVER_8B_PID=$!
echo "Waiting for servers (up to 5 min)..."
# Wait for servers
for i in $(seq 1 100); do
READY_70B=$(curl -s http://localhost:$PORT_70B/health > /dev/null 2>&1 && echo 1 || echo 0)
READY_8B=$(curl -s http://localhost:$PORT_8B/health > /dev/null 2>&1 && echo 1 || echo 0)
if [ "$READY_70B" = "1" ] && [ "$READY_8B" = "1" ]; then
echo "Both servers ready after $((i*3)) seconds"
break
fi
if [ $((i % 20)) -eq 0 ]; then
echo " Still waiting... 70B=$READY_70B, 8B=$READY_8B ($((i*3))s)"
fi
sleep 3
done
# Check health
if ! curl -s http://localhost:$PORT_70B/health > /dev/null; then
echo "ERROR: 70B server not healthy"; kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null; exit 1
fi
if ! curl -s http://localhost:$PORT_8B/health > /dev/null; then
echo "ERROR: 8B server not healthy"; kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null; exit 1
fi
echo "✓ Both servers healthy"
cd scripts
echo ""
echo "============================================"
echo "Running throughput tests..."
echo "============================================"
# Test 1: Sequential (1 profile, 2 sessions)
echo ""
echo "--- Test 1: Sequential (1 profile) ---"
START=$(date +%s)
python run_experiments.py \
--methods vanilla \
--datasets mmlu \
--n-profiles 1 \
--n-sessions 2 \
--use-vllm \
--vllm-user-url http://localhost:$PORT_70B/v1 \
--vllm-agent-url http://localhost:$PORT_8B/v1 \
--parallel-profiles 1 \
--output-dir ../results/quick_test_1 \
--profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -20
END=$(date +%s)
echo "Time: $((END-START)) seconds"
# Test 2: Parallel (4 profiles, 2 sessions each)
echo ""
echo "--- Test 2: Parallel (4 profiles) ---"
START=$(date +%s)
python run_experiments.py \
--methods vanilla \
--datasets mmlu \
--n-profiles 4 \
--n-sessions 2 \
--use-vllm \
--vllm-user-url http://localhost:$PORT_70B/v1 \
--vllm-agent-url http://localhost:$PORT_8B/v1 \
--parallel-profiles 4 \
--output-dir ../results/quick_test_4 \
--profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -20
END=$(date +%s)
echo "Time: $((END-START)) seconds"
# Test 3: Parallel (8 profiles, 2 sessions each)
echo ""
echo "--- Test 3: Parallel (8 profiles) ---"
START=$(date +%s)
python run_experiments.py \
--methods vanilla \
--datasets mmlu \
--n-profiles 8 \
--n-sessions 2 \
--use-vllm \
--vllm-user-url http://localhost:$PORT_70B/v1 \
--vllm-agent-url http://localhost:$PORT_8B/v1 \
--parallel-profiles 8 \
--output-dir ../results/quick_test_8 \
--profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -20
END=$(date +%s)
echo "Time: $((END-START)) seconds"
# Cleanup
echo ""
echo "Cleaning up..."
kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null || true
echo ""
echo "============================================"
echo "TEST COMPLETE!"
echo "============================================"
echo ""
echo "Summary: Compare timing above"
echo " - Sequential (1 profile): baseline"
echo " - Parallel (4 profiles): should be faster per profile"
echo " - Parallel (8 profiles): should show more speedup"
echo ""
date
|