collaborativeagents/scripts/test_vllm_interactive.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212

#!/bin/bash
# Test vLLM inference speed on interactive node
#
# Usage:
#   1. Get an interactive node:
#      srun --partition=gpu --gres=gpu:4 --time=2:00:00 --pty bash
#
#   2. Run this script:
#      bash scripts/test_vllm_interactive.sh
#
# This script will:
#   1. Start vLLM server for 8B model (agent)
#   2. Start vLLM server for 70B AWQ model (user simulator)
#   3. Run benchmarks
#   4. Compare with paper's 2000 conv/hr target

set -e

# Paths
MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
MODEL_70B="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
HF_CACHE="/projects/bfqt/users/yurenh2/hf_cache/huggingface"

# Ports
PORT_8B=8003
PORT_70B=8004

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

echo -e "${GREEN}======================================${NC}"
echo -e "${GREEN}  vLLM Inference Speed Test${NC}"
echo -e "${GREEN}======================================${NC}"
echo ""

# Check GPU availability
echo -e "${YELLOW}Checking GPUs...${NC}"
nvidia-smi --query-gpu=index,name,memory.total --format=csv
NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
echo -e "Found ${GREEN}${NUM_GPUS}${NC} GPUs"
echo ""

if [ "$NUM_GPUS" -lt 4 ]; then
    echo -e "${RED}WARNING: Less than 4 GPUs available. 70B model may not fit.${NC}"
fi

# Setup environment
export HF_HOME=$HF_CACHE
export TRANSFORMERS_CACHE=$HF_CACHE

# Activate conda environment if needed
# source /path/to/conda/etc/profile.d/conda.sh
# conda activate your_env

cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents

# Function to start vLLM server
start_vllm_server() {
    local model=$1
    local port=$2
    local gpus=$3
    local extra_args=$4
    local logfile=$5

    echo -e "${YELLOW}Starting vLLM server on port $port with GPUs $gpus...${NC}"
    echo "Model: $model"

    CUDA_VISIBLE_DEVICES=$gpus python -m vllm.entrypoints.openai.api_server \
        --model $model \
        --port $port \
        --gpu-memory-utilization 0.9 \
        --max-model-len 8192 \
        $extra_args \
        > $logfile 2>&1 &

    echo $!
}

# Function to wait for server to be ready
wait_for_server() {
    local port=$1
    local max_wait=300  # 5 minutes
    local waited=0

    echo -n "Waiting for server on port $port"
    while [ $waited -lt $max_wait ]; do
        if curl -s http://localhost:$port/health > /dev/null 2>&1; then
            echo -e " ${GREEN}Ready!${NC}"
            return 0
        fi
        echo -n "."
        sleep 5
        waited=$((waited + 5))
    done

    echo -e " ${RED}Timeout!${NC}"
    return 1
}

# Cleanup function
cleanup() {
    echo -e "\n${YELLOW}Cleaning up...${NC}"
    if [ ! -z "$PID_8B" ]; then
        kill $PID_8B 2>/dev/null || true
    fi
    if [ ! -z "$PID_70B" ]; then
        kill $PID_70B 2>/dev/null || true
    fi
    echo "Done."
}

trap cleanup EXIT

# Create log directory
mkdir -p logs

# ============================================
# Test 1: 8B model only (single GPU)
# ============================================
echo -e "\n${GREEN}=== Test 1: 8B Model Benchmark ===${NC}"

PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b.log")
echo "Server PID: $PID_8B"

if wait_for_server $PORT_8B; then
    echo -e "\n${YELLOW}Running 8B benchmark (20 requests)...${NC}"
    python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 20

    echo -e "\n${YELLOW}Running 8B benchmark with concurrency...${NC}"
    python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 50 --concurrent
else
    echo -e "${RED}Failed to start 8B server${NC}"
fi

# Stop 8B server
kill $PID_8B 2>/dev/null || true
sleep 5

# ============================================
# Test 2: 70B AWQ model (4 GPUs with tensor parallelism)
# ============================================
echo -e "\n${GREEN}=== Test 2: 70B AWQ Model Benchmark ===${NC}"

if [ "$NUM_GPUS" -ge 4 ]; then
    PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "0,1,2,3" "--tensor-parallel-size 4" "logs/vllm_70b.log")
    echo "Server PID: $PID_70B"

    if wait_for_server $PORT_70B; then
        echo -e "\n${YELLOW}Running 70B benchmark (20 requests)...${NC}"
        python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 20

        echo -e "\n${YELLOW}Running 70B benchmark with concurrency...${NC}"
        python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 50 --concurrent
    else
        echo -e "${RED}Failed to start 70B server${NC}"
        echo "Check logs/vllm_70b.log for errors"
    fi

    # Stop 70B server
    kill $PID_70B 2>/dev/null || true
    sleep 5
else
    echo -e "${YELLOW}Skipping 70B test (need 4 GPUs)${NC}"
fi

# ============================================
# Test 3: Full conversation simulation
# ============================================
echo -e "\n${GREEN}=== Test 3: Full Conversation Simulation ===${NC}"

if [ "$NUM_GPUS" -ge 4 ]; then
    # Start both servers
    # 8B on GPU 0, 70B on GPUs 1,2,3 (tensor parallel 3)
    # Or split differently based on memory

    echo "Starting 8B server on GPU 0..."
    PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b_conv.log")

    echo "Starting 70B server on GPUs 1,2,3..."
    PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "1,2,3" "--tensor-parallel-size 3" "logs/vllm_70b_conv.log")

    wait_for_server $PORT_8B
    wait_for_server $PORT_70B

    if [ $? -eq 0 ]; then
        echo -e "\n${YELLOW}Running full conversation benchmark (10 conversations)...${NC}"
        python scripts/benchmark_inference.py --mode conversation \
            --url-8b http://localhost:$PORT_8B/v1 \
            --url-70b http://localhost:$PORT_70B/v1 \
            -n 10
    fi
else
    echo -e "${YELLOW}Skipping full conversation test (need 4 GPUs)${NC}"
fi

# ============================================
# Summary
# ============================================
echo -e "\n${GREEN}======================================${NC}"
echo -e "${GREEN}  Test Complete!${NC}"
echo -e "${GREEN}======================================${NC}"
echo ""
echo "Target: 2000 conversations/hour (paper on H100x8)"
echo ""
echo "Check the benchmark results above to see how close we are."
echo "If throughput is still low, check:"
echo "  1. GPU utilization during tests (nvidia-smi dmon -s u)"
echo "  2. vLLM logs in logs/*.log"
echo "  3. Network latency if using remote servers"