#!/bin/bash # Test vLLM inference speed on interactive node # # Usage: # 1. Get an interactive node: # srun --partition=gpu --gres=gpu:4 --time=2:00:00 --pty bash # # 2. Run this script: # bash scripts/test_vllm_interactive.sh # # This script will: # 1. Start vLLM server for 8B model (agent) # 2. Start vLLM server for 70B AWQ model (user simulator) # 3. Run benchmarks # 4. Compare with paper's 2000 conv/hr target set -e # Paths MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" MODEL_70B="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4" HF_CACHE="/projects/bfqt/users/yurenh2/hf_cache/huggingface" # Ports PORT_8B=8003 PORT_70B=8004 # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # No Color echo -e "${GREEN}======================================${NC}" echo -e "${GREEN} vLLM Inference Speed Test${NC}" echo -e "${GREEN}======================================${NC}" echo "" # Check GPU availability echo -e "${YELLOW}Checking GPUs...${NC}" nvidia-smi --query-gpu=index,name,memory.total --format=csv NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l) echo -e "Found ${GREEN}${NUM_GPUS}${NC} GPUs" echo "" if [ "$NUM_GPUS" -lt 4 ]; then echo -e "${RED}WARNING: Less than 4 GPUs available. 70B model may not fit.${NC}" fi # Setup environment export HF_HOME=$HF_CACHE export TRANSFORMERS_CACHE=$HF_CACHE # Activate conda environment if needed # source /path/to/conda/etc/profile.d/conda.sh # conda activate your_env cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents # Function to start vLLM server start_vllm_server() { local model=$1 local port=$2 local gpus=$3 local extra_args=$4 local logfile=$5 echo -e "${YELLOW}Starting vLLM server on port $port with GPUs $gpus...${NC}" echo "Model: $model" CUDA_VISIBLE_DEVICES=$gpus python -m vllm.entrypoints.openai.api_server \ --model $model \ --port $port \ --gpu-memory-utilization 0.9 \ --max-model-len 8192 \ $extra_args \ > $logfile 2>&1 & echo $! } # Function to wait for server to be ready wait_for_server() { local port=$1 local max_wait=300 # 5 minutes local waited=0 echo -n "Waiting for server on port $port" while [ $waited -lt $max_wait ]; do if curl -s http://localhost:$port/health > /dev/null 2>&1; then echo -e " ${GREEN}Ready!${NC}" return 0 fi echo -n "." sleep 5 waited=$((waited + 5)) done echo -e " ${RED}Timeout!${NC}" return 1 } # Cleanup function cleanup() { echo -e "\n${YELLOW}Cleaning up...${NC}" if [ ! -z "$PID_8B" ]; then kill $PID_8B 2>/dev/null || true fi if [ ! -z "$PID_70B" ]; then kill $PID_70B 2>/dev/null || true fi echo "Done." } trap cleanup EXIT # Create log directory mkdir -p logs # ============================================ # Test 1: 8B model only (single GPU) # ============================================ echo -e "\n${GREEN}=== Test 1: 8B Model Benchmark ===${NC}" PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b.log") echo "Server PID: $PID_8B" if wait_for_server $PORT_8B; then echo -e "\n${YELLOW}Running 8B benchmark (20 requests)...${NC}" python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 20 echo -e "\n${YELLOW}Running 8B benchmark with concurrency...${NC}" python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 50 --concurrent else echo -e "${RED}Failed to start 8B server${NC}" fi # Stop 8B server kill $PID_8B 2>/dev/null || true sleep 5 # ============================================ # Test 2: 70B AWQ model (4 GPUs with tensor parallelism) # ============================================ echo -e "\n${GREEN}=== Test 2: 70B AWQ Model Benchmark ===${NC}" if [ "$NUM_GPUS" -ge 4 ]; then PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "0,1,2,3" "--tensor-parallel-size 4" "logs/vllm_70b.log") echo "Server PID: $PID_70B" if wait_for_server $PORT_70B; then echo -e "\n${YELLOW}Running 70B benchmark (20 requests)...${NC}" python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 20 echo -e "\n${YELLOW}Running 70B benchmark with concurrency...${NC}" python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 50 --concurrent else echo -e "${RED}Failed to start 70B server${NC}" echo "Check logs/vllm_70b.log for errors" fi # Stop 70B server kill $PID_70B 2>/dev/null || true sleep 5 else echo -e "${YELLOW}Skipping 70B test (need 4 GPUs)${NC}" fi # ============================================ # Test 3: Full conversation simulation # ============================================ echo -e "\n${GREEN}=== Test 3: Full Conversation Simulation ===${NC}" if [ "$NUM_GPUS" -ge 4 ]; then # Start both servers # 8B on GPU 0, 70B on GPUs 1,2,3 (tensor parallel 3) # Or split differently based on memory echo "Starting 8B server on GPU 0..." PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b_conv.log") echo "Starting 70B server on GPUs 1,2,3..." PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "1,2,3" "--tensor-parallel-size 3" "logs/vllm_70b_conv.log") wait_for_server $PORT_8B wait_for_server $PORT_70B if [ $? -eq 0 ]; then echo -e "\n${YELLOW}Running full conversation benchmark (10 conversations)...${NC}" python scripts/benchmark_inference.py --mode conversation \ --url-8b http://localhost:$PORT_8B/v1 \ --url-70b http://localhost:$PORT_70B/v1 \ -n 10 fi else echo -e "${YELLOW}Skipping full conversation test (need 4 GPUs)${NC}" fi # ============================================ # Summary # ============================================ echo -e "\n${GREEN}======================================${NC}" echo -e "${GREEN} Test Complete!${NC}" echo -e "${GREEN}======================================${NC}" echo "" echo "Target: 2000 conversations/hour (paper on H100x8)" echo "" echo "Check the benchmark results above to see how close we are." echo "If throughput is still low, check:" echo " 1. GPU utilization during tests (nvidia-smi dmon -s u)" echo " 2. vLLM logs in logs/*.log" echo " 3. Network latency if using remote servers"