summaryrefslogtreecommitdiff
path: root/collaborativeagents/scripts/test_vllm_interactive.sh
diff options
context:
space:
mode:
Diffstat (limited to 'collaborativeagents/scripts/test_vllm_interactive.sh')
-rwxr-xr-xcollaborativeagents/scripts/test_vllm_interactive.sh212
1 files changed, 212 insertions, 0 deletions
diff --git a/collaborativeagents/scripts/test_vllm_interactive.sh b/collaborativeagents/scripts/test_vllm_interactive.sh
new file mode 100755
index 0000000..5da73b4
--- /dev/null
+++ b/collaborativeagents/scripts/test_vllm_interactive.sh
@@ -0,0 +1,212 @@
+#!/bin/bash
+# Test vLLM inference speed on interactive node
+#
+# Usage:
+# 1. Get an interactive node:
+# srun --partition=gpu --gres=gpu:4 --time=2:00:00 --pty bash
+#
+# 2. Run this script:
+# bash scripts/test_vllm_interactive.sh
+#
+# This script will:
+# 1. Start vLLM server for 8B model (agent)
+# 2. Start vLLM server for 70B AWQ model (user simulator)
+# 3. Run benchmarks
+# 4. Compare with paper's 2000 conv/hr target
+
+set -e
+
+# Paths
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+MODEL_70B="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
+HF_CACHE="/projects/bfqt/users/yurenh2/hf_cache/huggingface"
+
+# Ports
+PORT_8B=8003
+PORT_70B=8004
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}======================================${NC}"
+echo -e "${GREEN} vLLM Inference Speed Test${NC}"
+echo -e "${GREEN}======================================${NC}"
+echo ""
+
+# Check GPU availability
+echo -e "${YELLOW}Checking GPUs...${NC}"
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
+echo -e "Found ${GREEN}${NUM_GPUS}${NC} GPUs"
+echo ""
+
+if [ "$NUM_GPUS" -lt 4 ]; then
+ echo -e "${RED}WARNING: Less than 4 GPUs available. 70B model may not fit.${NC}"
+fi
+
+# Setup environment
+export HF_HOME=$HF_CACHE
+export TRANSFORMERS_CACHE=$HF_CACHE
+
+# Activate conda environment if needed
+# source /path/to/conda/etc/profile.d/conda.sh
+# conda activate your_env
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+
+# Function to start vLLM server
+start_vllm_server() {
+ local model=$1
+ local port=$2
+ local gpus=$3
+ local extra_args=$4
+ local logfile=$5
+
+ echo -e "${YELLOW}Starting vLLM server on port $port with GPUs $gpus...${NC}"
+ echo "Model: $model"
+
+ CUDA_VISIBLE_DEVICES=$gpus python -m vllm.entrypoints.openai.api_server \
+ --model $model \
+ --port $port \
+ --gpu-memory-utilization 0.9 \
+ --max-model-len 8192 \
+ $extra_args \
+ > $logfile 2>&1 &
+
+ echo $!
+}
+
+# Function to wait for server to be ready
+wait_for_server() {
+ local port=$1
+ local max_wait=300 # 5 minutes
+ local waited=0
+
+ echo -n "Waiting for server on port $port"
+ while [ $waited -lt $max_wait ]; do
+ if curl -s http://localhost:$port/health > /dev/null 2>&1; then
+ echo -e " ${GREEN}Ready!${NC}"
+ return 0
+ fi
+ echo -n "."
+ sleep 5
+ waited=$((waited + 5))
+ done
+
+ echo -e " ${RED}Timeout!${NC}"
+ return 1
+}
+
+# Cleanup function
+cleanup() {
+ echo -e "\n${YELLOW}Cleaning up...${NC}"
+ if [ ! -z "$PID_8B" ]; then
+ kill $PID_8B 2>/dev/null || true
+ fi
+ if [ ! -z "$PID_70B" ]; then
+ kill $PID_70B 2>/dev/null || true
+ fi
+ echo "Done."
+}
+
+trap cleanup EXIT
+
+# Create log directory
+mkdir -p logs
+
+# ============================================
+# Test 1: 8B model only (single GPU)
+# ============================================
+echo -e "\n${GREEN}=== Test 1: 8B Model Benchmark ===${NC}"
+
+PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b.log")
+echo "Server PID: $PID_8B"
+
+if wait_for_server $PORT_8B; then
+ echo -e "\n${YELLOW}Running 8B benchmark (20 requests)...${NC}"
+ python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 20
+
+ echo -e "\n${YELLOW}Running 8B benchmark with concurrency...${NC}"
+ python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 50 --concurrent
+else
+ echo -e "${RED}Failed to start 8B server${NC}"
+fi
+
+# Stop 8B server
+kill $PID_8B 2>/dev/null || true
+sleep 5
+
+# ============================================
+# Test 2: 70B AWQ model (4 GPUs with tensor parallelism)
+# ============================================
+echo -e "\n${GREEN}=== Test 2: 70B AWQ Model Benchmark ===${NC}"
+
+if [ "$NUM_GPUS" -ge 4 ]; then
+ PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "0,1,2,3" "--tensor-parallel-size 4" "logs/vllm_70b.log")
+ echo "Server PID: $PID_70B"
+
+ if wait_for_server $PORT_70B; then
+ echo -e "\n${YELLOW}Running 70B benchmark (20 requests)...${NC}"
+ python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 20
+
+ echo -e "\n${YELLOW}Running 70B benchmark with concurrency...${NC}"
+ python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 50 --concurrent
+ else
+ echo -e "${RED}Failed to start 70B server${NC}"
+ echo "Check logs/vllm_70b.log for errors"
+ fi
+
+ # Stop 70B server
+ kill $PID_70B 2>/dev/null || true
+ sleep 5
+else
+ echo -e "${YELLOW}Skipping 70B test (need 4 GPUs)${NC}"
+fi
+
+# ============================================
+# Test 3: Full conversation simulation
+# ============================================
+echo -e "\n${GREEN}=== Test 3: Full Conversation Simulation ===${NC}"
+
+if [ "$NUM_GPUS" -ge 4 ]; then
+ # Start both servers
+ # 8B on GPU 0, 70B on GPUs 1,2,3 (tensor parallel 3)
+ # Or split differently based on memory
+
+ echo "Starting 8B server on GPU 0..."
+ PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b_conv.log")
+
+ echo "Starting 70B server on GPUs 1,2,3..."
+ PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "1,2,3" "--tensor-parallel-size 3" "logs/vllm_70b_conv.log")
+
+ wait_for_server $PORT_8B
+ wait_for_server $PORT_70B
+
+ if [ $? -eq 0 ]; then
+ echo -e "\n${YELLOW}Running full conversation benchmark (10 conversations)...${NC}"
+ python scripts/benchmark_inference.py --mode conversation \
+ --url-8b http://localhost:$PORT_8B/v1 \
+ --url-70b http://localhost:$PORT_70B/v1 \
+ -n 10
+ fi
+else
+ echo -e "${YELLOW}Skipping full conversation test (need 4 GPUs)${NC}"
+fi
+
+# ============================================
+# Summary
+# ============================================
+echo -e "\n${GREEN}======================================${NC}"
+echo -e "${GREEN} Test Complete!${NC}"
+echo -e "${GREEN}======================================${NC}"
+echo ""
+echo "Target: 2000 conversations/hour (paper on H100x8)"
+echo ""
+echo "Check the benchmark results above to see how close we are."
+echo "If throughput is still low, check:"
+echo " 1. GPU utilization during tests (nvidia-smi dmon -s u)"
+echo " 2. vLLM logs in logs/*.log"
+echo " 3. Network latency if using remote servers"