diff options
| author | YurenHao0426 <blackhao0426@gmail.com> | 2026-01-27 09:57:37 -0600 |
|---|---|---|
| committer | YurenHao0426 <blackhao0426@gmail.com> | 2026-01-27 09:57:37 -0600 |
| commit | dc801c07cf38b0c495686463e6ca6f871a64440e (patch) | |
| tree | 599f03114775921dbc472403c701f4a3a8ea188a /collaborativeagents/scripts/test_vllm_interactive.sh | |
| parent | e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (diff) | |
Add collaborativeagents module and update gitignore
- Add collaborativeagents subproject with adapters, agents, and evaluation modules
- Update .gitignore to exclude large binary files (.whl, .tar), wandb logs, and results
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Diffstat (limited to 'collaborativeagents/scripts/test_vllm_interactive.sh')
| -rwxr-xr-x | collaborativeagents/scripts/test_vllm_interactive.sh | 212 |
1 files changed, 212 insertions, 0 deletions
diff --git a/collaborativeagents/scripts/test_vllm_interactive.sh b/collaborativeagents/scripts/test_vllm_interactive.sh new file mode 100755 index 0000000..5da73b4 --- /dev/null +++ b/collaborativeagents/scripts/test_vllm_interactive.sh @@ -0,0 +1,212 @@ +#!/bin/bash +# Test vLLM inference speed on interactive node +# +# Usage: +# 1. Get an interactive node: +# srun --partition=gpu --gres=gpu:4 --time=2:00:00 --pty bash +# +# 2. Run this script: +# bash scripts/test_vllm_interactive.sh +# +# This script will: +# 1. Start vLLM server for 8B model (agent) +# 2. Start vLLM server for 70B AWQ model (user simulator) +# 3. Run benchmarks +# 4. Compare with paper's 2000 conv/hr target + +set -e + +# Paths +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +MODEL_70B="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4" +HF_CACHE="/projects/bfqt/users/yurenh2/hf_cache/huggingface" + +# Ports +PORT_8B=8003 +PORT_70B=8004 + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${GREEN}======================================${NC}" +echo -e "${GREEN} vLLM Inference Speed Test${NC}" +echo -e "${GREEN}======================================${NC}" +echo "" + +# Check GPU availability +echo -e "${YELLOW}Checking GPUs...${NC}" +nvidia-smi --query-gpu=index,name,memory.total --format=csv +NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l) +echo -e "Found ${GREEN}${NUM_GPUS}${NC} GPUs" +echo "" + +if [ "$NUM_GPUS" -lt 4 ]; then + echo -e "${RED}WARNING: Less than 4 GPUs available. 70B model may not fit.${NC}" +fi + +# Setup environment +export HF_HOME=$HF_CACHE +export TRANSFORMERS_CACHE=$HF_CACHE + +# Activate conda environment if needed +# source /path/to/conda/etc/profile.d/conda.sh +# conda activate your_env + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents + +# Function to start vLLM server +start_vllm_server() { + local model=$1 + local port=$2 + local gpus=$3 + local extra_args=$4 + local logfile=$5 + + echo -e "${YELLOW}Starting vLLM server on port $port with GPUs $gpus...${NC}" + echo "Model: $model" + + CUDA_VISIBLE_DEVICES=$gpus python -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port $port \ + --gpu-memory-utilization 0.9 \ + --max-model-len 8192 \ + $extra_args \ + > $logfile 2>&1 & + + echo $! +} + +# Function to wait for server to be ready +wait_for_server() { + local port=$1 + local max_wait=300 # 5 minutes + local waited=0 + + echo -n "Waiting for server on port $port" + while [ $waited -lt $max_wait ]; do + if curl -s http://localhost:$port/health > /dev/null 2>&1; then + echo -e " ${GREEN}Ready!${NC}" + return 0 + fi + echo -n "." + sleep 5 + waited=$((waited + 5)) + done + + echo -e " ${RED}Timeout!${NC}" + return 1 +} + +# Cleanup function +cleanup() { + echo -e "\n${YELLOW}Cleaning up...${NC}" + if [ ! -z "$PID_8B" ]; then + kill $PID_8B 2>/dev/null || true + fi + if [ ! -z "$PID_70B" ]; then + kill $PID_70B 2>/dev/null || true + fi + echo "Done." +} + +trap cleanup EXIT + +# Create log directory +mkdir -p logs + +# ============================================ +# Test 1: 8B model only (single GPU) +# ============================================ +echo -e "\n${GREEN}=== Test 1: 8B Model Benchmark ===${NC}" + +PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b.log") +echo "Server PID: $PID_8B" + +if wait_for_server $PORT_8B; then + echo -e "\n${YELLOW}Running 8B benchmark (20 requests)...${NC}" + python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 20 + + echo -e "\n${YELLOW}Running 8B benchmark with concurrency...${NC}" + python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 50 --concurrent +else + echo -e "${RED}Failed to start 8B server${NC}" +fi + +# Stop 8B server +kill $PID_8B 2>/dev/null || true +sleep 5 + +# ============================================ +# Test 2: 70B AWQ model (4 GPUs with tensor parallelism) +# ============================================ +echo -e "\n${GREEN}=== Test 2: 70B AWQ Model Benchmark ===${NC}" + +if [ "$NUM_GPUS" -ge 4 ]; then + PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "0,1,2,3" "--tensor-parallel-size 4" "logs/vllm_70b.log") + echo "Server PID: $PID_70B" + + if wait_for_server $PORT_70B; then + echo -e "\n${YELLOW}Running 70B benchmark (20 requests)...${NC}" + python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 20 + + echo -e "\n${YELLOW}Running 70B benchmark with concurrency...${NC}" + python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 50 --concurrent + else + echo -e "${RED}Failed to start 70B server${NC}" + echo "Check logs/vllm_70b.log for errors" + fi + + # Stop 70B server + kill $PID_70B 2>/dev/null || true + sleep 5 +else + echo -e "${YELLOW}Skipping 70B test (need 4 GPUs)${NC}" +fi + +# ============================================ +# Test 3: Full conversation simulation +# ============================================ +echo -e "\n${GREEN}=== Test 3: Full Conversation Simulation ===${NC}" + +if [ "$NUM_GPUS" -ge 4 ]; then + # Start both servers + # 8B on GPU 0, 70B on GPUs 1,2,3 (tensor parallel 3) + # Or split differently based on memory + + echo "Starting 8B server on GPU 0..." + PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b_conv.log") + + echo "Starting 70B server on GPUs 1,2,3..." + PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "1,2,3" "--tensor-parallel-size 3" "logs/vllm_70b_conv.log") + + wait_for_server $PORT_8B + wait_for_server $PORT_70B + + if [ $? -eq 0 ]; then + echo -e "\n${YELLOW}Running full conversation benchmark (10 conversations)...${NC}" + python scripts/benchmark_inference.py --mode conversation \ + --url-8b http://localhost:$PORT_8B/v1 \ + --url-70b http://localhost:$PORT_70B/v1 \ + -n 10 + fi +else + echo -e "${YELLOW}Skipping full conversation test (need 4 GPUs)${NC}" +fi + +# ============================================ +# Summary +# ============================================ +echo -e "\n${GREEN}======================================${NC}" +echo -e "${GREEN} Test Complete!${NC}" +echo -e "${GREEN}======================================${NC}" +echo "" +echo "Target: 2000 conversations/hour (paper on H100x8)" +echo "" +echo "Check the benchmark results above to see how close we are." +echo "If throughput is still low, check:" +echo " 1. GPU utilization during tests (nvidia-smi dmon -s u)" +echo " 2. vLLM logs in logs/*.log" +echo " 3. Network latency if using remote servers" |
