1 files changed, 212 insertions, 0 deletions
diff --git a/collaborativeagents/scripts/test_vllm_interactive.sh b/collaborativeagents/scripts/test_vllm_interactive.sh
new file mode 100755
index 0000000..5da73b4
--- /dev/null
+++ b/collaborativeagents/scripts/test_vllm_interactive.sh
@@ -0,0 +1,212 @@
+#!/bin/bash
+# Test vLLM inference speed on interactive node
+#
+# Usage:
+#   1. Get an interactive node:
+#      srun --partition=gpu --gres=gpu:4 --time=2:00:00 --pty bash
+#
+#   2. Run this script:
+#      bash scripts/test_vllm_interactive.sh
+#
+# This script will:
+#   1. Start vLLM server for 8B model (agent)
+#   2. Start vLLM server for 70B AWQ model (user simulator)
+#   3. Run benchmarks
+#   4. Compare with paper's 2000 conv/hr target
+
+set -e
+
+# Paths
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+MODEL_70B="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
+HF_CACHE="/projects/bfqt/users/yurenh2/hf_cache/huggingface"
+
+# Ports
+PORT_8B=8003
+PORT_70B=8004
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}======================================${NC}"
+echo -e "${GREEN}  vLLM Inference Speed Test${NC}"
+echo -e "${GREEN}======================================${NC}"
+echo ""
+
+# Check GPU availability
+echo -e "${YELLOW}Checking GPUs...${NC}"
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
+echo -e "Found ${GREEN}${NUM_GPUS}${NC} GPUs"
+echo ""
+
+if [ "$NUM_GPUS" -lt 4 ]; then
+    echo -e "${RED}WARNING: Less than 4 GPUs available. 70B model may not fit.${NC}"
+fi
+
+# Setup environment
+export HF_HOME=$HF_CACHE
+export TRANSFORMERS_CACHE=$HF_CACHE
+
+# Activate conda environment if needed
+# source /path/to/conda/etc/profile.d/conda.sh
+# conda activate your_env
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+
+# Function to start vLLM server
+start_vllm_server() {
+    local model=$1
+    local port=$2
+    local gpus=$3
+    local extra_args=$4
+    local logfile=$5
+
+    echo -e "${YELLOW}Starting vLLM server on port $port with GPUs $gpus...${NC}"
+    echo "Model: $model"
+
+    CUDA_VISIBLE_DEVICES=$gpus python -m vllm.entrypoints.openai.api_server \
+        --model $model \
+        --port $port \
+        --gpu-memory-utilization 0.9 \
+        --max-model-len 8192 \
+        $extra_args \
+        > $logfile 2>&1 &
+
+    echo $!
+}
+
+# Function to wait for server to be ready
+wait_for_server() {
+    local port=$1
+    local max_wait=300  # 5 minutes
+    local waited=0
+
+    echo -n "Waiting for server on port $port"
+    while [ $waited -lt $max_wait ]; do
+        if curl -s http://localhost:$port/health > /dev/null 2>&1; then
+            echo -e " ${GREEN}Ready!${NC}"
+            return 0
+        fi
+        echo -n "."
+        sleep 5
+        waited=$((waited + 5))
+    done
+
+    echo -e " ${RED}Timeout!${NC}"
+    return 1
+}
+
+# Cleanup function
+cleanup() {
+    echo -e "\n${YELLOW}Cleaning up...${NC}"
+    if [ ! -z "$PID_8B" ]; then
+        kill $PID_8B 2>/dev/null || true
+    fi
+    if [ ! -z "$PID_70B" ]; then
+        kill $PID_70B 2>/dev/null || true
+    fi
+    echo "Done."
+}
+
+trap cleanup EXIT
+
+# Create log directory
+mkdir -p logs
+
+# ============================================
+# Test 1: 8B model only (single GPU)
+# ============================================
+echo -e "\n${GREEN}=== Test 1: 8B Model Benchmark ===${NC}"
+
+PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b.log")
+echo "Server PID: $PID_8B"
+
+if wait_for_server $PORT_8B; then
+    echo -e "\n${YELLOW}Running 8B benchmark (20 requests)...${NC}"
+    python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 20
+
+    echo -e "\n${YELLOW}Running 8B benchmark with concurrency...${NC}"
+    python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 50 --concurrent
+else
+    echo -e "${RED}Failed to start 8B server${NC}"
+fi
+
+# Stop 8B server
+kill $PID_8B 2>/dev/null || true
+sleep 5
+
+# ============================================
+# Test 2: 70B AWQ model (4 GPUs with tensor parallelism)
+# ============================================
+echo -e "\n${GREEN}=== Test 2: 70B AWQ Model Benchmark ===${NC}"
+
+if [ "$NUM_GPUS" -ge 4 ]; then
+    PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "0,1,2,3" "--tensor-parallel-size 4" "logs/vllm_70b.log")
+    echo "Server PID: $PID_70B"
+
+    if wait_for_server $PORT_70B; then
+        echo -e "\n${YELLOW}Running 70B benchmark (20 requests)...${NC}"
+        python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 20
+
+        echo -e "\n${YELLOW}Running 70B benchmark with concurrency...${NC}"
+        python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 50 --concurrent
+    else
+        echo -e "${RED}Failed to start 70B server${NC}"
+        echo "Check logs/vllm_70b.log for errors"
+    fi
+
+    # Stop 70B server
+    kill $PID_70B 2>/dev/null || true
+    sleep 5
+else
+    echo -e "${YELLOW}Skipping 70B test (need 4 GPUs)${NC}"
+fi
+
+# ============================================
+# Test 3: Full conversation simulation
+# ============================================
+echo -e "\n${GREEN}=== Test 3: Full Conversation Simulation ===${NC}"
+
+if [ "$NUM_GPUS" -ge 4 ]; then
+    # Start both servers
+    # 8B on GPU 0, 70B on GPUs 1,2,3 (tensor parallel 3)
+    # Or split differently based on memory
+
+    echo "Starting 8B server on GPU 0..."
+    PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b_conv.log")
+
+    echo "Starting 70B server on GPUs 1,2,3..."
+    PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "1,2,3" "--tensor-parallel-size 3" "logs/vllm_70b_conv.log")
+
+    wait_for_server $PORT_8B
+    wait_for_server $PORT_70B
+
+    if [ $? -eq 0 ]; then
+        echo -e "\n${YELLOW}Running full conversation benchmark (10 conversations)...${NC}"
+        python scripts/benchmark_inference.py --mode conversation \
+            --url-8b http://localhost:$PORT_8B/v1 \
+            --url-70b http://localhost:$PORT_70B/v1 \
+            -n 10
+    fi
+else
+    echo -e "${YELLOW}Skipping full conversation test (need 4 GPUs)${NC}"
+fi
+
+# ============================================
+# Summary
+# ============================================
+echo -e "\n${GREEN}======================================${NC}"
+echo -e "${GREEN}  Test Complete!${NC}"
+echo -e "${GREEN}======================================${NC}"
+echo ""
+echo "Target: 2000 conversations/hour (paper on H100x8)"
+echo ""
+echo "Check the benchmark results above to see how close we are."
+echo "If throughput is still low, check:"
+echo "  1. GPU utilization during tests (nvidia-smi dmon -s u)"
+echo "  2. vLLM logs in logs/*.log"
+echo "  3. Network latency if using remote servers"