#!/bin/bash
# Test vLLM inference speed on interactive node
#
# Usage:
#   1. Get an interactive node:
#      srun --partition=gpu --gres=gpu:4 --time=2:00:00 --pty bash
#
#   2. Run this script:
#      bash scripts/test_vllm_interactive.sh
#
# This script will:
#   1. Start vLLM server for 8B model (agent)
#   2. Start vLLM server for 70B AWQ model (user simulator)
#   3. Run benchmarks
#   4. Compare with paper's 2000 conv/hr target

set -e

# Paths
MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
MODEL_70B="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
HF_CACHE="/projects/bfqt/users/yurenh2/hf_cache/huggingface"

# Ports
PORT_8B=8003
PORT_70B=8004

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

echo -e "${GREEN}======================================${NC}"
echo -e "${GREEN}  vLLM Inference Speed Test${NC}"
echo -e "${GREEN}======================================${NC}"
echo ""

# Check GPU availability
echo -e "${YELLOW}Checking GPUs...${NC}"
nvidia-smi --query-gpu=index,name,memory.total --format=csv
NUM_GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
echo -e "Found ${GREEN}${NUM_GPUS}${NC} GPUs"
echo ""

if [ "$NUM_GPUS" -lt 4 ]; then
    echo -e "${RED}WARNING: Less than 4 GPUs available. 70B model may not fit.${NC}"
fi

# Setup environment
export HF_HOME=$HF_CACHE
export TRANSFORMERS_CACHE=$HF_CACHE

# Activate conda environment if needed
# source /path/to/conda/etc/profile.d/conda.sh
# conda activate your_env

cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents

# Function to start vLLM server
start_vllm_server() {
    local model=$1
    local port=$2
    local gpus=$3
    local extra_args=$4
    local logfile=$5

    echo -e "${YELLOW}Starting vLLM server on port $port with GPUs $gpus...${NC}"
    echo "Model: $model"

    CUDA_VISIBLE_DEVICES=$gpus python -m vllm.entrypoints.openai.api_server \
        --model $model \
        --port $port \
        --gpu-memory-utilization 0.9 \
        --max-model-len 8192 \
        $extra_args \
        > $logfile 2>&1 &

    echo $!
}

# Function to wait for server to be ready
wait_for_server() {
    local port=$1
    local max_wait=300  # 5 minutes
    local waited=0

    echo -n "Waiting for server on port $port"
    while [ $waited -lt $max_wait ]; do
        if curl -s http://localhost:$port/health > /dev/null 2>&1; then
            echo -e " ${GREEN}Ready!${NC}"
            return 0
        fi
        echo -n "."
        sleep 5
        waited=$((waited + 5))
    done

    echo -e " ${RED}Timeout!${NC}"
    return 1
}

# Cleanup function
cleanup() {
    echo -e "\n${YELLOW}Cleaning up...${NC}"
    if [ ! -z "$PID_8B" ]; then
        kill $PID_8B 2>/dev/null || true
    fi
    if [ ! -z "$PID_70B" ]; then
        kill $PID_70B 2>/dev/null || true
    fi
    echo "Done."
}

trap cleanup EXIT

# Create log directory
mkdir -p logs

# ============================================
# Test 1: 8B model only (single GPU)
# ============================================
echo -e "\n${GREEN}=== Test 1: 8B Model Benchmark ===${NC}"

PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b.log")
echo "Server PID: $PID_8B"

if wait_for_server $PORT_8B; then
    echo -e "\n${YELLOW}Running 8B benchmark (20 requests)...${NC}"
    python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 20

    echo -e "\n${YELLOW}Running 8B benchmark with concurrency...${NC}"
    python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 50 --concurrent
else
    echo -e "${RED}Failed to start 8B server${NC}"
fi

# Stop 8B server
kill $PID_8B 2>/dev/null || true
sleep 5

# ============================================
# Test 2: 70B AWQ model (4 GPUs with tensor parallelism)
# ============================================
echo -e "\n${GREEN}=== Test 2: 70B AWQ Model Benchmark ===${NC}"

if [ "$NUM_GPUS" -ge 4 ]; then
    PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "0,1,2,3" "--tensor-parallel-size 4" "logs/vllm_70b.log")
    echo "Server PID: $PID_70B"

    if wait_for_server $PORT_70B; then
        echo -e "\n${YELLOW}Running 70B benchmark (20 requests)...${NC}"
        python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 20

        echo -e "\n${YELLOW}Running 70B benchmark with concurrency...${NC}"
        python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 50 --concurrent
    else
        echo -e "${RED}Failed to start 70B server${NC}"
        echo "Check logs/vllm_70b.log for errors"
    fi

    # Stop 70B server
    kill $PID_70B 2>/dev/null || true
    sleep 5
else
    echo -e "${YELLOW}Skipping 70B test (need 4 GPUs)${NC}"
fi

# ============================================
# Test 3: Full conversation simulation
# ============================================
echo -e "\n${GREEN}=== Test 3: Full Conversation Simulation ===${NC}"

if [ "$NUM_GPUS" -ge 4 ]; then
    # Start both servers
    # 8B on GPU 0, 70B on GPUs 1,2,3 (tensor parallel 3)
    # Or split differently based on memory

    echo "Starting 8B server on GPU 0..."
    PID_8B=$(start_vllm_server "$MODEL_8B" $PORT_8B "0" "" "logs/vllm_8b_conv.log")

    echo "Starting 70B server on GPUs 1,2,3..."
    PID_70B=$(start_vllm_server "$MODEL_70B" $PORT_70B "1,2,3" "--tensor-parallel-size 3" "logs/vllm_70b_conv.log")

    wait_for_server $PORT_8B
    wait_for_server $PORT_70B

    if [ $? -eq 0 ]; then
        echo -e "\n${YELLOW}Running full conversation benchmark (10 conversations)...${NC}"
        python scripts/benchmark_inference.py --mode conversation \
            --url-8b http://localhost:$PORT_8B/v1 \
            --url-70b http://localhost:$PORT_70B/v1 \
            -n 10
    fi
else
    echo -e "${YELLOW}Skipping full conversation test (need 4 GPUs)${NC}"
fi

# ============================================
# Summary
# ============================================
echo -e "\n${GREEN}======================================${NC}"
echo -e "${GREEN}  Test Complete!${NC}"
echo -e "${GREEN}======================================${NC}"
echo ""
echo "Target: 2000 conversations/hour (paper on H100x8)"
echo ""
echo "Check the benchmark results above to see how close we are."
echo "If throughput is still low, check:"
echo "  1. GPU utilization during tests (nvidia-smi dmon -s u)"
echo "  2. vLLM logs in logs/*.log"
echo "  3. Network latency if using remote servers"