diff options
Diffstat (limited to 'collaborativeagents/slurm/test_vllm_only.sh')
| -rw-r--r-- | collaborativeagents/slurm/test_vllm_only.sh | 117 |
1 files changed, 117 insertions, 0 deletions
diff --git a/collaborativeagents/slurm/test_vllm_only.sh b/collaborativeagents/slurm/test_vllm_only.sh new file mode 100644 index 0000000..302952c --- /dev/null +++ b/collaborativeagents/slurm/test_vllm_only.sh @@ -0,0 +1,117 @@ +#!/bin/bash +#SBATCH --job-name=vllm_only +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuA100x4 +#SBATCH --nodes=1 +#SBATCH --gpus-per-node=1 +#SBATCH --time=00:45:00 +#SBATCH --mem=64G +#SBATCH --output=slurm/logs/vllm_only_%j.out +#SBATCH --error=slurm/logs/vllm_only_%j.err + +# Test vLLM inference speed ONLY (skip transformers which OOMs) +set -e + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents + +# Activate conda environment +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +# Install vLLM if not already installed (requires GPU node for CUDA compilation) +if ! python -c "import vllm" 2>/dev/null; then + echo "Installing vLLM..." + pip install vllm --quiet +fi + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface + +echo "=== Job Info ===" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURM_NODELIST" +date + +echo "" +echo "=== GPU Info ===" +nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PORT=8003 + +echo "" +echo "============================================" +echo "Starting vLLM Server for 8B Model" +echo "============================================" + +# Start vLLM server with memory optimization +python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT \ + --gpu-memory-utilization 0.85 \ + --max-model-len 4096 \ + --disable-log-requests \ + --dtype bfloat16 & +SERVER_PID=$! +echo "vLLM Server PID: $SERVER_PID" + +# Wait for server to be ready +echo "Waiting for server to start..." +for i in {1..90}; do + if curl -s http://localhost:$PORT/health > /dev/null 2>&1; then + echo "Server ready after $((i*2)) seconds" + break + fi + sleep 2 +done + +# Check if server is up +if ! curl -s http://localhost:$PORT/health > /dev/null 2>&1; then + echo "ERROR: vLLM server failed to start" + cat slurm/logs/vllm_only_${SLURM_JOB_ID}.err | tail -50 + kill $SERVER_PID 2>/dev/null || true + exit 1 +fi + +# Get model info +echo "" +echo "=== vLLM Server Info ===" +curl -s http://localhost:$PORT/v1/models | python -m json.tool 2>/dev/null || echo "Could not get model info" + +echo "" +echo "============================================" +echo "Test 1: vLLM Sequential (20 requests)" +echo "============================================" +python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT/v1 -n 20 + +echo "" +echo "============================================" +echo "Test 2: vLLM Sequential (50 requests)" +echo "============================================" +python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT/v1 -n 50 + +echo "" +echo "============================================" +echo "Test 3: vLLM Concurrent 4 workers (50 req)" +echo "============================================" +python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT/v1 -n 50 --concurrent + +# Cleanup +echo "" +echo "Cleaning up..." +kill $SERVER_PID 2>/dev/null || true +wait $SERVER_PID 2>/dev/null || true + +echo "" +echo "============================================" +echo "BENCHMARK COMPLETE!" +echo "============================================" +echo "" +echo "Key metrics to compare with paper:" +echo " - Paper: 2000 conversations/hour on H100x8" +echo " - Expected A100x1: ~200-500 conv/hr" +echo " - Our old code: ~20 conv/hr (100x slower)" +echo "" +echo "If vLLM shows good throughput, we need to update" +echo "our experiment code to use vLLM instead of transformers." +echo "" +date |
