diff options
| author | YurenHao0426 <blackhao0426@gmail.com> | 2026-01-27 09:57:37 -0600 |
|---|---|---|
| committer | YurenHao0426 <blackhao0426@gmail.com> | 2026-01-27 09:57:37 -0600 |
| commit | dc801c07cf38b0c495686463e6ca6f871a64440e (patch) | |
| tree | 599f03114775921dbc472403c701f4a3a8ea188a /collaborativeagents/slurm/test_vllm_benchmark.sh | |
| parent | e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (diff) | |
Add collaborativeagents module and update gitignore
- Add collaborativeagents subproject with adapters, agents, and evaluation modules
- Update .gitignore to exclude large binary files (.whl, .tar), wandb logs, and results
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Diffstat (limited to 'collaborativeagents/slurm/test_vllm_benchmark.sh')
| -rw-r--r-- | collaborativeagents/slurm/test_vllm_benchmark.sh | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/collaborativeagents/slurm/test_vllm_benchmark.sh b/collaborativeagents/slurm/test_vllm_benchmark.sh new file mode 100644 index 0000000..d812b43 --- /dev/null +++ b/collaborativeagents/slurm/test_vllm_benchmark.sh @@ -0,0 +1,102 @@ +#!/bin/bash +#SBATCH --job-name=vllm_bench +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuA100x4 +#SBATCH --nodes=1 +#SBATCH --gpus-per-node=1 +#SBATCH --time=00:30:00 +#SBATCH --output=slurm/logs/vllm_bench_%j.out +#SBATCH --error=slurm/logs/vllm_bench_%j.err + +# Benchmark vLLM vs transformers inference speed +set -e + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export TRANSFORMERS_CACHE=/projects/bfqt/users/yurenh2/hf_cache/huggingface + +echo "=== Job Info ===" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURM_NODELIST" +echo "GPUs: $SLURM_GPUS_ON_NODE" +date + +echo "" +echo "=== GPU Info ===" +nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv + +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" +PORT=8003 + +# ============================================ +# Test 1: Transformers baseline +# ============================================ +echo "" +echo "============================================" +echo "Test 1: Transformers Baseline (10 requests)" +echo "============================================" +python scripts/benchmark_inference.py --mode transformers --model $MODEL_8B -n 10 + +# ============================================ +# Test 2: vLLM server +# ============================================ +echo "" +echo "============================================" +echo "Test 2: Starting vLLM Server" +echo "============================================" + +# Start vLLM server +python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port $PORT \ + --gpu-memory-utilization 0.9 \ + --max-model-len 4096 \ + --disable-log-requests & +SERVER_PID=$! +echo "vLLM Server PID: $SERVER_PID" + +# Wait for server to be ready +echo "Waiting for server to start..." +for i in {1..60}; do + if curl -s http://localhost:$PORT/health > /dev/null 2>&1; then + echo "Server ready after $((i*5)) seconds" + break + fi + sleep 5 +done + +# Check if server is up +if ! curl -s http://localhost:$PORT/health > /dev/null 2>&1; then + echo "ERROR: vLLM server failed to start" + kill $SERVER_PID 2>/dev/null || true + exit 1 +fi + +echo "" +echo "============================================" +echo "Test 2a: vLLM Sequential (20 requests)" +echo "============================================" +python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT/v1 -n 20 + +echo "" +echo "============================================" +echo "Test 2b: vLLM Concurrent (50 requests)" +echo "============================================" +python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT/v1 -n 50 --concurrent + +# Cleanup +echo "" +echo "Cleaning up..." +kill $SERVER_PID 2>/dev/null || true +wait $SERVER_PID 2>/dev/null || true + +echo "" +echo "============================================" +echo "Benchmark Complete!" +echo "============================================" +echo "" +echo "Target: 2000 conversations/hour (paper on H100x8)" +echo "Our A100x4 should achieve ~500-1000 conv/hr with vLLM" +echo "" +date |
