#!/bin/bash #SBATCH --job-name=test_vllm_a100 #SBATCH --account=bfqt-delta-gpu #SBATCH --partition=gpuA100x4 #SBATCH --gres=gpu:4 #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=16 #SBATCH --mem=128G #SBATCH --time=00:30:00 #SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_vllm_a100-%j.out #SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/test_vllm_a100-%j.err set -e cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents source /u/yurenh2/miniforge3/etc/profile.d/conda.sh conda activate eval export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface export PYTHONPATH="${PWD}:${PWD}/scripts:${PWD}/../src:${PYTHONPATH}" MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" echo "=== vLLM Speed Test (A100) ===" date nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv pkill -f "vllm.entrypoints" 2>/dev/null || true sleep 2 echo "" echo "=== Test 1: ContextualAdapter with vLLM (2 servers) ===" CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ --gpu-memory-utilization 0.90 --max-model-len 8192 \ --disable-log-requests --dtype bfloat16 & CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \ --model $MODEL_8B --port 8003 --tensor-parallel-size 2 \ --gpu-memory-utilization 0.90 --max-model-len 8192 \ --disable-log-requests --dtype bfloat16 & for i in $(seq 1 120); do u=$(curl -s http://localhost:8004/health > /dev/null 2>&1 && echo 1 || echo 0) a=$(curl -s http://localhost:8003/health > /dev/null 2>&1 && echo 1 || echo 0) if [ "$u" = "1" ] && [ "$a" = "1" ]; then echo "Both servers ready after $((i*2))s"; break fi sleep 2 done python -c " import time import sys sys.path.insert(0, '.') from adapters.contextual_adapter import ContextualAdapter print('Testing ContextualAdapter with vLLM...') adapter = ContextualAdapter(vllm_url='http://localhost:8003/v1') adapter.initialize() adapter.start_session('test_user') # Warm up adapter.generate_response('Hello') # Benchmark n_requests = 20 start = time.time() for i in range(n_requests): resp = adapter.generate_response(f'Solve: What is {i*7} + {i*3}? Give a brief answer.') elapsed = time.time() - start print(f'ContextualAdapter (vLLM): {n_requests} requests in {elapsed:.2f}s') print(f'Throughput: {n_requests/elapsed:.2f} req/s = {n_requests/elapsed*3600:.0f} requests/hr') print(f'Estimated sessions/hr (5 turns/session): {n_requests/elapsed*3600/5:.0f}') " pkill -f "vllm.entrypoints" 2>/dev/null || true sleep 5 echo "" echo "=== Test 2: PersonalizedLLMAdapter (vLLM user + transformers) ===" CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ --model $MODEL_8B --port 8004 --tensor-parallel-size 2 \ --gpu-memory-utilization 0.90 --max-model-len 8192 \ --disable-log-requests --dtype bfloat16 & for i in $(seq 1 120); do if curl -s http://localhost:8004/health > /dev/null 2>&1; then echo "Server ready after $((i*2))s"; break fi sleep 2 done CUDA_VISIBLE_DEVICES=2,3 python -c " import time import sys sys.path.insert(0, '.') from adapters.personalized_llm_adapter import create_baseline_adapter print('Testing PersonalizedLLMAdapter (all_memory)...') adapter = create_baseline_adapter('all_memory') adapter.initialize() adapter.start_session('test_user') # Warm up adapter.generate_response('Hello') # Benchmark n_requests = 10 start = time.time() for i in range(n_requests): resp = adapter.generate_response(f'Solve: What is {i*7} + {i*3}? Give a brief answer.') elapsed = time.time() - start print(f'PersonalizedLLMAdapter (transformers): {n_requests} requests in {elapsed:.2f}s') print(f'Throughput: {n_requests/elapsed:.2f} req/s = {n_requests/elapsed*3600:.0f} requests/hr') print(f'Estimated sessions/hr (5 turns/session): {n_requests/elapsed*3600/5:.0f}') " pkill -f "vllm.entrypoints" 2>/dev/null || true echo "" echo "=== Test Complete ===" date