#!/bin/bash # Quick test of parallel vLLM processing on H200x8-interactive # Simplified version for 1 hour time limit set -e cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents source /u/yurenh2/miniforge3/etc/profile.d/conda.sh conda activate eval export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}" # Configuration MODEL_70B_AWQ="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4" MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" PORT_70B=8004 PORT_8B=8003 echo "============================================" echo "Quick Parallel vLLM Test (H200)" echo "============================================" date echo "Node: $(hostname)" nvidia-smi --query-gpu=index,name,memory.total --format=csv echo "" # Kill any existing vLLM servers pkill -f "vllm.entrypoints" 2>/dev/null || true sleep 2 # Start 70B AWQ server on GPU 0-1 (TP=2) echo "Starting 70B AWQ server (GPU 0-1, TP=2)..." CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ --model $MODEL_70B_AWQ \ --port $PORT_70B \ --tensor-parallel-size 2 \ --gpu-memory-utilization 0.90 \ --max-model-len 4096 \ --disable-log-requests \ --quantization awq \ --dtype float16 & SERVER_70B_PID=$! # Start 8B server on GPU 2 echo "Starting 8B server (GPU 2)..." CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \ --model $MODEL_8B \ --port $PORT_8B \ --gpu-memory-utilization 0.90 \ --max-model-len 4096 \ --disable-log-requests \ --dtype bfloat16 & SERVER_8B_PID=$! echo "Waiting for servers (up to 5 min)..." # Wait for servers for i in $(seq 1 100); do READY_70B=$(curl -s http://localhost:$PORT_70B/health > /dev/null 2>&1 && echo 1 || echo 0) READY_8B=$(curl -s http://localhost:$PORT_8B/health > /dev/null 2>&1 && echo 1 || echo 0) if [ "$READY_70B" = "1" ] && [ "$READY_8B" = "1" ]; then echo "Both servers ready after $((i*3)) seconds" break fi if [ $((i % 20)) -eq 0 ]; then echo " Still waiting... 70B=$READY_70B, 8B=$READY_8B ($((i*3))s)" fi sleep 3 done # Check health if ! curl -s http://localhost:$PORT_70B/health > /dev/null; then echo "ERROR: 70B server not healthy"; kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null; exit 1 fi if ! curl -s http://localhost:$PORT_8B/health > /dev/null; then echo "ERROR: 8B server not healthy"; kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null; exit 1 fi echo "✓ Both servers healthy" cd scripts echo "" echo "============================================" echo "Running throughput tests..." echo "============================================" # Test 1: Sequential (1 profile, 2 sessions) echo "" echo "--- Test 1: Sequential (1 profile) ---" START=$(date +%s) python run_experiments.py \ --methods vanilla \ --datasets mmlu \ --n-profiles 1 \ --n-sessions 2 \ --use-vllm \ --vllm-user-url http://localhost:$PORT_70B/v1 \ --vllm-agent-url http://localhost:$PORT_8B/v1 \ --parallel-profiles 1 \ --output-dir ../results/quick_test_1 \ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -20 END=$(date +%s) echo "Time: $((END-START)) seconds" # Test 2: Parallel (4 profiles, 2 sessions each) echo "" echo "--- Test 2: Parallel (4 profiles) ---" START=$(date +%s) python run_experiments.py \ --methods vanilla \ --datasets mmlu \ --n-profiles 4 \ --n-sessions 2 \ --use-vllm \ --vllm-user-url http://localhost:$PORT_70B/v1 \ --vllm-agent-url http://localhost:$PORT_8B/v1 \ --parallel-profiles 4 \ --output-dir ../results/quick_test_4 \ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -20 END=$(date +%s) echo "Time: $((END-START)) seconds" # Test 3: Parallel (8 profiles, 2 sessions each) echo "" echo "--- Test 3: Parallel (8 profiles) ---" START=$(date +%s) python run_experiments.py \ --methods vanilla \ --datasets mmlu \ --n-profiles 8 \ --n-sessions 2 \ --use-vllm \ --vllm-user-url http://localhost:$PORT_70B/v1 \ --vllm-agent-url http://localhost:$PORT_8B/v1 \ --parallel-profiles 8 \ --output-dir ../results/quick_test_8 \ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -20 END=$(date +%s) echo "Time: $((END-START)) seconds" # Cleanup echo "" echo "Cleaning up..." kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null || true echo "" echo "============================================" echo "TEST COMPLETE!" echo "============================================" echo "" echo "Summary: Compare timing above" echo " - Sequential (1 profile): baseline" echo " - Parallel (4 profiles): should be faster per profile" echo " - Parallel (8 profiles): should show more speedup" echo "" date