summaryrefslogtreecommitdiff
path: root/collaborativeagents/scripts/test_parallel_quick.sh
diff options
context:
space:
mode:
Diffstat (limited to 'collaborativeagents/scripts/test_parallel_quick.sh')
-rwxr-xr-xcollaborativeagents/scripts/test_parallel_quick.sh158
1 files changed, 158 insertions, 0 deletions
diff --git a/collaborativeagents/scripts/test_parallel_quick.sh b/collaborativeagents/scripts/test_parallel_quick.sh
new file mode 100755
index 0000000..8429da7
--- /dev/null
+++ b/collaborativeagents/scripts/test_parallel_quick.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+# Quick test of parallel vLLM processing on H200x8-interactive
+# Simplified version for 1 hour time limit
+
+set -e
+
+cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
+source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
+conda activate eval
+
+export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
+export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"
+
+# Configuration
+MODEL_70B_AWQ="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
+MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
+PORT_70B=8004
+PORT_8B=8003
+
+echo "============================================"
+echo "Quick Parallel vLLM Test (H200)"
+echo "============================================"
+date
+echo "Node: $(hostname)"
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+echo ""
+
+# Kill any existing vLLM servers
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+# Start 70B AWQ server on GPU 0-1 (TP=2)
+echo "Starting 70B AWQ server (GPU 0-1, TP=2)..."
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_70B_AWQ \
+ --port $PORT_70B \
+ --tensor-parallel-size 2 \
+ --gpu-memory-utilization 0.90 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --quantization awq \
+ --dtype float16 &
+SERVER_70B_PID=$!
+
+# Start 8B server on GPU 2
+echo "Starting 8B server (GPU 2)..."
+CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \
+ --model $MODEL_8B \
+ --port $PORT_8B \
+ --gpu-memory-utilization 0.90 \
+ --max-model-len 4096 \
+ --disable-log-requests \
+ --dtype bfloat16 &
+SERVER_8B_PID=$!
+
+echo "Waiting for servers (up to 5 min)..."
+
+# Wait for servers
+for i in $(seq 1 100); do
+ READY_70B=$(curl -s http://localhost:$PORT_70B/health > /dev/null 2>&1 && echo 1 || echo 0)
+ READY_8B=$(curl -s http://localhost:$PORT_8B/health > /dev/null 2>&1 && echo 1 || echo 0)
+
+ if [ "$READY_70B" = "1" ] && [ "$READY_8B" = "1" ]; then
+ echo "Both servers ready after $((i*3)) seconds"
+ break
+ fi
+ if [ $((i % 20)) -eq 0 ]; then
+ echo " Still waiting... 70B=$READY_70B, 8B=$READY_8B ($((i*3))s)"
+ fi
+ sleep 3
+done
+
+# Check health
+if ! curl -s http://localhost:$PORT_70B/health > /dev/null; then
+ echo "ERROR: 70B server not healthy"; kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null; exit 1
+fi
+if ! curl -s http://localhost:$PORT_8B/health > /dev/null; then
+ echo "ERROR: 8B server not healthy"; kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null; exit 1
+fi
+echo "✓ Both servers healthy"
+
+cd scripts
+
+echo ""
+echo "============================================"
+echo "Running throughput tests..."
+echo "============================================"
+
+# Test 1: Sequential (1 profile, 2 sessions)
+echo ""
+echo "--- Test 1: Sequential (1 profile) ---"
+START=$(date +%s)
+python run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 1 \
+ --n-sessions 2 \
+ --use-vllm \
+ --vllm-user-url http://localhost:$PORT_70B/v1 \
+ --vllm-agent-url http://localhost:$PORT_8B/v1 \
+ --parallel-profiles 1 \
+ --output-dir ../results/quick_test_1 \
+ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -20
+END=$(date +%s)
+echo "Time: $((END-START)) seconds"
+
+# Test 2: Parallel (4 profiles, 2 sessions each)
+echo ""
+echo "--- Test 2: Parallel (4 profiles) ---"
+START=$(date +%s)
+python run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 4 \
+ --n-sessions 2 \
+ --use-vllm \
+ --vllm-user-url http://localhost:$PORT_70B/v1 \
+ --vllm-agent-url http://localhost:$PORT_8B/v1 \
+ --parallel-profiles 4 \
+ --output-dir ../results/quick_test_4 \
+ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -20
+END=$(date +%s)
+echo "Time: $((END-START)) seconds"
+
+# Test 3: Parallel (8 profiles, 2 sessions each)
+echo ""
+echo "--- Test 3: Parallel (8 profiles) ---"
+START=$(date +%s)
+python run_experiments.py \
+ --methods vanilla \
+ --datasets mmlu \
+ --n-profiles 8 \
+ --n-sessions 2 \
+ --use-vllm \
+ --vllm-user-url http://localhost:$PORT_70B/v1 \
+ --vllm-agent-url http://localhost:$PORT_8B/v1 \
+ --parallel-profiles 8 \
+ --output-dir ../results/quick_test_8 \
+ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -20
+END=$(date +%s)
+echo "Time: $((END-START)) seconds"
+
+# Cleanup
+echo ""
+echo "Cleaning up..."
+kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null || true
+
+echo ""
+echo "============================================"
+echo "TEST COMPLETE!"
+echo "============================================"
+echo ""
+echo "Summary: Compare timing above"
+echo " - Sequential (1 profile): baseline"
+echo " - Parallel (4 profiles): should be faster per profile"
+echo " - Parallel (8 profiles): should show more speedup"
+echo ""
+date