collaborativeagents/scripts/test_batch_vs_parallel.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151

#!/bin/bash
# Compare batch processing vs parallel profile processing on A100x4
#
# Expected result: Batch should be significantly faster because:
# - Turn-synchronous: ALL conversations processed at same turn together
# - Maximizes vLLM continuous batching
# - Fewer total HTTP requests

set -e

cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
conda activate eval

export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}"

MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
PORT_USER=8004
PORT_AGENT=8003

echo "============================================"
echo "Batch vs Parallel Processing Comparison"
echo "============================================"
date
echo "Node: $(hostname)"
nvidia-smi --query-gpu=index,name,memory.total --format=csv
echo ""

# Kill any existing vLLM servers
pkill -f "vllm.entrypoints" 2>/dev/null || true
sleep 2

# Start user simulator server (8B) on GPU 0-1
echo "Starting 8B user simulator server (GPU 0-1, TP=2)..."
CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
    --model $MODEL_8B \
    --port $PORT_USER \
    --tensor-parallel-size 2 \
    --gpu-memory-utilization 0.85 \
    --max-model-len 4096 \
    --disable-log-requests \
    --dtype bfloat16 &
SERVER_USER_PID=$!

# Start agent server (8B) on GPU 2-3
echo "Starting 8B agent server (GPU 2-3, TP=2)..."
CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
    --model $MODEL_8B \
    --port $PORT_AGENT \
    --tensor-parallel-size 2 \
    --gpu-memory-utilization 0.85 \
    --max-model-len 4096 \
    --disable-log-requests \
    --dtype bfloat16 &
SERVER_AGENT_PID=$!

echo "Waiting for servers (up to 5 min)..."

# Wait for servers
for i in $(seq 1 100); do
    READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0)
    READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0)

    if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then
        echo "Both servers ready after $((i*3)) seconds"
        break
    fi
    if [ $((i % 20)) -eq 0 ]; then
        echo "  Still waiting... user=$READY_USER, agent=$READY_AGENT ($((i*3))s)"
    fi
    sleep 3
done

# Check health
if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then
    echo "ERROR: User server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
fi
if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then
    echo "ERROR: Agent server not healthy"; kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null; exit 1
fi
echo "✓ Both servers healthy"

echo ""
echo "============================================"
echo "Test 1: NEW Batch Processing (20 samples)"
echo "============================================"
echo "This batches ALL user requests together, then ALL agent requests."
echo ""

START=$(date +%s)
python agents/batch_vllm_agent.py \
    http://localhost:$PORT_USER/v1 \
    http://localhost:$PORT_AGENT/v1 \
    20
END=$(date +%s)
ELAPSED_BATCH=$((END-START))
echo ""
echo "Batch processing time: ${ELAPSED_BATCH} seconds"

echo ""
echo "============================================"
echo "Test 2: OLD Parallel Profile Processing (20 samples)"
echo "============================================"
echo "This runs 20 profiles in parallel, but each makes separate requests."
echo ""

cd scripts
START=$(date +%s)
python run_experiments.py \
    --methods vanilla \
    --datasets mmlu \
    --n-profiles 20 \
    --n-sessions 1 \
    --use-vllm \
    --vllm-user-url http://localhost:$PORT_USER/v1 \
    --vllm-agent-url http://localhost:$PORT_AGENT/v1 \
    --parallel-profiles 20 \
    --output-dir ../results/batch_compare_parallel \
    --profile-path ../data/complex_profiles_v2/profiles_100.jsonl 2>&1 | tail -30
END=$(date +%s)
ELAPSED_PARALLEL=$((END-START))
echo ""
echo "Parallel profile processing time: ${ELAPSED_PARALLEL} seconds"

cd ..

# Cleanup
echo ""
echo "Cleaning up..."
kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true

echo ""
echo "============================================"
echo "RESULTS COMPARISON"
echo "============================================"
echo ""
echo "NEW Batch processing (20 conv):    ${ELAPSED_BATCH}s"
echo "OLD Parallel profiles (20 conv):   ${ELAPSED_PARALLEL}s"
echo ""
if [ $ELAPSED_BATCH -gt 0 ]; then
    SPEEDUP=$(echo "scale=2; $ELAPSED_PARALLEL / $ELAPSED_BATCH" | bc)
    echo "Speedup with batch processing: ${SPEEDUP}x"
fi
echo ""
echo "Expected: Batch should be 5-10x faster due to:"
echo "  - Turn-synchronous processing (all convs at same turn batched)"
echo "  - Fewer HTTP request overhead"
echo "  - Better vLLM continuous batching utilization"
echo ""
date