collaborativeagents/slurm/test_vllm_70b_8b.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167

#!/bin/bash
#SBATCH --job-name=vllm_bench
#SBATCH --account=bfqt-delta-gpu
#SBATCH --partition=gpuA100x4
#SBATCH --nodes=1
#SBATCH --gpus-per-node=2
#SBATCH --time=02:00:00
#SBATCH --mem=128G
#SBATCH --output=slurm/logs/vllm_bench_70b_8b_%j.out
#SBATCH --error=slurm/logs/vllm_bench_70b_8b_%j.err

# Realistic benchmark: 70B AWQ user simulator + 8B agent
# Tests actual conversation throughput with both models
set -e

cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
conda activate eval

export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface

echo "=== Job Info ==="
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURM_NODELIST"
date

echo ""
echo "=== GPU Info ==="
nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv

# Download AWQ 70B model if not complete
echo ""
echo "=== Ensuring AWQ 70B Model is Downloaded ==="
python -c "
from huggingface_hub import snapshot_download
import os
os.environ['HF_HOME'] = '/projects/bfqt/users/yurenh2/hf_cache/huggingface'
print('Checking/downloading hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4...')
path = snapshot_download('hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4')
print(f'Model ready at: {path}')
"

MODEL_70B_AWQ="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
PORT_70B=8004
PORT_8B=8003

echo ""
echo "============================================"
echo "Starting 70B AWQ vLLM Server (GPU 0)"
echo "============================================"
CUDA_VISIBLE_DEVICES=0 python -m vllm.entrypoints.openai.api_server \
    --model $MODEL_70B_AWQ \
    --port $PORT_70B \
    --gpu-memory-utilization 0.90 \
    --max-model-len 4096 \
    --disable-log-requests \
    --quantization awq \
    --dtype float16 &
SERVER_70B_PID=$!
echo "70B Server PID: $SERVER_70B_PID"

echo ""
echo "============================================"
echo "Starting 8B vLLM Server (GPU 1)"
echo "============================================"
CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server \
    --model $MODEL_8B \
    --port $PORT_8B \
    --gpu-memory-utilization 0.90 \
    --max-model-len 4096 \
    --disable-log-requests \
    --dtype bfloat16 &
SERVER_8B_PID=$!
echo "8B Server PID: $SERVER_8B_PID"

echo ""
echo "Waiting for servers to start..."

# Wait for 70B (may take 3-5 minutes)
for i in $(seq 1 120); do
    if curl -s http://localhost:$PORT_70B/health > /dev/null 2>&1; then
        echo "70B Server ready after $((i*3)) seconds"
        break
    fi
    if [ $((i % 20)) -eq 0 ]; then
        echo "  Waiting for 70B... ($((i*3)) seconds)"
    fi
    sleep 3
done

# Wait for 8B
for i in $(seq 1 60); do
    if curl -s http://localhost:$PORT_8B/health > /dev/null 2>&1; then
        echo "8B Server ready after $((i*2)) seconds"
        break
    fi
    sleep 2
done

# Check both servers
echo ""
if ! curl -s http://localhost:$PORT_70B/health > /dev/null 2>&1; then
    echo "ERROR: 70B server failed to start"
    kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null
    exit 1
fi
echo "✓ 70B server healthy"

if ! curl -s http://localhost:$PORT_8B/health > /dev/null 2>&1; then
    echo "ERROR: 8B server failed to start"
    kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null
    exit 1
fi
echo "✓ 8B server healthy"

echo ""
echo "=== vLLM Server Info ==="
echo "70B model:"
curl -s http://localhost:$PORT_70B/v1/models | python -m json.tool 2>/dev/null | head -10
echo ""
echo "8B model:"
curl -s http://localhost:$PORT_8B/v1/models | python -m json.tool 2>/dev/null | head -10

echo ""
echo "============================================"
echo "Test 1: Individual Model Throughput"
echo "============================================"

echo ""
echo "--- 70B AWQ Sequential (10 requests) ---"
python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_70B/v1 -n 10

echo ""
echo "--- 8B Sequential (20 requests) ---"
python scripts/benchmark_inference.py --mode vllm --url http://localhost:$PORT_8B/v1 -n 20

echo ""
echo "============================================"
echo "Test 2: Full Conversation Benchmark"
echo "============================================"
echo "Running 10 conversations with 70B user simulator + 8B agent..."
python scripts/benchmark_inference.py \
    --mode conversation \
    --url-70b http://localhost:$PORT_70B/v1 \
    --url-8b http://localhost:$PORT_8B/v1 \
    -n 10

# Cleanup
echo ""
echo "Cleaning up..."
kill $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null
wait $SERVER_70B_PID $SERVER_8B_PID 2>/dev/null

echo ""
echo "============================================"
echo "BENCHMARK COMPLETE!"
echo "============================================"
echo ""
echo "Key metrics to compare with paper:"
echo "  - Paper: 2000 conversations/hour on H100x8"
echo "  - Expected A100x2 with 70B AWQ + 8B: ~100-300 conv/hr"
echo "  - Our old code: ~20 conv/hr"
echo ""
echo "If throughput is good, update experiment code to use vLLM."
echo ""
date