diff options
Diffstat (limited to 'collaborativeagents/slurm/fullscale/test_local_user.sh')
| -rw-r--r-- | collaborativeagents/slurm/fullscale/test_local_user.sh | 94 |
1 files changed, 94 insertions, 0 deletions
diff --git a/collaborativeagents/slurm/fullscale/test_local_user.sh b/collaborativeagents/slurm/fullscale/test_local_user.sh new file mode 100644 index 0000000..8374a93 --- /dev/null +++ b/collaborativeagents/slurm/fullscale/test_local_user.sh @@ -0,0 +1,94 @@ +#!/bin/bash +#SBATCH --job-name=test_local_user +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuA100x4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:nvidia_a100:4 +#SBATCH --mem=200G +#SBATCH --time=1:00:00 +#SBATCH --output=test_local_user_%j.out +#SBATCH --error=test_local_user_%j.err + +# Test with LOCAL vLLM user simulator (70B AWQ) instead of OpenAI +# Expected: ~2000+ sessions/hr (vs ~60 with OpenAI API latency) +# +# GPU Layout: +# GPU 0-1: 70B user simulator (AWQ INT4, TP=2) +# GPU 2: 8B agent (~24GB) +# GPU 3: Available for embedding/reranker if needed + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}/src:${PWD}/collaborativeagents:${PYTHONPATH}" +export NCCL_P2P_DISABLE=1 + +set -a +source .env +set +a + +# Models +MODEL_70B="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4" +MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" + +# Kill any existing vLLM servers +pkill -f "vllm.entrypoints" 2>/dev/null || true +sleep 3 + +echo "=== Starting 70B User Simulator (GPU 0-1, TP=2) ===" +CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_70B \ + --port 8004 \ + --tensor-parallel-size 2 \ + --gpu-memory-utilization 0.90 \ + --max-model-len 4096 \ + --quantization awq \ + --dtype float16 \ + --disable-log-requests \ + --guided-decoding-backend outlines & + +echo "=== Starting 8B Agent (GPU 2) ===" +CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \ + --model $MODEL_8B \ + --port 8003 \ + --tensor-parallel-size 1 \ + --gpu-memory-utilization 0.50 \ + --max-model-len 8192 \ + --dtype bfloat16 \ + --disable-log-requests & + +# Wait for both servers +echo "Waiting for vLLM servers..." +for port in 8004 8003; do + for i in $(seq 1 120); do + curl -s http://localhost:$port/health > /dev/null 2>&1 && break + sleep 2 + done + echo " Port $port ready." +done + +cd collaborativeagents/scripts + +echo "" +echo "=== Running Test: 10 profiles × 2 sessions with LOCAL user simulator ===" +python run_experiments.py \ + --methods vanilla \ + --datasets math-hard \ + --n-profiles 10 \ + --n-sessions 2 \ + --max-turns 8 \ + --use-vllm \ + --vllm-user-url http://localhost:8004/v1 \ + --vllm-agent-url http://localhost:8003/v1 \ + --reward-mode llm \ + --parallel-profiles 10 \ + --profile-path ../data/complex_profiles_v2/profiles_200.jsonl \ + --output-dir ../results/test_local_user_$(date +%Y%m%d_%H%M%S) + +echo "" +echo "=== Test Complete ===" +pkill -f "vllm.entrypoints" 2>/dev/null || true |
