#!/bin/bash #SBATCH --job-name=test_local_user #SBATCH --account=bfqt-delta-gpu #SBATCH --partition=gpuA100x4 #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=32 #SBATCH --gres=gpu:nvidia_a100:4 #SBATCH --mem=200G #SBATCH --time=1:00:00 #SBATCH --output=test_local_user_%j.out #SBATCH --error=test_local_user_%j.err # Test with LOCAL vLLM user simulator (70B AWQ) instead of OpenAI # Expected: ~2000+ sessions/hr (vs ~60 with OpenAI API latency) # # GPU Layout: # GPU 0-1: 70B user simulator (AWQ INT4, TP=2) # GPU 2: 8B agent (~24GB) # GPU 3: Available for embedding/reranker if needed cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model source /u/yurenh2/miniforge3/etc/profile.d/conda.sh conda activate eval export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface export PYTHONPATH="${PWD}/src:${PWD}/collaborativeagents:${PYTHONPATH}" export NCCL_P2P_DISABLE=1 set -a source .env set +a # Models MODEL_70B="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4" MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" # Kill any existing vLLM servers pkill -f "vllm.entrypoints" 2>/dev/null || true sleep 3 echo "=== Starting 70B User Simulator (GPU 0-1, TP=2) ===" CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ --model $MODEL_70B \ --port 8004 \ --tensor-parallel-size 2 \ --gpu-memory-utilization 0.90 \ --max-model-len 4096 \ --quantization awq \ --dtype float16 \ --disable-log-requests \ --guided-decoding-backend outlines & echo "=== Starting 8B Agent (GPU 2) ===" CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server \ --model $MODEL_8B \ --port 8003 \ --tensor-parallel-size 1 \ --gpu-memory-utilization 0.50 \ --max-model-len 8192 \ --dtype bfloat16 \ --disable-log-requests & # Wait for both servers echo "Waiting for vLLM servers..." for port in 8004 8003; do for i in $(seq 1 120); do curl -s http://localhost:$port/health > /dev/null 2>&1 && break sleep 2 done echo " Port $port ready." done cd collaborativeagents/scripts echo "" echo "=== Running Test: 10 profiles × 2 sessions with LOCAL user simulator ===" python run_experiments.py \ --methods vanilla \ --datasets math-hard \ --n-profiles 10 \ --n-sessions 2 \ --max-turns 8 \ --use-vllm \ --vllm-user-url http://localhost:8004/v1 \ --vllm-agent-url http://localhost:8003/v1 \ --reward-mode llm \ --parallel-profiles 10 \ --profile-path ../data/complex_profiles_v2/profiles_200.jsonl \ --output-dir ../results/test_local_user_$(date +%Y%m%d_%H%M%S) echo "" echo "=== Test Complete ===" pkill -f "vllm.entrypoints" 2>/dev/null || true