#!/bin/bash
#SBATCH --job-name=test_local_reward
#SBATCH --account=bfqt-delta-gpu
#SBATCH --partition=gpuA100x4
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --gres=gpu:nvidia_a100:1
#SBATCH --mem=48G
#SBATCH --time=0:30:00
#SBATCH --output=test_local_reward_%j.out
#SBATCH --error=test_local_reward_%j.err

# Test LocalLLMRewardClient with vLLM server

cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model
source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
conda activate eval

export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
export PYTHONPATH="${PWD}/src:${PYTHONPATH}"

REWARD_MODEL="models/llama-3.1-8b-instruct"
REWARD_PORT=8005

echo "=== Local LLM Reward Model Batch Test ==="
echo "Model: $REWARD_MODEL"
echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)"
echo ""

# Start vLLM server for reward model
echo "Starting vLLM server on port $REWARD_PORT..."
python -m vllm.entrypoints.openai.api_server \
    --model $REWARD_MODEL \
    --port $REWARD_PORT \
    --tensor-parallel-size 1 \
    --dtype bfloat16 \
    --max-model-len 4096 \
    --gpu-memory-utilization 0.85 \
    --disable-log-requests \
    &
VLLM_PID=$!

# Wait for server to be ready (model loading can take 2-3 minutes)
echo "Waiting for vLLM server to start..."
for i in {1..180}; do
    if curl -s http://localhost:$REWARD_PORT/health > /dev/null 2>&1; then
        echo "vLLM server ready after ${i}s"
        break
    fi
    sleep 1
done

# Check if server started
if ! curl -s http://localhost:$REWARD_PORT/health > /dev/null 2>&1; then
    echo "ERROR: vLLM server failed to start"
    kill $VLLM_PID 2>/dev/null
    exit 1
fi

echo ""
echo "Running batch test..."
python scripts/test_local_reward_batch.py \
    --vllm-url http://localhost:$REWARD_PORT/v1 \
    --batch-size 12

echo ""
echo "=== Test Complete ==="

# Cleanup
kill $VLLM_PID 2>/dev/null