From 680513b7771a29f27cbbb3ffb009a69a913de6f9 Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Tue, 27 Jan 2026 12:15:45 -0600 Subject: local reward model --- scripts/test_local_reward_batch.sh | 71 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100755 scripts/test_local_reward_batch.sh (limited to 'scripts/test_local_reward_batch.sh') diff --git a/scripts/test_local_reward_batch.sh b/scripts/test_local_reward_batch.sh new file mode 100755 index 0000000..675ab76 --- /dev/null +++ b/scripts/test_local_reward_batch.sh @@ -0,0 +1,71 @@ +#!/bin/bash +#SBATCH --job-name=test_local_reward +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuA100x4 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:nvidia_a100:1 +#SBATCH --mem=48G +#SBATCH --time=0:30:00 +#SBATCH --output=test_local_reward_%j.out +#SBATCH --error=test_local_reward_%j.err + +# Test LocalLLMRewardClient with vLLM server + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface +export PYTHONPATH="${PWD}/src:${PYTHONPATH}" + +REWARD_MODEL="models/llama-3.1-8b-instruct" +REWARD_PORT=8005 + +echo "=== Local LLM Reward Model Batch Test ===" +echo "Model: $REWARD_MODEL" +echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)" +echo "" + +# Start vLLM server for reward model +echo "Starting vLLM server on port $REWARD_PORT..." +python -m vllm.entrypoints.openai.api_server \ + --model $REWARD_MODEL \ + --port $REWARD_PORT \ + --tensor-parallel-size 1 \ + --dtype bfloat16 \ + --max-model-len 4096 \ + --gpu-memory-utilization 0.85 \ + --disable-log-requests \ + & +VLLM_PID=$! + +# Wait for server to be ready (model loading can take 2-3 minutes) +echo "Waiting for vLLM server to start..." +for i in {1..180}; do + if curl -s http://localhost:$REWARD_PORT/health > /dev/null 2>&1; then + echo "vLLM server ready after ${i}s" + break + fi + sleep 1 +done + +# Check if server started +if ! curl -s http://localhost:$REWARD_PORT/health > /dev/null 2>&1; then + echo "ERROR: vLLM server failed to start" + kill $VLLM_PID 2>/dev/null + exit 1 +fi + +echo "" +echo "Running batch test..." +python scripts/test_local_reward_batch.py \ + --vllm-url http://localhost:$REWARD_PORT/v1 \ + --batch-size 12 + +echo "" +echo "=== Test Complete ===" + +# Cleanup +kill $VLLM_PID 2>/dev/null -- cgit v1.2.3