Add RAG rewrite, 60-session experiment scripts, and analysis tools

- RAG rewrite adapter and vector preference pipeline in personalized_llm - 60-session experiment queue scripts (reflection, rag, rag_vector, rag_rewrite) - Vector-preference correlation analysis and visualization scripts - Local reward model batch processing improvements - Updated CLAUDE.md with full experiment documentation and notes Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
author: YurenHao0426 <blackhao0426@gmail.com> 2026-02-10 20:16:36 +0000
committer: YurenHao0426 <blackhao0426@gmail.com> 2026-02-10 20:16:36 +0000
commit: 5626080ca4c4219aec4888d6b9406d0d3349fb55 (patch)
tree: 86287d9fd5833e11ccd78566992540f2664fd195 /scripts/start_vllm_servers.sh
parent: a2036838807428424bbbaff507a6563749a83145 (diff)
1 files changed, 80 insertions, 0 deletions
diff --git a/scripts/start_vllm_servers.sh b/scripts/start_vllm_servers.sh
new file mode 100755
index 0000000..44e211b
--- /dev/null
+++ b/scripts/start_vllm_servers.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+# Start vLLM servers for personalization experiments
+# GPU Layout (4x H200):
+#   GPU 0-1: 70B user simulator (TP=2)
+#   GPU 2:   8B agent
+#   GPU 3:   8B reward model
+
+set -e
+
+PROJECT_ROOT="/workspace/personalization-user-model"
+MODEL_8B="${PROJECT_ROOT}/models/llama-3.1-8b-instruct"
+MODEL_70B="${PROJECT_ROOT}/models/llama-3.1-70b-instruct"
+
+mkdir -p "${PROJECT_ROOT}/logs"
+
+# Kill any existing vLLM servers
+pkill -f "vllm.entrypoints" 2>/dev/null || true
+sleep 2
+
+echo "Starting vLLM servers..."
+
+# GPU 0-1: 70B User Simulator (TP=2)
+echo "Starting 70B user simulator on GPU 0-1 (port 8004)..."
+CUDA_VISIBLE_DEVICES=0,1 python3 -m vllm.entrypoints.openai.api_server \
+    --model "${MODEL_70B}" \
+    --port 8004 \
+    --tensor-parallel-size 2 \
+    --dtype bfloat16 \
+    --max-model-len 4096 \
+    --gpu-memory-utilization 0.90 \
+    --disable-log-requests \
+    > "${PROJECT_ROOT}/logs/vllm_user_70b.log" 2>&1 &
+USER_PID=$!
+echo "70B user simulator PID: $USER_PID"
+
+# GPU 2: 8B Agent
+echo "Starting 8B agent on GPU 2 (port 8003)..."
+CUDA_VISIBLE_DEVICES=2 python3 -m vllm.entrypoints.openai.api_server \
+    --model "${MODEL_8B}" \
+    --port 8003 \
+    --tensor-parallel-size 1 \
+    --dtype bfloat16 \
+    --max-model-len 8192 \
+    --gpu-memory-utilization 0.90 \
+    --disable-log-requests \
+    > "${PROJECT_ROOT}/logs/vllm_agent_8b.log" 2>&1 &
+AGENT_PID=$!
+echo "8B agent PID: $AGENT_PID"
+
+# GPU 3: 8B Reward Model
+echo "Starting 8B reward model on GPU 3 (port 8005)..."
+CUDA_VISIBLE_DEVICES=3 python3 -m vllm.entrypoints.openai.api_server \
+    --model "${MODEL_8B}" \
+    --port 8005 \
+    --tensor-parallel-size 1 \
+    --dtype bfloat16 \
+    --max-model-len 4096 \
+    --gpu-memory-utilization 0.50 \
+    --disable-log-requests \
+    > "${PROJECT_ROOT}/logs/vllm_reward_8b.log" 2>&1 &
+REWARD_PID=$!
+echo "8B reward model PID: $REWARD_PID"
+
+echo ""
+echo "Waiting for servers to initialize (60s)..."
+sleep 60
+
+# Health checks
+echo "Checking server health..."
+for port in 8003 8004 8005; do
+    if curl -s "http://localhost:${port}/health" > /dev/null 2>&1; then
+        echo "  Port ${port}: OK"
+    else
+        echo "  Port ${port}: WAITING..."
+    fi
+done
+
+echo ""
+echo "Server PIDs: User=$USER_PID, Agent=$AGENT_PID, Reward=$REWARD_PID"
+echo "Logs: ${PROJECT_ROOT}/logs/vllm_*.log"
author	YurenHao0426 <blackhao0426@gmail.com>	2026-02-10 20:16:36 +0000
committer	YurenHao0426 <blackhao0426@gmail.com>	2026-02-10 20:16:36 +0000
commit	5626080ca4c4219aec4888d6b9406d0d3349fb55 (patch)
tree	86287d9fd5833e11ccd78566992540f2664fd195 /scripts/start_vllm_servers.sh
parent	a2036838807428424bbbaff507a6563749a83145 (diff)