From dc801c07cf38b0c495686463e6ca6f871a64440e Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Tue, 27 Jan 2026 09:57:37 -0600 Subject: Add collaborativeagents module and update gitignore - Add collaborativeagents subproject with adapters, agents, and evaluation modules - Update .gitignore to exclude large binary files (.whl, .tar), wandb logs, and results Co-Authored-By: Claude Opus 4.5 --- collaborativeagents/slurm/start_model_servers.sh | 82 ++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 collaborativeagents/slurm/start_model_servers.sh (limited to 'collaborativeagents/slurm/start_model_servers.sh') diff --git a/collaborativeagents/slurm/start_model_servers.sh b/collaborativeagents/slurm/start_model_servers.sh new file mode 100644 index 0000000..1f4e177 --- /dev/null +++ b/collaborativeagents/slurm/start_model_servers.sh @@ -0,0 +1,82 @@ +#!/bin/bash +#SBATCH --job-name=model_servers +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:h200:8 +#SBATCH --mem=400G +#SBATCH --time=24:00:00 +#SBATCH --output=logs/model_servers_%j.out +#SBATCH --error=logs/model_servers_%j.err + +# Start vLLM/sglang model servers for experiments +# - Port 8004: Llama-3.3-70B-Instruct (user simulator + judge) - 4 GPUs +# - Port 8003: Llama-3.1-8B-Instruct (agent) - 2 GPUs + +set -e + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model + +# Create logs directory +mkdir -p collaborativeagents/slurm/logs + +echo "Starting model servers at $(date)" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURMD_NODENAME" +echo "GPUs: $CUDA_VISIBLE_DEVICES" + +# Activate environment with sglang +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +# Check GPU availability +nvidia-smi + +# Start 70B model server (user simulator + judge) - needs 4 GPUs for TP +echo "Starting Llama-3.3-70B-Instruct server on port 8004..." +CUDA_VISIBLE_DEVICES=0,1,2,3 python -m sglang.launch_server \ + --model-path meta-llama/Llama-3.3-70B-Instruct \ + --port 8004 \ + --tp-size 4 \ + --context-length 16384 \ + --mem-fraction-static 0.85 \ + 2>&1 | tee logs/server_70b_$SLURM_JOB_ID.log & +SERVER_70B_PID=$! + +# Wait for 70B server to start (takes a few minutes) +echo "Waiting for 70B server to initialize..." +sleep 120 + +# Start 8B model server (agent) - needs 2 GPUs +echo "Starting Llama-3.1-8B-Instruct server on port 8003..." +CUDA_VISIBLE_DEVICES=4,5 python -m sglang.launch_server \ + --model-path models/llama-3.1-8b-instruct \ + --served-model-name meta-llama/Llama-3.1-8B-Instruct \ + --port 8003 \ + --tp-size 2 \ + --context-length 16384 \ + --mem-fraction-static 0.85 \ + 2>&1 | tee logs/server_8b_$SLURM_JOB_ID.log & +SERVER_8B_PID=$! + +echo "Servers starting..." +echo "70B server PID: $SERVER_70B_PID" +echo "8B server PID: $SERVER_8B_PID" + +# Save server info for experiment runner +cat > collaborativeagents/slurm/server_info.txt << EOF +NODE=$SLURMD_NODENAME +JOB_ID=$SLURM_JOB_ID +SERVER_70B_PID=$SERVER_70B_PID +SERVER_8B_PID=$SERVER_8B_PID +USER_API_BASE=http://$SLURMD_NODENAME:8004/v1 +AGENT_API_BASE=http://$SLURMD_NODENAME:8003/v1 +JUDGE_API_BASE=http://$SLURMD_NODENAME:8004/v1 +EOF + +echo "Server info saved to collaborativeagents/slurm/server_info.txt" + +# Wait for both servers +wait $SERVER_70B_PID $SERVER_8B_PID -- cgit v1.2.3