#!/bin/bash
#SBATCH --job-name=refl_v2
#SBATCH --account=bfqt-delta-gpu
#SBATCH --partition=gpuH200x8
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=32
#SBATCH --gres=gpu:h200:4
#SBATCH --mem=200G
#SBATCH --time=4:00:00
#SBATCH --output=refl_v2_%j.out
#SBATCH --error=refl_v2_%j.err

# Reflection experiment v2 - with proper_scaffolding enabled (LLM-based retrieval)
# Uses original CollaborativeAgents prompts for fair reproduction
# H200 node, 5 profiles, 15 sessions

set -e

cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model
mkdir -p collaborativeagents/slurm/logs collaborativeagents/results

source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
conda activate eval

export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
export PYTHONPATH="${PWD}/src:${PWD}/collaborativeagents:${PYTHONPATH}"
export NCCL_P2P_DISABLE=1

# Model paths
MODEL_70B="meta-llama/Llama-3.1-70B-Instruct"
MODEL_8B="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
PORT_USER=8004
PORT_AGENT=8003

echo "=== Starting vLLM servers ==="
echo "Method: reflection (with proper_scaffolding)"
echo "User simulator: $MODEL_70B (70B full-precision)"
echo "Agent: $MODEL_8B (8B)"
date

# Kill any existing vLLM servers
pkill -f "vllm.entrypoints" 2>/dev/null || true
sleep 2

# Start 70B user simulator on GPU 0-1 (TP=2)
echo "Starting 70B user simulator on GPU 0-1..."
CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
    --model $MODEL_70B \
    --port $PORT_USER \
    --tensor-parallel-size 2 \
    --gpu-memory-utilization 0.95 \
    --max-model-len 16384 \
    --download-dir $HF_HOME \
    --dtype bfloat16 \
    --disable-log-requests &
SERVER_USER_PID=$!

# Start 8B agent on GPU 2-3 (TP=2)
echo "Starting 8B agent on GPU 2-3..."
CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
    --model $MODEL_8B \
    --port $PORT_AGENT \
    --tensor-parallel-size 2 \
    --gpu-memory-utilization 0.90 \
    --max-model-len 16384 \
    --dtype bfloat16 \
    --disable-log-requests &
SERVER_AGENT_PID=$!

echo "Waiting for vLLM servers to be ready..."

# Wait for servers
for i in $(seq 1 120); do
    READY_USER=$(curl -s http://localhost:$PORT_USER/health > /dev/null 2>&1 && echo 1 || echo 0)
    READY_AGENT=$(curl -s http://localhost:$PORT_AGENT/health > /dev/null 2>&1 && echo 1 || echo 0)

    if [ "$READY_USER" = "1" ] && [ "$READY_AGENT" = "1" ]; then
        echo "Both servers ready after $((i*3)) seconds"
        break
    fi
    if [ $((i % 20)) -eq 0 ]; then
        echo "  Still waiting... user=$READY_USER, agent=$READY_AGENT ($((i*3))s)"
    fi
    sleep 3
done

# Verify health
if ! curl -s http://localhost:$PORT_USER/health > /dev/null; then
    echo "ERROR: User server not healthy"
    kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null
    exit 1
fi
if ! curl -s http://localhost:$PORT_AGENT/health > /dev/null; then
    echo "ERROR: Agent server not healthy"
    kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null
    exit 1
fi
echo "Both vLLM servers healthy!"

echo ""
echo "=== Running reflection experiment with proper_scaffolding ==="
echo "Settings: 5 profiles, 15 sessions, math-hard dataset"
date

cd collaborativeagents/scripts

# Run reflection: 5 profiles, 15 sessions each
python run_experiments.py \
    --methods reflection \
    --datasets math-hard \
    --n-profiles 5 \
    --n-sessions 20 \
    --use-vllm \
    --vllm-user-url http://localhost:$PORT_USER/v1 \
    --vllm-agent-url http://localhost:$PORT_AGENT/v1 \
    --parallel-profiles 5 \
    --profile-path ../data/complex_profiles_v2/profiles_200.jsonl \
    --output-dir ../results/reflection_v2_$(date +%Y%m%d_%H%M%S)

echo ""
echo "=== Experiment completed ==="
date

# Cleanup
kill $SERVER_USER_PID $SERVER_AGENT_PID 2>/dev/null || true