1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
|
#!/bin/bash
#SBATCH --job-name=rag_empty
#SBATCH --account=bfqt-delta-gpu
#SBATCH --partition=gpuH200x8
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=32
#SBATCH --gres=gpu:4
#SBATCH --mem=250G
#SBATCH --time=03:00:00
#SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_empty-%j.out
#SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/rag_empty-%j.err
# Test RAG with EMPTY memory store - start fresh and accumulate
# 5 profiles, 15 sessions each (more sessions to test accumulation)
# Compare: vanilla, rag, rag_vector
cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents
source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
conda activate eval
export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
export PYTHONPATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/src:$PYTHONPATH"
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
PROFILE_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/data/complex_profiles_v2/profiles_200.jsonl"
AGENT_MODEL="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct"
USER_MODEL="meta-llama/Llama-3.1-70B-Instruct"
echo "=== RAG Empty Memory Store Test ==="
echo "Key change: Starting with EMPTY memory store"
echo " - RAG will accumulate memories during evaluation"
echo " - Each user builds their own memory basket from scratch"
echo ""
echo "Settings: 5 profiles, 15 sessions each"
echo "User simulator: $USER_MODEL (70B)"
echo "Agent: $AGENT_MODEL (8B)"
date
nvidia-smi --query-gpu=index,name,memory.total --format=csv
# Clear empty store before each run to ensure fresh start
echo ""
echo "Clearing empty memory store..."
> /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl
rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy
echo "Memory store cleared."
# Start vLLM servers with adjusted memory allocation
echo ""
echo "Starting vLLM servers..."
# User simulator on GPUs 0,1 (70B, TP=2)
CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \
--model $USER_MODEL \
--port 8004 --tensor-parallel-size 2 --gpu-memory-utilization 0.85 \
--max-model-len 16384 --dtype bfloat16 --download-dir $HF_HOME \
--disable-log-requests &
USER_PID=$!
# Agent on GPUs 2,3 (8B, TP=2) - reduced memory for embedding/reranker headroom
CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server \
--model $AGENT_MODEL \
--port 8003 --tensor-parallel-size 2 --gpu-memory-utilization 0.40 \
--max-model-len 16384 --dtype bfloat16 \
--disable-log-requests &
AGENT_PID=$!
# Wait for servers
echo "Waiting for vLLM servers (may take 5-10 min)..."
for i in {1..200}; do
if curl -s http://localhost:8004/health > /dev/null 2>&1; then
echo "User simulator (8004) ready after $((i*5)) seconds"
break
fi
sleep 5
done
for i in {1..60}; do
if curl -s http://localhost:8003/health > /dev/null 2>&1; then
echo "Agent (8003) ready after $((i*5)) seconds"
break
fi
sleep 5
done
if ! curl -s http://localhost:8004/health > /dev/null 2>&1; then
echo "ERROR: User server not healthy"
kill $USER_PID $AGENT_PID 2>/dev/null
exit 1
fi
if ! curl -s http://localhost:8003/health > /dev/null 2>&1; then
echo "ERROR: Agent server not healthy"
kill $USER_PID $AGENT_PID 2>/dev/null
exit 1
fi
echo "Both vLLM servers ready"
sleep 5
OUTPUT_DIR="../results/rag_empty_test_$(date +%Y%m%d_%H%M%S)"
# Run methods sequentially (each starts with fresh empty memory)
for METHOD in vanilla rag rag_vector; do
echo ""
echo "============================================"
echo "Testing method: $METHOD"
echo "============================================"
# Clear memory store before each method for fair comparison
> /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl
rm -f /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_embeddings.npy
echo "Memory store cleared for $METHOD"
date
START=$(date +%s)
python scripts/run_experiments.py --methods $METHOD \
--datasets math-hard --n-profiles 5 --n-sessions 15 --max-turns 15 \
--use-vllm --no-batch-processing --parallel-profiles 5 \
--output-dir $OUTPUT_DIR --profile-path $PROFILE_PATH
END=$(date +%s)
ELAPSED=$((END-START))
# Show memory accumulation stats for RAG methods
if [[ "$METHOD" == "rag" || "$METHOD" == "rag_vector" ]]; then
CARD_COUNT=$(wc -l < /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/data/corpora/empty_store/memory_cards.jsonl 2>/dev/null || echo 0)
echo "Memory cards accumulated: $CARD_COUNT"
fi
if [ $? -eq 0 ]; then
echo "Method $METHOD: SUCCESS (${ELAPSED}s)"
else
echo "Method $METHOD: FAILED after ${ELAPSED}s"
fi
done
echo ""
echo "============================================"
echo "RAG Empty Memory Test Complete"
echo "============================================"
echo "Results saved to: $OUTPUT_DIR"
date
# Cleanup
pkill -f "vllm.entrypoints" 2>/dev/null || true
|