diff options
Diffstat (limited to 'collaborativeagents/slurm/run_experiments.sh')
| -rw-r--r-- | collaborativeagents/slurm/run_experiments.sh | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/collaborativeagents/slurm/run_experiments.sh b/collaborativeagents/slurm/run_experiments.sh new file mode 100644 index 0000000..e254202 --- /dev/null +++ b/collaborativeagents/slurm/run_experiments.sh @@ -0,0 +1,66 @@ +#!/bin/bash +#SBATCH --job-name=run_expts +#SBATCH --account=bfqt-delta-gpu +#SBATCH --partition=gpuH200x8 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --gres=gpu:h200:8 +#SBATCH --mem=400G +#SBATCH --time=48:00:00 +#SBATCH --output=logs/run_expts_%j.out +#SBATCH --error=logs/run_expts_%j.err + +# Run experiments with models loaded locally +# This job needs 8 GPUs: +# - 4 GPUs for 70B judge model +# - 2 GPUs for PersonalizedLLM (embedder, reranker, extractor, main LLM) +# - Reserve for headroom + +set -e + +cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model + +# Create logs and results directories +mkdir -p collaborativeagents/slurm/logs +mkdir -p collaborativeagents/results + +echo "Starting experiments at $(date)" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURMD_NODENAME" +echo "GPUs: $CUDA_VISIBLE_DEVICES" + +# Activate environment +source /u/yurenh2/miniforge3/etc/profile.d/conda.sh +conda activate eval + +# Check GPU availability +nvidia-smi + +# Add project to path +export PYTHONPATH="${PWD}/src:${PWD}/collaborativeagents:${PYTHONPATH}" + +# Run experiments +cd collaborativeagents/scripts + +# Quick test first (2 profiles, 2 sessions) +echo "Running quick test..." +python run_experiments.py \ + --methods rag_vector \ + --datasets math-500 \ + --n-profiles 2 \ + --n-sessions 2 \ + --profile-path ../data/complex_profiles_v2/profiles_100.jsonl \ + --output-dir ../results/test_$(date +%Y%m%d_%H%M%S) + +# Full run (uncomment when ready) +# echo "Running full experiments..." +# python run_experiments.py \ +# --methods vanilla,all_memory,rag,rag_vector \ +# --datasets math-500,gpqa,aime \ +# --n-profiles 100 \ +# --n-sessions 20 \ +# --profile-path ../data/complex_profiles_v2/profiles_100.jsonl \ +# --output-dir ../results/full_$(date +%Y%m%d_%H%M%S) + +echo "Experiments completed at $(date)" |
