#!/bin/bash #SBATCH --job-name=run_expts #SBATCH --account=bfqt-delta-gpu #SBATCH --partition=gpuH200x8 #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=32 #SBATCH --gres=gpu:h200:8 #SBATCH --mem=400G #SBATCH --time=48:00:00 #SBATCH --output=logs/run_expts_%j.out #SBATCH --error=logs/run_expts_%j.err # Run experiments with models loaded locally # This job needs 8 GPUs: # - 4 GPUs for 70B judge model # - 2 GPUs for PersonalizedLLM (embedder, reranker, extractor, main LLM) # - Reserve for headroom set -e cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model # Create logs and results directories mkdir -p collaborativeagents/slurm/logs mkdir -p collaborativeagents/results echo "Starting experiments at $(date)" echo "Job ID: $SLURM_JOB_ID" echo "Node: $SLURMD_NODENAME" echo "GPUs: $CUDA_VISIBLE_DEVICES" # Activate environment source /u/yurenh2/miniforge3/etc/profile.d/conda.sh conda activate eval # Check GPU availability nvidia-smi # Add project to path export PYTHONPATH="${PWD}/src:${PWD}/collaborativeagents:${PYTHONPATH}" # Run experiments cd collaborativeagents/scripts # Quick test first (2 profiles, 2 sessions) echo "Running quick test..." python run_experiments.py \ --methods rag_vector \ --datasets math-500 \ --n-profiles 2 \ --n-sessions 2 \ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl \ --output-dir ../results/test_$(date +%Y%m%d_%H%M%S) # Full run (uncomment when ready) # echo "Running full experiments..." # python run_experiments.py \ # --methods vanilla,all_memory,rag,rag_vector \ # --datasets math-500,gpqa,aime \ # --n-profiles 100 \ # --n-sessions 20 \ # --profile-path ../data/complex_profiles_v2/profiles_100.jsonl \ # --output-dir ../results/full_$(date +%Y%m%d_%H%M%S) echo "Experiments completed at $(date)"