#!/bin/bash #SBATCH --job-name=run_expts_a100 #SBATCH --account=bfqt-delta-gpu #SBATCH --partition=gpuA100x4 #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=16 #SBATCH --gres=gpu:nvidia_a100:4 #SBATCH --mem=200G #SBATCH --time=48:00:00 #SBATCH --output=logs/run_expts_a100_%j.out #SBATCH --error=logs/run_expts_a100_%j.err # Run experiments on 4x A100 80GB # - 70B judge model with TP=4 (~140GB) # - 8B PersonalizedLLM models (~16GB shared) set -e cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model # Create logs and results directories mkdir -p collaborativeagents/slurm/logs mkdir -p collaborativeagents/results echo "Starting experiments at $(date)" echo "Job ID: $SLURM_JOB_ID" echo "Node: $SLURMD_NODENAME" echo "GPUs: $CUDA_VISIBLE_DEVICES" # Activate environment source /u/yurenh2/miniforge3/etc/profile.d/conda.sh conda activate eval # Check GPU availability nvidia-smi # Redirect HF cache to project space (avoid home quota issues) export HF_HOME=/projects/bfqt/users/yurenh2/.cache/huggingface mkdir -p $HF_HOME # Add project to path export PYTHONPATH="${PWD}/src:${PWD}/collaborativeagents:${PYTHONPATH}" # Run experiments cd collaborativeagents/scripts # Full benchmark run echo "Running full experiments..." python run_experiments.py \ --methods vanilla,all_memory,rag,rag_vector \ --datasets math-500 \ --n-profiles 20 \ --n-sessions 5 \ --profile-path ../data/complex_profiles_v2/profiles_100.jsonl \ --output-dir ../results/full_$(date +%Y%m%d_%H%M%S) echo "Experiments completed at $(date)"