#!/bin/bash #SBATCH --job-name=test_70b #SBATCH --account=bfqt-delta-gpu #SBATCH --partition=gpuA100x4 #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=16 #SBATCH --gres=gpu:nvidia_a100:4 #SBATCH --mem=200G #SBATCH --time=01:00:00 #SBATCH --output=logs/test_70b_%j.out #SBATCH --error=logs/test_70b_%j.err # Pilot test for 70B AWQ user model # Tests that the model loads without OOM and multi-turn works set -e cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model # Create logs directory mkdir -p collaborativeagents/slurm/logs echo "Starting 70B pilot test at $(date)" echo "Job ID: $SLURM_JOB_ID" echo "Node: $SLURMD_NODENAME" echo "GPUs: $CUDA_VISIBLE_DEVICES" # Activate environment source /u/yurenh2/miniforge3/etc/profile.d/conda.sh conda activate eval # Check GPU availability nvidia-smi # Set HF cache to project space export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface mkdir -p $HF_HOME # Add project to path export PYTHONPATH="${PWD}/src:${PWD}/collaborativeagents:${PYTHONPATH}" # Run pilot test cd collaborativeagents/scripts echo "Running 70B pilot test..." python test_70b_pilot.py echo "Pilot test completed at $(date)"