summaryrefslogtreecommitdiff
path: root/collaborativeagents/slurm/test_70b_pilot.sh
blob: 3514e865c69acc59cdc2c8678bf9b8a13acdd903 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/bin/bash
#SBATCH --job-name=test_70b
#SBATCH --account=bfqt-delta-gpu
#SBATCH --partition=gpuA100x4
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=16
#SBATCH --gres=gpu:nvidia_a100:4
#SBATCH --mem=200G
#SBATCH --time=01:00:00
#SBATCH --output=logs/test_70b_%j.out
#SBATCH --error=logs/test_70b_%j.err

# Pilot test for 70B AWQ user model
# Tests that the model loads without OOM and multi-turn works

set -e

cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model

# Create logs directory
mkdir -p collaborativeagents/slurm/logs

echo "Starting 70B pilot test at $(date)"
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURMD_NODENAME"
echo "GPUs: $CUDA_VISIBLE_DEVICES"

# Activate environment
source /u/yurenh2/miniforge3/etc/profile.d/conda.sh
conda activate eval

# Check GPU availability
nvidia-smi

# Set HF cache to project space
export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface
mkdir -p $HF_HOME

# Add project to path
export PYTHONPATH="${PWD}/src:${PWD}/collaborativeagents:${PYTHONPATH}"

# Run pilot test
cd collaborativeagents/scripts
echo "Running 70B pilot test..."
python test_70b_pilot.py

echo "Pilot test completed at $(date)"