#!/bin/bash #SBATCH --job-name=sft_train #SBATCH --account=bfqt-delta-gpu #SBATCH --partition=gpuA100x4 #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=16 #SBATCH --gres=gpu:4 #SBATCH --mem=200G #SBATCH --time=24:00:00 #SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/sft_train_%j.out #SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/sft_train_%j.err # SFT Training only (data already exists) set -e cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model source /u/yurenh2/miniforge3/etc/profile.d/conda.sh conda activate eval export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface export PYTHONPATH="${PWD}/src:${PWD}/collaborativeagents:${PWD}/collaborativeagents/scripts:${PYTHONPATH}" export WANDB_PROJECT="collaborative-agent-reflection-sft" echo "=== SFT Training ===" date nvidia-smi --query-gpu=index,name,memory.total --format=csv DATA_PATH="collaborativeagents/training/training_data/sft_training_data.json" MODEL_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" OUTPUT_DIR="collaborativeagents/training/outputs/sft_reflection" echo "Data: $DATA_PATH" echo "Model: $MODEL_PATH" echo "Output: $OUTPUT_DIR" # Check data exists if [ ! -f "$DATA_PATH" ]; then echo "ERROR: Training data not found at $DATA_PATH" exit 1 fi echo "" echo "Training data size: $(wc -c < $DATA_PATH) bytes" python -c "import json; d=json.load(open('$DATA_PATH')); print(f'Training examples: {len(d)}')" mkdir -p $OUTPUT_DIR echo "" echo "Starting SFT training..." python collaborativeagents/training/train_sft.py \ --model-path $MODEL_PATH \ --data-path $DATA_PATH \ --output-dir $OUTPUT_DIR \ --num-epochs 4 \ --learning-rate 1e-6 \ --batch-size 1 \ --gradient-accumulation 64 echo "" echo "=== SFT Training Complete ===" echo "Model saved to: $OUTPUT_DIR" date