#!/bin/bash #SBATCH --job-name=sft_train #SBATCH --account=bfqt-delta-gpu #SBATCH --partition=gpuH200x8 #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=32 #SBATCH --gres=gpu:4 #SBATCH --mem=256G #SBATCH --time=08:00:00 #SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/sft_train_%j.out #SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/sft_train_%j.err set -e cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents source /u/yurenh2/miniforge3/etc/profile.d/conda.sh conda activate eval export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface export PYTHONPATH="${PWD}:${PWD}/../src:${PYTHONPATH}" export WANDB_MODE=offline echo "=== SFT Training (H200) ===" date nvidia-smi --query-gpu=index,name,memory.total --format=csv MODEL_PATH="/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/models/llama-3.1-8b-instruct" DATA_PATH="training/training_data/sft_training_data.json" OUTPUT_DIR="training/outputs/sft_reflection" echo "Model: $MODEL_PATH" echo "Data: $DATA_PATH" echo "Output: $OUTPUT_DIR" # Count training examples echo "" echo "Training data size: $(wc -c < $DATA_PATH) bytes" python -c "import json; data = json.load(open('$DATA_PATH')); print(f'Training examples: {len(data)}')" echo "" echo "Starting SFT training..." # Training with 4 GPUs using FSDP for memory efficiency accelerate launch --num_processes=4 --mixed_precision=bf16 \ training/train_sft.py \ --model-path $MODEL_PATH \ --data-path $DATA_PATH \ --output-dir $OUTPUT_DIR \ --num-epochs 4 \ --learning-rate 1e-6 \ --batch-size 1 \ --gradient-accumulation 16 echo "" echo "Training complete!" date ls -la $OUTPUT_DIR/