#!/bin/bash #SBATCH --job-name=sft_resume #SBATCH --account=bfqt-delta-gpu #SBATCH --partition=gpuH200x8 #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=32 #SBATCH --gres=gpu:4 #SBATCH --mem=256G #SBATCH --time=1:00:00 #SBATCH --output=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/sft_resume_%j.out #SBATCH --error=/projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/sft_resume_%j.err echo "=== SFT Training Resume from checkpoint-100 (H200) ===" date nvidia-smi --query-gpu=index,name,memory.total --format=csv cd /projects/bfqt/users/yurenh2/ml-projects/personalization-user-model/collaborativeagents/training source /u/yurenh2/miniforge3/etc/profile.d/conda.sh conda activate eval export HF_HOME=/projects/bfqt/users/yurenh2/hf_cache/huggingface export WANDB_MODE=offline echo "Model: llama-3.1-8b-instruct" echo "Resuming from: /work/nvme/bfqt/yurenh2/sft_checkpoints/checkpoint-100" echo "Output: /work/nvme/bfqt/yurenh2/sft_checkpoints" # Verify checkpoint exists if [ -d "/work/nvme/bfqt/yurenh2/sft_checkpoints/checkpoint-100" ]; then echo "Checkpoint verified!" else echo "ERROR: Checkpoint not found!" exit 1 fi echo "" echo "Starting LLaMA-Factory SFT training resume with DeepSpeed ZeRO-3..." # Run with LLaMA-Factory CLI + DeepSpeed ZeRO-3 FORCE_TORCHRUN=1 llamafactory-cli train llama_factory_resume_config.yaml echo "" echo "SFT Training complete!" date