### Qwen3-1.7B Full SFT Config (H200x4 Optimized) model_name_or_path: models/Qwen3-1.7B stage: sft do_train: true finetuning_type: full dataset: preference_extractor_train template: qwen cutoff_len: 1024 overwrite_cache: true preprocessing_num_workers: 4 output_dir: saves/qwen3-1.7b-full-sft-h200 logging_steps: 5 save_strategy: steps save_steps: 200 plot_loss: true overwrite_output_dir: true # H200x4 Configuration # Total Batch Size = 32 * 4 * 1 = 128 per_device_train_batch_size: 32 gradient_accumulation_steps: 1 learning_rate: 2.0e-5 num_train_epochs: 1.0 lr_scheduler_type: cosine warmup_ratio: 0.05 bf16: true flash_attn: fa2 val_size: 0.01 per_device_eval_batch_size: 32 eval_strategy: steps eval_steps: 200