diff options
Diffstat (limited to 'src/training/trainer.py')
| -rw-r--r-- | src/training/trainer.py | 2 |
1 files changed, 2 insertions, 0 deletions
diff --git a/src/training/trainer.py b/src/training/trainer.py index 6be949e..de0eb96 100644 --- a/src/training/trainer.py +++ b/src/training/trainer.py @@ -44,6 +44,7 @@ class TrainConfig: cascading_gate_k: float = 5.0 input_norm: str = "none" qwen_input_prefix: str = "" + init_logit: float = 15.0 # bias on Z logits so A≈1 at init (dense connectivity) # Data dataset: str = "allenai/dolma" @@ -185,6 +186,7 @@ class Trainer: rank=config.predictor_rank, cascading_gate_k=config.cascading_gate_k, qwen_input_prefix=config.qwen_input_prefix, + init_logit=config.init_logit, device=self.device, ) |
