summaryrefslogtreecommitdiff
path: root/src/training
diff options
context:
space:
mode:
Diffstat (limited to 'src/training')
-rw-r--r--src/training/trainer.py2
1 files changed, 2 insertions, 0 deletions
diff --git a/src/training/trainer.py b/src/training/trainer.py
index 6be949e..de0eb96 100644
--- a/src/training/trainer.py
+++ b/src/training/trainer.py
@@ -44,6 +44,7 @@ class TrainConfig:
cascading_gate_k: float = 5.0
input_norm: str = "none"
qwen_input_prefix: str = ""
+ init_logit: float = 15.0 # bias on Z logits so A≈1 at init (dense connectivity)
# Data
dataset: str = "allenai/dolma"
@@ -185,6 +186,7 @@ class Trainer:
rank=config.predictor_rank,
cascading_gate_k=config.cascading_gate_k,
qwen_input_prefix=config.qwen_input_prefix,
+ init_logit=config.init_logit,
device=self.device,
)