From 31ddecc9eb646b15c4ac5960c7de9346c8f7be68 Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Tue, 7 Apr 2026 23:00:54 -0500 Subject: Protocol diagnostic (a): use max per-block growth, not max/min ratio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Old metric: max(||h||) / max(||h_0||, eps). False-positives on ViT-style architectures because the cls token at layer 0 (right after patch_embed) has anomalously small magnitude (~0.3-1.5), inflating the ratio even on healthy BP-trained ViTs. New metric: max_l(||h_{l+1}|| / ||h_l||) — the largest single-block residual amplification. Architecture-invariant. Calibration: - BP-trained, late training: <5x per block - BP ViT, early epochs (cls token resolving): 13-25x max - DFA-trained ResMLP/ViT: 100-4000x per block Threshold raised from 10 to 50 to sit cleanly between healthy-early- training (max 25) and failure-regime (min 100). Re-verifications: - smoke test (BP/DFA/EP): all 3 verdicts unchanged - random init (3 seeds): trustworthy on all 3 - 5-method audit table single-seed: identical verdicts - decision-utility ablation: identical (still 0/5 by S1, 3/5 by S_full) - temporal evolution 3-seed: (b) now fires first at ep 3-4, (a) at ep 8-11. Both well before training ends. The 'protocol fires ~92 epochs early' story still holds. - ViT temporal evolution: BP no longer false-fires; DFA fires (a) ep 1, (b) ep 3 — protocol works on the second architecture. --- protocol/report.py | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) (limited to 'protocol/report.py') diff --git a/protocol/report.py b/protocol/report.py index 15d6c34..00640eb 100644 --- a/protocol/report.py +++ b/protocol/report.py @@ -35,6 +35,19 @@ class DiagnosticThresholds: """ g_norm_floor: float = 1e-7 + # Per-block residual growth ratio threshold. The diagnostic is + # `max_l(||h_{l+1}|| / ||h_l||)` — the largest single-block residual + # amplification. We avoided `max(||h||) / ||h_0||` because it false- + # positives on ViT-style architectures where the cls token at layer 0 + # is anomalously small after patch_embed. + # + # Calibration on observed data: + # - BP-trained, late training: <5× per block (steady state) + # - BP ViT, early training (epoch 1-5): 13-25× max (cls token still + # resolving from its small init magnitude) + # - DFA-trained ResMLP / ViT: 100-4000× max per block + # Threshold 50 sits cleanly between healthy-early-training (max 25) and + # failure-regime (min 100), with margin on both sides. h_norm_explosion_ratio: float = 50.0 stability_drift_ceiling: float = 0.30 frozen_acc_margin_pp: float = 2.0 @@ -58,14 +71,23 @@ class DiagnosticReport: # Per-diagnostic verdicts # ------------------------------------------------------------------ # + @property + def max_per_block_growth(self) -> float: + """max_l (||h_{l+1}|| / ||h_l||) — the largest residual-stream + amplification by any single block. Healthy BP/EP networks have all + per-block growth < 5×; pathological networks (DFA/SB/CB on pre-LN + residuals) have at least one block with growth > 100×.""" + if len(self.residual_norms) < 2: + return 1.0 + ratios = [] + for i in range(len(self.residual_norms) - 1): + denom = max(self.residual_norms[i], 1e-30) + ratios.append(self.residual_norms[i + 1] / denom) + return max(ratios) + @property def residual_stream_exploded(self) -> bool: - if not self.residual_norms: - return False - h0 = self.residual_norms[0] - if h0 <= 0: - return False - return (max(self.residual_norms) / h0) > self.thresholds.h_norm_explosion_ratio + return self.max_per_block_growth > self.thresholds.h_norm_explosion_ratio @property def bp_grad_at_floor(self) -> bool: @@ -126,8 +148,8 @@ class DiagnosticReport: lines.append(f" h_{l}: {self.residual_norms[l]:.3e}") if self.residual_stream_exploded: lines.append( - f" FLAG: max/min ratio " - f"{max(self.residual_norms)/max(self.residual_norms[0],1e-30):.2e} " + f" FLAG: max per-block growth ‖h_{{l+1}}‖/‖h_l‖ = " + f"{self.max_per_block_growth:.2e} " f"> threshold {self.thresholds.h_norm_explosion_ratio}× — " "residual stream exploded." ) -- cgit v1.2.3