""" Pipeline pitfalls verifier: empirically demonstrate bugs 1-3 from `protocol/CHECKLIST.md` so the catalog is grounded in reproducible synthetic evidence rather than in-vivo anecdote. Bug 1: `tensor.norm(-1)` is the L_{-1} 'norm' of the entire tensor, NOT 'L_2 along dim=-1'. The correct call is `tensor.norm(dim=-1)`. Bug 2: `F.cosine_similarity(a, b)` clamps the divisor by eps=1e-8 by default. When ||b|| ~ 1e-10 (which is the regime BP grads land in on DFA-trained pre-LN ResMLPs), the divisor becomes ||a|| * 1e-8 instead of ||a|| * 1e-10, scaling the reported cosine by ~100x in the wrong direction. Bug 3: fp16 mixed precision underflows BP grads at hidden layers when they sit at ~5e-10 (well below fp16's smallest subnormal of ~6e-8). bf16 works because it has the same exponent range as fp32. This script does NOT use GPU and runs in <1 second. Run: python -m protocol.examples.verify_pitfalls """ import math import torch import torch.nn.functional as F def banner(title): print("=" * 72) print(title) print("=" * 72) def bug1_norm_minus_one(): banner("BUG 1: tensor.norm(-1) is NOT 'L_2 along dim=-1'") torch.manual_seed(0) x = torch.tensor([[3.0, 4.0], [6.0, 8.0]]) # rows have L2 norms 5 and 10 correct = x.norm(dim=-1) # this is what callers usually mean bug = x.norm(-1) # this is what `.norm(-1)` actually computes # Hand-compute the L_{-1} 'norm' of the whole tensor for clarity: # ||x||_{-1} = (sum_i |x_i|^{-1})^{-1} = harmonic-mean-like quantity flat = x.flatten() hand_neg1 = (flat.abs().pow(-1).sum()).pow(-1).item() print(f" x = {x.tolist()}") print(f" x.norm(dim=-1) (correct, L_2 along last dim): {correct.tolist()}") print(f" x.norm(-1) (bug, L_{{-1}} of whole tensor): {bug.item():.6f}") print(f" hand-computed L_{{-1}} of flat tensor: {hand_neg1:.6f}") print(f" -> the two values match: {abs(bug.item() - hand_neg1) < 1e-6}") print(f" -> the bug version is unrelated to per-row L_2 norms.") print() def bug2_cosine_eps_clamp(): banner("BUG 2: F.cosine_similarity(a, b) clamps divisor by eps=1e-8") # Construct a case where one vector has a tiny but non-zero magnitude. # We use float64 throughout to avoid confounding with fp underflow. torch.manual_seed(0) a = torch.randn(1, 100, dtype=torch.float64) direction = torch.randn(100, dtype=torch.float64) direction = direction / direction.norm() # b is just direction scaled to a tiny magnitude b_scale = 5e-10 # the magnitude DFA-trained nets give for BP grads at hidden layers b = (direction * b_scale).unsqueeze(0) # True cosine, no clamp true_cos = (a @ b.T).item() / (a.norm().item() * b.norm().item()) # PyTorch's F.cosine_similarity with default eps=1e-8 pytorch_cos = F.cosine_similarity(a, b, dim=-1).item() ratio = pytorch_cos / true_cos if abs(true_cos) > 1e-30 else float('nan') print(f" ||a|| = {a.norm().item():.4e}") print(f" ||b|| = {b.norm().item():.4e} (intentionally below eps=1e-8)") print(f" true cosine (no clamp): {true_cos:+.6f}") print(f" F.cosine_similarity (default eps): {pytorch_cos:+.6f}") print(f" ratio reported/true: {ratio:.6e} (should be 1.0)") print(f" scaling distortion: {b_scale / 1e-8:.4e}x (i.e. PyTorch divides by") print(f" ||a||*1e-8 instead of ||a||*{b_scale:.0e}, off by ~{1e-8/b_scale:.0e}x)") print() def bug3_fp16_underflow(): banner("BUG 3: fp16 mixed precision underflows BP grads at ~5e-10") # The smallest positive subnormal in fp16 is approximately 6e-8. # Anything below that becomes 0. fp16_min = torch.tensor(6e-8, dtype=torch.float16) bp_grad_magnitude = 5e-10 # typical for DFA-trained pre-LN ResMLPs # Try to represent the magnitude in fp16 val_fp16 = torch.tensor(bp_grad_magnitude, dtype=torch.float16) val_bf16 = torch.tensor(bp_grad_magnitude, dtype=torch.bfloat16) val_fp32 = torch.tensor(bp_grad_magnitude, dtype=torch.float32) print(f" BP grad magnitude on DFA-trained ResMLP: {bp_grad_magnitude:.0e}") print(f" fp16 representation: {val_fp16.item():.4e} (-> 0 = UNDERFLOW)") print(f" bf16 representation: {val_bf16.item():.4e} (works, same exp range as fp32)") print(f" fp32 representation: {val_fp32.item():.4e} (works)") # Show what happens to a downstream cosine computation a = torch.randn(100) direction = torch.randn(100); direction = direction / direction.norm() b32 = direction * bp_grad_magnitude b16 = b32.half() bbf = b32.bfloat16() print() print(" cosine of random vector with the BP-grad-magnitude direction, by precision:") print(f" fp32 cosine: {F.cosine_similarity(a.unsqueeze(0), b32.unsqueeze(0)).item():+.4f} (correct)") print(f" fp16 cosine: {F.cosine_similarity(a.half().unsqueeze(0), b16.unsqueeze(0)).item():+.4f} (corrupt — divisor underflowed)") print(f" bf16 cosine: {F.cosine_similarity(a.bfloat16().unsqueeze(0), bbf.unsqueeze(0)).float().item():+.4f} (correct)") print() def main(): bug1_norm_minus_one() bug2_cosine_eps_clamp() bug3_fp16_underflow() print("All 3 reproducers ran. Each demonstrates the documented bug from") print("protocol/CHECKLIST.md. Bugs 4-6 require a trained network and are") print("verified inside the audit_table and ablation_decision_utility scripts.") if __name__ == "__main__": main()