diff options
| author | YurenHao0426 <Blackhao0426@gmail.com> | 2026-04-07 22:52:41 -0500 |
|---|---|---|
| committer | YurenHao0426 <Blackhao0426@gmail.com> | 2026-04-07 22:52:41 -0500 |
| commit | ede7cca3e4f9048e3fc6d99077f8842e9b598ff4 (patch) | |
| tree | 720562461cfd2672a6f9c1b74bad6fc088bd6872 /protocol | |
| parent | 4420af372024ef12b28eac21678504dd75484dca (diff) | |
Add reproducers for pitfalls 1-3 in CHECKLIST.md
Each bug from the catalog has a synthetic reproducer that runs in <1 sec
without GPU:
Bug 1: x.norm(-1) on a 2x2 tensor returns 1.143 (L_{-1} of whole tensor)
instead of [5, 10] (per-row L_2 along dim=-1).
Bug 2: F.cosine_similarity(a, b) with ||b||=5e-10 returns +0.000905
instead of the true +0.018101. The clamp (eps=1e-8) underestimates
the divisor 20x.
Bug 3: 5e-10 in fp16 -> 0 (underflows smallest subnormal ~6e-8).
Downstream F.cosine_similarity returns NaN. bf16 works because it
shares fp32's exponent range.
Bugs 4-6 (Bs reproducibility, aggregation, layer-0 dominance) require a
trained network and are demonstrated inside audit_table and
ablation_decision_utility.
Diffstat (limited to 'protocol')
| -rw-r--r-- | protocol/examples/verify_pitfalls.py | 126 |
1 files changed, 126 insertions, 0 deletions
diff --git a/protocol/examples/verify_pitfalls.py b/protocol/examples/verify_pitfalls.py new file mode 100644 index 0000000..d329331 --- /dev/null +++ b/protocol/examples/verify_pitfalls.py @@ -0,0 +1,126 @@ +""" +Pipeline pitfalls verifier: empirically demonstrate bugs 1-3 from +`protocol/CHECKLIST.md` so the catalog is grounded in reproducible +synthetic evidence rather than in-vivo anecdote. + +Bug 1: `tensor.norm(-1)` is the L_{-1} 'norm' of the entire tensor, + NOT 'L_2 along dim=-1'. The correct call is `tensor.norm(dim=-1)`. + +Bug 2: `F.cosine_similarity(a, b)` clamps the divisor by eps=1e-8 by + default. When ||b|| ~ 1e-10 (which is the regime BP grads land in + on DFA-trained pre-LN ResMLPs), the divisor becomes ||a|| * 1e-8 + instead of ||a|| * 1e-10, scaling the reported cosine by ~100x + in the wrong direction. + +Bug 3: fp16 mixed precision underflows BP grads at hidden layers when + they sit at ~5e-10 (well below fp16's smallest subnormal of + ~6e-8). bf16 works because it has the same exponent range as fp32. + +This script does NOT use GPU and runs in <1 second. + +Run: + python -m protocol.examples.verify_pitfalls +""" +import math + +import torch +import torch.nn.functional as F + + +def banner(title): + print("=" * 72) + print(title) + print("=" * 72) + + +def bug1_norm_minus_one(): + banner("BUG 1: tensor.norm(-1) is NOT 'L_2 along dim=-1'") + torch.manual_seed(0) + x = torch.tensor([[3.0, 4.0], [6.0, 8.0]]) # rows have L2 norms 5 and 10 + correct = x.norm(dim=-1) # this is what callers usually mean + bug = x.norm(-1) # this is what `.norm(-1)` actually computes + + # Hand-compute the L_{-1} 'norm' of the whole tensor for clarity: + # ||x||_{-1} = (sum_i |x_i|^{-1})^{-1} = harmonic-mean-like quantity + flat = x.flatten() + hand_neg1 = (flat.abs().pow(-1).sum()).pow(-1).item() + + print(f" x = {x.tolist()}") + print(f" x.norm(dim=-1) (correct, L_2 along last dim): {correct.tolist()}") + print(f" x.norm(-1) (bug, L_{{-1}} of whole tensor): {bug.item():.6f}") + print(f" hand-computed L_{{-1}} of flat tensor: {hand_neg1:.6f}") + print(f" -> the two values match: {abs(bug.item() - hand_neg1) < 1e-6}") + print(f" -> the bug version is unrelated to per-row L_2 norms.") + print() + + +def bug2_cosine_eps_clamp(): + banner("BUG 2: F.cosine_similarity(a, b) clamps divisor by eps=1e-8") + # Construct a case where one vector has a tiny but non-zero magnitude. + # We use float64 throughout to avoid confounding with fp underflow. + torch.manual_seed(0) + a = torch.randn(1, 100, dtype=torch.float64) + direction = torch.randn(100, dtype=torch.float64) + direction = direction / direction.norm() + # b is just direction scaled to a tiny magnitude + b_scale = 5e-10 # the magnitude DFA-trained nets give for BP grads at hidden layers + b = (direction * b_scale).unsqueeze(0) + + # True cosine, no clamp + true_cos = (a @ b.T).item() / (a.norm().item() * b.norm().item()) + # PyTorch's F.cosine_similarity with default eps=1e-8 + pytorch_cos = F.cosine_similarity(a, b, dim=-1).item() + + ratio = pytorch_cos / true_cos if abs(true_cos) > 1e-30 else float('nan') + print(f" ||a|| = {a.norm().item():.4e}") + print(f" ||b|| = {b.norm().item():.4e} (intentionally below eps=1e-8)") + print(f" true cosine (no clamp): {true_cos:+.6f}") + print(f" F.cosine_similarity (default eps): {pytorch_cos:+.6f}") + print(f" ratio reported/true: {ratio:.6e} (should be 1.0)") + print(f" scaling distortion: {b_scale / 1e-8:.4e}x (i.e. PyTorch divides by") + print(f" ||a||*1e-8 instead of ||a||*{b_scale:.0e}, off by ~{1e-8/b_scale:.0e}x)") + print() + + +def bug3_fp16_underflow(): + banner("BUG 3: fp16 mixed precision underflows BP grads at ~5e-10") + # The smallest positive subnormal in fp16 is approximately 6e-8. + # Anything below that becomes 0. + fp16_min = torch.tensor(6e-8, dtype=torch.float16) + bp_grad_magnitude = 5e-10 # typical for DFA-trained pre-LN ResMLPs + + # Try to represent the magnitude in fp16 + val_fp16 = torch.tensor(bp_grad_magnitude, dtype=torch.float16) + val_bf16 = torch.tensor(bp_grad_magnitude, dtype=torch.bfloat16) + val_fp32 = torch.tensor(bp_grad_magnitude, dtype=torch.float32) + + print(f" BP grad magnitude on DFA-trained ResMLP: {bp_grad_magnitude:.0e}") + print(f" fp16 representation: {val_fp16.item():.4e} (-> 0 = UNDERFLOW)") + print(f" bf16 representation: {val_bf16.item():.4e} (works, same exp range as fp32)") + print(f" fp32 representation: {val_fp32.item():.4e} (works)") + + # Show what happens to a downstream cosine computation + a = torch.randn(100) + direction = torch.randn(100); direction = direction / direction.norm() + b32 = direction * bp_grad_magnitude + b16 = b32.half() + bbf = b32.bfloat16() + print() + print(" cosine of random vector with the BP-grad-magnitude direction, by precision:") + print(f" fp32 cosine: {F.cosine_similarity(a.unsqueeze(0), b32.unsqueeze(0)).item():+.4f} (correct)") + print(f" fp16 cosine: {F.cosine_similarity(a.half().unsqueeze(0), b16.unsqueeze(0)).item():+.4f} (corrupt — divisor underflowed)") + print(f" bf16 cosine: {F.cosine_similarity(a.bfloat16().unsqueeze(0), bbf.unsqueeze(0)).float().item():+.4f} (correct)") + print() + + +def main(): + bug1_norm_minus_one() + bug2_cosine_eps_clamp() + bug3_fp16_underflow() + print("All 3 reproducers ran. Each demonstrates the documented bug from") + print("protocol/CHECKLIST.md. Bugs 4-6 require a trained network and are") + print("verified inside the audit_table and ablation_decision_utility scripts.") + + +if __name__ == "__main__": + main() |
