summaryrefslogtreecommitdiff
path: root/protocol
diff options
context:
space:
mode:
authorYurenHao0426 <Blackhao0426@gmail.com>2026-04-07 22:52:41 -0500
committerYurenHao0426 <Blackhao0426@gmail.com>2026-04-07 22:52:41 -0500
commitede7cca3e4f9048e3fc6d99077f8842e9b598ff4 (patch)
tree720562461cfd2672a6f9c1b74bad6fc088bd6872 /protocol
parent4420af372024ef12b28eac21678504dd75484dca (diff)
Add reproducers for pitfalls 1-3 in CHECKLIST.md
Each bug from the catalog has a synthetic reproducer that runs in <1 sec without GPU: Bug 1: x.norm(-1) on a 2x2 tensor returns 1.143 (L_{-1} of whole tensor) instead of [5, 10] (per-row L_2 along dim=-1). Bug 2: F.cosine_similarity(a, b) with ||b||=5e-10 returns +0.000905 instead of the true +0.018101. The clamp (eps=1e-8) underestimates the divisor 20x. Bug 3: 5e-10 in fp16 -> 0 (underflows smallest subnormal ~6e-8). Downstream F.cosine_similarity returns NaN. bf16 works because it shares fp32's exponent range. Bugs 4-6 (Bs reproducibility, aggregation, layer-0 dominance) require a trained network and are demonstrated inside audit_table and ablation_decision_utility.
Diffstat (limited to 'protocol')
-rw-r--r--protocol/examples/verify_pitfalls.py126
1 files changed, 126 insertions, 0 deletions
diff --git a/protocol/examples/verify_pitfalls.py b/protocol/examples/verify_pitfalls.py
new file mode 100644
index 0000000..d329331
--- /dev/null
+++ b/protocol/examples/verify_pitfalls.py
@@ -0,0 +1,126 @@
+"""
+Pipeline pitfalls verifier: empirically demonstrate bugs 1-3 from
+`protocol/CHECKLIST.md` so the catalog is grounded in reproducible
+synthetic evidence rather than in-vivo anecdote.
+
+Bug 1: `tensor.norm(-1)` is the L_{-1} 'norm' of the entire tensor,
+ NOT 'L_2 along dim=-1'. The correct call is `tensor.norm(dim=-1)`.
+
+Bug 2: `F.cosine_similarity(a, b)` clamps the divisor by eps=1e-8 by
+ default. When ||b|| ~ 1e-10 (which is the regime BP grads land in
+ on DFA-trained pre-LN ResMLPs), the divisor becomes ||a|| * 1e-8
+ instead of ||a|| * 1e-10, scaling the reported cosine by ~100x
+ in the wrong direction.
+
+Bug 3: fp16 mixed precision underflows BP grads at hidden layers when
+ they sit at ~5e-10 (well below fp16's smallest subnormal of
+ ~6e-8). bf16 works because it has the same exponent range as fp32.
+
+This script does NOT use GPU and runs in <1 second.
+
+Run:
+ python -m protocol.examples.verify_pitfalls
+"""
+import math
+
+import torch
+import torch.nn.functional as F
+
+
+def banner(title):
+ print("=" * 72)
+ print(title)
+ print("=" * 72)
+
+
+def bug1_norm_minus_one():
+ banner("BUG 1: tensor.norm(-1) is NOT 'L_2 along dim=-1'")
+ torch.manual_seed(0)
+ x = torch.tensor([[3.0, 4.0], [6.0, 8.0]]) # rows have L2 norms 5 and 10
+ correct = x.norm(dim=-1) # this is what callers usually mean
+ bug = x.norm(-1) # this is what `.norm(-1)` actually computes
+
+ # Hand-compute the L_{-1} 'norm' of the whole tensor for clarity:
+ # ||x||_{-1} = (sum_i |x_i|^{-1})^{-1} = harmonic-mean-like quantity
+ flat = x.flatten()
+ hand_neg1 = (flat.abs().pow(-1).sum()).pow(-1).item()
+
+ print(f" x = {x.tolist()}")
+ print(f" x.norm(dim=-1) (correct, L_2 along last dim): {correct.tolist()}")
+ print(f" x.norm(-1) (bug, L_{{-1}} of whole tensor): {bug.item():.6f}")
+ print(f" hand-computed L_{{-1}} of flat tensor: {hand_neg1:.6f}")
+ print(f" -> the two values match: {abs(bug.item() - hand_neg1) < 1e-6}")
+ print(f" -> the bug version is unrelated to per-row L_2 norms.")
+ print()
+
+
+def bug2_cosine_eps_clamp():
+ banner("BUG 2: F.cosine_similarity(a, b) clamps divisor by eps=1e-8")
+ # Construct a case where one vector has a tiny but non-zero magnitude.
+ # We use float64 throughout to avoid confounding with fp underflow.
+ torch.manual_seed(0)
+ a = torch.randn(1, 100, dtype=torch.float64)
+ direction = torch.randn(100, dtype=torch.float64)
+ direction = direction / direction.norm()
+ # b is just direction scaled to a tiny magnitude
+ b_scale = 5e-10 # the magnitude DFA-trained nets give for BP grads at hidden layers
+ b = (direction * b_scale).unsqueeze(0)
+
+ # True cosine, no clamp
+ true_cos = (a @ b.T).item() / (a.norm().item() * b.norm().item())
+ # PyTorch's F.cosine_similarity with default eps=1e-8
+ pytorch_cos = F.cosine_similarity(a, b, dim=-1).item()
+
+ ratio = pytorch_cos / true_cos if abs(true_cos) > 1e-30 else float('nan')
+ print(f" ||a|| = {a.norm().item():.4e}")
+ print(f" ||b|| = {b.norm().item():.4e} (intentionally below eps=1e-8)")
+ print(f" true cosine (no clamp): {true_cos:+.6f}")
+ print(f" F.cosine_similarity (default eps): {pytorch_cos:+.6f}")
+ print(f" ratio reported/true: {ratio:.6e} (should be 1.0)")
+ print(f" scaling distortion: {b_scale / 1e-8:.4e}x (i.e. PyTorch divides by")
+ print(f" ||a||*1e-8 instead of ||a||*{b_scale:.0e}, off by ~{1e-8/b_scale:.0e}x)")
+ print()
+
+
+def bug3_fp16_underflow():
+ banner("BUG 3: fp16 mixed precision underflows BP grads at ~5e-10")
+ # The smallest positive subnormal in fp16 is approximately 6e-8.
+ # Anything below that becomes 0.
+ fp16_min = torch.tensor(6e-8, dtype=torch.float16)
+ bp_grad_magnitude = 5e-10 # typical for DFA-trained pre-LN ResMLPs
+
+ # Try to represent the magnitude in fp16
+ val_fp16 = torch.tensor(bp_grad_magnitude, dtype=torch.float16)
+ val_bf16 = torch.tensor(bp_grad_magnitude, dtype=torch.bfloat16)
+ val_fp32 = torch.tensor(bp_grad_magnitude, dtype=torch.float32)
+
+ print(f" BP grad magnitude on DFA-trained ResMLP: {bp_grad_magnitude:.0e}")
+ print(f" fp16 representation: {val_fp16.item():.4e} (-> 0 = UNDERFLOW)")
+ print(f" bf16 representation: {val_bf16.item():.4e} (works, same exp range as fp32)")
+ print(f" fp32 representation: {val_fp32.item():.4e} (works)")
+
+ # Show what happens to a downstream cosine computation
+ a = torch.randn(100)
+ direction = torch.randn(100); direction = direction / direction.norm()
+ b32 = direction * bp_grad_magnitude
+ b16 = b32.half()
+ bbf = b32.bfloat16()
+ print()
+ print(" cosine of random vector with the BP-grad-magnitude direction, by precision:")
+ print(f" fp32 cosine: {F.cosine_similarity(a.unsqueeze(0), b32.unsqueeze(0)).item():+.4f} (correct)")
+ print(f" fp16 cosine: {F.cosine_similarity(a.half().unsqueeze(0), b16.unsqueeze(0)).item():+.4f} (corrupt — divisor underflowed)")
+ print(f" bf16 cosine: {F.cosine_similarity(a.bfloat16().unsqueeze(0), bbf.unsqueeze(0)).float().item():+.4f} (correct)")
+ print()
+
+
+def main():
+ bug1_norm_minus_one()
+ bug2_cosine_eps_clamp()
+ bug3_fp16_underflow()
+ print("All 3 reproducers ran. Each demonstrates the documented bug from")
+ print("protocol/CHECKLIST.md. Bugs 4-6 require a trained network and are")
+ print("verified inside the audit_table and ablation_decision_utility scripts.")
+
+
+if __name__ == "__main__":
+ main()