1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
"""
Pipeline pitfalls verifier: empirically demonstrate bugs 1-3 from
`protocol/CHECKLIST.md` so the catalog is grounded in reproducible
synthetic evidence rather than in-vivo anecdote.
Bug 1: `tensor.norm(-1)` is the L_{-1} 'norm' of the entire tensor,
NOT 'L_2 along dim=-1'. The correct call is `tensor.norm(dim=-1)`.
Bug 2: `F.cosine_similarity(a, b)` clamps the divisor by eps=1e-8 by
default. When ||b|| ~ 1e-10 (which is the regime BP grads land in
on DFA-trained pre-LN ResMLPs), the divisor becomes ||a|| * 1e-8
instead of ||a|| * 1e-10, scaling the reported cosine by ~100x
in the wrong direction.
Bug 3: fp16 mixed precision underflows BP grads at hidden layers when
they sit at ~5e-10 (well below fp16's smallest subnormal of
~6e-8). bf16 works because it has the same exponent range as fp32.
This script does NOT use GPU and runs in <1 second.
Run:
python -m protocol.examples.verify_pitfalls
"""
import math
import torch
import torch.nn.functional as F
def banner(title):
print("=" * 72)
print(title)
print("=" * 72)
def bug1_norm_minus_one():
banner("BUG 1: tensor.norm(-1) is NOT 'L_2 along dim=-1'")
torch.manual_seed(0)
x = torch.tensor([[3.0, 4.0], [6.0, 8.0]]) # rows have L2 norms 5 and 10
correct = x.norm(dim=-1) # this is what callers usually mean
bug = x.norm(-1) # this is what `.norm(-1)` actually computes
# Hand-compute the L_{-1} 'norm' of the whole tensor for clarity:
# ||x||_{-1} = (sum_i |x_i|^{-1})^{-1} = harmonic-mean-like quantity
flat = x.flatten()
hand_neg1 = (flat.abs().pow(-1).sum()).pow(-1).item()
print(f" x = {x.tolist()}")
print(f" x.norm(dim=-1) (correct, L_2 along last dim): {correct.tolist()}")
print(f" x.norm(-1) (bug, L_{{-1}} of whole tensor): {bug.item():.6f}")
print(f" hand-computed L_{{-1}} of flat tensor: {hand_neg1:.6f}")
print(f" -> the two values match: {abs(bug.item() - hand_neg1) < 1e-6}")
print(f" -> the bug version is unrelated to per-row L_2 norms.")
print()
def bug2_cosine_eps_clamp():
banner("BUG 2: F.cosine_similarity(a, b) clamps divisor by eps=1e-8")
# Construct a case where one vector has a tiny but non-zero magnitude.
# We use float64 throughout to avoid confounding with fp underflow.
torch.manual_seed(0)
a = torch.randn(1, 100, dtype=torch.float64)
direction = torch.randn(100, dtype=torch.float64)
direction = direction / direction.norm()
# b is just direction scaled to a tiny magnitude
b_scale = 5e-10 # the magnitude DFA-trained nets give for BP grads at hidden layers
b = (direction * b_scale).unsqueeze(0)
# True cosine, no clamp
true_cos = (a @ b.T).item() / (a.norm().item() * b.norm().item())
# PyTorch's F.cosine_similarity with default eps=1e-8
pytorch_cos = F.cosine_similarity(a, b, dim=-1).item()
ratio = pytorch_cos / true_cos if abs(true_cos) > 1e-30 else float('nan')
print(f" ||a|| = {a.norm().item():.4e}")
print(f" ||b|| = {b.norm().item():.4e} (intentionally below eps=1e-8)")
print(f" true cosine (no clamp): {true_cos:+.6f}")
print(f" F.cosine_similarity (default eps): {pytorch_cos:+.6f}")
print(f" ratio reported/true: {ratio:.6e} (should be 1.0)")
print(f" scaling distortion: {b_scale / 1e-8:.4e}x (i.e. PyTorch divides by")
print(f" ||a||*1e-8 instead of ||a||*{b_scale:.0e}, off by ~{1e-8/b_scale:.0e}x)")
print()
def bug3_fp16_underflow():
banner("BUG 3: fp16 mixed precision underflows BP grads at ~5e-10")
# The smallest positive subnormal in fp16 is approximately 6e-8.
# Anything below that becomes 0.
fp16_min = torch.tensor(6e-8, dtype=torch.float16)
bp_grad_magnitude = 5e-10 # typical for DFA-trained pre-LN ResMLPs
# Try to represent the magnitude in fp16
val_fp16 = torch.tensor(bp_grad_magnitude, dtype=torch.float16)
val_bf16 = torch.tensor(bp_grad_magnitude, dtype=torch.bfloat16)
val_fp32 = torch.tensor(bp_grad_magnitude, dtype=torch.float32)
print(f" BP grad magnitude on DFA-trained ResMLP: {bp_grad_magnitude:.0e}")
print(f" fp16 representation: {val_fp16.item():.4e} (-> 0 = UNDERFLOW)")
print(f" bf16 representation: {val_bf16.item():.4e} (works, same exp range as fp32)")
print(f" fp32 representation: {val_fp32.item():.4e} (works)")
# Show what happens to a downstream cosine computation
a = torch.randn(100)
direction = torch.randn(100); direction = direction / direction.norm()
b32 = direction * bp_grad_magnitude
b16 = b32.half()
bbf = b32.bfloat16()
print()
print(" cosine of random vector with the BP-grad-magnitude direction, by precision:")
print(f" fp32 cosine: {F.cosine_similarity(a.unsqueeze(0), b32.unsqueeze(0)).item():+.4f} (correct)")
print(f" fp16 cosine: {F.cosine_similarity(a.half().unsqueeze(0), b16.unsqueeze(0)).item():+.4f} (corrupt — divisor underflowed)")
print(f" bf16 cosine: {F.cosine_similarity(a.bfloat16().unsqueeze(0), bbf.unsqueeze(0)).float().item():+.4f} (correct)")
print()
def main():
bug1_norm_minus_one()
bug2_cosine_eps_clamp()
bug3_fp16_underflow()
print("All 3 reproducers ran. Each demonstrates the documented bug from")
print("protocol/CHECKLIST.md. Bugs 4-6 require a trained network and are")
print("verified inside the audit_table and ablation_decision_utility scripts.")
if __name__ == "__main__":
main()
|