"""
Pipeline pitfalls verifier: empirically demonstrate bugs 1-3 from
`protocol/CHECKLIST.md` so the catalog is grounded in reproducible
synthetic evidence rather than in-vivo anecdote.

Bug 1: `tensor.norm(-1)` is the L_{-1} 'norm' of the entire tensor,
       NOT 'L_2 along dim=-1'. The correct call is `tensor.norm(dim=-1)`.

Bug 2: `F.cosine_similarity(a, b)` clamps the divisor by eps=1e-8 by
       default. When ||b|| ~ 1e-10 (which is the regime BP grads land in
       on DFA-trained pre-LN ResMLPs), the divisor becomes ||a|| * 1e-8
       instead of ||a|| * 1e-10, scaling the reported cosine by ~100x
       in the wrong direction.

Bug 3: fp16 mixed precision underflows BP grads at hidden layers when
       they sit at ~5e-10 (well below fp16's smallest subnormal of
       ~6e-8). bf16 works because it has the same exponent range as fp32.

This script does NOT use GPU and runs in <1 second.

Run:
    python -m protocol.examples.verify_pitfalls
"""
import math

import torch
import torch.nn.functional as F


def banner(title):
    print("=" * 72)
    print(title)
    print("=" * 72)


def bug1_norm_minus_one():
    banner("BUG 1: tensor.norm(-1) is NOT 'L_2 along dim=-1'")
    torch.manual_seed(0)
    x = torch.tensor([[3.0, 4.0], [6.0, 8.0]])  # rows have L2 norms 5 and 10
    correct = x.norm(dim=-1)  # this is what callers usually mean
    bug = x.norm(-1)          # this is what `.norm(-1)` actually computes

    # Hand-compute the L_{-1} 'norm' of the whole tensor for clarity:
    # ||x||_{-1} = (sum_i |x_i|^{-1})^{-1} = harmonic-mean-like quantity
    flat = x.flatten()
    hand_neg1 = (flat.abs().pow(-1).sum()).pow(-1).item()

    print(f"  x = {x.tolist()}")
    print(f"  x.norm(dim=-1) (correct, L_2 along last dim): {correct.tolist()}")
    print(f"  x.norm(-1)     (bug, L_{{-1}} of whole tensor): {bug.item():.6f}")
    print(f"  hand-computed L_{{-1}} of flat tensor:           {hand_neg1:.6f}")
    print(f"  -> the two values match: {abs(bug.item() - hand_neg1) < 1e-6}")
    print(f"  -> the bug version is unrelated to per-row L_2 norms.")
    print()


def bug2_cosine_eps_clamp():
    banner("BUG 2: F.cosine_similarity(a, b) clamps divisor by eps=1e-8")
    # Construct a case where one vector has a tiny but non-zero magnitude.
    # We use float64 throughout to avoid confounding with fp underflow.
    torch.manual_seed(0)
    a = torch.randn(1, 100, dtype=torch.float64)
    direction = torch.randn(100, dtype=torch.float64)
    direction = direction / direction.norm()
    # b is just direction scaled to a tiny magnitude
    b_scale = 5e-10  # the magnitude DFA-trained nets give for BP grads at hidden layers
    b = (direction * b_scale).unsqueeze(0)

    # True cosine, no clamp
    true_cos = (a @ b.T).item() / (a.norm().item() * b.norm().item())
    # PyTorch's F.cosine_similarity with default eps=1e-8
    pytorch_cos = F.cosine_similarity(a, b, dim=-1).item()

    ratio = pytorch_cos / true_cos if abs(true_cos) > 1e-30 else float('nan')
    print(f"  ||a|| = {a.norm().item():.4e}")
    print(f"  ||b|| = {b.norm().item():.4e}  (intentionally below eps=1e-8)")
    print(f"  true cosine     (no clamp):       {true_cos:+.6f}")
    print(f"  F.cosine_similarity (default eps): {pytorch_cos:+.6f}")
    print(f"  ratio reported/true: {ratio:.6e}  (should be 1.0)")
    print(f"  scaling distortion: {b_scale / 1e-8:.4e}x  (i.e. PyTorch divides by")
    print(f"    ||a||*1e-8 instead of ||a||*{b_scale:.0e}, off by ~{1e-8/b_scale:.0e}x)")
    print()


def bug3_fp16_underflow():
    banner("BUG 3: fp16 mixed precision underflows BP grads at ~5e-10")
    # The smallest positive subnormal in fp16 is approximately 6e-8.
    # Anything below that becomes 0.
    fp16_min = torch.tensor(6e-8, dtype=torch.float16)
    bp_grad_magnitude = 5e-10  # typical for DFA-trained pre-LN ResMLPs

    # Try to represent the magnitude in fp16
    val_fp16 = torch.tensor(bp_grad_magnitude, dtype=torch.float16)
    val_bf16 = torch.tensor(bp_grad_magnitude, dtype=torch.bfloat16)
    val_fp32 = torch.tensor(bp_grad_magnitude, dtype=torch.float32)

    print(f"  BP grad magnitude on DFA-trained ResMLP: {bp_grad_magnitude:.0e}")
    print(f"  fp16 representation:  {val_fp16.item():.4e}  (-> 0 = UNDERFLOW)")
    print(f"  bf16 representation:  {val_bf16.item():.4e}  (works, same exp range as fp32)")
    print(f"  fp32 representation:  {val_fp32.item():.4e}  (works)")

    # Show what happens to a downstream cosine computation
    a = torch.randn(100)
    direction = torch.randn(100); direction = direction / direction.norm()
    b32 = direction * bp_grad_magnitude
    b16 = b32.half()
    bbf = b32.bfloat16()
    print()
    print("  cosine of random vector with the BP-grad-magnitude direction, by precision:")
    print(f"    fp32 cosine: {F.cosine_similarity(a.unsqueeze(0), b32.unsqueeze(0)).item():+.4f}  (correct)")
    print(f"    fp16 cosine: {F.cosine_similarity(a.half().unsqueeze(0), b16.unsqueeze(0)).item():+.4f}  (corrupt — divisor underflowed)")
    print(f"    bf16 cosine: {F.cosine_similarity(a.bfloat16().unsqueeze(0), bbf.unsqueeze(0)).float().item():+.4f}  (correct)")
    print()


def main():
    bug1_norm_minus_one()
    bug2_cosine_eps_clamp()
    bug3_fp16_underflow()
    print("All 3 reproducers ran. Each demonstrates the documented bug from")
    print("protocol/CHECKLIST.md. Bugs 4-6 require a trained network and are")
    print("verified inside the audit_table and ablation_decision_utility scripts.")


if __name__ == "__main__":
    main()