experiments/clean_gradient_check.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

"""
Clean BP gradient check — run in independent Python process per method.
Usage: python clean_gradient_check.py --method bp --seed 42 --gpu 1
"""
import os, sys, json, argparse, numpy as np, torch, torch.nn.functional as F
from torch.utils.data import DataLoader
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from models.residual_mlp import ResidualMLP
import torchvision, torchvision.transforms as transforms

def main():
    p = argparse.ArgumentParser()
    p.add_argument('--method', type=str, required=True)
    p.add_argument('--seed', type=int, default=42)
    p.add_argument('--gpu', type=int, default=1)
    p.add_argument('--output_dir', type=str, default='results/confirmatory/clean_grads')
    args = p.parse_args()

    os.makedirs(args.output_dir, exist_ok=True)
    device = torch.device(f'cuda:{args.gpu}')

    # 1. Load eval data (256 samples, first batch, no shuffle)
    tv = transforms.Compose([transforms.ToTensor(),
        transforms.Normalize((0.4914,0.4822,0.4465),(0.2470,0.2435,0.2616))])
    tel = DataLoader(torchvision.datasets.CIFAR10('./data', False, download=True, transform=tv),
                     256, False, num_workers=0)  # num_workers=0 for determinism
    for x, y in tel:
        x = x.view(x.size(0), -1).to(device)
        y = y.to(device)
        break
    batch = x.size(0)
    print(f"[{args.method} s={args.seed}] Batch: {batch}, y[:5]={y[:5].tolist()}", flush=True)

    # 2. Create model from scratch, load checkpoint (strict=True)
    L, d, C = 4, 256, 10
    ckpt_path = f'results/confirmatory/checkpoints_A2/{args.method}_s{args.seed}.pt'
    assert os.path.exists(ckpt_path), f"Checkpoint not found: {ckpt_path}"

    model = ResidualMLP(3072, d, C, L).to(device)
    sd = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(sd, strict=True)
    model.eval()

    # Verify: print first param norm and checkpoint hash
    first_param = list(model.parameters())[0]
    print(f"  First param norm: {first_param.norm().item():.6f}", flush=True)
    print(f"  Checkpoint: {ckpt_path}", flush=True)

    # 3. Method A: manual forward + autograd.grad
    h0 = model.embed(x.detach())
    hs = [h0.clone().requires_grad_(True)]
    for b in model.blocks:
        hs.append(hs[-1] + b(hs[-1]))
    lo_a = model.out_head(model.out_ln(hs[-1]))
    loss_a = F.cross_entropy(lo_a, y)
    acc_a = (lo_a.argmax(1) == y).float().mean().item()
    gs_a = torch.autograd.grad(loss_a, hs)

    print(f"  Method A (manual+autograd.grad): loss={loss_a.item():.6f} acc={acc_a:.4f}", flush=True)
    for l in range(L):
        n = gs_a[l].norm(dim=-1)
        print(f"    layer {l}: mean_norm={n.mean():.2e} median={n.median():.2e} "
              f"max={n.max():.2e} s(1e-6)={(n>1e-6).float().mean():.4f}", flush=True)

    # 4. Method B: retain_grad + backward
    model.zero_grad()
    for param in model.parameters():
        param.requires_grad_(True)
    lo_b, hi_b = model(x, return_hidden=True)
    for l in range(L + 1):
        hi_b[l].retain_grad()
    loss_b = F.cross_entropy(lo_b, y)
    acc_b = (lo_b.argmax(1) == y).float().mean().item()
    loss_b.backward()

    print(f"  Method B (retain_grad+backward): loss={loss_b.item():.6f} acc={acc_b:.4f}", flush=True)
    for l in range(L):
        if hi_b[l].grad is not None:
            n = hi_b[l].grad.norm(dim=-1)
            print(f"    layer {l}: mean_norm={n.mean():.2e} median={n.median():.2e} "
                  f"max={n.max():.2e} s(1e-6)={(n>1e-6).float().mean():.4f}", flush=True)
        else:
            print(f"    layer {l}: grad is None!", flush=True)

    # 5. Method C: full model backward (no detach)
    model.zero_grad()
    lo_c = model(x)
    loss_c = F.cross_entropy(lo_c, y)
    loss_c.backward()
    # Get embedding gradient as proxy
    embed_grad_norm = model.embed.weight.grad.norm().item() if model.embed.weight.grad is not None else 0
    print(f"  Method C (full backward): loss={loss_c.item():.6f} embed_grad_norm={embed_grad_norm:.2e}", flush=True)

    # 6. Save results
    result = {
        'method': args.method, 'seed': args.seed, 'batch_size': batch,
        'y_first5': y[:5].tolist(),
        'first_param_norm': first_param.norm().item(),
        'method_A': {
            'loss': loss_a.item(), 'acc': acc_a,
            'per_layer': [{
                'mean_norm': gs_a[l].norm(-1).mean().item(),
                'median_norm': gs_a[l].norm(-1).median().item(),
                'max_norm': gs_a[l].norm(-1).max().item(),
                's_1e6': (gs_a[l].norm(-1) > 1e-6).float().mean().item(),
            } for l in range(L)]
        },
        'method_B': {
            'loss': loss_b.item(), 'acc': acc_b,
            'per_layer': [{
                'mean_norm': hi_b[l].grad.norm(-1).mean().item() if hi_b[l].grad is not None else None,
                'median_norm': hi_b[l].grad.norm(-1).median().item() if hi_b[l].grad is not None else None,
                'max_norm': hi_b[l].grad.norm(-1).max().item() if hi_b[l].grad is not None else None,
                's_1e6': (hi_b[l].grad.norm(-1) > 1e-6).float().mean().item() if hi_b[l].grad is not None else None,
            } for l in range(L)]
        },
        'method_C_embed_grad_norm': embed_grad_norm,
    }

    out = os.path.join(args.output_dir, f'{args.method}_s{args.seed}.json')
    with open(out, 'w') as f:
        json.dump(result, f, indent=2)
    print(f"  Saved to {out}", flush=True)

if __name__ == '__main__':
    main()