#!/usr/bin/env python3 """§2.3 diagnostic v2 — faithful reproduction of original methodology. Uses src.trainers.BPTrainer (the actual training stack used in the paper), matching results/gradient_reach_20seeds/per_seed_data.json which shows GCN L=10 weight grad norms = 0.0 for all 20 seeds × 10 layers. Adds beyond the original: - pre-activation grad G_Z[l] = ||dL/dZ_l||_F and RMS-normed variant - forward magnitudes M[l] = ||H_l||_F and RMS-normed - centered dispersion D[l] = ||H_l - mean||_F / D_0 - frozen linear probe probe_acc[l] on H_l Backbone: GCN. Cora. 100 epochs (matches original). 20 seeds. Depths {6, 10, 20}. Output: results/diag_section23/diag_data_v2.json """ import json, os, sys import numpy as np import torch import torch.nn.functional as F from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler sys.path.insert(0, '/home/yurenh2/graph-grape') from src.data import load_dataset, spmm from src.trainers import BPTrainer DEVICE = 'cuda:0' # CUDA_VISIBLE_DEVICES=2 → cuda:0 HIDDEN = 64 LR = 0.01 WD = 5e-4 EPOCHS = 100 SEEDS = list(range(20)) OUT_DIR = '/home/yurenh2/graph-grape/results/diag_section23' os.makedirs(OUT_DIR, exist_ok=True) def forward_with_intermediates(bp, capture_for_grad=False): """Re-implement BPTrainer.forward() but capture per-layer Z (pre-act) and H (post-act). H[0] = X (input features). For l = 1..L: H[l] = relu(Z[l-1]) (or Z[l-1] for last layer). Z[0..L-1] are pre-activation outputs of each conv. """ X = bp.data['X'] H_list = [X] Z_list = [] H = X H0 = None for l in range(bp.num_layers): if l > 0 and l < bp.num_layers - 1 and bp.residual_alpha > 0 and H0 is not None: H = (1 - bp.residual_alpha) * H + bp.residual_alpha * H0 Z = bp._graph_conv(H, bp.weights[l], l) if capture_for_grad: Z.retain_grad() Z_list.append(Z) if l < bp.num_layers - 1: H = F.relu(Z) if l == 0: H0 = H else: H = Z # final logits, no relu H_list.append(H) return H_list[-1], Z_list, H_list # logits, Z's, H's def diagnose(seed, L, data): torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) bp = BPTrainer(data=data, hidden_dim=HIDDEN, lr=LR, weight_decay=WD, num_layers=L, residual_alpha=0.0, backbone='gcn') for _ in range(EPOCHS): bp.train_step() # Diagnostic forward at epoch 100 bp.optimizer.zero_grad() logits, Zs, Hs = forward_with_intermediates(bp, capture_for_grad=True) mask = data['train_mask'] loss = F.cross_entropy(logits[mask], data['y'][mask]) loss.backward(retain_graph=False) # Weight gradients (original methodology) W_grads_F = [float(bp.weights[l].grad.detach().norm().item()) for l in range(L)] W_grads_rms = [g / np.sqrt(bp.weights[l].numel()) for g, l in zip(W_grads_F, range(L))] # Pre-activation gradients on Z_l (l=0..L-1) Z_grads_F = [] Z_grads_rms = [] for z in Zs: if z.grad is None: Z_grads_F.append(0.0); Z_grads_rms.append(0.0); continue N, d_ = z.shape gf = float(z.grad.detach().norm().item()) Z_grads_F.append(gf) Z_grads_rms.append(gf / np.sqrt(N * d_)) # Forward state metrics on H_l (l=0..L) M_F, M_rms = [], [] D_raw = [] for H in Hs: N, d_ = H.shape mf = float(H.detach().norm().item()) M_F.append(mf) M_rms.append(mf / np.sqrt(N * d_)) mu = H.detach().mean(0, keepdim=True) D_raw.append(float((H.detach() - mu).norm().item())) D0 = D_raw[0] if D_raw[0] > 0 else 1.0 D_norm = [d / D0 for d in D_raw] # Frozen linear probe on each H_l probe_acc = [] ytr = data['y'][data['train_mask']].cpu().numpy() yte = data['y'][data['test_mask']].cpu().numpy() train_mask_b = data['train_mask'] test_mask_b = data['test_mask'] for H in Hs: Xtr = H.detach()[train_mask_b].cpu().numpy() Xte = H.detach()[test_mask_b].cpu().numpy() try: sc = StandardScaler().fit(Xtr) Xtr_s = sc.transform(Xtr) Xte_s = sc.transform(Xte) clf = LogisticRegression(max_iter=2000, C=1.0).fit(Xtr_s, ytr) acc = float(clf.score(Xte_s, yte)) except Exception: acc = float('nan') probe_acc.append(acc) bp_acc = bp.evaluate('test_mask') del bp; torch.cuda.empty_cache() return dict(L=L, seed=seed, bp_acc=bp_acc, W_grads_F=W_grads_F, W_grads_rms=W_grads_rms, Z_grads_F=Z_grads_F, Z_grads_rms=Z_grads_rms, M_F=M_F, M_rms=M_rms, D_raw=D_raw, D_norm=D_norm, probe_acc=probe_acc) def main(): data = load_dataset('Cora', device=DEVICE) print(f"Cora: N={data['X'].shape[0]}, F={data['X'].shape[1]}, " f"C={data['num_classes']}", flush=True) all_results = {} for L in [20, 10, 6]: print(f'\n=== L={L} ===', flush=True) rows = [] for s in SEEDS: r = diagnose(s, L, data) rows.append(r) wg = r['W_grads_F'] print(f" L={L} s={s:2d} acc={r['bp_acc']:.4f} " f"W_grads[0,mid,-1]=[{wg[0]:.2e}, {wg[len(wg)//2]:.2e}, {wg[-1]:.2e}] " f"Z_grad[out]={r['Z_grads_F'][-1]:.2e}", flush=True) all_results[f'L={L}'] = rows out_path = os.path.join(OUT_DIR, 'diag_data_v2.json') with open(out_path, 'w') as f: json.dump(all_results, f, indent=2) print(f'\nSaved {out_path}') print('\n=== summary ===') for k, rows in all_results.items(): Wg = np.array([r['W_grads_F'] for r in rows]) n_under = int((Wg < 1e-38).sum()) n_total = Wg.size accs = np.array([r['bp_acc'] for r in rows]) print(f' {k}: BP acc {accs.mean():.4f}±{accs.std():.4f} ' f'W_grads_F median={np.median(Wg):.3e} ' f'<1e-38: {n_under}/{n_total} cells') if __name__ == '__main__': main()