summaryrefslogtreecommitdiff
path: root/experiments/run_diag_section23_v2.py
diff options
context:
space:
mode:
Diffstat (limited to 'experiments/run_diag_section23_v2.py')
-rw-r--r--experiments/run_diag_section23_v2.py172
1 files changed, 172 insertions, 0 deletions
diff --git a/experiments/run_diag_section23_v2.py b/experiments/run_diag_section23_v2.py
new file mode 100644
index 0000000..f583031
--- /dev/null
+++ b/experiments/run_diag_section23_v2.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+"""§2.3 diagnostic v2 — faithful reproduction of original methodology.
+
+Uses src.trainers.BPTrainer (the actual training stack used in the paper),
+matching results/gradient_reach_20seeds/per_seed_data.json which shows
+GCN L=10 weight grad norms = 0.0 for all 20 seeds × 10 layers.
+
+Adds beyond the original:
+ - pre-activation grad G_Z[l] = ||dL/dZ_l||_F and RMS-normed variant
+ - forward magnitudes M[l] = ||H_l||_F and RMS-normed
+ - centered dispersion D[l] = ||H_l - mean||_F / D_0
+ - frozen linear probe probe_acc[l] on H_l
+
+Backbone: GCN. Cora. 100 epochs (matches original). 20 seeds. Depths {6, 10, 20}.
+Output: results/diag_section23/diag_data_v2.json
+"""
+import json, os, sys
+import numpy as np
+import torch
+import torch.nn.functional as F
+from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import StandardScaler
+
+sys.path.insert(0, '/home/yurenh2/graph-grape')
+from src.data import load_dataset, spmm
+from src.trainers import BPTrainer
+
+DEVICE = 'cuda:0' # CUDA_VISIBLE_DEVICES=2 → cuda:0
+HIDDEN = 64
+LR = 0.01
+WD = 5e-4
+EPOCHS = 100
+SEEDS = list(range(20))
+OUT_DIR = '/home/yurenh2/graph-grape/results/diag_section23'
+os.makedirs(OUT_DIR, exist_ok=True)
+
+
+def forward_with_intermediates(bp, capture_for_grad=False):
+ """Re-implement BPTrainer.forward() but capture per-layer Z (pre-act) and H (post-act).
+ H[0] = X (input features). For l = 1..L: H[l] = relu(Z[l-1]) (or Z[l-1] for last layer).
+ Z[0..L-1] are pre-activation outputs of each conv.
+ """
+ X = bp.data['X']
+ H_list = [X]
+ Z_list = []
+ H = X
+ H0 = None
+ for l in range(bp.num_layers):
+ if l > 0 and l < bp.num_layers - 1 and bp.residual_alpha > 0 and H0 is not None:
+ H = (1 - bp.residual_alpha) * H + bp.residual_alpha * H0
+ Z = bp._graph_conv(H, bp.weights[l], l)
+ if capture_for_grad:
+ Z.retain_grad()
+ Z_list.append(Z)
+ if l < bp.num_layers - 1:
+ H = F.relu(Z)
+ if l == 0:
+ H0 = H
+ else:
+ H = Z # final logits, no relu
+ H_list.append(H)
+ return H_list[-1], Z_list, H_list # logits, Z's, H's
+
+
+def diagnose(seed, L, data):
+ torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed)
+ bp = BPTrainer(data=data, hidden_dim=HIDDEN, lr=LR, weight_decay=WD,
+ num_layers=L, residual_alpha=0.0, backbone='gcn')
+
+ for _ in range(EPOCHS):
+ bp.train_step()
+
+ # Diagnostic forward at epoch 100
+ bp.optimizer.zero_grad()
+ logits, Zs, Hs = forward_with_intermediates(bp, capture_for_grad=True)
+ mask = data['train_mask']
+ loss = F.cross_entropy(logits[mask], data['y'][mask])
+ loss.backward(retain_graph=False)
+
+ # Weight gradients (original methodology)
+ W_grads_F = [float(bp.weights[l].grad.detach().norm().item()) for l in range(L)]
+ W_grads_rms = [g / np.sqrt(bp.weights[l].numel()) for g, l in zip(W_grads_F, range(L))]
+
+ # Pre-activation gradients on Z_l (l=0..L-1)
+ Z_grads_F = []
+ Z_grads_rms = []
+ for z in Zs:
+ if z.grad is None:
+ Z_grads_F.append(0.0); Z_grads_rms.append(0.0); continue
+ N, d_ = z.shape
+ gf = float(z.grad.detach().norm().item())
+ Z_grads_F.append(gf)
+ Z_grads_rms.append(gf / np.sqrt(N * d_))
+
+ # Forward state metrics on H_l (l=0..L)
+ M_F, M_rms = [], []
+ D_raw = []
+ for H in Hs:
+ N, d_ = H.shape
+ mf = float(H.detach().norm().item())
+ M_F.append(mf)
+ M_rms.append(mf / np.sqrt(N * d_))
+ mu = H.detach().mean(0, keepdim=True)
+ D_raw.append(float((H.detach() - mu).norm().item()))
+ D0 = D_raw[0] if D_raw[0] > 0 else 1.0
+ D_norm = [d / D0 for d in D_raw]
+
+ # Frozen linear probe on each H_l
+ probe_acc = []
+ ytr = data['y'][data['train_mask']].cpu().numpy()
+ yte = data['y'][data['test_mask']].cpu().numpy()
+ train_mask_b = data['train_mask']
+ test_mask_b = data['test_mask']
+ for H in Hs:
+ Xtr = H.detach()[train_mask_b].cpu().numpy()
+ Xte = H.detach()[test_mask_b].cpu().numpy()
+ try:
+ sc = StandardScaler().fit(Xtr)
+ Xtr_s = sc.transform(Xtr)
+ Xte_s = sc.transform(Xte)
+ clf = LogisticRegression(max_iter=2000, C=1.0).fit(Xtr_s, ytr)
+ acc = float(clf.score(Xte_s, yte))
+ except Exception:
+ acc = float('nan')
+ probe_acc.append(acc)
+
+ bp_acc = bp.evaluate('test_mask')
+
+ del bp; torch.cuda.empty_cache()
+ return dict(L=L, seed=seed, bp_acc=bp_acc,
+ W_grads_F=W_grads_F, W_grads_rms=W_grads_rms,
+ Z_grads_F=Z_grads_F, Z_grads_rms=Z_grads_rms,
+ M_F=M_F, M_rms=M_rms, D_raw=D_raw, D_norm=D_norm,
+ probe_acc=probe_acc)
+
+
+def main():
+ data = load_dataset('Cora', device=DEVICE)
+ print(f"Cora: N={data['X'].shape[0]}, F={data['X'].shape[1]}, "
+ f"C={data['num_classes']}", flush=True)
+
+ all_results = {}
+ for L in [20, 10, 6]:
+ print(f'\n=== L={L} ===', flush=True)
+ rows = []
+ for s in SEEDS:
+ r = diagnose(s, L, data)
+ rows.append(r)
+ wg = r['W_grads_F']
+ print(f" L={L} s={s:2d} acc={r['bp_acc']:.4f} "
+ f"W_grads[0,mid,-1]=[{wg[0]:.2e}, {wg[len(wg)//2]:.2e}, {wg[-1]:.2e}] "
+ f"Z_grad[out]={r['Z_grads_F'][-1]:.2e}", flush=True)
+ all_results[f'L={L}'] = rows
+
+ out_path = os.path.join(OUT_DIR, 'diag_data_v2.json')
+ with open(out_path, 'w') as f:
+ json.dump(all_results, f, indent=2)
+ print(f'\nSaved {out_path}')
+
+ print('\n=== summary ===')
+ for k, rows in all_results.items():
+ Wg = np.array([r['W_grads_F'] for r in rows])
+ n_under = int((Wg < 1e-38).sum())
+ n_total = Wg.size
+ accs = np.array([r['bp_acc'] for r in rows])
+ print(f' {k}: BP acc {accs.mean():.4f}±{accs.std():.4f} '
+ f'W_grads_F median={np.median(Wg):.3e} '
+ f'<1e-38: {n_under}/{n_total} cells')
+
+
+if __name__ == '__main__':
+ main()