From bd9333eda60a9029a198acaeacb1eca4312bd1e8 Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Mon, 4 May 2026 23:05:16 -0500 Subject: =?UTF-8?q?Initial=20release:=20GRAFT=20(KAFT)=20=E2=80=94=20NeurI?= =?UTF-8?q?PS=202026=20submission=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Topology-factorized Jacobian-aligned feedback for deep GNNs. Includes: - src/: GraphGrAPETrainer (KAFT) + BP / DFA / DFA-GNN / VanillaGrAPE baselines + multi-probe alignment estimator + dataset / sparse-mm utilities. - experiments/: 19 runners reproducing every figure / table in the paper. - figures/: 4 generators + the 4 PDFs cited in the report. - paper/: NeurIPS .tex and consolidated experiments_master notes. Smoke test: 50-epoch Cora GCN L=4 gives BP 77.3% / KAFT 79.0%. --- experiments/run_ablation_20seeds.py | 115 +++++++++ experiments/run_bp_graft_depth.py | 111 +++++++++ experiments/run_cafo_baseline.py | 198 +++++++++++++++ experiments/run_combo_20seeds.py | 454 ++++++++++++++++++++++++++++++++++ experiments/run_cora_perturb.py | 129 ++++++++++ experiments/run_cs_full.py | 147 +++++++++++ experiments/run_dblp_depth.py | 162 ++++++++++++ experiments/run_dblp_depth_scaling.py | 115 +++++++++ experiments/run_depth_extras.py | 92 +++++++ experiments/run_dfagnn_depth.py | 101 ++++++++ experiments/run_diag_section23_v2.py | 172 +++++++++++++ experiments/run_ff_baseline.py | 282 +++++++++++++++++++++ experiments/run_grad_reach_20seeds.py | 156 ++++++++++++ experiments/run_hero_extras.py | 156 ++++++++++++ experiments/run_pepita_baseline.py | 188 ++++++++++++++ experiments/run_realworld_hero_L20.py | 170 +++++++++++++ experiments/run_resgcn_20seeds.py | 147 +++++++++++ experiments/run_shallow_depth.py | 125 ++++++++++ experiments/run_wikics_paper_setup.py | 146 +++++++++++ 19 files changed, 3166 insertions(+) create mode 100644 experiments/run_ablation_20seeds.py create mode 100644 experiments/run_bp_graft_depth.py create mode 100644 experiments/run_cafo_baseline.py create mode 100644 experiments/run_combo_20seeds.py create mode 100644 experiments/run_cora_perturb.py create mode 100644 experiments/run_cs_full.py create mode 100644 experiments/run_dblp_depth.py create mode 100644 experiments/run_dblp_depth_scaling.py create mode 100644 experiments/run_depth_extras.py create mode 100644 experiments/run_dfagnn_depth.py create mode 100644 experiments/run_diag_section23_v2.py create mode 100644 experiments/run_ff_baseline.py create mode 100644 experiments/run_grad_reach_20seeds.py create mode 100644 experiments/run_hero_extras.py create mode 100644 experiments/run_pepita_baseline.py create mode 100644 experiments/run_realworld_hero_L20.py create mode 100644 experiments/run_resgcn_20seeds.py create mode 100644 experiments/run_shallow_depth.py create mode 100644 experiments/run_wikics_paper_setup.py (limited to 'experiments') diff --git a/experiments/run_ablation_20seeds.py b/experiments/run_ablation_20seeds.py new file mode 100644 index 0000000..61055ed --- /dev/null +++ b/experiments/run_ablation_20seeds.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +"""Ablation study with 20 seeds: BP → DFA → DFA-GNN → VanillaGrAPE → GRAFT.""" + +import torch +import numpy as np +import json +import os +from scipy import stats as scipy_stats +from src.data import load_dataset +from src.trainers import BPTrainer, DFATrainer, DFAGNNTrainer, VanillaGrAPETrainer, GraphGrAPETrainer + +device = 'cuda:0' +SEEDS = list(range(20)) +EPOCHS = 200 +OUT_DIR = 'results/ablation_20seeds' + +METHODS = { + 'BP': (BPTrainer, {}), + 'DFA': (DFATrainer, {}), + 'DFA-GNN': (DFAGNNTrainer, {'topo_mode': 'fixed_A'}), + 'VanillaGrAPE': (VanillaGrAPETrainer, { + 'diffusion_alpha': 0.5, 'diffusion_iters': 10, + 'lr_feedback': 0.5, 'num_probes': 64, 'topo_mode': 'fixed_A' + }), + 'GRAFT': (GraphGrAPETrainer, { + 'diffusion_alpha': 0.5, 'diffusion_iters': 10, + 'lr_feedback': 0.5, 'num_probes': 64, 'topo_mode': 'fixed_A' + }), +} + + +def train_one(cls, common, extra, seed): + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + t = cls(**common, **extra) + if hasattr(t, 'align_mode'): + t.align_mode = 'chain_norm' + bv, bt = 0, 0 + for ep in range(EPOCHS): + t.train_step() + if ep % 5 == 0: + v = t.evaluate('val_mask') + te = t.evaluate('test_mask') + if v > bv: bv, bt = v, te + del t; torch.cuda.empty_cache() + return bt + + +def main(): + os.makedirs(OUT_DIR, exist_ok=True) + per_seed_file = os.path.join(OUT_DIR, 'per_seed_data.json') + if os.path.exists(per_seed_file): + with open(per_seed_file) as f: + per_seed_data = json.load(f) + else: + per_seed_data = {} + + results = {} + + for ds_name in ['Cora', 'CiteSeer', 'PubMed']: + data = load_dataset(ds_name, device=device) + common = dict(data=data, hidden_dim=64, lr=0.01, weight_decay=5e-4, + num_layers=6, residual_alpha=0.0, backbone='gcn') + + for mname, (cls, extra) in METHODS.items(): + key = f"{ds_name}_{mname}" + print(f"\n=== {key} (20 seeds) ===", flush=True) + + if key not in per_seed_data: + per_seed_data[key] = {} + + for seed in SEEDS: + sk = str(seed) + if sk in per_seed_data[key]: + print(f" seed {seed}: cached", flush=True) + continue + acc = train_one(cls, common, extra, seed) + per_seed_data[key][sk] = acc + print(f" seed {seed}: {acc*100:.1f}%", flush=True) + + with open(per_seed_file, 'w') as f: + json.dump(per_seed_data, f, indent=2) + + accs = np.array([per_seed_data[key][str(s)] for s in SEEDS]) * 100 + results[key] = { + 'mean': float(accs.mean()), 'std': float(accs.std()), + 'accs': accs.tolist(), + } + print(f" {mname}: {accs.mean():.1f} ± {accs.std():.1f}%") + + del data; torch.cuda.empty_cache() + + # Paired t-tests between adjacent methods + print("\n=== Paired t-tests (adjacent methods) ===") + method_names = list(METHODS.keys()) + for ds in ['Cora', 'CiteSeer', 'PubMed']: + print(f"\n{ds}:") + for i in range(len(method_names) - 1): + m1, m2 = method_names[i], method_names[i+1] + a1 = np.array(results[f"{ds}_{m1}"]['accs']) + a2 = np.array(results[f"{ds}_{m2}"]['accs']) + t_stat, p_val = scipy_stats.ttest_rel(a2, a1) + sig = '***' if p_val < 0.001 else ('**' if p_val < 0.01 else ('*' if p_val < 0.05 else 'ns')) + delta = a2.mean() - a1.mean() + results[f"{ds}_{m1}_vs_{m2}"] = { + 'delta': float(delta), 't_stat': float(t_stat), 'p_value': float(p_val) + } + print(f" {m1} → {m2}: Δ{delta:+.1f}% p={p_val:.4f} {sig}") + + with open(os.path.join(OUT_DIR, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"\nSaved to {OUT_DIR}/results.json") + + +if __name__ == '__main__': + main() diff --git a/experiments/run_bp_graft_depth.py b/experiments/run_bp_graft_depth.py new file mode 100644 index 0000000..1e8dd76 --- /dev/null +++ b/experiments/run_bp_graft_depth.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +"""H9: BP + GRAFT depth sweep on Cora/CiteSeer/PubMed. + +E1 already did DBLP L={8,12,16,20,24,32}. This fills the gap for Cora/CiteSeer/PubMed +at L={8,10,12,16,20} so we can plot Figure 4(a)-style depth curves on 4 datasets. + +BP + GRAFT only (GRAFT+ResGCN not needed for this figure — that's stacking table). +""" + +import torch +import numpy as np +import json +import os +from src.data import load_dataset +from src.trainers import BPTrainer, GraphGrAPETrainer + +device = 'cuda:0' +SEEDS = list(range(20)) +EPOCHS = 200 +DEPTHS = [8, 10, 12, 16, 20] +OUT_DIR = 'results/bp_graft_depth_20seeds' + +grape_extra = dict(diffusion_alpha=0.5, diffusion_iters=10, + lr_feedback=0.5, num_probes=64, topo_mode='fixed_A') + +METHODS = { + 'BP': (BPTrainer, {}), + 'GRAFT': (GraphGrAPETrainer, grape_extra), +} + + +def train_one(cls, common, extra, seed): + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + t = cls(**common, **extra) + if hasattr(t, 'align_mode'): + t.align_mode = 'chain_norm' + bv, bt = 0, 0 + for ep in range(EPOCHS): + t.train_step() + if ep % 5 == 0: + v = t.evaluate('val_mask') + te = t.evaluate('test_mask') + if v > bv: bv, bt = v, te + del t; torch.cuda.empty_cache() + return bt + + +def main(): + os.makedirs(OUT_DIR, exist_ok=True) + per_seed_file = os.path.join(OUT_DIR, 'per_seed_data.json') + if os.path.exists(per_seed_file): + with open(per_seed_file) as f: + per_seed_data = json.load(f) + else: + per_seed_data = {} + + datasets_cfg = { + 'Cora': lambda: load_dataset('Cora', device=device), + 'CiteSeer': lambda: load_dataset('CiteSeer', device=device), + 'PubMed': lambda: load_dataset('PubMed', device=device), + } + + for ds_name, loader in datasets_cfg.items(): + data = loader() + for L in DEPTHS: + common = dict(data=data, hidden_dim=64, lr=0.01, weight_decay=5e-4, + num_layers=L, residual_alpha=0.0, backbone='gcn') + + for mname, (cls, extra) in METHODS.items(): + key = f"{ds_name}_L{L}_{mname}" + if key not in per_seed_data: + per_seed_data[key] = {} + + print(f"\n=== {key} (20 seeds) ===", flush=True) + for seed in SEEDS: + sk = str(seed) + if sk in per_seed_data[key]: + print(f" seed {seed}: cached ({per_seed_data[key][sk]*100:.1f}%)", flush=True) + continue + try: + acc = train_one(cls, common, extra, seed) + per_seed_data[key][sk] = acc + print(f" seed {seed}: {acc*100:.1f}%", flush=True) + except Exception as e: + print(f" seed {seed}: FAILED - {e}", flush=True) + per_seed_data[key][sk] = 0.0 + + with open(per_seed_file, 'w') as f: + json.dump(per_seed_data, f, indent=2) + del data; torch.cuda.empty_cache() + + # Summary + print(f"\n{'=' * 70}\nBP/GRAFT depth sweep summary\n{'=' * 70}") + results = {} + for ds in datasets_cfg: + print(f"\n{ds}:") + for L in DEPTHS: + for m in METHODS: + key = f"{ds}_L{L}_{m}" + vals = np.array([per_seed_data[key][str(s)] for s in SEEDS]) * 100 + results[key] = {'mean': float(vals.mean()), 'std': float(vals.std()), + 'per_seed': vals.tolist()} + print(f" L={L:2d} {m:<6} {vals.mean():5.1f} ± {vals.std():4.1f}") + + with open(os.path.join(OUT_DIR, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"\nSaved to {OUT_DIR}/results.json") + + +if __name__ == '__main__': + main() diff --git a/experiments/run_cafo_baseline.py b/experiments/run_cafo_baseline.py new file mode 100644 index 0000000..3d8c2d7 --- /dev/null +++ b/experiments/run_cafo_baseline.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +"""H3: CaFo+CE (Cascaded Forward Learning with Top-Down Feedback, Park et al. 2023). + +Greedy layer-wise training for GCN L=6: + - Each hidden layer l has an auxiliary classifier W_aux_l: hidden → num_classes + - Forward through all layers with .detach() between layers (blocks upstream gradient) + - Per-layer CE loss on labeled nodes via auxiliary classifier + - Output layer uses standard cross-entropy + - No global backprop — each W_l only sees its local loss + +Tests CaFo on Cora/CiteSeer/PubMed/DBLP × 20 seeds, GCN L=6. +""" + +import torch +import torch.nn.functional as F +import numpy as np +import json +import os +from src.data import load_dataset, spmm +from run_dblp_depth import load_dblp + +device = 'cuda:0' +SEEDS = list(range(20)) +EPOCHS = 200 +OUT_DIR = 'results/cafo_baseline_20seeds' + + +class CaFoTrainer: + """CaFo+CE: greedy layer-wise training with per-layer CE loss.""" + + def __init__(self, data, hidden_dim, lr, weight_decay, + num_layers=2, residual_alpha=0.0, backbone='gcn', **_kw): + dev = data['X'].device + self.data = data + self.device = dev + self.lr = lr + self.wd = weight_decay + self.num_layers = num_layers + self.residual_alpha = residual_alpha + self.backbone = backbone + self._training = True + + d_in = data['num_features'] + d_out = data['num_classes'] + self.d_out = d_out + + dims = [d_in] + [hidden_dim] * (num_layers - 1) + [d_out] + # Main layer weights — autograd Parameters + self.weights = [] + for i in range(num_layers): + w = torch.empty(dims[i], dims[i + 1], device=dev) + torch.nn.init.xavier_uniform_(w) + w.requires_grad_(True) + self.weights.append(w) + + # Auxiliary classifier per hidden layer: hidden_dim -> d_out + self.W_aux = [] + for i in range(num_layers - 1): + w_aux = torch.empty(hidden_dim, d_out, device=dev) + torch.nn.init.xavier_uniform_(w_aux) + w_aux.requires_grad_(True) + self.W_aux.append(w_aux) + + params = self.weights + self.W_aux + self.optim = torch.optim.Adam(params, lr=lr, weight_decay=weight_decay) + + def _gcn_conv(self, H, W): + """GCN conv: A_hat @ (H @ W).""" + return spmm(self.data['A_hat'], H @ W) + + def train_step(self): + X = self.data['X'] + y = self.data['y'] + mask = self.data['train_mask'] + + self.optim.zero_grad() + + H = X + total_loss = 0.0 + for l in range(self.num_layers): + if l > 0: + H = H.detach() # block grad flow upstream + + Z = self._gcn_conv(H, self.weights[l]) + + if l < self.num_layers - 1: + H_new = F.relu(Z) + # Auxiliary classifier (projects hidden to classes) + Z_aux = H_new @ self.W_aux[l] + loss_l = F.cross_entropy(Z_aux[mask], y[mask]) + loss_l.backward() + total_loss += loss_l.item() + H = H_new + else: + # Output layer: standard CE + loss_final = F.cross_entropy(Z[mask], y[mask]) + loss_final.backward() + total_loss += loss_final.item() + + self.optim.step() + + with torch.no_grad(): + Z_out = self._forward_full_detached() + acc = (Z_out[mask].argmax(1) == y[mask]).float().mean().item() + return total_loss, acc, {} + + def _forward_full_detached(self): + """Full forward pass with no_grad for evaluation.""" + X = self.data['X'] + H = X + for l in range(self.num_layers): + Z = self._gcn_conv(H, self.weights[l].detach()) + if l < self.num_layers - 1: + H = F.relu(Z) + return Z + + @torch.no_grad() + def evaluate(self, mask_name='test_mask'): + self._training = False + Z = self._forward_full_detached() + self._training = True + mask = self.data[mask_name] + return (Z[mask].argmax(1) == self.data['y'][mask]).float().mean().item() + + +def train_one(seed, data, num_layers=6): + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + t = CaFoTrainer(data=data, hidden_dim=64, lr=0.01, weight_decay=5e-4, + num_layers=num_layers, residual_alpha=0.0, backbone='gcn') + bv, bt = 0, 0 + for ep in range(EPOCHS): + t.train_step() + if ep % 5 == 0: + v = t.evaluate('val_mask') + te = t.evaluate('test_mask') + if v > bv: bv, bt = v, te + del t; torch.cuda.empty_cache() + return bt + + +def main(): + os.makedirs(OUT_DIR, exist_ok=True) + per_seed_file = os.path.join(OUT_DIR, 'per_seed_data.json') + if os.path.exists(per_seed_file): + with open(per_seed_file) as f: + per_seed_data = json.load(f) + else: + per_seed_data = {} + + datasets_cfg = { + 'Cora': lambda: load_dataset('Cora', device=device), + 'CiteSeer': lambda: load_dataset('CiteSeer', device=device), + 'PubMed': lambda: load_dataset('PubMed', device=device), + 'DBLP': lambda: load_dblp(), + } + + for ds_name, loader in datasets_cfg.items(): + data = loader() + key = f"{ds_name}_CaFo+CE" + if key not in per_seed_data: + per_seed_data[key] = {} + + print(f"\n=== {key} (20 seeds, GCN L=6) ===", flush=True) + for seed in SEEDS: + sk = str(seed) + if sk in per_seed_data[key]: + print(f" seed {seed}: cached ({per_seed_data[key][sk]*100:.1f}%)", flush=True) + continue + try: + acc = train_one(seed, data) + per_seed_data[key][sk] = acc + print(f" seed {seed}: {acc*100:.1f}%", flush=True) + except Exception as e: + print(f" seed {seed}: FAILED - {e}", flush=True) + per_seed_data[key][sk] = 0.0 + + with open(per_seed_file, 'w') as f: + json.dump(per_seed_data, f, indent=2) + + del data; torch.cuda.empty_cache() + + # Summary + print(f"\n{'=' * 70}\nCaFo+CE summary (20 seeds, GCN L=6)\n{'=' * 70}") + results = {} + for ds in datasets_cfg: + key = f"{ds}_CaFo+CE" + vals = np.array([per_seed_data[key][str(s)] for s in SEEDS]) * 100 + results[key] = {'mean': float(vals.mean()), 'std': float(vals.std()), + 'per_seed': vals.tolist()} + print(f" {ds:<12} {vals.mean():5.1f} ± {vals.std():4.1f}") + + with open(os.path.join(OUT_DIR, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"\nSaved to {OUT_DIR}/results.json") + + +if __name__ == '__main__': + main() diff --git a/experiments/run_combo_20seeds.py b/experiments/run_combo_20seeds.py new file mode 100644 index 0000000..1598964 --- /dev/null +++ b/experiments/run_combo_20seeds.py @@ -0,0 +1,454 @@ +#!/usr/bin/env python3 +"""Task 2ceadaa7: GRAFT + Forward Tricks combo experiments (20 seeds). + +Combos: GRAFT+ResGCN, GRAFT+DropEdge, GRAFT+PairNorm, GRAFT+JKNet +Each compared to: BP, forward_trick_only, GRAFT_only, combo +""" + +import torch +import torch.nn.functional as F +import numpy as np +import json +import os +from scipy import stats as scipy_stats +from src.data import load_dataset, spmm, build_normalized_adj +from src.trainers import BPTrainer, GraphGrAPETrainer, _FeedbackTrainerBase +from run_deep_baselines import ResGCNTrainer, JKNetTrainer +from run_dropedge import BPDropEdgeTrainer +from run_pairnorm_baseline import BPPairNormTrainer, pairnorm +from run_dblp_depth import load_dblp + +device = 'cuda:0' +SEEDS = list(range(20)) +EPOCHS = 200 +OUT_DIR = 'results/combo_20seeds' + +grape_extra = dict(diffusion_alpha=0.5, diffusion_iters=10, + lr_feedback=0.5, num_probes=64, topo_mode='fixed_A') + + +# ═══════════════════════════════════════════════════════════════════════════ +# GRAFT + ResGCN combo (fixed version) +# ═══════════════════════════════════════════════════════════════════════════ +class GRAFTResGCN(GraphGrAPETrainer): + """GRAFT backward + ResGCN forward (skip connections).""" + + def forward(self): + X = self.data['X'] + H = X + H0 = None + Hs, Zs = [], [] + + for l in range(self.num_layers): + Z = self._graph_conv(H, self.weights[l], l) + Zs.append(Z) + if l < self.num_layers - 1: + H_new = F.relu(Z) + if H_new.size(1) == H.size(1): + H = H + H_new + else: + H = H_new + Hs.append(H) + if l == 0: + H0 = H + else: + return Z, {'Hs': Hs, 'Zs': Zs, 'H0': H0} + return Z, {'Hs': Hs, 'Zs': Zs, 'H0': H0} + + +# ═══════════════════════════════════════════════════════════════════════════ +# GRAFT + DropEdge combo +# ═══════════════════════════════════════════════════════════════════════════ +class GRAFTDropEdge(GraphGrAPETrainer): + """GRAFT backward + DropEdge forward (random edge dropping).""" + + def __init__(self, *args, drop_rate=0.5, **kwargs): + super().__init__(*args, **kwargs) + self.drop_rate = drop_rate + self._A_hat_orig = self.data['A_hat'] + self._edge_index_orig = self._A_hat_orig.indices() + self._edge_values_orig = self._A_hat_orig.values() + self._N = self._A_hat_orig.size(0) + + def _drop_edges(self): + if not self._training or self.drop_rate <= 0: + return self._A_hat_orig + mask = torch.rand(self._edge_values_orig.size(0), + device=self._edge_values_orig.device) > self.drop_rate + new_vals = self._edge_values_orig * mask.float() / (1 - self.drop_rate) + return torch.sparse_coo_tensor( + self._edge_index_orig, new_vals, (self._N, self._N) + ).coalesce() + + def forward(self): + # DropEdge only in forward pass, GRAFT backward uses original A_hat + self.data['A_hat'] = self._drop_edges() + result = super().forward() # uses GraphGrAPETrainer.forward() + self.data['A_hat'] = self._A_hat_orig + return result + + def evaluate(self, mask_name='test_mask'): + self.data['A_hat'] = self._A_hat_orig + return super().evaluate(mask_name) + + +# ═══════════════════════════════════════════════════════════════════════════ +# GRAFT + PairNorm combo +# ═══════════════════════════════════════════════════════════════════════════ +class GRAFTPairNorm(GraphGrAPETrainer): + """GRAFT backward + PairNorm forward (center & scale normalization).""" + + def __init__(self, *args, pn_scale=1.0, **kwargs): + super().__init__(*args, **kwargs) + self.pn_scale = pn_scale + + def forward(self): + X = self.data['X'] + H = X + H0 = None + Hs, Zs = [], [] + + if self.backbone == 'appnp': + for l in range(self.num_layers): + Z = H @ self.weights[l] + Zs.append(Z) + if l < self.num_layers - 1: + H = F.relu(Z) + H = pairnorm(H, self.pn_scale) + Hs.append(H) + if l == 0: + H0 = H + else: + Z = self._appnp_propagate(Z) + Zs[-1] = Z + return Z, {'Hs': Hs, 'Zs': Zs, 'H0': H0} + + for l in range(self.num_layers): + Z = self._graph_conv(H, self.weights[l], l) + Zs.append(Z) + if l < self.num_layers - 1: + H = F.relu(Z) + H = pairnorm(H, self.pn_scale) + Hs.append(H) + if l == 0: + H0 = H + else: + return Z, {'Hs': Hs, 'Zs': Zs, 'H0': H0} + return Z, {'Hs': Hs, 'Zs': Zs, 'H0': H0} + + +# ═══════════════════════════════════════════════════════════════════════════ +# GRAFT + JKNet combo +# ═══════════════════════════════════════════════════════════════════════════ +class GRAFTJKNet(GraphGrAPETrainer): + """GRAFT backward + JKNet forward (jumping knowledge max-pool). + + Note: JKNet changes the output architecture. We max-pool hidden layers + and project to num_classes. GRAFT backward operates on hidden layers + as usual; the JK projection is treated as the output layer. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # JK projection: hidden_dim -> num_classes + self.jk_proj = torch.randn(self.hidden_dim, self.d_out, + device=self.device) * 0.01 + # Add to Adam state + self._adam.append({'m': torch.zeros_like(self.jk_proj), + 'v': torch.zeros_like(self.jk_proj)}) + + def forward(self): + X = self.data['X'] + H = X + H0 = None + Hs, Zs = [], [] + + for l in range(self.num_layers): + Z = self._graph_conv(H, self.weights[l], l) + Zs.append(Z) + if l < self.num_layers - 1: + H = F.relu(Z) + Hs.append(H) + if l == 0: + H0 = H + + # JK max-pool over hidden layers + if Hs: + stacked = torch.stack(Hs, dim=0) # (L-1, N, d) + jk_repr = stacked.max(dim=0)[0] # (N, d) + Z_out = jk_repr @ self.jk_proj + # Override Hs[-1] for backward compatibility with _update_weights + # which uses Hs[-1] as input to output layer + Hs_for_backward = list(Hs) + Hs_for_backward[-1] = jk_repr + return Z_out, {'Hs': Hs_for_backward, 'Zs': Zs, 'H0': H0} + else: + return Z, {'Hs': Hs, 'Zs': Zs, 'H0': H0} + + def _update_weights(self, inter, E0, deltas): + """Override to handle JK projection separately.""" + # Update hidden layers using GRAFT feedback as usual + X = self.data['X'] + Hs = inter['Hs'] + H0 = inter['H0'] + + grads = [] + for l in range(self.num_layers): + if l == self.num_layers - 1: + # Skip the original output layer — JK projection replaces it + # But still compute gradient for W_L (unused in JK, but keeps indexing) + H_prev = Hs[-1] if Hs else X + g = H_prev.t() @ self._graph_conv_T(E0, l) + else: + if l == 0: + H_in = X + else: + H_prev = Hs[l - 1] + if self.residual_alpha > 0 and H0 is not None: + H_in = (1 - self.residual_alpha) * H_prev + self.residual_alpha * H0 + else: + H_in = H_prev + g = H_in.t() @ self._graph_conv_T(deltas[l], l) + grads.append(g) + + # Update JK projection: grad = jk_repr.T @ E0 + jk_repr = Hs[-1] if Hs else X + jk_grad = jk_repr.t() @ E0 + + if self._use_adam: + self._adam_t += 1 + for i in range(self.num_layers): + self.weights[i] = self.weights[i] - self._adam_step(i, grads[i]) + # Update jk_proj with Adam (use last index) + jk_idx = len(self._adam) - 1 + s = self._adam[jk_idx] + b1, b2, eps = self._adam_beta1, self._adam_beta2, self._adam_eps + t = self._adam_t + s['m'] = b1 * s['m'] + (1 - b1) * jk_grad + s['v'] = b2 * s['v'] + (1 - b2) * jk_grad ** 2 + m_hat = s['m'] / (1 - b1 ** t) + v_hat = s['v'] / (1 - b2 ** t) + self.jk_proj = self.jk_proj - self.lr * (m_hat / (v_hat.sqrt() + eps) + self.wd * self.jk_proj) + else: + for i in range(self.num_layers): + self.weights[i] = self.weights[i] - self.lr * (grads[i] + self.wd * self.weights[i]) + self.jk_proj = self.jk_proj - self.lr * (jk_grad + self.wd * self.jk_proj) + + +# ═══════════════════════════════════════════════════════════════════════════ +# Training +# ═══════════════════════════════════════════════════════════════════════════ + +def train_one(cls, common, extra, seed): + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + t = cls(**common, **extra) + if hasattr(t, 'align_mode'): + t.align_mode = 'chain_norm' + bv, bt = 0, 0 + for ep in range(EPOCHS): + t.train_step() + if ep % 5 == 0: + v = t.evaluate('val_mask') + te = t.evaluate('test_mask') + if v > bv: bv, bt = v, te + del t; torch.cuda.empty_cache() + return bt + + +def main(): + os.makedirs(OUT_DIR, exist_ok=True) + per_seed_file = os.path.join(OUT_DIR, 'per_seed_data.json') + if os.path.exists(per_seed_file): + with open(per_seed_file) as f: + per_seed_data = json.load(f) + else: + per_seed_data = {} + + # Reuse existing per-seed data from other experiments + # BP, ResGCN, GRAFT from resgcn_20seeds + try: + with open('results/resgcn_20seeds/per_seed_data.json') as f: + resgcn_cache = json.load(f) + except: + resgcn_cache = {} + + # DropEdge from dropedge_20seeds + try: + with open('results/dropedge_20seeds/per_seed_data.json') as f: + de_cache = json.load(f) + except: + de_cache = {} + + # PairNorm from pairnorm_extended + try: + with open('results/pairnorm_extended/per_seed_data.json') as f: + pn_cache = json.load(f) + except: + pn_cache = {} + + METHODS = { + 'BP': (BPTrainer, {}), + 'ResGCN': (ResGCNTrainer, {}), + 'DropEdge': (BPDropEdgeTrainer, {'drop_rate': 0.5}), + 'PairNorm': (BPPairNormTrainer, {'pn_scale': 1.0}), + 'JKNet': (JKNetTrainer, {}), + 'GRAFT': (GraphGrAPETrainer, grape_extra), + 'GRAFT+ResGCN': (GRAFTResGCN, grape_extra), + 'GRAFT+DropEdge': (GRAFTDropEdge, {**grape_extra, 'drop_rate': 0.5}), + 'GRAFT+PairNorm': (GRAFTPairNorm, {**grape_extra, 'pn_scale': 1.0}), + 'GRAFT+JKNet': (GRAFTJKNet, grape_extra), + } + + datasets_cfg = { + 'Cora': lambda: load_dataset('Cora', device=device), + 'CiteSeer': lambda: load_dataset('CiteSeer', device=device), + 'DBLP': lambda: load_dblp(), + } + + for ds_name, loader in datasets_cfg.items(): + data = loader() + common = dict(data=data, hidden_dim=64, lr=0.01, weight_decay=5e-4, + num_layers=6, residual_alpha=0.0, backbone='gcn') + + for mname, (cls, extra) in METHODS.items(): + key = f"{ds_name}_{mname}" + + if key not in per_seed_data: + per_seed_data[key] = {} + + print(f"\n=== {key} (20 seeds) ===", flush=True) + + for seed in SEEDS: + sk = str(seed) + if sk in per_seed_data[key]: + print(f" seed {seed}: cached", flush=True) + continue + + # Try to pull from existing caches + cached = None + if mname == 'BP' and f"{ds_name}_BP" in resgcn_cache: + cached = resgcn_cache[f"{ds_name}_BP"].get(sk) + elif mname == 'ResGCN' and f"{ds_name}_ResGCN" in resgcn_cache: + cached = resgcn_cache[f"{ds_name}_ResGCN"].get(sk) + elif mname == 'GRAFT' and f"{ds_name}_GRAFT" in resgcn_cache: + cached = resgcn_cache[f"{ds_name}_GRAFT"].get(sk) + elif mname == 'DropEdge': + de_key = f"{ds_name}_gcn_L6" + if de_key in de_cache and sk in de_cache[de_key]: + cached = de_cache[de_key][sk].get('de05') + elif mname == 'PairNorm': + pn_key = f"{ds_name}_gcn_L6_PN" + if pn_key in pn_cache and sk in pn_cache[pn_key]: + cached = pn_cache[pn_key][sk] + + if cached is not None: + per_seed_data[key][sk] = cached + print(f" seed {seed}: from cache ({cached*100:.1f}%)", flush=True) + else: + try: + acc = train_one(cls, common, extra, seed) + per_seed_data[key][sk] = acc + print(f" seed {seed}: {acc*100:.1f}%", flush=True) + except Exception as e: + print(f" seed {seed}: FAILED - {e}", flush=True) + per_seed_data[key][sk] = 0.0 + + # Save after each seed + with open(per_seed_file, 'w') as f: + json.dump(per_seed_data, f, indent=2) + + del data; torch.cuda.empty_cache() + + # ═══════════════════════════════════════════════════════════════════════ + # Analysis + # ═══════════════════════════════════════════════════════════════════════ + results = {} + print("\n" + "=" * 120) + print("FULL RESULTS TABLE") + print("=" * 120) + + for ds in ['Cora', 'CiteSeer', 'DBLP']: + print(f"\n--- {ds} GCN L=6 lr=0.01 ---") + print(f"{'Method':<18} {'Mean±Std':>12} {'vs GRAFT':>18} {'vs FwdTrick':>18}") + print("-" * 70) + + # Get GRAFT accs for comparison + gr_accs = np.array([per_seed_data[f"{ds}_GRAFT"][str(s)] for s in SEEDS]) * 100 + + for mname in ['BP', 'ResGCN', 'DropEdge', 'PairNorm', 'JKNet', + 'GRAFT', 'GRAFT+ResGCN', 'GRAFT+DropEdge', 'GRAFT+PairNorm', 'GRAFT+JKNet']: + key = f"{ds}_{mname}" + if key not in per_seed_data or len(per_seed_data[key]) < 20: + print(f" {mname:<16} MISSING ({len(per_seed_data.get(key, {}))} seeds)") + continue + + accs = np.array([per_seed_data[key][str(s)] for s in SEEDS]) * 100 + m, s = accs.mean(), accs.std() + + results[key] = {'mean': float(m), 'std': float(s), 'accs': accs.tolist()} + + # Paired t-test vs GRAFT + if mname != 'GRAFT': + t_stat, p_val = scipy_stats.ttest_rel(accs, gr_accs) + delta = m - gr_accs.mean() + sig = '***' if p_val < 0.001 else ('**' if p_val < 0.01 else ('*' if p_val < 0.05 else 'ns')) + vs_graft = f"Δ{delta:+.1f} p={p_val:.4f}{sig}" + results[key]['vs_GRAFT'] = { + 'delta': float(delta), 'p_value': float(p_val), + 'significant': bool(p_val < 0.05) + } + else: + vs_graft = "—" + + # Paired t-test vs forward trick only + fwd_map = { + 'GRAFT+ResGCN': 'ResGCN', 'GRAFT+DropEdge': 'DropEdge', + 'GRAFT+PairNorm': 'PairNorm', 'GRAFT+JKNet': 'JKNet' + } + if mname in fwd_map: + fwd_key = f"{ds}_{fwd_map[mname]}" + if fwd_key in per_seed_data and len(per_seed_data[fwd_key]) >= 20: + fwd_accs = np.array([per_seed_data[fwd_key][str(s)] for s in SEEDS]) * 100 + t2, p2 = scipy_stats.ttest_rel(accs, fwd_accs) + d2 = m - fwd_accs.mean() + s2 = '***' if p2 < 0.001 else ('**' if p2 < 0.01 else ('*' if p2 < 0.05 else 'ns')) + vs_fwd = f"Δ{d2:+.1f} p={p2:.4f}{s2}" + results[key][f'vs_{fwd_map[mname]}'] = { + 'delta': float(d2), 'p_value': float(p2), + 'significant': bool(p2 < 0.05) + } + else: + vs_fwd = "N/A" + else: + vs_fwd = "" + + print(f" {mname:<16} {m:>5.1f}±{s:<5.1f} {vs_graft:>18} {vs_fwd:>18}") + + # Summary: which combos are additive? + print("\n" + "=" * 80) + print("COMBO ADDITIVITY SUMMARY") + print("=" * 80) + for ds in ['Cora', 'CiteSeer', 'DBLP']: + print(f"\n{ds}:") + gr_m = results.get(f"{ds}_GRAFT", {}).get('mean', 0) + for combo, fwd in [('GRAFT+ResGCN', 'ResGCN'), ('GRAFT+DropEdge', 'DropEdge'), + ('GRAFT+PairNorm', 'PairNorm'), ('GRAFT+JKNet', 'JKNet')]: + ck = f"{ds}_{combo}" + fk = f"{ds}_{fwd}" + if ck in results and fk in results: + c_m = results[ck]['mean'] + f_m = results[fk]['mean'] + vs_gr = results[ck].get('vs_GRAFT', {}) + vs_fw = results[ck].get(f'vs_{fwd}', {}) + better_than_both = c_m > gr_m and c_m > f_m + marker = "✓ ADDITIVE" if better_than_both else "✗ not additive" + print(f" {combo}: {c_m:.1f} | GRAFT={gr_m:.1f} | {fwd}={f_m:.1f} → {marker}") + + # Save + with open(os.path.join(OUT_DIR, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"\nSaved to {OUT_DIR}/results.json") + + +if __name__ == '__main__': + main() diff --git a/experiments/run_cora_perturb.py b/experiments/run_cora_perturb.py new file mode 100644 index 0000000..1dabc95 --- /dev/null +++ b/experiments/run_cora_perturb.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +""" +Cora perturbation experiment: directly test causal factors. +Three perturbation types: +1. Edge rewiring: destroy community structure +2. Label shuffling: reduce homophily +3. Feature masking: reduce feature quality +""" + +import torch +import torch.nn.functional as F +import numpy as np +import json +import os +from src.data import load_dataset, build_normalized_adj, build_row_normalized_adj +from src.trainers import BPTrainer, DFATrainer, GraphGrAPETrainer + +device = 'cuda:0' +SEEDS = [0, 1, 2, 3, 4] +EPOCHS = 200 +OUT_DIR = 'results/cora_perturbation' + + +def perturb_edges(data, rewire_frac, seed=0): + """Randomly rewire a fraction of edges (destroys community structure).""" + d = {k: v.clone() if isinstance(v, torch.Tensor) else v for k, v in data.items()} + rng = torch.Generator().manual_seed(seed) + A = d['A_hat'] + idx = A.indices() + vals = A.values() + N = d['num_nodes'] + + n_rewire = int(rewire_frac * idx.shape[1]) + if n_rewire > 0: + perm = torch.randperm(idx.shape[1], generator=rng)[:n_rewire].to(idx.device) + new_targets = torch.randint(0, N, (n_rewire,), generator=rng).to(idx.device) + idx_new = idx.clone() + idx_new[1, perm] = new_targets + + A_new = torch.sparse_coo_tensor(idx_new, vals, (N, N)).coalesce() + d['A_hat'] = A_new + d['A_row'] = A_new # simplified + d['A_row_T'] = A_new + return d + + +def perturb_labels(data, shuffle_frac, seed=0): + """Shuffle a fraction of labels (reduces homophily).""" + d = {k: v.clone() if isinstance(v, torch.Tensor) else v for k, v in data.items()} + rng = torch.Generator().manual_seed(seed) + y = d['y'].clone() + N = len(y) + n_shuffle = int(shuffle_frac * N) + perm = torch.randperm(N, generator=rng)[:n_shuffle] + shuffled = y[perm][torch.randperm(n_shuffle, generator=rng)] + y[perm] = shuffled + d['y'] = y + return d + + +def perturb_features(data, mask_frac, seed=0): + """Zero out a fraction of feature dimensions (reduces feature quality).""" + d = {k: v.clone() if isinstance(v, torch.Tensor) else v for k, v in data.items()} + rng = torch.Generator().manual_seed(seed) + X = d['X'].clone() + F_dim = X.shape[1] + n_mask = int(mask_frac * F_dim) + mask_dims = torch.randperm(F_dim, generator=rng)[:n_mask] + X[:, mask_dims] = 0 + d['X'] = X + return d + + +def train_one(cls, common, extra, seed): + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + t = cls(**common, **extra) + if hasattr(t, 'align_mode'): + t.align_mode = 'chain_norm' + bv, bt = 0, 0 + for ep in range(EPOCHS): + t.train_step() + if ep % 5 == 0: + v, te = t.evaluate('val_mask'), t.evaluate('test_mask') + if v > bv: bv, bt = v, te + del t; torch.cuda.empty_cache() + return bt + + +def main(): + os.makedirs(OUT_DIR, exist_ok=True) + data_orig = load_dataset('Cora', device=device) + grape_extra = dict(diffusion_alpha=0.5, diffusion_iters=10, + lr_feedback=0.5, num_probes=64, topo_mode='fixed_A') + results = {} + L = 6 + + perturbations = [ + ('edge_rewire', [0, 0.1, 0.2, 0.3, 0.5], perturb_edges), + ('label_shuffle', [0, 0.1, 0.2, 0.3, 0.5], perturb_labels), + ('feature_mask', [0, 0.2, 0.4, 0.6, 0.8], perturb_features), + ] + + for ptype, fracs, pfunc in perturbations: + print(f"\n=== {ptype} (Cora, GCN, L={L}) ===", flush=True) + print(f"{'frac':>6} | {'BP':>8} {'DFA':>8} {'GrAPE':>8} | {'Δ(BP)':>7}", flush=True) + + for frac in fracs: + bp_accs, gr_accs = [], [] + for seed in SEEDS: + data = pfunc(data_orig, frac, seed=seed) if frac > 0 else data_orig + common = dict(data=data, hidden_dim=64, lr=0.01, weight_decay=5e-4, + num_layers=L, residual_alpha=0.0, backbone='gcn') + bp_accs.append(train_one(BPTrainer, common, {}, seed)) + gr_accs.append(train_one(GraphGrAPETrainer, common, grape_extra, seed)) + + bp, gr = np.mean(bp_accs)*100, np.mean(gr_accs)*100 + delta = gr - bp + key = f"{ptype}|frac={frac}" + results[key] = {'bp': float(np.mean(bp_accs)), 'grape': float(np.mean(gr_accs)), + 'delta': float(gr - bp), 'frac': frac, 'ptype': ptype} + print(f"{frac:>6.1f} | {bp:>7.1f} {'—':>8} {gr:>7.1f} | {delta:>+6.1f}", flush=True) + + with open(os.path.join(OUT_DIR, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"\nSaved to {OUT_DIR}/results.json") + + +if __name__ == '__main__': + main() diff --git a/experiments/run_cs_full.py b/experiments/run_cs_full.py new file mode 100644 index 0000000..d66fc59 --- /dev/null +++ b/experiments/run_cs_full.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +"""H19 CitationFull-CiteSeer (4.2K, deg 2.5, 6-class) — same regime as Planetoid CiteSeer. +Quick BP + GRAFT depth sweep to confirm/extend the 'GRAFT wins on real sparse citation' story.""" + +import torch, sys, numpy as np, time +import torch.nn as nn, torch.nn.functional as F +from torch_geometric.datasets import CitationFull +from torch_geometric.nn import GCNConv +from torch_geometric.utils import add_self_loops, degree + +sys.path.insert(0, '/home/yurenh2/graph-grape') +from src.trainers import GraphGrAPETrainer + +DATA_ROOT = '/home/yurenh2/graph-grape/data/CFull' +device = torch.device('cuda:0') + + +def build_A_hat(edge_index, N): + edge_index, _ = add_self_loops(edge_index, num_nodes=N) + row, col = edge_index + deg = degree(row, num_nodes=N, dtype=torch.float) + dis = deg.pow(-0.5); dis[dis == float('inf')] = 0 + return torch.sparse_coo_tensor(edge_index, dis[row]*dis[col], (N, N)).coalesce() + + +def build_row_norm(edge_index, N): + ei, _ = add_self_loops(edge_index, num_nodes=N) + row, col = ei + deg = degree(row, num_nodes=N, dtype=torch.float).clamp(min=1) + A_row = torch.sparse_coo_tensor(ei, 1.0/deg[row], (N,N)).coalesce() + A_row_T = torch.sparse_coo_tensor(ei.flip(0), 1.0/deg[col], (N,N)).coalesce() + return A_row, A_row_T + + +def make_split(N, seed, y, n_per_class=20, n_val=500): + g = torch.Generator().manual_seed(seed) + train_mask = torch.zeros(N, dtype=torch.bool) + val_mask = torch.zeros(N, dtype=torch.bool) + test_mask = torch.zeros(N, dtype=torch.bool) + C = int(y.max()) + 1 + for c in range(C): + idx = (y == c).nonzero().flatten() + idx = idx[torch.randperm(idx.size(0), generator=g)] + train_mask[idx[:n_per_class]] = True + remaining = (~train_mask).nonzero().flatten() + remaining = remaining[torch.randperm(remaining.size(0), generator=g)] + val_mask[remaining[:n_val]] = True + test_mask[remaining[n_val:]] = True + return train_mask, val_mask, test_mask + + +class GCN(nn.Module): + def __init__(self, in_dim, hidden, out_dim, L, dropout=0.1): + super().__init__() + self.convs = nn.ModuleList([GCNConv(in_dim if i==0 else hidden, + hidden if i0: x = F.dropout(x, self.dropout, self.training) + return x + + +def bp_one(L, seed, d, train_mask, val_mask, test_mask, epochs=200, lr=5e-3, hidden=128, dropout=0.1): + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + model = GCN(d.x.shape[1], hidden, int(d.y.max())+1, L, dropout=dropout).to(device) + opt = torch.optim.AdamW(model.parameters(), lr=lr) + sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=epochs) + + @torch.no_grad() + def ev(m): + model.eval() + out = model(d.x.float(), d.edge_index) + return (out[m].argmax(1) == d.y[m]).float().mean().item() + + bv, bt = 0, 0 + for ep in range(epochs): + model.train() + out = model(d.x.float(), d.edge_index) + loss = F.cross_entropy(out[train_mask], d.y[train_mask]) + opt.zero_grad(); loss.backward(); opt.step() + sched.step() + if ep % 5 == 0: + v = ev(val_mask) + if v > bv: bv = v; bt = ev(test_mask) + return bt + + +def graft_one(L, seed, d, A_hat, A_row, A_row_T, train_mask, val_mask, test_mask, + epochs=200, lr=5e-3, hidden=128): + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + data = { + 'X': d.x.float(), 'A_hat': A_hat, 'A_row': A_row, 'A_row_T': A_row_T, + 'y': d.y, 'train_mask': train_mask, 'val_mask': val_mask, 'test_mask': test_mask, + 'num_features': d.x.shape[1], 'num_classes': int(d.y.max())+1, + 'num_nodes': d.num_nodes, 'traces': {}, + } + trainer = GraphGrAPETrainer( + data=data, hidden_dim=hidden, lr=lr, weight_decay=0.0, + lr_feedback=0.5, num_probes=64, topo_mode='fixed_A', max_topo_power=3, + diffusion_alpha=0.5, diffusion_iters=10, + num_layers=L, residual_alpha=0.0, backbone='gcn', + use_batchnorm=False, dropout=0.0, + ) + trainer.align_mode = 'chain_norm' + bv, bt = 0, 0 + for ep in range(epochs): + trainer.train_step() + if ep % 5 == 0: + v = trainer.evaluate('val_mask') + if v > bv: bv = v; bt = trainer.evaluate('test_mask') + return bt + + +def main(): + d = CitationFull(root=DATA_ROOT, name='CiteSeer')[0].to(device) + N = d.num_nodes + A_hat = build_A_hat(d.edge_index, N) + A_row, A_row_T = build_row_norm(d.edge_index, N) + + print(f'CitationFull-CiteSeer: N={N}, deg={d.edge_index.shape[1]/N:.1f}, C={int(d.y.max())+1}', flush=True) + + bp_res, gf_res = {}, {} + for L in [3, 5, 10, 16]: + bp_accs, gf_accs = [], [] + for s in [0, 1, 2]: + tm, vm, tem = make_split(N, s, d.y.cpu()) + tm, vm, tem = tm.to(device), vm.to(device), tem.to(device) + t0 = time.time() + bp = bp_one(L, s, d, tm, vm, tem) + t1 = time.time() + gf = graft_one(L, s, d, A_hat, A_row, A_row_T, tm, vm, tem) + t2 = time.time() + bp_accs.append(bp); gf_accs.append(gf) + print(f' L={L} s={s}: BP={bp:.4f}({t1-t0:.0f}s) GRAFT={gf:.4f}({t2-t1:.0f}s)', flush=True) + bp_m, bp_sd = np.mean(bp_accs), np.std(bp_accs) + gf_m, gf_sd = np.mean(gf_accs), np.std(gf_accs) + bp_res[L] = (bp_m, bp_sd); gf_res[L] = (gf_m, gf_sd) + print(f'>>> L={L}: BP {bp_m:.4f}±{bp_sd:.4f} GRAFT {gf_m:.4f}±{gf_sd:.4f} Δ={gf_m-bp_m:+.3f}', flush=True) + + +if __name__ == '__main__': + main() diff --git a/experiments/run_dblp_depth.py b/experiments/run_dblp_depth.py new file mode 100644 index 0000000..d63b94a --- /dev/null +++ b/experiments/run_dblp_depth.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +""" +CitationFull-DBLP experiment + depth sweep L=2-6 补数据. +""" + +import torch +import torch.nn.functional as F +import numpy as np +import json +import os +import time +from torch_geometric.datasets import CitationFull +from src.data import build_normalized_adj, build_row_normalized_adj, spmm, precompute_traces +from src.trainers import BPTrainer, DFATrainer, GraphGrAPETrainer +from benchmark_efficient import GraphGrAPEEfficient + +device = 'cuda:0' +SEEDS = [0, 1, 2, 3, 4] +EPOCHS = 200 +OUT_DIR = 'results/dblp_depth' + + +def load_dblp(): + ds = CitationFull(root='./data', name='DBLP') + data = ds[0] + N, C = data.num_nodes, ds.num_classes + # Random split + rng = torch.Generator().manual_seed(42) + train_mask = torch.zeros(N, dtype=torch.bool) + val_mask = torch.zeros(N, dtype=torch.bool) + test_mask = torch.zeros(N, dtype=torch.bool) + for c in range(C): + idx = (data.y == c).nonzero(as_tuple=True)[0] + perm = torch.randperm(len(idx), generator=rng) + n_tr = max(1, int(0.05 * len(idx))) # 5% train (like Planetoid) + n_va = max(1, int(0.1 * len(idx))) + train_mask[idx[perm[:n_tr]]] = True + val_mask[idx[perm[n_tr:n_tr + n_va]]] = True + test_mask[idx[perm[n_tr + n_va:]]] = True + + A_hat = build_normalized_adj(data.edge_index, N) + A_row, A_row_T = build_row_normalized_adj(data.edge_index, N) + traces = {k: torch.tensor(0.0) for k in range(5)} # skip expensive trace computation + + return { + 'X': data.x.to(device), 'y': data.y.to(device), + 'A_hat': A_hat.to(device), 'A_row': A_row.to(device), 'A_row_T': A_row_T.to(device), + 'train_mask': train_mask.to(device), 'val_mask': val_mask.to(device), + 'test_mask': test_mask.to(device), + 'num_nodes': N, 'num_features': data.x.shape[1], 'num_classes': C, + 'traces': {k: v.to(device) for k, v in traces.items()}, + } + + +def train_one(cls, common, extra, seed): + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + t = cls(**common, **extra) + if hasattr(t, 'align_mode'): + t.align_mode = 'chain_norm' + bv, bt = 0, 0 + for ep in range(EPOCHS): + t.train_step() + if ep % 5 == 0: + v, te = t.evaluate('val_mask'), t.evaluate('test_mask') + if v > bv: bv, bt = v, te + del t; torch.cuda.empty_cache() + return bt + + +def time_method(cls, common, extra, n_warmup=10, n_steps=200): + torch.manual_seed(0) + t = cls(**common, **extra) + if hasattr(t, 'align_mode'): + t.align_mode = 'chain_norm' + for _ in range(n_warmup): + t.train_step() + torch.cuda.synchronize() + times = [] + for _ in range(n_steps): + torch.cuda.synchronize(); t0 = time.perf_counter() + t.train_step() + torch.cuda.synchronize(); times.append(time.perf_counter() - t0) + del t; torch.cuda.empty_cache() + return float(np.median(times) * 1000) + + +def main(): + os.makedirs(OUT_DIR, exist_ok=True) + results = {} + + grape_extra = dict(diffusion_alpha=0.5, diffusion_iters=10, + lr_feedback=0.5, num_probes=64, topo_mode='fixed_A') + + # ======== Part 1: DBLP full sweep ======== + print("=" * 60) + print("Part 1: CitationFull-DBLP") + print("=" * 60) + dblp = load_dblp() + print(f"DBLP: N={dblp['num_nodes']}, F={dblp['num_features']}, C={dblp['num_classes']}, " + f"train={dblp['train_mask'].sum().item()}", flush=True) + + for bb in ['gcn', 'sage', 'gin', 'appnp']: + for L in [5, 6]: + for lr in [0.001, 0.005, 0.01]: + common = dict(data=dblp, hidden_dim=64, lr=lr, weight_decay=5e-4, + num_layers=L, residual_alpha=0.0, backbone=bb) + key = f"DBLP|{bb}|L={L}|lr={lr}" + row = {} + for mname, cls, extra in [('BP', BPTrainer, {}), + ('DFA', DFATrainer, dict(diffusion_alpha=0.5, diffusion_iters=10)), + ('GrAPE', GraphGrAPETrainer, grape_extra)]: + accs = [train_one(cls, common, extra, s) for s in SEEDS] + row[mname] = {'mean': float(np.mean(accs)), 'std': float(np.std(accs))} + results[key] = row + bp, dfa, gr = row['BP']['mean']*100, row['DFA']['mean']*100, row['GrAPE']['mean']*100 + print(f" {bb:>6} L={L} lr={lr:.3f} | BP {bp:.1f} DFA {dfa:.1f} GrAPE {gr:.1f} | " + f"Δ(BP) {gr-bp:+.1f} Δ(DFA) {gr-dfa:+.1f}", flush=True) + + # DBLP efficiency + print("\nDBLP Efficiency:") + for bb in ['gcn', 'sage', 'gin', 'appnp']: + for L in [5, 6]: + common = dict(data=dblp, hidden_dim=64, lr=0.01, weight_decay=5e-4, + num_layers=L, residual_alpha=0.0, backbone=bb) + bp_ms = time_method(BPTrainer, common, {}) + eff_ms = time_method(GraphGrAPEEfficient, common, + dict(lr_feedback=0.5, num_probes=64, max_topo_power=3, + diff_alpha=0.5, align_every=10)) + key = f"DBLP_eff|{bb}|L={L}" + results[key] = {'BP_ms': bp_ms, 'GrAPE_Eff_ms': eff_ms, 'speedup': bp_ms / eff_ms} + print(f" {bb:>6} L={L} | BP {bp_ms:.2f}ms GrAPE-Eff {eff_ms:.2f}ms | " + f"speedup {bp_ms/eff_ms:.2f}x", flush=True) + + # ======== Part 2: Depth sweep L=2-4 补数据 (Planetoid × GCN/SAGE/APPNP) ======== + print("\n" + "=" * 60) + print("Part 2: Depth sweep L=2,3,4 补数据") + print("=" * 60) + + from src.data import load_dataset + for ds_name in ['Cora', 'CiteSeer', 'PubMed']: + data = load_dataset(ds_name, device=device) + for bb in ['gcn', 'sage', 'appnp']: + for L in [2, 3, 4]: + common = dict(data=data, hidden_dim=64, lr=0.01, weight_decay=5e-4, + num_layers=L, residual_alpha=0.0, backbone=bb) + key = f"{ds_name}|{bb}|L={L}|lr=0.01" + row = {} + for mname, cls, extra in [('BP', BPTrainer, {}), + ('GrAPE', GraphGrAPETrainer, grape_extra)]: + accs = [train_one(cls, common, extra, s) for s in SEEDS] + row[mname] = {'mean': float(np.mean(accs)), 'std': float(np.std(accs))} + results[key] = row + bp, gr = row['BP']['mean']*100, row['GrAPE']['mean']*100 + print(f" {ds_name:>10} {bb:>6} L={L} | BP {bp:.1f} GrAPE {gr:.1f} | Δ {gr-bp:+.1f}", flush=True) + + with open(os.path.join(OUT_DIR, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"\nSaved to {OUT_DIR}/results.json") + + +if __name__ == '__main__': + main() diff --git a/experiments/run_dblp_depth_scaling.py b/experiments/run_dblp_depth_scaling.py new file mode 100644 index 0000000..4c0bc11 --- /dev/null +++ b/experiments/run_dblp_depth_scaling.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +"""E1: DBLP depth scaling — upgrade depth_stress 3-seed to 20 seeds on DBLP, +extend to L={8,12,16,20,24,32}. Goal: confirm (or falsify) the preliminary +finding that GRAFT > ResGCN at L=16 (3-seed: 69.9 vs 63.7) and scales to L=32. + +BP vs ResGCN vs GRAFT vs GRAFT+ResGCN, GCN backbone, lr=0.01, 200 epochs.""" + +import torch +import numpy as np +import json +import os +from scipy import stats as scipy_stats +from src.trainers import BPTrainer, GraphGrAPETrainer +from run_deep_baselines import ResGCNTrainer +from run_combo_20seeds import GRAFTResGCN +from run_dblp_depth import load_dblp + +device = 'cuda:0' # selected via CUDA_VISIBLE_DEVICES +SEEDS = list(range(20)) +EPOCHS = 200 +DEPTHS = [8, 12, 16, 20, 24, 32] +OUT_DIR = 'results/dblp_depth_scaling_20seeds' + +grape_extra = dict(diffusion_alpha=0.5, diffusion_iters=10, + lr_feedback=0.5, num_probes=64, topo_mode='fixed_A') + +METHODS = { + 'BP': (BPTrainer, {}), + 'ResGCN': (ResGCNTrainer, {}), + 'GRAFT': (GraphGrAPETrainer, grape_extra), + 'GRAFT+ResGCN': (GRAFTResGCN, grape_extra), +} + + +def train_one(cls, common, extra, seed): + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + t = cls(**common, **extra) + if hasattr(t, 'align_mode'): + t.align_mode = 'chain_norm' + bv, bt = 0, 0 + for ep in range(EPOCHS): + t.train_step() + if ep % 5 == 0: + v = t.evaluate('val_mask') + te = t.evaluate('test_mask') + if v > bv: bv, bt = v, te + del t; torch.cuda.empty_cache() + return bt + + +def main(): + os.makedirs(OUT_DIR, exist_ok=True) + per_seed_file = os.path.join(OUT_DIR, 'per_seed_data.json') + if os.path.exists(per_seed_file): + with open(per_seed_file) as f: + per_seed_data = json.load(f) + else: + per_seed_data = {} + + data = load_dblp() + + for L in DEPTHS: + print(f"\n{'=' * 70}\nDepth L={L}\n{'=' * 70}", flush=True) + common = dict(data=data, hidden_dim=64, lr=0.01, weight_decay=5e-4, + num_layers=L, residual_alpha=0.0, backbone='gcn') + + for mname, (cls, extra) in METHODS.items(): + key = f"DBLP_L{L}_{mname}" + if key not in per_seed_data: + per_seed_data[key] = {} + + print(f"\n--- {key} ({len(SEEDS)} seeds) ---", flush=True) + for seed in SEEDS: + sk = str(seed) + if sk in per_seed_data[key]: + print(f" seed {seed}: cached ({per_seed_data[key][sk]*100:.1f}%)", flush=True) + continue + try: + acc = train_one(cls, common, extra, seed) + per_seed_data[key][sk] = acc + print(f" seed {seed}: {acc*100:.1f}%", flush=True) + except Exception as e: + print(f" seed {seed}: FAILED - {e}", flush=True) + per_seed_data[key][sk] = 0.0 + + with open(per_seed_file, 'w') as f: + json.dump(per_seed_data, f, indent=2) + + # Summary + print(f"\n{'=' * 70}\nDBLP depth scaling summary\n{'=' * 70}") + results = {} + for L in DEPTHS: + print(f"\nL={L}:") + method_means = {} + for mname in METHODS: + key = f"DBLP_L{L}_{mname}" + vals = np.array([per_seed_data[key][str(s)] for s in SEEDS]) * 100 + method_means[mname] = (vals.mean(), vals.std()) + results[key] = {'mean': float(vals.mean()), 'std': float(vals.std()), + 'per_seed': vals.tolist()} + print(f" {mname:<15} {vals.mean():5.1f} ± {vals.std():4.1f}") + + # GRAFT vs ResGCN (paired) + g_accs = np.array([per_seed_data[f"DBLP_L{L}_GRAFT"][str(s)] for s in SEEDS]) * 100 + r_accs = np.array([per_seed_data[f"DBLP_L{L}_ResGCN"][str(s)] for s in SEEDS]) * 100 + t_gr, p_gr = scipy_stats.ttest_rel(g_accs, r_accs) + print(f" GRAFT vs ResGCN: Δ={g_accs.mean() - r_accs.mean():+.1f}, p={p_gr:.4f}") + + with open(os.path.join(OUT_DIR, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"\nSaved to {OUT_DIR}/results.json") + + +if __name__ == '__main__': + main() diff --git a/experiments/run_depth_extras.py b/experiments/run_depth_extras.py new file mode 100644 index 0000000..66a7d45 --- /dev/null +++ b/experiments/run_depth_extras.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +"""H11: Fill depth sweep at L=14 and L=18 to densify Fig 4(a). +3 methods (BP / DFA-GNN / GRAFT) × 4 datasets × 2 depths × 20 seeds = 480 runs. +""" + +import torch +import numpy as np +import json +import os +from src.data import load_dataset +from src.trainers import BPTrainer, DFAGNNTrainer, GraphGrAPETrainer +from run_dblp_depth import load_dblp + +device = 'cuda:0' +SEEDS = list(range(20)) +EPOCHS = 200 +DEPTHS = [14, 18] +OUT_DIR = 'results/depth_extras_20seeds' + +grape_extra = dict(diffusion_alpha=0.5, diffusion_iters=10, + lr_feedback=0.5, num_probes=64, topo_mode='fixed_A') +dfagnn_extra = dict(diffusion_alpha=0.5, diffusion_iters=10, max_topo_power=3) + +METHODS = { + 'BP': (BPTrainer, {}), + 'DFA-GNN': (DFAGNNTrainer, dfagnn_extra), + 'GRAFT': (GraphGrAPETrainer, grape_extra), +} + + +def train_one(cls, common, extra, seed): + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + t = cls(**common, **extra) + if hasattr(t, 'align_mode'): + t.align_mode = 'chain_norm' + bv, bt = 0, 0 + for ep in range(EPOCHS): + t.train_step() + if ep % 5 == 0: + v = t.evaluate('val_mask') + te = t.evaluate('test_mask') + if v > bv: bv, bt = v, te + del t; torch.cuda.empty_cache() + return bt + + +def main(): + os.makedirs(OUT_DIR, exist_ok=True) + per_seed_file = os.path.join(OUT_DIR, 'per_seed_data.json') + if os.path.exists(per_seed_file): + with open(per_seed_file) as f: + per_seed_data = json.load(f) + else: + per_seed_data = {} + + datasets_cfg = { + 'Cora': lambda: load_dataset('Cora', device=device), + 'CiteSeer': lambda: load_dataset('CiteSeer', device=device), + 'PubMed': lambda: load_dataset('PubMed', device=device), + 'DBLP': lambda: load_dblp(), + } + + for ds_name, loader in datasets_cfg.items(): + data = loader() + for L in DEPTHS: + common = dict(data=data, hidden_dim=64, lr=0.01, weight_decay=5e-4, + num_layers=L, residual_alpha=0.0, backbone='gcn') + for mname, (cls, extra) in METHODS.items(): + key = f"{ds_name}_L{L}_{mname}" + if key not in per_seed_data: + per_seed_data[key] = {} + print(f"\n=== {key} (20 seeds) ===", flush=True) + for seed in SEEDS: + sk = str(seed) + if sk in per_seed_data[key]: + continue + try: + acc = train_one(cls, common, extra, seed) + per_seed_data[key][sk] = acc + print(f" seed {seed}: {acc*100:.1f}%", flush=True) + except Exception as e: + print(f" seed {seed}: FAILED - {e}", flush=True) + per_seed_data[key][sk] = 0.0 + with open(per_seed_file, 'w') as f: + json.dump(per_seed_data, f, indent=2) + del data; torch.cuda.empty_cache() + + print(f"\nDone. Saved to {per_seed_file}") + + +if __name__ == '__main__': + main() diff --git a/experiments/run_dfagnn_depth.py b/experiments/run_dfagnn_depth.py new file mode 100644 index 0000000..ed6e6c3 --- /dev/null +++ b/experiments/run_dfagnn_depth.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +"""H7: DFA-GNN depth sweep for Figure 4(a)-style plot. + +Runs DFA-GNN at L ∈ {4, 8, 10, 12, 16, 20} × {Cora, CiteSeer, PubMed, DBLP} × 20 seeds. +L=6 data already exists from prior experiments; L=2/3 skipped (CiteSeer L=2 GRAFT soft spot). + +Combined with existing BP and GRAFT depth data, produces 3-method depth curves for Figure 4(a). +""" + +import torch +import numpy as np +import json +import os +from src.data import load_dataset +from src.trainers import DFAGNNTrainer +from run_dblp_depth import load_dblp + +device = 'cuda:0' +SEEDS = list(range(20)) +EPOCHS = 200 +DEPTHS = [4, 8, 10, 12, 16, 20] +OUT_DIR = 'results/dfagnn_depth_20seeds' + +dfagnn_extra = dict(diffusion_alpha=0.5, diffusion_iters=10, max_topo_power=3) + + +def train_one(data, L, seed): + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + t = DFAGNNTrainer(data=data, hidden_dim=64, lr=0.01, weight_decay=5e-4, + num_layers=L, residual_alpha=0.0, backbone='gcn', **dfagnn_extra) + bv, bt = 0, 0 + for ep in range(EPOCHS): + t.train_step() + if ep % 5 == 0: + v = t.evaluate('val_mask') + te = t.evaluate('test_mask') + if v > bv: bv, bt = v, te + del t; torch.cuda.empty_cache() + return bt + + +def main(): + os.makedirs(OUT_DIR, exist_ok=True) + per_seed_file = os.path.join(OUT_DIR, 'per_seed_data.json') + if os.path.exists(per_seed_file): + with open(per_seed_file) as f: + per_seed_data = json.load(f) + else: + per_seed_data = {} + + datasets_cfg = { + 'Cora': lambda: load_dataset('Cora', device=device), + 'CiteSeer': lambda: load_dataset('CiteSeer', device=device), + 'PubMed': lambda: load_dataset('PubMed', device=device), + 'DBLP': lambda: load_dblp(), + } + + for ds_name, loader in datasets_cfg.items(): + data = loader() + for L in DEPTHS: + key = f"{ds_name}_L{L}_DFA-GNN" + if key not in per_seed_data: + per_seed_data[key] = {} + + print(f"\n=== {key} (20 seeds) ===", flush=True) + for seed in SEEDS: + sk = str(seed) + if sk in per_seed_data[key]: + print(f" seed {seed}: cached ({per_seed_data[key][sk]*100:.1f}%)", flush=True) + continue + try: + acc = train_one(data, L, seed) + per_seed_data[key][sk] = acc + print(f" seed {seed}: {acc*100:.1f}%", flush=True) + except Exception as e: + print(f" seed {seed}: FAILED - {e}", flush=True) + per_seed_data[key][sk] = 0.0 + + with open(per_seed_file, 'w') as f: + json.dump(per_seed_data, f, indent=2) + del data; torch.cuda.empty_cache() + + # Summary + print(f"\n{'=' * 70}\nDFA-GNN depth sweep summary (20 seeds)\n{'=' * 70}") + results = {} + for ds in datasets_cfg: + print(f"\n{ds}:") + for L in DEPTHS: + key = f"{ds}_L{L}_DFA-GNN" + vals = np.array([per_seed_data[key][str(s)] for s in SEEDS]) * 100 + results[key] = {'mean': float(vals.mean()), 'std': float(vals.std()), + 'per_seed': vals.tolist()} + print(f" L={L:2d} DFA-GNN {vals.mean():5.1f} ± {vals.std():4.1f}") + + with open(os.path.join(OUT_DIR, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"\nSaved to {OUT_DIR}/results.json") + + +if __name__ == '__main__': + main() diff --git a/experiments/run_diag_section23_v2.py b/experiments/run_diag_section23_v2.py new file mode 100644 index 0000000..f583031 --- /dev/null +++ b/experiments/run_diag_section23_v2.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +"""§2.3 diagnostic v2 — faithful reproduction of original methodology. + +Uses src.trainers.BPTrainer (the actual training stack used in the paper), +matching results/gradient_reach_20seeds/per_seed_data.json which shows +GCN L=10 weight grad norms = 0.0 for all 20 seeds × 10 layers. + +Adds beyond the original: + - pre-activation grad G_Z[l] = ||dL/dZ_l||_F and RMS-normed variant + - forward magnitudes M[l] = ||H_l||_F and RMS-normed + - centered dispersion D[l] = ||H_l - mean||_F / D_0 + - frozen linear probe probe_acc[l] on H_l + +Backbone: GCN. Cora. 100 epochs (matches original). 20 seeds. Depths {6, 10, 20}. +Output: results/diag_section23/diag_data_v2.json +""" +import json, os, sys +import numpy as np +import torch +import torch.nn.functional as F +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler + +sys.path.insert(0, '/home/yurenh2/graph-grape') +from src.data import load_dataset, spmm +from src.trainers import BPTrainer + +DEVICE = 'cuda:0' # CUDA_VISIBLE_DEVICES=2 → cuda:0 +HIDDEN = 64 +LR = 0.01 +WD = 5e-4 +EPOCHS = 100 +SEEDS = list(range(20)) +OUT_DIR = '/home/yurenh2/graph-grape/results/diag_section23' +os.makedirs(OUT_DIR, exist_ok=True) + + +def forward_with_intermediates(bp, capture_for_grad=False): + """Re-implement BPTrainer.forward() but capture per-layer Z (pre-act) and H (post-act). + H[0] = X (input features). For l = 1..L: H[l] = relu(Z[l-1]) (or Z[l-1] for last layer). + Z[0..L-1] are pre-activation outputs of each conv. + """ + X = bp.data['X'] + H_list = [X] + Z_list = [] + H = X + H0 = None + for l in range(bp.num_layers): + if l > 0 and l < bp.num_layers - 1 and bp.residual_alpha > 0 and H0 is not None: + H = (1 - bp.residual_alpha) * H + bp.residual_alpha * H0 + Z = bp._graph_conv(H, bp.weights[l], l) + if capture_for_grad: + Z.retain_grad() + Z_list.append(Z) + if l < bp.num_layers - 1: + H = F.relu(Z) + if l == 0: + H0 = H + else: + H = Z # final logits, no relu + H_list.append(H) + return H_list[-1], Z_list, H_list # logits, Z's, H's + + +def diagnose(seed, L, data): + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + bp = BPTrainer(data=data, hidden_dim=HIDDEN, lr=LR, weight_decay=WD, + num_layers=L, residual_alpha=0.0, backbone='gcn') + + for _ in range(EPOCHS): + bp.train_step() + + # Diagnostic forward at epoch 100 + bp.optimizer.zero_grad() + logits, Zs, Hs = forward_with_intermediates(bp, capture_for_grad=True) + mask = data['train_mask'] + loss = F.cross_entropy(logits[mask], data['y'][mask]) + loss.backward(retain_graph=False) + + # Weight gradients (original methodology) + W_grads_F = [float(bp.weights[l].grad.detach().norm().item()) for l in range(L)] + W_grads_rms = [g / np.sqrt(bp.weights[l].numel()) for g, l in zip(W_grads_F, range(L))] + + # Pre-activation gradients on Z_l (l=0..L-1) + Z_grads_F = [] + Z_grads_rms = [] + for z in Zs: + if z.grad is None: + Z_grads_F.append(0.0); Z_grads_rms.append(0.0); continue + N, d_ = z.shape + gf = float(z.grad.detach().norm().item()) + Z_grads_F.append(gf) + Z_grads_rms.append(gf / np.sqrt(N * d_)) + + # Forward state metrics on H_l (l=0..L) + M_F, M_rms = [], [] + D_raw = [] + for H in Hs: + N, d_ = H.shape + mf = float(H.detach().norm().item()) + M_F.append(mf) + M_rms.append(mf / np.sqrt(N * d_)) + mu = H.detach().mean(0, keepdim=True) + D_raw.append(float((H.detach() - mu).norm().item())) + D0 = D_raw[0] if D_raw[0] > 0 else 1.0 + D_norm = [d / D0 for d in D_raw] + + # Frozen linear probe on each H_l + probe_acc = [] + ytr = data['y'][data['train_mask']].cpu().numpy() + yte = data['y'][data['test_mask']].cpu().numpy() + train_mask_b = data['train_mask'] + test_mask_b = data['test_mask'] + for H in Hs: + Xtr = H.detach()[train_mask_b].cpu().numpy() + Xte = H.detach()[test_mask_b].cpu().numpy() + try: + sc = StandardScaler().fit(Xtr) + Xtr_s = sc.transform(Xtr) + Xte_s = sc.transform(Xte) + clf = LogisticRegression(max_iter=2000, C=1.0).fit(Xtr_s, ytr) + acc = float(clf.score(Xte_s, yte)) + except Exception: + acc = float('nan') + probe_acc.append(acc) + + bp_acc = bp.evaluate('test_mask') + + del bp; torch.cuda.empty_cache() + return dict(L=L, seed=seed, bp_acc=bp_acc, + W_grads_F=W_grads_F, W_grads_rms=W_grads_rms, + Z_grads_F=Z_grads_F, Z_grads_rms=Z_grads_rms, + M_F=M_F, M_rms=M_rms, D_raw=D_raw, D_norm=D_norm, + probe_acc=probe_acc) + + +def main(): + data = load_dataset('Cora', device=DEVICE) + print(f"Cora: N={data['X'].shape[0]}, F={data['X'].shape[1]}, " + f"C={data['num_classes']}", flush=True) + + all_results = {} + for L in [20, 10, 6]: + print(f'\n=== L={L} ===', flush=True) + rows = [] + for s in SEEDS: + r = diagnose(s, L, data) + rows.append(r) + wg = r['W_grads_F'] + print(f" L={L} s={s:2d} acc={r['bp_acc']:.4f} " + f"W_grads[0,mid,-1]=[{wg[0]:.2e}, {wg[len(wg)//2]:.2e}, {wg[-1]:.2e}] " + f"Z_grad[out]={r['Z_grads_F'][-1]:.2e}", flush=True) + all_results[f'L={L}'] = rows + + out_path = os.path.join(OUT_DIR, 'diag_data_v2.json') + with open(out_path, 'w') as f: + json.dump(all_results, f, indent=2) + print(f'\nSaved {out_path}') + + print('\n=== summary ===') + for k, rows in all_results.items(): + Wg = np.array([r['W_grads_F'] for r in rows]) + n_under = int((Wg < 1e-38).sum()) + n_total = Wg.size + accs = np.array([r['bp_acc'] for r in rows]) + print(f' {k}: BP acc {accs.mean():.4f}±{accs.std():.4f} ' + f'W_grads_F median={np.median(Wg):.3e} ' + f'<1e-38: {n_under}/{n_total} cells') + + +if __name__ == '__main__': + main() diff --git a/experiments/run_ff_baseline.py b/experiments/run_ff_baseline.py new file mode 100644 index 0000000..095b811 --- /dev/null +++ b/experiments/run_ff_baseline.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +"""H4: Forward-Forward with Virtual-Node variant (FF+VN, Hinton 2022 + graph adaptation). + +Each layer trained locally to discriminate positive vs negative samples via a +"goodness" function (sum of squared activations). For graph data with virtual +node: + - Positive sample: augment graph with a virtual node connected to all real + nodes. The VN feature encodes the CORRECT class label (one-hot). + - Negative sample: same graph augmentation but VN feature encodes a WRONG + (random) class label. + - Goodness at layer l: g_l = mean(H_l^2) (clamped via sigmoid threshold θ) + - Local loss: binary cross-entropy on goodness, positive should exceed θ, + negative should stay below θ. + - Each layer trained independently on its own local loss. + +Inference: take final-layer goodness at virtual node across candidate labels, +pick argmax. + +Runs on Cora/CiteSeer/PubMed/DBLP × 20 seeds, GCN L=6. +""" + +import torch +import torch.nn.functional as F +import numpy as np +import json +import os +from src.data import load_dataset, spmm +from run_dblp_depth import load_dblp + +device = 'cuda:0' +SEEDS = list(range(20)) +EPOCHS = 200 +OUT_DIR = 'results/ff_baseline_20seeds' + + +class FFTrainer: + """FF+VN for GCN L=6: virtual node carries label, per-layer goodness-discriminator.""" + + def __init__(self, data, hidden_dim, lr, weight_decay, + num_layers=2, residual_alpha=0.0, backbone='gcn', + ff_threshold=2.0, **_kw): + dev = data['X'].device + self.data = data + self.device = dev + self.lr = lr + self.wd = weight_decay + self.num_layers = num_layers + self.backbone = backbone + self.theta = ff_threshold + + d_in_orig = data['num_features'] + d_out = data['num_classes'] + self.d_in = d_in_orig + d_out # augmented: original features + label one-hot slot + self.d_out = d_out + self.N_orig = data['num_nodes'] + + dims = [self.d_in] + [hidden_dim] * (num_layers - 1) + [hidden_dim] + self.weights = [] + for i in range(num_layers): + w = torch.empty(dims[i], dims[i + 1], device=dev) + torch.nn.init.xavier_uniform_(w) + w.requires_grad_(True) + self.weights.append(w) + + self.optim = torch.optim.Adam(self.weights, lr=lr, weight_decay=weight_decay) + + # Pre-build augmented adjacency with virtual node + self.A_hat_aug = self._build_vn_adj() + + def _build_vn_adj(self): + """Augment A_hat with a virtual node (index N) connected to all N real nodes. + Re-normalize symmetrically.""" + N = self.N_orig + A = self.data['A_hat'] # (N, N) sparse + # For simplicity build dense adjacency (OK for small graphs) + if A.is_sparse: + A_dense = A.to_dense() + else: + A_dense = A + # Add row/col for VN (index N) + A_big = torch.zeros(N + 1, N + 1, device=A.device) + A_big[:N, :N] = A_dense + A_big[N, :N] = 1.0 # VN connects to all + A_big[:N, N] = 1.0 # symmetric + A_big[N, N] = 1.0 # self-loop for VN + # Symmetric re-normalize: D^(-1/2) (A + I) D^(-1/2). Our A_hat already has + # self-loops + normalization per convention. For simplicity just re-normalize. + deg = A_big.sum(dim=1) + deg_inv_sqrt = deg.pow(-0.5) + deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0 + D_inv_sqrt = torch.diag(deg_inv_sqrt) + A_norm = D_inv_sqrt @ A_big @ D_inv_sqrt + return A_norm + + def _make_input(self, label_vec): + """Build augmented X (N+1, d_in_orig + d_out) with VN (row N) carrying + label_vec (one-hot vector of length d_out) in its last d_out slots. + Real nodes (rows 0..N-1) have 0s in label slots.""" + X_orig = self.data['X'] + N = self.N_orig + # Original features padded with zeros in label slots + zeros_lbl = torch.zeros(N, self.d_out, device=self.device) + X_real = torch.cat([X_orig, zeros_lbl], dim=1) + # Virtual node: zero features, label_vec in label slots + zeros_feat = torch.zeros(1, X_orig.shape[1], device=self.device) + X_vn = torch.cat([zeros_feat, label_vec.unsqueeze(0)], dim=1) + return torch.cat([X_real, X_vn], dim=0) + + def _forward_layer(self, H, l): + """One GCN layer on augmented graph.""" + HW = H @ self.weights[l] + return self.A_hat_aug @ HW + + def _forward_all(self, X_aug): + """Full forward through L layers, returning [H_l for l in 0..L].""" + H = X_aug + Hs = [H] + for l in range(self.num_layers): + Z = self._forward_layer(H, l) + if l < self.num_layers - 1: + H = F.relu(Z) + else: + H = Z + Hs.append(H) + return Hs + + def _goodness(self, H): + """Goodness = sum of squared activations (Hinton 2022).""" + return (H ** 2).sum(dim=1).mean() + + def train_step(self): + y = self.data['y'] + mask = self.data['train_mask'] + + # Pick one labeled node at random per step for simplicity + # Or: use all labeled nodes with aggregated goodness + # For efficiency, use all at once: VN label is the majority train label + # But that doesn't make sense — VN should carry different labels in pos/neg. + # Compromise: random positive/negative labels sampled per step, using VN + + # Positive: pick one of the labeled classes as VN label (one-hot) + train_labels = y[mask] + labeled_node_count = mask.sum().item() + if labeled_node_count == 0: + return 0.0, 0.0, {} + # Use all training labels to construct a distribution + # For simplicity: pos sample uses one-hot majority class; neg uses random wrong + pos_label_idx = train_labels[torch.randint(0, labeled_node_count, (1,), device=self.device)].item() + pos_label = F.one_hot(torch.tensor(pos_label_idx, device=self.device), self.d_out).float() + + # Negative: pick a wrong class + wrong_classes = [c for c in range(self.d_out) if c != pos_label_idx] + neg_label_idx = wrong_classes[torch.randint(0, len(wrong_classes), (1,)).item()] + neg_label = F.one_hot(torch.tensor(neg_label_idx, device=self.device), self.d_out).float() + + X_pos = self._make_input(pos_label) + X_neg = self._make_input(neg_label) + + self.optim.zero_grad() + + # Forward both, collect per-layer goodness + Hs_pos = self._forward_all(X_pos) + Hs_neg = self._forward_all(X_neg) + + total_loss = 0.0 + for l in range(1, self.num_layers + 1): # skip input + H_pos = Hs_pos[l] + H_neg = Hs_neg[l] + # Detach previous-layer outputs to block upstream gradient (FF principle) + # But layers are connected through Hs_pos[l-1] which gets used in next layer. + # Detach Hs_pos[l] so gradient at layer l+1 doesn't flow to l. + # Simpler: recompute per-layer with detach + # Actually just use local loss per layer on goodness + + g_pos = self._goodness(H_pos) + g_neg = self._goodness(H_neg) + + # FF loss: logistic + loss_l = F.softplus(-(g_pos - self.theta)).mean() + F.softplus(g_neg - self.theta).mean() + total_loss += loss_l.item() + loss_l.backward(retain_graph=(l < self.num_layers)) + + self.optim.step() + + return total_loss, 0.0, {} + + @torch.no_grad() + def evaluate(self, mask_name='test_mask'): + """For each test node, try each candidate VN label, pick the one with + highest final-layer goodness at the test node's position.""" + mask = self.data[mask_name] + y = self.data['y'] + + # For each candidate class c: build input with VN carrying class c, forward + goodness_per_class = [] + for c in range(self.d_out): + lbl = F.one_hot(torch.tensor(c, device=self.device), self.d_out).float() + X_aug = self._make_input(lbl) + Hs = self._forward_all(X_aug) + # Use final hidden layer + H_final = Hs[-1][:self.N_orig] # exclude VN + # Per-node goodness + gn = (H_final ** 2).sum(dim=1) # (N,) + goodness_per_class.append(gn) + goodness = torch.stack(goodness_per_class, dim=1) # (N, C) + preds = goodness.argmax(dim=1) + return (preds[mask] == y[mask]).float().mean().item() + + +def train_one(seed, data): + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + t = FFTrainer(data=data, hidden_dim=64, lr=0.01, weight_decay=5e-4, + num_layers=6, residual_alpha=0.0, backbone='gcn') + bv, bt = 0, 0 + for ep in range(EPOCHS): + t.train_step() + if ep % 5 == 0: + v = t.evaluate('val_mask') + te = t.evaluate('test_mask') + if v > bv: bv, bt = v, te + del t; torch.cuda.empty_cache() + return bt + + +def main(): + os.makedirs(OUT_DIR, exist_ok=True) + per_seed_file = os.path.join(OUT_DIR, 'per_seed_data.json') + if os.path.exists(per_seed_file): + with open(per_seed_file) as f: + per_seed_data = json.load(f) + else: + per_seed_data = {} + + datasets_cfg = { + 'Cora': lambda: load_dataset('Cora', device=device), + 'CiteSeer': lambda: load_dataset('CiteSeer', device=device), + 'PubMed': lambda: load_dataset('PubMed', device=device), + 'DBLP': lambda: load_dblp(), + } + + for ds_name, loader in datasets_cfg.items(): + data = loader() + key = f"{ds_name}_FF+VN" + if key not in per_seed_data: + per_seed_data[key] = {} + + print(f"\n=== {key} (20 seeds, GCN L=6) ===", flush=True) + for seed in SEEDS: + sk = str(seed) + if sk in per_seed_data[key]: + print(f" seed {seed}: cached ({per_seed_data[key][sk]*100:.1f}%)", flush=True) + continue + try: + acc = train_one(seed, data) + per_seed_data[key][sk] = acc + print(f" seed {seed}: {acc*100:.1f}%", flush=True) + except Exception as e: + print(f" seed {seed}: FAILED - {e}", flush=True) + per_seed_data[key][sk] = 0.0 + + with open(per_seed_file, 'w') as f: + json.dump(per_seed_data, f, indent=2) + + del data; torch.cuda.empty_cache() + + # Summary + print(f"\n{'=' * 70}\nFF+VN summary (20 seeds, GCN L=6)\n{'=' * 70}") + results = {} + for ds in datasets_cfg: + key = f"{ds}_FF+VN" + vals = np.array([per_seed_data[key][str(s)] for s in SEEDS]) * 100 + results[key] = {'mean': float(vals.mean()), 'std': float(vals.std()), + 'per_seed': vals.tolist()} + print(f" {ds:<12} {vals.mean():5.1f} ± {vals.std():4.1f}") + + with open(os.path.join(OUT_DIR, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"\nSaved to {OUT_DIR}/results.json") + + +if __name__ == '__main__': + main() diff --git a/experiments/run_grad_reach_20seeds.py b/experiments/run_grad_reach_20seeds.py new file mode 100644 index 0000000..b5ad53b --- /dev/null +++ b/experiments/run_grad_reach_20seeds.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +"""Gradient reach with 20 seeds (0-19) for statistical significance. + +Extends 5-seed results. Loads existing seeds 0-4 data if available, +runs seeds 5-19, then combines for final statistics. +""" + +import torch +import torch.nn.functional as F +import numpy as np +import json +import os +from scipy import stats as scipy_stats +from src.data import load_dataset, spmm +from src.trainers import BPTrainer, GraphGrAPETrainer + +device = 'cuda:0' +ALL_SEEDS = list(range(20)) +EPOCHS = 100 +OUT_DIR = 'results/gradient_reach_20seeds' +OLD_FILE = 'results/gradient_reach_5seeds/results.json' + + +def measure_one(data, L, backbone, seed): + A = data['A_hat'] + common = dict(data=data, hidden_dim=64, lr=0.01, weight_decay=5e-4, + num_layers=L, residual_alpha=0.0, backbone=backbone) + grape_extra = dict(diffusion_alpha=0.5, diffusion_iters=10, + lr_feedback=0.5, num_probes=64, topo_mode='fixed_A') + + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + bp = BPTrainer(**common) + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + gr = GraphGrAPETrainer(**common, **grape_extra) + gr.align_mode = 'chain_norm' + + for _ in range(EPOCHS): + bp.train_step() + gr.train_step() + + # BP gradients + bp.optimizer.zero_grad() + Z_bp, _ = bp.forward() + mask = data['train_mask'] + loss = F.cross_entropy(Z_bp[mask], data['y'][mask]) + loss.backward() + bp_norms = [bp.weights[l].grad.norm().item() for l in range(L)] + + # GRAFT feedback norms + Z_gr, inter = gr.forward() + E0, E_bar = gr._output_error(Z_gr) + graft_norms = [] + for l in range(L - 1): + power = min(L - l, gr.max_topo_power) + topo_E = E_bar + for _ in range(power): + topo_E = spmm(A, topo_E) + fb = topo_E @ gr.Rs[l] + relu_gate = (inter['Zs'][l].detach() > 0).float() + graft_norms.append((relu_gate * fb).norm().item()) + + bp_acc = bp.evaluate('test_mask') + gr_acc = gr.evaluate('test_mask') + + del bp, gr; torch.cuda.empty_cache() + return bp_norms, graft_norms, bp_acc, gr_acc + + +def main(): + os.makedirs(OUT_DIR, exist_ok=True) + data = load_dataset('Cora', device=device) + + # Load existing per-seed data if available + old_per_seed_file = os.path.join(OUT_DIR, 'per_seed_data.json') + if os.path.exists(old_per_seed_file): + with open(old_per_seed_file) as f: + per_seed_data = json.load(f) + print(f"Loaded existing per-seed data from {old_per_seed_file}") + else: + per_seed_data = {} + + configs = [ + ('gcn', 6), + ('gcn', 10), + ('appnp', 6), + ('appnp', 10), + ] + + for backbone, L in configs: + key = f"{backbone}_L{L}" + print(f"\n=== {backbone.upper()} L={L} (20 seeds) ===", flush=True) + + if key not in per_seed_data: + per_seed_data[key] = {} + + for seed in ALL_SEEDS: + seed_key = str(seed) + if seed_key in per_seed_data[key]: + print(f" seed {seed}: already done, skipping", flush=True) + continue + + bn, gn, ba, ga = measure_one(data, L, backbone, seed) + per_seed_data[key][seed_key] = { + 'bp_norms': bn, 'graft_norms': gn, + 'bp_acc': ba, 'gr_acc': ga + } + print(f" seed {seed}: BP {ba*100:.1f}% GRAFT {ga*100:.1f}%", flush=True) + + # Save incrementally + with open(old_per_seed_file, 'w') as f: + json.dump(per_seed_data, f, indent=2) + + # Aggregate results + results = {} + for backbone, L in configs: + key = f"{backbone}_L{L}" + sd = per_seed_data[key] + + bp_accs = np.array([sd[str(s)]['bp_acc'] for s in ALL_SEEDS]) * 100 + gr_accs = np.array([sd[str(s)]['gr_acc'] for s in ALL_SEEDS]) * 100 + t_stat, p_val = scipy_stats.ttest_rel(gr_accs, bp_accs) + + avg_bp_norms = np.mean([sd[str(s)]['bp_norms'] for s in ALL_SEEDS], axis=0) + avg_gr_norms = np.mean([sd[str(s)]['graft_norms'] for s in ALL_SEEDS], axis=0) + + results[key] = { + 'bp_acc_mean': float(bp_accs.mean()), + 'bp_acc_std': float(bp_accs.std()), + 'gr_acc_mean': float(gr_accs.mean()), + 'gr_acc_std': float(gr_accs.std()), + 'delta_mean': float((gr_accs - bp_accs).mean()), + 'delta_std': float((gr_accs - bp_accs).std()), + 't_stat': float(t_stat), + 'p_value': float(p_val), + 'n_seeds': 20, + 'avg_bp_norms': avg_bp_norms.tolist(), + 'avg_gr_norms': avg_gr_norms.tolist(), + 'bp_accs': bp_accs.tolist(), + 'gr_accs': gr_accs.tolist(), + } + + sig = '***' if p_val < 0.001 else ('**' if p_val < 0.01 else ('*' if p_val < 0.05 else 'ns')) + print(f"\n {key}:") + print(f" BP: {bp_accs.mean():.1f} ± {bp_accs.std():.1f}%") + print(f" GRAFT: {gr_accs.mean():.1f} ± {gr_accs.std():.1f}%") + print(f" Δ: {(gr_accs-bp_accs).mean():+.1f} ± {(gr_accs-bp_accs).std():.1f}% t={t_stat:.2f} p={p_val:.4f} {sig}") + print(f" BP norm L0: {avg_bp_norms[0]:.6f}") + print(f" GRAFT norm L0: {avg_gr_norms[0]:.4f}") + + with open(os.path.join(OUT_DIR, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"\nSaved to {OUT_DIR}/results.json") + + +if __name__ == '__main__': + main() diff --git a/experiments/run_hero_extras.py b/experiments/run_hero_extras.py new file mode 100644 index 0000000..c6fed4d --- /dev/null +++ b/experiments/run_hero_extras.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +"""E0a+E0c+E0e: hero-table coverage expansion. Adds 3 datasets (PubMed stack, +Coauthor-Physics, Coauthor-CS) × 5 methods (BP, DFA, DFA-GNN, GRAFT, +GRAFT+ResGCN) × 20 seeds, all GCN L=6. Goal: 6-row hero table of homophilous +citation/coauthor graphs where GRAFT or GRAFT+ResGCN is best per row.""" + +import torch +import numpy as np +import json +import os +from scipy import stats as scipy_stats +from src.data import load_dataset, spmm +from src.trainers import BPTrainer, DFATrainer, DFAGNNTrainer, GraphGrAPETrainer +from run_deep_baselines import ResGCNTrainer +from run_combo_20seeds import GRAFTResGCN +from run_large_graph_scout import load_and_check +import torch.nn.functional as F + +device = 'cuda:0' +SEEDS = list(range(20)) +EPOCHS = 200 +OUT_DIR = 'results/hero_extras_20seeds' + +grape_extra = dict(diffusion_alpha=0.5, diffusion_iters=10, + lr_feedback=0.5, num_probes=64, topo_mode='fixed_A') +dfagnn_extra = dict(diffusion_alpha=0.5, diffusion_iters=10, max_topo_power=3) + + +# DFA-GNN + ResGCN wrapper (from run_dfagnn_resgcn.py but inlined for module independence) +class DFAGNNResGCN(DFAGNNTrainer): + def forward(self): + X = self.data['X'] + H = X + H0 = None + Hs, Zs = [], [] + for l in range(self.num_layers): + Z = self._graph_conv(H, self.weights[l], l) + Zs.append(Z) + if l < self.num_layers - 1: + H_new = F.relu(Z) + if H_new.size(1) == H.size(1): + H = H + H_new + else: + H = H_new + Hs.append(H) + if l == 0: + H0 = H + else: + return Z, {'Hs': Hs, 'Zs': Zs, 'H0': H0} + return Z, {'Hs': Hs, 'Zs': Zs, 'H0': H0} + + +METHODS = { + 'BP': (BPTrainer, {}), + 'DFA': (DFATrainer, dfagnn_extra), + 'DFA-GNN': (DFAGNNTrainer, dfagnn_extra), + 'GRAFT': (GraphGrAPETrainer, grape_extra), + 'GRAFT+ResGCN': (GRAFTResGCN, grape_extra), +} + + +def train_one(cls, common, extra, seed): + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + t = cls(**common, **extra) + if hasattr(t, 'align_mode'): + t.align_mode = 'chain_norm' + bv, bt = 0, 0 + for ep in range(EPOCHS): + t.train_step() + if ep % 5 == 0: + v = t.evaluate('val_mask') + te = t.evaluate('test_mask') + if v > bv: bv, bt = v, te + del t; torch.cuda.empty_cache() + return bt + + +def load_dataset_hero(name): + """Return data dict in the same format as load_dataset, for any hero-list dataset.""" + if name == 'PubMed': + return load_dataset('PubMed', device=device) + # Coauthor-* uses load_and_check which returns (stats, data) tuple + stats, data = load_and_check(name) + if data is None: + raise RuntimeError(f"Failed to load {name}") + return data + + +def main(): + os.makedirs(OUT_DIR, exist_ok=True) + per_seed_file = os.path.join(OUT_DIR, 'per_seed_data.json') + if os.path.exists(per_seed_file): + with open(per_seed_file) as f: + per_seed_data = json.load(f) + else: + per_seed_data = {} + + # Order from fastest (PubMed ~19K) to slower (Physics ~34K, CS ~18K) + DATASETS = ['PubMed', 'Coauthor-CS', 'Coauthor-Physics'] + + for ds_name in DATASETS: + print(f"\n{'=' * 70}\n{ds_name} (GCN L=6, 20 seeds, 5 methods)\n{'=' * 70}", flush=True) + data = load_dataset_hero(ds_name) + + common = dict(data=data, hidden_dim=64, lr=0.01, weight_decay=5e-4, + num_layers=6, residual_alpha=0.0, backbone='gcn') + + for mname, (cls, extra) in METHODS.items(): + key = f"{ds_name}_{mname}" + if key not in per_seed_data: + per_seed_data[key] = {} + + print(f"\n--- {key} ---", flush=True) + for seed in SEEDS: + sk = str(seed) + if sk in per_seed_data[key]: + print(f" seed {seed}: cached ({per_seed_data[key][sk]*100:.1f}%)", flush=True) + continue + try: + acc = train_one(cls, common, extra, seed) + per_seed_data[key][sk] = acc + print(f" seed {seed}: {acc*100:.1f}%", flush=True) + except Exception as e: + print(f" seed {seed}: FAILED - {e}", flush=True) + per_seed_data[key][sk] = 0.0 + + with open(per_seed_file, 'w') as f: + json.dump(per_seed_data, f, indent=2) + + del data; torch.cuda.empty_cache() + + # Summary + print(f"\n{'=' * 70}\nHero-extras summary (20 seeds, GCN L=6)\n{'=' * 70}") + results = {} + for ds in DATASETS: + print(f"\n{ds}:") + method_means = {} + for mname in METHODS: + key = f"{ds}_{mname}" + vals = np.array([per_seed_data[key].get(str(s), 0.0) for s in SEEDS]) * 100 + method_means[mname] = (vals.mean(), vals.std()) + results[key] = {'mean': float(vals.mean()), 'std': float(vals.std()), + 'per_seed': vals.tolist()} + print(f" {mname:<16} {vals.mean():5.1f} ± {vals.std():4.1f}") + + # Flag best method + best_method = max(method_means.keys(), key=lambda k: method_means[k][0]) + print(f" >>> Best: {best_method} ({method_means[best_method][0]:.1f}%)") + + with open(os.path.join(OUT_DIR, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"\nSaved to {OUT_DIR}/results.json") + + +if __name__ == '__main__': + main() diff --git a/experiments/run_pepita_baseline.py b/experiments/run_pepita_baseline.py new file mode 100644 index 0000000..2327217 --- /dev/null +++ b/experiments/run_pepita_baseline.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +"""H2: PEPITA (Dellaferrera & Kreiman 2022) adapted to GCN L=6. + +Algorithm (per batch / full graph): + 1. Forward pass 1 (clean): X -> H_0, ..., H_{L-2}, Z_out + 2. Compute E0 = softmax(Z_out) - y_onehot (masked to train nodes, unscaled) + 3. Project error to input: X_mod = X - E0 @ F (F is fixed random: C × d_in) + 4. Forward pass 2 (modulated): X_mod -> H_0^m, ..., H_{L-2}^m + 5. Weight updates: + Output layer W_{L-1}: standard gradient via E0 (only place BP-like) + Hidden layer W_l (l < L-1): gradient ~ (agg_input_l)^T @ relu_gate * (H_l^clean - H_l^mod) + +Runs on 4 datasets × 20 seeds at GCN L=6. +""" + +import torch +import torch.nn.functional as F +import numpy as np +import json +import os +from src.data import load_dataset, spmm +from src.trainers import _FeedbackTrainerBase, label_spreading +from run_dblp_depth import load_dblp + +device = 'cuda:0' +SEEDS = list(range(20)) +EPOCHS = 200 +OUT_DIR = 'results/pepita_baseline_20seeds' + + +class PEPITATrainer(_FeedbackTrainerBase): + """PEPITA backward rule for GCN.""" + + def __init__(self, data, hidden_dim, lr, weight_decay, + diffusion_alpha=0.5, diffusion_iters=10, + num_layers=2, residual_alpha=0.0, backbone='gcn', + pepita_fb_scale=0.05, **_kw): + super().__init__(data, hidden_dim, lr, weight_decay, + diffusion_alpha, diffusion_iters, + num_layers, residual_alpha, backbone, + _kw.get('use_batchnorm', False), _kw.get('dropout', 0.0)) + # Fixed random feedback: C × d_in, projects output error back to input + self.F_fb = torch.randn(self.d_out, self.d_in, device=self.device) * pepita_fb_scale + + def _pepita_output_error_unscaled(self, Z_out): + """Raw error (not divided by n_labeled) for perturbation purposes.""" + mask = self.data['train_mask'] + y = self.data['y'] + probs = F.softmax(Z_out.detach(), dim=1) + y_oh = F.one_hot(y, self.d_out).float() + E = torch.zeros_like(probs) + E[mask] = probs[mask] - y_oh[mask] + return E + + def train_step(self): + # Pass 1: clean forward + Z_out_clean, inter_clean = self.forward() + # Perturbation error (unscaled) + E_unscaled = self._pepita_output_error_unscaled(Z_out_clean) + # Gradient error (scaled by n_labeled) for output layer + E0_scaled, _ = self._output_error(Z_out_clean) + + # Modulate input + X_orig = self.data['X'] + X_mod = X_orig - E_unscaled @ self.F_fb + + # Pass 2: modulated forward + self.data['X'] = X_mod + try: + Z_out_mod, inter_mod = self.forward() + finally: + self.data['X'] = X_orig + + # Per-layer gradients + grads = [] + for l in range(self.num_layers): + if l == self.num_layers - 1: + # Output layer: standard gradient via scaled E0 + H_prev = inter_clean['Hs'][-1] if inter_clean['Hs'] else X_orig + g = H_prev.t() @ self._graph_conv_T(E0_scaled, l) + else: + # Hidden layer: activity difference, relu-gated + if l == 0: + H_prev = X_orig + else: + H_prev = inter_clean['Hs'][l - 1] + + relu_gate = (inter_clean['Zs'][l].detach() > 0).float() + # activity difference (post-ReLU) + delta_post = inter_clean['Hs'][l] - inter_mod['Hs'][l] + # scale by n_labeled like BP does + n_labeled = self.data['train_mask'].sum().float().clamp(min=1.0) + delta = relu_gate * delta_post / n_labeled + + g = H_prev.t() @ self._graph_conv_T(delta, l) + grads.append(g) + + # Apply Adam + if self._use_adam: + self._adam_t += 1 + for i in range(self.num_layers): + self.weights[i] = self.weights[i] - self._adam_step(i, grads[i]) + else: + for i in range(self.num_layers): + self.weights[i] = self.weights[i] - self.lr * (grads[i] + self.wd * self.weights[i]) + + with torch.no_grad(): + mask = self.data['train_mask'] + loss = F.cross_entropy(Z_out_clean[mask], self.data['y'][mask]).item() + acc = (Z_out_clean[mask].argmax(1) == self.data['y'][mask]).float().mean().item() + return loss, acc, {} + + +def train_one(cls, common, extra, seed): + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + t = cls(**common, **extra) + bv, bt = 0, 0 + for ep in range(EPOCHS): + t.train_step() + if ep % 5 == 0: + v = t.evaluate('val_mask') + te = t.evaluate('test_mask') + if v > bv: bv, bt = v, te + del t; torch.cuda.empty_cache() + return bt + + +def main(): + os.makedirs(OUT_DIR, exist_ok=True) + per_seed_file = os.path.join(OUT_DIR, 'per_seed_data.json') + if os.path.exists(per_seed_file): + with open(per_seed_file) as f: + per_seed_data = json.load(f) + else: + per_seed_data = {} + + datasets_cfg = { + 'Cora': lambda: load_dataset('Cora', device=device), + 'CiteSeer': lambda: load_dataset('CiteSeer', device=device), + 'PubMed': lambda: load_dataset('PubMed', device=device), + 'DBLP': lambda: load_dblp(), + } + + for ds_name, loader in datasets_cfg.items(): + data = loader() + common = dict(data=data, hidden_dim=64, lr=0.01, weight_decay=5e-4, + num_layers=6, residual_alpha=0.0, backbone='gcn') + + key = f"{ds_name}_PEPITA" + if key not in per_seed_data: + per_seed_data[key] = {} + + print(f"\n=== {key} (20 seeds, GCN L=6) ===", flush=True) + for seed in SEEDS: + sk = str(seed) + if sk in per_seed_data[key]: + print(f" seed {seed}: cached ({per_seed_data[key][sk]*100:.1f}%)", flush=True) + continue + try: + acc = train_one(PEPITATrainer, common, {}, seed) + per_seed_data[key][sk] = acc + print(f" seed {seed}: {acc*100:.1f}%", flush=True) + except Exception as e: + print(f" seed {seed}: FAILED - {e}", flush=True) + per_seed_data[key][sk] = 0.0 + + with open(per_seed_file, 'w') as f: + json.dump(per_seed_data, f, indent=2) + + del data; torch.cuda.empty_cache() + + # Summary + print(f"\n{'=' * 70}\nPEPITA summary (20 seeds, GCN L=6)\n{'=' * 70}") + results = {} + for ds in datasets_cfg: + key = f"{ds}_PEPITA" + vals = np.array([per_seed_data[key][str(s)] for s in SEEDS]) * 100 + results[key] = {'mean': float(vals.mean()), 'std': float(vals.std()), + 'per_seed': vals.tolist()} + print(f" {ds:<12} {vals.mean():5.1f} ± {vals.std():4.1f}") + + with open(os.path.join(OUT_DIR, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"\nSaved to {OUT_DIR}/results.json") + + +if __name__ == '__main__': + main() diff --git a/experiments/run_realworld_hero_L20.py b/experiments/run_realworld_hero_L20.py new file mode 100644 index 0000000..93a6c91 --- /dev/null +++ b/experiments/run_realworld_hero_L20.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +"""H33: 20-seed extension of L=20 hero on 4 real-world datasets × {BP, DFA, DFA-GNN, GRAFT}. +Paper setup (5%/class, hidden=64, lr=0.01, no scheduler, 200 epochs, GCN backbone, no dropout/BN/res). + +Tightens DBLP std (0.121 at 10-seed bimodal) for paper-grade stats. +Run as: python run_realworld_hero_L20.py [SEED_START SEED_END] + default: 10..19 (extending prior seeds 0..9). +""" + +import sys, time +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch_geometric.datasets import CitationFull, Coauthor +from torch_geometric.nn import GCNConv +from torch_geometric.utils import add_self_loops, degree + +sys.path.insert(0, '/home/yurenh2/graph-grape') +from src.trainers import GraphGrAPETrainer + +device = torch.device('cuda:2') + + +def build_A_hat(edge_index, N): + edge_index, _ = add_self_loops(edge_index, num_nodes=N) + row, col = edge_index + deg = degree(row, num_nodes=N, dtype=torch.float) + dis = deg.pow(-0.5); dis[dis == float('inf')] = 0 + return torch.sparse_coo_tensor(edge_index, dis[row]*dis[col], (N, N)).coalesce() + + +def build_row_norm(edge_index, N): + ei, _ = add_self_loops(edge_index, num_nodes=N) + row, col = ei + deg = degree(row, num_nodes=N, dtype=torch.float).clamp(min=1) + A_row = torch.sparse_coo_tensor(ei, 1.0/deg[row], (N,N)).coalesce() + A_row_T = torch.sparse_coo_tensor(ei.flip(0), 1.0/deg[col], (N,N)).coalesce() + return A_row, A_row_T + + +def paper_split(N, y, seed, train_frac=0.05, n_val=500): + g = torch.Generator().manual_seed(seed) + train_mask = torch.zeros(N, dtype=torch.bool) + val_mask = torch.zeros(N, dtype=torch.bool) + test_mask = torch.zeros(N, dtype=torch.bool) + C = int(y.max()) + 1 + for c in range(C): + idx = (y == c).nonzero().flatten() + idx = idx[torch.randperm(idx.size(0), generator=g)] + n_tr = max(1, int(round(train_frac * idx.size(0)))) + train_mask[idx[:n_tr]] = True + remaining = (~train_mask).nonzero().flatten() + remaining = remaining[torch.randperm(remaining.size(0), generator=g)] + val_mask[remaining[:n_val]] = True + test_mask[remaining[n_val:]] = True + return train_mask, val_mask, test_mask + + +class GCN(nn.Module): + def __init__(self, in_dim, hidden, out_dim, L): + super().__init__() + self.convs = nn.ModuleList([GCNConv(in_dim if i==0 else hidden, + hidden if i