#!/usr/bin/env python3 """E0a+E0c+E0e: hero-table coverage expansion. Adds 3 datasets (PubMed stack, Coauthor-Physics, Coauthor-CS) × 5 methods (BP, DFA, DFA-GNN, KAFT, KAFT+ResGCN) × 20 seeds, all GCN L=6. Goal: 6-row hero table of homophilous citation/coauthor graphs where KAFT or KAFT+ResGCN is best per row.""" import torch import numpy as np import json import os from scipy import stats as scipy_stats from src.data import load_dataset, spmm from src.trainers import BPTrainer, DFATrainer, DFAGNNTrainer, KAFTTrainer from run_deep_baselines import ResGCNTrainer from run_combo_20seeds import GRAFTResGCN from run_large_graph_scout import load_and_check import torch.nn.functional as F device = 'cuda:0' SEEDS = list(range(20)) EPOCHS = 200 OUT_DIR = 'results/hero_extras_20seeds' grape_extra = dict(diffusion_alpha=0.5, diffusion_iters=10, lr_feedback=0.5, num_probes=64, topo_mode='fixed_A') dfagnn_extra = dict(diffusion_alpha=0.5, diffusion_iters=10, max_topo_power=3) # DFA-GNN + ResGCN wrapper (from run_dfagnn_resgcn.py but inlined for module independence) class DFAGNNResGCN(DFAGNNTrainer): def forward(self): X = self.data['X'] H = X H0 = None Hs, Zs = [], [] for l in range(self.num_layers): Z = self._graph_conv(H, self.weights[l], l) Zs.append(Z) if l < self.num_layers - 1: H_new = F.relu(Z) if H_new.size(1) == H.size(1): H = H + H_new else: H = H_new Hs.append(H) if l == 0: H0 = H else: return Z, {'Hs': Hs, 'Zs': Zs, 'H0': H0} return Z, {'Hs': Hs, 'Zs': Zs, 'H0': H0} METHODS = { 'BP': (BPTrainer, {}), 'DFA': (DFATrainer, dfagnn_extra), 'DFA-GNN': (DFAGNNTrainer, dfagnn_extra), 'KAFT': (KAFTTrainer, grape_extra), 'KAFT+ResGCN': (GRAFTResGCN, grape_extra), } def train_one(cls, common, extra, seed): torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) t = cls(**common, **extra) if hasattr(t, 'align_mode'): t.align_mode = 'chain_norm' bv, bt = 0, 0 for ep in range(EPOCHS): t.train_step() if ep % 5 == 0: v = t.evaluate('val_mask') te = t.evaluate('test_mask') if v > bv: bv, bt = v, te del t; torch.cuda.empty_cache() return bt def load_dataset_hero(name): """Return data dict in the same format as load_dataset, for any hero-list dataset.""" if name == 'PubMed': return load_dataset('PubMed', device=device) # Coauthor-* uses load_and_check which returns (stats, data) tuple stats, data = load_and_check(name) if data is None: raise RuntimeError(f"Failed to load {name}") return data def main(): os.makedirs(OUT_DIR, exist_ok=True) per_seed_file = os.path.join(OUT_DIR, 'per_seed_data.json') if os.path.exists(per_seed_file): with open(per_seed_file) as f: per_seed_data = json.load(f) else: per_seed_data = {} # Order from fastest (PubMed ~19K) to slower (Physics ~34K, CS ~18K) DATASETS = ['PubMed', 'Coauthor-CS', 'Coauthor-Physics'] for ds_name in DATASETS: print(f"\n{'=' * 70}\n{ds_name} (GCN L=6, 20 seeds, 5 methods)\n{'=' * 70}", flush=True) data = load_dataset_hero(ds_name) common = dict(data=data, hidden_dim=64, lr=0.01, weight_decay=5e-4, num_layers=6, residual_alpha=0.0, backbone='gcn') for mname, (cls, extra) in METHODS.items(): key = f"{ds_name}_{mname}" if key not in per_seed_data: per_seed_data[key] = {} print(f"\n--- {key} ---", flush=True) for seed in SEEDS: sk = str(seed) if sk in per_seed_data[key]: print(f" seed {seed}: cached ({per_seed_data[key][sk]*100:.1f}%)", flush=True) continue try: acc = train_one(cls, common, extra, seed) per_seed_data[key][sk] = acc print(f" seed {seed}: {acc*100:.1f}%", flush=True) except Exception as e: print(f" seed {seed}: FAILED - {e}", flush=True) per_seed_data[key][sk] = 0.0 with open(per_seed_file, 'w') as f: json.dump(per_seed_data, f, indent=2) del data; torch.cuda.empty_cache() # Summary print(f"\n{'=' * 70}\nHero-extras summary (20 seeds, GCN L=6)\n{'=' * 70}") results = {} for ds in DATASETS: print(f"\n{ds}:") method_means = {} for mname in METHODS: key = f"{ds}_{mname}" vals = np.array([per_seed_data[key].get(str(s), 0.0) for s in SEEDS]) * 100 method_means[mname] = (vals.mean(), vals.std()) results[key] = {'mean': float(vals.mean()), 'std': float(vals.std()), 'per_seed': vals.tolist()} print(f" {mname:<16} {vals.mean():5.1f} ± {vals.std():4.1f}") # Flag best method best_method = max(method_means.keys(), key=lambda k: method_means[k][0]) print(f" >>> Best: {best_method} ({method_means[best_method][0]:.1f}%)") with open(os.path.join(OUT_DIR, 'results.json'), 'w') as f: json.dump(results, f, indent=2) print(f"\nSaved to {OUT_DIR}/results.json") if __name__ == '__main__': main()