#!/usr/bin/env python3
"""H33: 20-seed extension of L=20 hero on 4 real-world datasets × {BP, DFA, DFA-GNN, GRAFT}.
Paper setup (5%/class, hidden=64, lr=0.01, no scheduler, 200 epochs, GCN backbone, no dropout/BN/res).

Tightens DBLP std (0.121 at 10-seed bimodal) for paper-grade stats.
Run as: python run_realworld_hero_L20.py [SEED_START SEED_END]
        default: 10..19 (extending prior seeds 0..9).
"""

import sys, time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.datasets import CitationFull, Coauthor
from torch_geometric.nn import GCNConv
from torch_geometric.utils import add_self_loops, degree

sys.path.insert(0, '/home/yurenh2/graph-grape')
from src.trainers import GraphGrAPETrainer

device = torch.device('cuda:2')


def build_A_hat(edge_index, N):
    edge_index, _ = add_self_loops(edge_index, num_nodes=N)
    row, col = edge_index
    deg = degree(row, num_nodes=N, dtype=torch.float)
    dis = deg.pow(-0.5); dis[dis == float('inf')] = 0
    return torch.sparse_coo_tensor(edge_index, dis[row]*dis[col], (N, N)).coalesce()


def build_row_norm(edge_index, N):
    ei, _ = add_self_loops(edge_index, num_nodes=N)
    row, col = ei
    deg = degree(row, num_nodes=N, dtype=torch.float).clamp(min=1)
    A_row = torch.sparse_coo_tensor(ei, 1.0/deg[row], (N,N)).coalesce()
    A_row_T = torch.sparse_coo_tensor(ei.flip(0), 1.0/deg[col], (N,N)).coalesce()
    return A_row, A_row_T


def paper_split(N, y, seed, train_frac=0.05, n_val=500):
    g = torch.Generator().manual_seed(seed)
    train_mask = torch.zeros(N, dtype=torch.bool)
    val_mask = torch.zeros(N, dtype=torch.bool)
    test_mask = torch.zeros(N, dtype=torch.bool)
    C = int(y.max()) + 1
    for c in range(C):
        idx = (y == c).nonzero().flatten()
        idx = idx[torch.randperm(idx.size(0), generator=g)]
        n_tr = max(1, int(round(train_frac * idx.size(0))))
        train_mask[idx[:n_tr]] = True
    remaining = (~train_mask).nonzero().flatten()
    remaining = remaining[torch.randperm(remaining.size(0), generator=g)]
    val_mask[remaining[:n_val]] = True
    test_mask[remaining[n_val:]] = True
    return train_mask, val_mask, test_mask


class GCN(nn.Module):
    def __init__(self, in_dim, hidden, out_dim, L):
        super().__init__()
        self.convs = nn.ModuleList([GCNConv(in_dim if i==0 else hidden,
                                             hidden if i<L-1 else out_dim) for i in range(L)])

    def forward(self, x, ei):
        for l, c in enumerate(self.convs):
            x = c(x, ei)
            if l < len(self.convs)-1:
                x = F.relu(x)
        return x


def bp_one(L, seed, d, tm, vm, tem, epochs=200, lr=0.01, hidden=64):
    torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed)
    m = GCN(d.x.shape[1], hidden, int(d.y.max())+1, L).to(device)
    opt = torch.optim.Adam(m.parameters(), lr=lr, weight_decay=5e-4)
    @torch.no_grad()
    def ev(mask):
        m.eval()
        out = m(d.x.float(), d.edge_index)
        return (out[mask].argmax(1) == d.y[mask]).float().mean().item()
    bv = bt = 0
    for ep in range(epochs):
        m.train()
        out = m(d.x.float(), d.edge_index)
        loss = F.cross_entropy(out[tm], d.y[tm])
        opt.zero_grad(); loss.backward(); opt.step()
        if ep % 5 == 0:
            v = ev(vm)
            if v > bv: bv, bt = v, ev(tem)
    return bt


def graft_one(L, seed, d, A_hat, A_row, A_row_T, tm, vm, tem,
              epochs=200, lr=0.01, hidden=64):
    torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed)
    data = {
        'X': d.x.float(), 'A_hat': A_hat, 'A_row': A_row, 'A_row_T': A_row_T,
        'y': d.y, 'train_mask': tm, 'val_mask': vm, 'test_mask': tem,
        'num_features': d.x.shape[1], 'num_classes': int(d.y.max())+1,
        'num_nodes': d.num_nodes, 'traces': {},
    }
    trainer = GraphGrAPETrainer(
        data=data, hidden_dim=hidden, lr=lr, weight_decay=5e-4,
        lr_feedback=0.5, num_probes=64, topo_mode='fixed_A', max_topo_power=3,
        diffusion_alpha=0.5, diffusion_iters=10,
        num_layers=L, residual_alpha=0.0, backbone='gcn',
        use_batchnorm=False, dropout=0.0,
    )
    trainer.align_mode = 'chain_norm'
    bv = bt = 0
    for ep in range(epochs):
        trainer.train_step()
        if ep % 5 == 0:
            v = trainer.evaluate('val_mask')
            if v > bv: bv, bt = v, trainer.evaluate('test_mask')
    return bt


DATASETS = [
    ('CFull-CiteSeer', lambda: CitationFull(root='/home/yurenh2/graph-grape/data/CFull', name='CiteSeer')[0]),
    ('CFull-DBLP',     lambda: CitationFull(root='/home/yurenh2/graph-grape/data/CFull', name='DBLP')[0]),
    ('CFull-PubMed',   lambda: CitationFull(root='/home/yurenh2/graph-grape/data/CFull', name='PubMed')[0]),
    ('Coauthor-Physics', lambda: Coauthor(root='/home/yurenh2/graph-grape/data/Coauthor', name='Physics')[0]),
]


def main():
    s_lo = int(sys.argv[1]) if len(sys.argv) > 1 else 10
    s_hi = int(sys.argv[2]) if len(sys.argv) > 2 else 20
    seeds = list(range(s_lo, s_hi))
    L = 20

    print(f'>>> Hero L=20 extension: seeds={seeds}', flush=True)
    out = {}
    for name, loader in DATASETS:
        print(f'\n=== {name} ===', flush=True)
        d = loader().to(device)
        N = d.num_nodes
        A_hat = build_A_hat(d.edge_index, N)
        A_row, A_row_T = build_row_norm(d.edge_index, N)
        print(f'  N={N}, deg={d.edge_index.shape[1]/N:.1f}, C={int(d.y.max())+1}', flush=True)

        bp_a, gf_a = [], []
        for s in seeds:
            tm, vm, tem = paper_split(N, d.y.cpu(), s)
            tm = tm.to(device); vm = vm.to(device); tem = tem.to(device)
            t0 = time.time()
            bp = bp_one(L, s, d, tm, vm, tem)
            t1 = time.time()
            gf = graft_one(L, s, d, A_hat, A_row, A_row_T, tm, vm, tem)
            t2 = time.time()
            bp_a.append(bp); gf_a.append(gf)
            print(f'  s={s} L={L}: BP={bp:.4f}({t1-t0:.0f}s)  GRAFT={gf:.4f}({t2-t1:.0f}s)', flush=True)
        bp_m, bp_sd = float(np.mean(bp_a)), float(np.std(bp_a))
        gf_m, gf_sd = float(np.mean(gf_a)), float(np.std(gf_a))
        out[name] = dict(seeds=seeds, BP=bp_a, GRAFT=gf_a, BP_mean=bp_m, BP_std=bp_sd,
                        GRAFT_mean=gf_m, GRAFT_std=gf_sd)
        print(f'  >>> {name} L=20 (seeds {s_lo}-{s_hi-1}): BP {bp_m:.4f}±{bp_sd:.4f}  GRAFT {gf_m:.4f}±{gf_sd:.4f}  Δ={gf_m-bp_m:+.3f}', flush=True)
        del d, A_hat, A_row, A_row_T
        torch.cuda.empty_cache()

    print('\n=== SUMMARY (this run) ===', flush=True)
    for k, v in out.items():
        print(f'  {k}: BP {v["BP_mean"]:.4f}±{v["BP_std"]:.4f}  GRAFT {v["GRAFT_mean"]:.4f}±{v["GRAFT_std"]:.4f}', flush=True)


if __name__ == '__main__':
    main()