diag/wl.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166

"""1-WL color-refinement instrument for diagnosing GNN failures (H1 vs H2).

A GIN with L layers == L rounds of 1-WL refinement (injective sum aggregation).
A failure on sample i is attributed by label purity of its WL color classes:

  converged-WL class IMPURE (train labels conflict under same color)
      -> H2        : 1-WL ceiling. No MPNN at ANY depth separates -> needs >1-WL (noise).
  converged pure, but L-round class impure
      -> H1a_depth : separable only with MORE rounds -> deterministic RR-on-graph / depth helps.
  L-round class pure (info present at depth L) but model wrong
      -> H1b_opt   : optimization / capacity. Train better.

Refinement is dataset-global (shared per-round signature->label map) so node colors and
graph-color histograms are comparable across graphs.
"""
from collections import Counter, defaultdict
import numpy as np


def edges_to_adj(n, edge_index):
    adj = [[] for _ in range(n)]
    ei = np.asarray(edge_index)
    for a, b in zip(ei[0].tolist(), ei[1].tolist()):
        adj[a].append(b)
    return adj


def wl_refine(adjs, inits=None, max_rounds=None):
    """Dataset-level 1-WL. Returns (node_rounds, ghist_rounds, conv_round).
    node_rounds[r][g] = int color array (global labels) of graph g after r rounds.
    ghist_rounds[r][g] = canonical color histogram (hashable) of graph g after r rounds.
    conv_round       = round index at which the global partition stabilized.
    """
    if inits is None:
        inits = [np.zeros(len(a), dtype=np.int64) for a in adjs]
    else:
        inits = [np.asarray(x, dtype=np.int64) for x in inits]
    if max_rounds is None:
        max_rounds = max((len(a) for a in adjs), default=0) + 2

    d = {}
    def lab(s):
        v = d.get(s)
        if v is None:
            v = len(d); d[s] = v
        return v

    cur = [np.array([lab(('i', int(c))) for c in init], dtype=np.int64) for init in inits]
    node_rounds = [cur]
    nclasses = [len(d)]

    for _r in range(max_rounds):
        d = {}
        nxt = []
        for adj in adjs:
            c = cur_g = node_rounds[-1][len(nxt)]
            arr = np.empty(len(adj), dtype=np.int64)
            for v in range(len(adj)):
                sig = (int(c[v]), tuple(sorted(int(c[u]) for u in adj[v])))
                arr[v] = lab(sig)
            nxt.append(arr)
        node_rounds.append(nxt)
        nclasses.append(len(d))
        if nclasses[-1] == nclasses[-2]:          # global #classes stopped growing -> converged
            break

    conv_round = len(node_rounds) - 1
    ghist_rounds = [[_hist(c) for c in nr] for nr in node_rounds]
    return node_rounds, ghist_rounds, conv_round


def _hist(colors):
    return tuple(sorted(Counter(colors.tolist()).items()))


def graph_colors_at(ghist_rounds, conv_round, L):
    return ghist_rounds[min(L, conv_round)]


# ---------- classification attribution ----------
def attribute_classification(ghist_rounds, conv_round, L, y, train_idx, eval_idx):
    y = np.asarray(y)
    conv = ghist_rounds[conv_round]
    Lr = min(L, conv_round)
    lr = ghist_rounds[Lr]
    conv_train, lr_train = defaultdict(list), defaultdict(list)
    for i in train_idx:
        conv_train[conv[i]].append(int(y[i]))
        lr_train[lr[i]].append(int(y[i]))

    def pure(dct, key):
        labs = dct.get(key)
        return labs is not None and len(set(labs)) == 1

    def majority(dct, key):
        labs = dct.get(key)
        return Counter(labs).most_common(1)[0][0] if labs else None

    buckets = {}
    wl_opt = lr_opt = 0
    for i in eval_idx:
        if conv[i] not in conv_train:
            buckets[i] = 'novel'
        elif not pure(conv_train, conv[i]):
            buckets[i] = 'H2'
        elif not pure(lr_train, lr[i]):
            buckets[i] = 'H1a_depth'
        else:
            buckets[i] = 'H1b_opt'
        if majority(conv_train, conv[i]) == int(y[i]):
            wl_opt += 1
        if majority(lr_train, lr[i]) == int(y[i]):
            lr_opt += 1
    n = len(eval_idx)
    return {
        'buckets': buckets,
        'counts': dict(Counter(buckets.values())),
        'wl_optimal_acc_converged': wl_opt / n,   # best ANY MPNN can do
        'wl_optimal_acc_Ldepth': lr_opt / n,      # best L-layer MPNN can do
        'L_used': Lr, 'conv_round': conv_round,
    }


# ---------- regression decomposition ----------
def decompose_regression(ghist_rounds, conv_round, L, y, train_idx, eval_idx):
    """H2 floor = ORACLE within-color variance on FULL data (best possible function of the WL
    color: do same-color graphs share the target?). This is the true information ceiling and is
    NOT confounded by train/test coverage. The train-fitted floors are also reported to expose
    how much apparent error is really novel-color generalization, plus coverage fractions."""
    y = np.asarray(y, dtype=np.float64)
    conv = ghist_rounds[conv_round]
    Lr = min(L, conv_round)
    lr = ghist_rounds[Lr]
    full_idx = list(range(len(y)))

    # oracle: best constant per converged color over ALL data -> irreducible by any MPNN
    conv_mean_full = _group_mean(conv, y, full_idx)
    e_oracle = np.array([conv_mean_full[conv[i]] - y[i] for i in eval_idx])

    # train-fitted (achievable with this split); fallback to global mean on unseen colors
    conv_mean_tr = _group_mean(conv, y, train_idx)
    lr_mean_tr = _group_mean(lr, y, train_idx)
    gmean = float(y[list(train_idx)].mean())
    e_conv_tr = np.array([conv_mean_tr.get(conv[i], gmean) - y[i] for i in eval_idx])
    e_lr_tr = np.array([lr_mean_tr.get(lr[i], gmean) - y[i] for i in eval_idx])

    conv_count = Counter(conv[i] for i in full_idx)
    train_colors = set(conv[i] for i in train_idx)
    frac_unseen = float(np.mean([conv[i] not in train_colors for i in eval_idx]))
    frac_singleton = float(np.mean([conv_count[conv[i]] == 1 for i in eval_idx]))
    return {
        'mse_floor_oracle_H2': float((e_oracle ** 2).mean()),      # TRUE 1-WL ceiling
        'mse_floor_converged_train': float((e_conv_tr ** 2).mean()),
        'mse_floor_Ldepth_train': float((e_lr_tr ** 2).mean()),
        'frac_test_unseen_color': frac_unseen,
        'frac_test_singleton_color': frac_singleton,
        'L_used': Lr, 'conv_round': conv_round,
        'var_target_eval': float(y[list(eval_idx)].var()),
    }


def _group_mean(colors, y, idx):
    acc = defaultdict(list)
    for i in idx:
        acc[colors[i]].append(float(y[i]))
    return {k: float(np.mean(v)) for k, v in acc.items()}