paper/experiments_master.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426

% =============================================================================
% GRAFT — Master experiment notes (all results, grouped by category)
% Standalone .tex; compile with `pdflatex notes/experiments_master.tex` at repo root
% so the \includegraphics paths to ../graft_*.pdf resolve.
% =============================================================================
\documentclass[10pt]{article}
\usepackage[margin=0.9in]{geometry}
\usepackage[table]{xcolor}
\usepackage{tabularx,booktabs,multirow,float,graphicx,hyperref,amsmath,amssymb}
\definecolor{bestg}{HTML}{D6F0DC}
\definecolor{negr}{HTML}{F8D7DA}
\newcommand{\best}[1]{\colorbox{bestg}{$#1$}}
\newcommand{\nega}[1]{\colorbox{negr}{$#1$}}
\graphicspath{{../}{./}}
\hypersetup{colorlinks=true,linkcolor=blue!50!black}

\title{GRAFT — Master Experiment Notes\\\large All experiments grouped by category}
\author{Internal notes (auto-aggregated)}
\date{Last updated: 2026-04-30}

\begin{document}
\maketitle
\tableofcontents

\section*{Reading guide}
Numbers come from \texttt{neurips\_v4\_main.tex} (Tables T1--T12), \texttt{drafts/hero\_table.tex}, \texttt{drafts/hero\_realworld\_L20.tex}, and the \texttt{results/} folder. Figure files are PDFs at the repo root. Categories are in topical order, not story order; each section is self-contained.

\textbf{Default experimental setup (unless noted):} GCN backbone, hidden=64, lr=0.01, 200 epochs, no LR scheduler, no residual / BatchNorm / Dropout, 5\,\%/class semi-supervised split (Planetoid-style), 20 seeds, paired $t$-test BH-corrected, mean$\pm$std on test accuracy. ``Paper setup'' refers to this default. Deviations are stated per table.

\textbf{Main datasets.} Cora, CiteSeer, PubMed (Planetoid), DBLP (CitationFull). Real-world large: CitationFull-CiteSeer (4.2K, deg 2.5, 6-cl), CitationFull-DBLP (17.7K, deg 5.4, 4-cl), CitationFull-PubMed (19.7K biomed, deg 4.5, 3-cl), Coauthor-Physics (34.5K, deg 14.4, 5-cl).

% =============================================================================
\section{Main accuracy (BP vs GRAFT, paper setup)}\label{sec:main}

\subsection{Per-backbone, per-depth (T2)}
Source: \texttt{tab:main}, paper line 217. 4 datasets $\times$ 4 backbones $\times$ \{$L=5,6$\} $\times$ 20 seeds, paired-$t$ BH-corrected. GRAFT improves over BP in \textbf{86 of 96} paired comparisons; all non-GIN settings significant at $q\!=\!0.05$.

\begin{table}[H]
\centering\small
\caption{BP vs GRAFT per (dataset, backbone, depth). GIN excepted because its $(1+\epsilon)I$ identity already provides a residual gradient path.}
\begin{tabularx}{\textwidth}{ll *{4}{>{\centering\arraybackslash}X}}
\toprule
Dataset & Backbone-$L$ & BP & GRAFT & $\Delta$ & $p$ \\
\midrule
\multirow{8}{*}{Cora}
& gcn $L\!=\!5$  & $74.3{\pm 2.5}$  & \best{78.8{\pm 1.0}}  & $+4.5$  & $<\!0.001$ \\
& gcn $L\!=\!6$  & $69.4{\pm 5.7}$  & \best{78.2{\pm 1.1}}  & $+8.7$  & $0.002$ \\
& sage $L\!=\!5$ & $74.4{\pm 2.8}$  & \best{77.9{\pm 0.9}}  & $+3.5$  & $<\!0.001$ \\
& sage $L\!=\!6$ & $69.5{\pm 4.9}$  & \best{78.4{\pm 0.9}}  & $+8.9$  & $<\!0.001$ \\
& appnp $L\!=\!5$ & $74.8{\pm 2.7}$ & \best{79.1{\pm 1.1}}  & $+4.3$  & $<\!0.001$ \\
& appnp $L\!=\!6$ & $66.4{\pm 5.0}$ & \best{77.8{\pm 2.9}}  & $+11.4$ & $<\!0.001$ \\
& gin $L\!=\!5$  & $78.5{\pm 1.3}$  & \best{80.1{\pm 1.0}}  & $+1.6$  & $<\!0.001$ \\
& gin $L\!=\!6$  & $77.8{\pm 1.5}$  & $77.8{\pm 1.5}$       & $+0.0$  & ns \\
\midrule
\multirow{8}{*}{CiteSeer}
& gcn $L\!=\!5$ & $60.6{\pm 3.1}$ & \best{63.7{\pm 1.8}} & $+3.1$ & $0.002$ \\
& gcn $L\!=\!6$ & $55.7{\pm 3.6}$ & \best{63.5{\pm 2.2}} & $+7.7$ & $<\!0.001$ \\
& sage $L\!=\!5$ & $61.2{\pm 3.2}$ & \best{63.9{\pm 1.8}} & $+2.8$ & $0.005$ \\
& sage $L\!=\!6$ & $55.8{\pm 4.8}$ & \best{62.0{\pm 2.1}} & $+6.2$ & $0.007$ \\
& appnp $L\!=\!5$ & $61.3{\pm 2.7}$ & \best{64.6{\pm 1.6}} & $+3.2$ & $<\!0.001$ \\
& appnp $L\!=\!6$ & $53.3{\pm 5.4}$ & \best{64.7{\pm 1.7}} & $+11.4$ & $<\!0.001$ \\
& gin $L\!=\!5$ & \best{66.7{\pm 1.3}} & $65.2{\pm 1.3}$ & $-1.5$ & $<\!0.001$ \\
& gin $L\!=\!6$ & \best{65.1{\pm 1.7}} & $63.1{\pm 2.3}$ & $-2.1$ & $0.004$ \\
\midrule
\multirow{8}{*}{PubMed}
& gcn $L\!=\!5$ & $75.8{\pm 2.1}$ & \best{76.9{\pm 0.7}} & $+1.2$ & $0.032$ \\
& gcn $L\!=\!6$ & $73.2{\pm 2.7}$ & \best{75.8{\pm 1.1}} & $+2.6$ & $<\!0.001$ \\
& sage $L\!=\!5$ & $75.8{\pm 1.8}$ & \best{76.6{\pm 0.4}} & $+0.8$ & ns \\
& sage $L\!=\!6$ & $74.5{\pm 1.8}$ & \best{76.5{\pm 1.0}} & $+2.0$ & $0.001$ \\
& appnp $L\!=\!5$ & $76.9{\pm 1.8}$ & \best{79.1{\pm 0.4}} & $+2.2$ & $<\!0.001$ \\
& appnp $L\!=\!6$ & $73.7{\pm 3.7}$ & \best{78.3{\pm 0.9}} & $+4.6$ & $<\!0.001$ \\
& gin $L\!=\!5$ & $76.6{\pm 0.7}$ & \best{77.7{\pm 0.6}} & $+1.1$ & $<\!0.001$ \\
& gin $L\!=\!6$ & $76.4{\pm 1.3}$ & \best{76.9{\pm 1.0}} & $+0.5$ & ns \\
\midrule
\multirow{8}{*}{DBLP}
& gcn $L\!=\!5$ & $82.1{\pm 0.4}$ & \best{83.1{\pm 0.3}} & $+0.9$ & $<\!0.001$ \\
& gcn $L\!=\!6$ & $81.3{\pm 0.5}$ & \best{82.9{\pm 0.3}} & $+1.5$ & $<\!0.001$ \\
& sage $L\!=\!5$ & $82.4{\pm 0.3}$ & $82.5{\pm 0.4}$ & $+0.2$ & ns \\
& sage $L\!=\!6$ & $81.7{\pm 0.5}$ & \best{82.5{\pm 0.3}} & $+0.8$ & $0.002$ \\
& appnp $L\!=\!5$ & $81.6{\pm 0.4}$ & \best{83.1{\pm 0.4}} & $+1.5$ & $<\!0.001$ \\
& appnp $L\!=\!6$ & $79.6{\pm 1.2}$ & \best{83.2{\pm 0.4}} & $+3.6$ & $<\!0.001$ \\
& gin $L\!=\!5$ & $81.8{\pm 0.4}$ & \best{82.3{\pm 0.4}} & $+0.5$ & $0.001$ \\
& gin $L\!=\!6$ & $81.6{\pm 0.6}$ & \best{82.2{\pm 0.5}} & $+0.6$ & $0.004$ \\
\bottomrule
\end{tabularx}
\end{table}

\subsection{BP vs GRAFT visual summary}
\begin{figure}[H]\centering
\includegraphics[width=0.85\textwidth]{graft_vs_bp_boxscatter.pdf}
\caption{Per-seed scatter+box of GRAFT vs BP across paper-setup configurations (4 datasets, GCN $L=5,6$).}
\end{figure}

% =============================================================================
\section{Backward-method baselines (vs DFA / DFA-GNN / VanillaGrAPE / PEPITA / FF / CaFo)}\label{sec:backwards}

\subsection{Leaderboard (T1, paper)}
Source: \texttt{tab:leaderboard}, paper line 184. GCN $L\!=\!6$, 20 seeds.
\begin{table}[H]\centering\small
\begin{tabularx}{\textwidth}{l *{3}{>{\centering\arraybackslash}X}}
\toprule
Method & Cora & CiteSeer & DBLP \\
\midrule
\multicolumn{4}{l}{\emph{BP $+$ forward-side anti-over-smoothing}}\\
BP (vanilla)            & $68.8{\pm 4.6}$ & $54.0{\pm 4.1}$ & $80.5{\pm 1.0}$ \\
BP $+$ ResGCN           & $77.5{\pm 1.6}$ & $63.0{\pm 2.2}$ & $82.3{\pm 0.4}$ \\
BP $+$ JKNet            & $78.2{\pm 1.0}$ & $64.4{\pm 1.2}$ & $79.9{\pm 0.8}$ \\
BP $+$ PairNorm         & $69.0{\pm 3.2}$ & $55.4{\pm 3.4}$ & $79.0{\pm 0.8}$ \\
BP $+$ DropEdge         & $74.8{\pm 1.8}$ & $64.0{\pm 1.6}$ & $81.6{\pm 0.5}$ \\
\midrule
\multicolumn{4}{l}{\emph{Feedback-alignment baselines (graph-agnostic backward)}}\\
DFA                     & $70.4{\pm 6.8}$ & $60.2{\pm 2.4}$ & --- \\
DFA-GNN                 & $68.1{\pm 5.9}$ & $60.0{\pm 2.2}$ & --- \\
VanillaGrAPE            & $77.5{\pm 1.7}$ & $62.3{\pm 1.5}$ & $82.0{\pm 0.6}$ \\
\midrule
\multicolumn{4}{l}{\emph{GRAFT and combinations}}\\
\textbf{GRAFT}          & $76.7{\pm 1.8}$ & $62.4{\pm 1.9}$ & $82.1{\pm 0.4}$ \\
\textbf{GRAFT $+$ ResGCN}  & $77.8{\pm 1.9}$ & $61.5{\pm 2.2}$ & \best{82.7{\pm 0.6}} \\
\textbf{GRAFT $+$ JKNet}   & \best{78.3{\pm 1.6}} & $61.8{\pm 2.2}$ & $82.4{\pm 0.4}$ \\
\textbf{GRAFT $+$ PairNorm}& $75.8{\pm 1.5}$ & \best{64.3{\pm 2.0}} & $80.7{\pm 0.6}$ \\
\textbf{GRAFT $+$ DropEdge}& $70.8{\pm 3.8}$ & $62.1{\pm 1.8}$ & $80.7{\pm 0.7}$ \\
\bottomrule
\end{tabularx}
\caption{T1: Backward-method leaderboard at $L=6$. (DFA/DFA-GNN DBLP cells filled in T1' below.)}
\end{table}

\subsection{Wide backward-only hero (drafts)}
Source: \texttt{drafts/hero\_table.tex} (4 datasets, 6 backward methods, 20 seeds, $L=6$). PEPITA and FF$+$VN are essentially random-class on these graphs.

\begin{table}[H]\centering\small
\begin{tabularx}{\textwidth}{l *{6}{>{\centering\arraybackslash}X}}
\toprule
Dataset & BP & DFA & DFA-GNN & PEPITA & FF$+$VN & GRAFT \\
\midrule
Cora     & $68.8{\pm 4.6}$ & $70.4{\pm 6.8}$ & $70.1{\pm 6.1}$ & $31.9{\pm 0.0}$ & $25.5{\pm 8.8}$ & \best{76.7{\pm 1.8}} \\
CiteSeer & $54.0{\pm 4.1}$ & $60.2{\pm 2.4}$ & $60.0{\pm 1.8}$ & $18.2{\pm 0.3}$ & $19.0{\pm 2.0}$ & \best{62.4{\pm 1.9}} \\
PubMed   & $73.2{\pm 3.0}$ & $72.4{\pm 2.0}$ & $70.8{\pm 2.0}$ & $41.6{\pm 2.6}$ & $39.7{\pm 5.0}$ & \best{74.4{\pm 1.6}} \\
DBLP     & $80.5{\pm 1.0}$ & $81.5{\pm 1.2}$ & $81.0{\pm 1.1}$ & $47.7{\pm 5.5}$ & $44.7{\pm 0.0}$ & \best{82.1{\pm 0.4}} \\
\bottomrule
\end{tabularx}
\caption{Wide hero (not in paper). DBLP DFA/DFA-GNN cells filled here.}
\end{table}

\subsection{Hidden / deferred baselines (not in hero)}
\begin{itemize}\setlength\itemsep{1pt}
\item \textbf{CaFo$+$CE} (Park et al.\ 2023): Cora 79.5, CiteSeer 66.3, PubMed 76.4, DBLP 81.8 (20 seeds, $L=6$). Beats GRAFT on 3/4 datasets (+2.0 to +3.9). Greedy layer-wise (no gradient chain), different paradigm $\Rightarrow$ hidden from hero per paper-direction. Data: \texttt{results/cafo\_baseline\_20seeds/}.
\item \textbf{ForwardGNN-SF}: deferred (separate conda env + multi-file integration); paper SF on Cora $L=3$ reports $\sim$84.5 (close to BP 86.0).
\end{itemize}

\subsection{Ablation: learned alignment $\times$ topology factor (T3)}
Source: \texttt{tab:ablation}, paper line 273. GCN $L=6$, 20 seeds. Learned alignment dominates accuracy; explicit topology factor is marginal in raw accuracy but causal under intervention (\S\ref{sec:wrong-topo}).
\begin{table}[H]\centering\small
\begin{tabularx}{\textwidth}{l *{4}{>{\centering\arraybackslash}X}}
\toprule
Method & Cora & CiteSeer & PubMed & DBLP \\
\midrule
DFA (random $R$, $P\!=\!I$) & $70.4{\pm 6.8}$ & $60.2{\pm 2.4}$ & $72.2{\pm 1.5}$ & --- \\
DFA-GNN (random $R$, topo pseudo-error) & $68.1{\pm 5.9}$ & $60.0{\pm 2.2}$ & $70.5{\pm 2.0}$ & --- \\
VanillaGrAPE (learned $R$, $P\!=\!I$) & \best{77.3{\pm 1.0}} & $61.9{\pm 1.2}$ & \best{74.4{\pm 1.3}} & $82.0{\pm 0.6}$ \\
\textbf{GRAFT} (learned $R$, $P_\ell(\hat A)$) & \best{77.3{\pm 1.4}} & \best{62.8{\pm 1.6}} & $74.1{\pm 1.6}$ & \best{82.1{\pm 0.6}} \\
\bottomrule
\end{tabularx}
\end{table}

% =============================================================================
\section{Stackability (GRAFT $\times$ forward-side methods)}\label{sec:stack}
Source: \texttt{tab:stackability}, paper line 374. GCN $L=6$, 20 seeds.
\begin{table}[H]\centering\small
\begin{tabularx}{\textwidth}{l *{3}{>{\centering\arraybackslash}X}}
\toprule
Method & Cora & CiteSeer & DBLP \\
\midrule
BP                  & $68.8{\pm 4.6}$ & $54.0{\pm 4.1}$ & $80.5{\pm 1.0}$ \\
BP $+$ ResGCN       & $77.5{\pm 1.6}$ & $63.0{\pm 2.2}$ & $82.3{\pm 0.4}$ \\
BP $+$ JKNet        & $78.2{\pm 1.0}$ & \best{64.4{\pm 1.2}} & $79.9{\pm 0.8}$ \\
BP $+$ PairNorm     & $69.0{\pm 3.2}$ & $55.4{\pm 3.4}$ & $79.0{\pm 0.8}$ \\
BP $+$ DropEdge     & $74.8{\pm 1.8}$ & $64.0{\pm 1.6}$ & $81.6{\pm 0.5}$ \\
\midrule
GRAFT (backward only) & $76.7{\pm 1.8}$ & $62.4{\pm 1.9}$ & $82.1{\pm 0.4}$ \\
\midrule
GRAFT $+$ ResGCN    & $77.8{\pm 1.9}$ & $61.5{\pm 2.2}$ & \best{82.7{\pm 0.6}} \\
GRAFT $+$ JKNet     & \best{78.3{\pm 1.6}} & $61.8{\pm 2.2}$ & $82.4{\pm 0.4}$ \\
GRAFT $+$ PairNorm  & $75.8{\pm 1.5}$ & \best{64.3{\pm 2.0}} & $80.7{\pm 0.6}$ \\
GRAFT $+$ DropEdge  & $70.8{\pm 3.8}$ & $62.1{\pm 1.8}$ & $80.7{\pm 0.7}$ \\
\bottomrule
\end{tabularx}
\end{table}

\textbf{Notes.} GRAFT $+$ DropEdge is the one combination that fails to stack: forward--backward topology mismatch (forward drops edges, backward $P_\ell(\hat A)$ uses full $\hat A$). Synchronized variant recovers part of the gap but not all of it.

% =============================================================================
\section{Depth survival}\label{sec:depth}

\subsection{Cora / DBLP depth stress (T8)}
Source: \texttt{tab:depth-stress}, paper line 657. GCN, 3 seeds.
\begin{table}[H]\centering\small
\begin{tabularx}{\textwidth}{cl *{4}{>{\centering\arraybackslash}X}}
\toprule
Dataset & $L$ & BP & ResGCN & GRAFT & GRAFT $+$ ResGCN \\
\midrule
\multirow{6}{*}{Cora}
& 6  & $71.4{\pm 1.1}$ & $78.0{\pm 2.0}$ & $76.4{\pm 2.1}$ & \best{78.1{\pm 0.7}} \\
& 8  & $39.7{\pm 5.3}$ & \best{78.2{\pm 2.3}} & $63.8{\pm 5.0}$ & $51.7{\pm 11.0}$ \\
& 10 & $35.1{\pm 4.4}$ & \best{76.9{\pm 2.2}} & $54.5{\pm 4.7}$ & $47.3{\pm 5.3}$ \\
& 12 & $32.8{\pm 1.9}$ & \best{76.6{\pm 1.2}} & $45.7{\pm 1.8}$ & $42.3{\pm 1.3}$ \\
& 16 & $29.3{\pm 2.2}$ & \best{73.5{\pm 2.5}} & $35.4{\pm 2.6}$ & $31.6{\pm 0.5}$ \\
& 20 & $24.3{\pm 6.7}$ & \best{49.2{\pm 20.9}} & $38.3{\pm 5.0}$ & $34.1{\pm 3.1}$ \\
\midrule
\multirow{6}{*}{DBLP}
& 6  & $79.9{\pm 0.9}$ & $82.3{\pm 0.3}$ & $82.6{\pm 0.5}$ & \best{83.0{\pm 0.5}} \\
& 8  & $78.8{\pm 1.0}$ & $81.9{\pm 0.6}$ & \best{82.2{\pm 0.4}} & $81.6{\pm 1.1}$ \\
& 10 & $71.1{\pm 11.9}$ & \best{80.4{\pm 0.7}} & $78.1{\pm 1.0}$ & $69.4{\pm 0.9}$ \\
& 12 & $66.8{\pm 6.4}$ & \best{80.0{\pm 1.3}} & $73.4{\pm 3.2}$ & $64.8{\pm 8.1}$ \\
& 16 & $45.4{\pm 0.7}$ & $63.7{\pm 13.2}$ & \best{69.9{\pm 0.1}} & $60.3{\pm 11.3}$ \\
& 20 & $46.1{\pm 1.4}$ & $61.3{\pm 7.4}$ & \best{61.8{\pm 11.0}} & $46.8{\pm 3.0}$ \\
\bottomrule
\end{tabularx}
\caption{Three observations: (i) GRAFT sweet spot $L\!=\!5$--$8$. (ii) Cora $L\!\geq\!10$: ResGCN dominates. (iii) DBLP $L\!=\!16$: GRAFT \emph{overtakes} ResGCN (69.9 vs 63.7).}
\end{table}

\subsection{4 large real-world datasets, depth sweep (BP / DFA / DFA-GNN / GRAFT)}
Source: \texttt{gen\_realworld\_depth\_fig.py}, 3 seeds per cell. CitationFull-CiteSeer, CitationFull-DBLP, CitationFull-PubMed-biomed, Coauthor-Physics. $L\in\{3,5,8,10,12,14,16,18,20\}$.

\begin{figure}[H]\centering
\includegraphics[width=\textwidth]{graft_realworld_depth.pdf}
\caption{Real-world depth survival. Shallow ($L=3$) all methods tied $\geq 0.83$; from $L\!\geq\!10$ BP/DFA/DFA-GNN collapse, GRAFT descends gracefully and stays $\geq$10\,p.p.\ above the second-best at $L\!=\!20$ on every dataset.}
\end{figure}

\subsection{Real-world hero at $L=20$ (20 seeds)}
Source: \texttt{drafts/hero\_realworld\_L20.tex} + \texttt{realworld\_hero\_L20\_20seed.log}.
\begin{table}[H]\centering\small
\begin{tabularx}{\textwidth}{l *{4}{>{\centering\arraybackslash}X} c}
\toprule
Dataset & BP & DFA & DFA-GNN & GRAFT & $p$ (vs BP) \\
\midrule
CFull-CiteSeer       & $25.3{\pm 1.3}$  & $21.2{\pm 4.2}$ & $19.6{\pm 0.4}$ & \best{37.1{\pm 8.1}} & $5\!\times\!10^{-6}$ \\
CFull-DBLP           & $54.6{\pm 2.8}$  & $44.7{\pm 0.0}$ & $44.7{\pm 0.0}$ & \best{57.3{\pm 12.0}} & $0.34$ \\
CFull-PubMed (biomed)& $41.9{\pm 1.3}$  & $40.0{\pm 0.3}$ & $39.9{\pm 0.0}$ & \best{49.9{\pm 9.6}}  & $0.002$ \\
Coauthor-Physics     & $58.5{\pm 15.5}$ & $50.6{\pm 0.2}$ & $50.5{\pm 0.0}$ & \best{65.4{\pm 5.1}}  & $0.07$ \\
\bottomrule
\end{tabularx}
\caption{20-seed paired-$t$. GRAFT unique top performer everywhere; significant on CiteSeer (\,$p\!=\!5\!\times\!10^{-6}$\,) and PubMed (\,$p\!=\!0.002$\,), marginal on DBLP/Physics due to bimodal split-seed behaviour at $L\!=\!20$. DFA / DFA-GNN $\sigma\approx 0$ on 3 datasets = deterministic majority-class collapse.}
\end{table}

\subsection{Combined Fig 4-style depth panel}
\begin{figure}[H]\centering
\includegraphics[width=0.92\textwidth]{graft_fig4_combined.pdf}
\caption{Depth sweep across the four Planetoid-style datasets (Fig 4(a)) plus complementary panels.}
\end{figure}

\subsection{Original 4-dataset depth sweep}
\begin{figure}[H]\centering
\includegraphics[width=0.92\textwidth]{graft_depth_sweep.pdf}
\caption{Cora/CiteSeer/PubMed/DBLP, BP vs DFA-GNN vs GRAFT, $L\in\{4,8,10,12,16,20\}$, 20 seeds.}
\end{figure}

% =============================================================================
\section{Robustness}\label{sec:robustness}

\subsection{Wrong-topology causal control (T5)}\label{sec:wrong-topo}
Source: \texttt{tab:wrong-topo}, paper line 338. GCN $L=6$, 20 seeds. Forward uses true graph; only backward $P_\ell(\hat A)$ varies.
\begin{table}[H]\centering\small
\begin{tabularx}{\textwidth}{l *{3}{>{\centering\arraybackslash}X} c}
\toprule
Backward graph & Cora & CiteSeer & DBLP & vs.\ GRAFT \\
\midrule
GRAFT (correct $\hat A$) & $77.2{\pm 1.3}$ & \best{62.7{\pm 1.6}} & $81.9{\pm 0.8}$ & --- \\
VanillaGrAPE ($P=I$) & \best{77.5{\pm 1.7}} & $62.3{\pm 1.5}$ & \best{82.0{\pm 0.6}} & ns \\
\midrule
Rewired ($\tilde A$)         & \nega{32.3{\pm 1.3}} & \nega{29.6{\pm 8.0}} & \nega{46.1{\pm 5.1}} & $-35$ to $-45^{***}$ \\
Permuted ($\Pi\hat A\Pi^\top$) & \nega{32.5{\pm 2.0}} & \nega{48.1{\pm 6.5}} & \nega{75.8{\pm 3.9}} & $-6$ to $-45^{***}$ \\
Erd\H{o}s--R\'enyi           & \nega{31.9{\pm 0.0}} & \nega{27.4{\pm 5.8}} & \nega{44.8{\pm 0.3}} & $-37$ to $-45^{***}$ \\
\bottomrule
\end{tabularx}
\caption{Removing topology ($P=I$) is benign; \emph{wrong} topology is catastrophic. Forward--backward consistency is what the topology factor enforces.}
\end{table}

\subsection{Perturbation sweep (DFA-GNN-style Fig 4b/c/d)}
Source: \texttt{results/perturb\_20seeds/results.json} + \texttt{results/perturb\_extend/}. 3 attacks $\times$ 3 datasets (Cora, CiteSeer, PubMed) $\times$ 3 methods (BP, DFA-GNN, GRAFT) $\times$ rates $\{0,0.1,0.2,0.3,0.5,0.7\}$ $\times$ 20 seeds. Attacks: edge rewire, feature mask, label flip.

\begin{figure}[H]\centering
\includegraphics[width=\textwidth]{graft_perturb_sweep.pdf}
\caption{Perturbation robustness (DFA-GNN Fig 4b/c/d format). Top row: edge rewire; middle: feature mask; bottom: label flip. GRAFT keeps a positive margin over BP at most rates; both methods degrade symmetrically at extreme rates.}
\end{figure}

\textbf{Selected paired-$t$ from \texttt{perturb\_20seeds}} (CiteSeer edge-rewire example): rate$=$0\,$\Rightarrow$ BP 53.8/GRAFT 62.6 ($p\!=\!2.6e\text{-}8$); rate$=$0.1\,$\Rightarrow$ 36.4/42.7 ($p\!=\!1e\text{-}4$); rate$=$0.2\,$\Rightarrow$ 25.2/27.9 (ns); rate$=$0.3\,$\Rightarrow$ 21.6/20.4 (ns). The crossover is symmetric across attack types.

\subsection{Hyperparameter sensitivity (T9)}
Source: \texttt{tab:sensitivity}, paper line 689. Cora GCN $L=6$, 3 seeds. Default in \textbf{bold}.
\begin{table}[H]\centering\small
\begin{tabularx}{\textwidth}{l *{6}{>{\centering\arraybackslash}X}}
\toprule
\multicolumn{6}{l}{\textbf{(a) Probe count} (alignment every 10 steps, $K=3$)} \\
Probes & 16 & 32 & \textbf{64} & 128 & 256 \\
Acc (\%) & $74.6{\pm 0.8}$ & $76.1{\pm 1.1}$ & $\mathbf{77.5}{\pm 1.6}$ & $77.5{\pm 1.2}$ & $77.1{\pm 3.5}$ \\
\midrule
\multicolumn{6}{l}{\textbf{(b) Alignment frequency} (64 probes, $K=3$)} \\
Every $N$ steps & 1 & 5 & \textbf{10} & 20 & 50 \\
Acc (\%) & $77.1{\pm 1.8}$ & $76.9{\pm 0.3}$ & $\mathbf{78.2}{\pm 0.9}$ & $71.9{\pm 4.9}$ & $73.0{\pm 3.2}$ \\
\midrule
\multicolumn{6}{l}{\textbf{(c) Hop cap $K$} (64 probes, alignment every 10 steps)} \\
$K$ & 1 & 2 & \textbf{3} & 5 & --- \\
Acc (\%) & $77.1{\pm 1.9}$ & $76.0{\pm 0.9}$ & $\mathbf{78.3}{\pm 0.6}$ & $78.2{\pm 0.7}$ & --- \\
\bottomrule
\end{tabularx}
\caption{Variation $\leq 3\%$ across tested ranges; defaults at or near optimum on each axis.}
\end{table}

% =============================================================================
\section{Alignment analysis (per-layer cosine, gradient reach)}\label{sec:align}

\subsection{Per-layer cosine vs true BP gradient (T11)}
Source: \texttt{tab:per-layer-cos}, paper line 741. Cora GCN $L=6$, 200 epochs, 20 seeds.
\begin{table}[H]\centering\small
\begin{tabularx}{\textwidth}{l *{5}{>{\centering\arraybackslash}X}}
\toprule
Layer (input $\to$ output) & $\ell\!=\!0$ & $\ell\!=\!1$ & $\ell\!=\!2$ & $\ell\!=\!3$ & $\ell\!=\!4$ \\
\midrule
$\cos(\delta^{\text{GRAFT}},\nabla^{\text{BP}})$
& $0.33{\pm 0.12}$ & $0.36{\pm 0.15}$ & $0.39{\pm 0.16}$ & $0.42{\pm 0.16}$ & $0.59{\pm 0.19}$ \\
\bottomrule
\end{tabularx}
\caption{All five layers strictly positive (95\% CI $>$ 0); higher near loss, smooth degradation with depth (multi-probe variance $\uparrow$ as more matrices chain).}
\end{table}

\subsection{Gradient-reach summary (paper §5.1, prose)}
At GCN $L=10$, BP gradient norms $\|\partial\mathcal{L}/\partial Z_\ell\|_F < 10^{-38}$ across all 20 seeds and all hidden layers (single-precision underflow). Forward representations remain $\Theta(1)$. GRAFT $\|\delta_\ell\|_F\!\approx\!0.7$--$1.2$ across all layers with tight CI. Accuracy gap at $L=10$: GCN $\Delta=+16.3\%$ ($p=4\!\times\!10^{-4}$), APPNP $\Delta=+10.8\%$ ($p=8\!\times\!10^{-3}$). At $L=6$: BP norms $\sim 0.02$, GRAFT $\sim 0.17$ ($\sim 8\times$).

% =============================================================================
\section{Efficiency}\label{sec:efficient}

\subsection{Wall-clock (T4)}
Source: \texttt{tab:efficiency}, paper line 292. ms / training step, 5 timing runs, median reported.
\begin{table}[H]\centering\small
\begin{tabularx}{\textwidth}{ll *{3}{>{\centering\arraybackslash}X} >{\centering\arraybackslash}X}
\toprule
Dataset & $L$ & BP & ResGCN & GRAFT-Opt & Speedup vs BP \\
\midrule
Cora & 6  & 4.16 & 4.80 & \best{2.62} & $1.59\times$ \\
Cora & 10 & 7.03 & 6.40 & \best{4.07} & $1.73\times$ \\
DBLP & 6  & 5.51 & 5.35 & \best{5.34} & $1.03\times$ \\
DBLP & 10 & \best{7.13} & 7.42 & 7.33 & $0.97\times$ \\
\bottomrule
\end{tabularx}
\caption{Cora speedup driven by avoiding autograd + replacing $L$-step sequential backward with $O(1)$ batched kernels. DBLP speedup vanishes (large SpMM saturates GPU). Memory $1.2$--$1.4\times$ peak.}
\end{table}

\subsection{Reference vs Optimized accuracy parity (T12)}
Source: \texttt{tab:ref-vs-opt}, paper line 764. 9 settings, 5 seeds.
\begin{table}[H]\centering\small
\begin{tabularx}{\textwidth}{l *{3}{>{\centering\arraybackslash}X}}
\toprule
Setting (GCN/SAGE/APPNP $L=6$) & Cora & CiteSeer & DBLP \\
\midrule
GCN  & $76.9{\pm 2.2}$ & $61.6{\pm 2.7}$ & $82.5{\pm 0.3}$ \\
SAGE & $75.6{\pm 1.1}$ & $61.5{\pm 2.1}$ & $82.2{\pm 0.4}$ \\
APPNP& $76.1{\pm 1.7}$ & $59.4{\pm 1.7}$ & $82.8{\pm 0.3}$ \\
\bottomrule
\end{tabularx}
\caption{All within $\pm 2\%$ of reference; no setting significantly different at $p<0.05$.}
\end{table}

% =============================================================================
\section{Negative results / regime boundary}\label{sec:negative}

\subsection{Heterophily (T10)}
Source: \texttt{tab:hetero}, paper line 715. 3 seeds, GCN $L=6$.
\begin{table}[H]\centering\small
\begin{tabularx}{\textwidth}{l *{3}{>{\centering\arraybackslash}X} *{2}{>{\centering\arraybackslash}X}}
\toprule
Dataset & $N$ & deg & $h$ & BP & GRAFT \\
\midrule
Texas    & 183  & 1.8  & 0.108 & $47.4$ & $47.4$ \\
Cornell  & 183  & 1.6  & 0.131 & $39.5$ & $37.7$ \\
Chameleon& 2{,}277 & 15.9 & 0.235 & $52.3{\pm 1.2}$ & \nega{26.7{\pm 5.0}} \\
Squirrel & 5{,}201 & 41.7 & 0.224 & $28.1{\pm 3.5}$ & \nega{21.2{\pm 0.3}} \\
Actor    & 7{,}600 & 3.9  & 0.219 & $26.8{\pm 1.1}$ & $26.4{\pm 0.8}$ \\
\bottomrule
\end{tabularx}
\caption{GRAFT relies on homophily; useless when $h<0.3$. Edge-flow backward propagates supervision \emph{across} class boundaries.}
\end{table}

\subsection{Large dense graphs (paper-side prose)}
\begin{itemize}\setlength\itemsep{1pt}
\item \textbf{ogbn-arxiv} (169K nodes, 40 classes): GRAFT trails BP by 25--35\,pp at all class-counts (6/9/40). Identity-augmented kernel $(1{-}\beta)\hat A^k+\beta I$ at $\beta=0.5$ improves the 6-class case 48.6$\to$53.7\,\% but BP still 73.6\,\%.
\item \textbf{Flickr} (89K, deg $\sim$10, 7-cl, social): both BP and GRAFT collapse to majority at $L\!\geq\!10$ in paper setup.
\item \textbf{WikiCS} (11.7K, deg 36.9, 10-cl): GRAFT loses every depth $L\in\{3,5,10,14,20\}$, $\Delta\!=\!-9$ to $-20$\,pp. Confirms regime boundary: dense (deg $>$ 20) $\Rightarrow$ BP-stable, GRAFT collapses to majority (0.229) at deep $L$.
\end{itemize}

\subsection{Graph-level regression (Peptides-struct, PPI)}
\begin{itemize}\setlength\itemsep{1pt}
\item \textbf{Peptides-struct} (LRGB MAE): GRAFT carries an intrinsic $+0.11$ MAE offset from pool-transpose on graph-level regression; reuse of \texttt{src/trainers.GraphGrAPETrainer} v4 reproduces the same offset $\Rightarrow$ not a port bug. Failure mode of the framing.
\item \textbf{PPI} (multi-label F1): GRAFT loses $-0.04$ to $-0.12$ F1 vs BP at all depths (avg deg 18, dense).
\end{itemize}

\subsection{Other rejected candidates (triaged)}
ENZYMES (TUDataset, graph-level), Cora-Full ($\geq 70$ classes, both methods collapse $L\!\geq\!5$), Roman-empire / Chameleon / Squirrel / Texas / Cornell / Actor (heterophily, App N.1), Reddit2 (dense social), QM9 / ogbg-molhiv / MalNet-Tiny (graph-level regression / classification), CitationFull-Cora\_ML (3K, both methods saturate at $L=3$, similar profile to other CFull's). All triaged with rationale in \texttt{drafts/experiment\_queue.md}.

% =============================================================================
\section{BH multiple-comparisons correction}\label{sec:bh}
144 paired tests grouped: 96 BP-vs-GRAFT (full LR sweep), 12 ablation contrasts (DFA $\to$ DFA-GNN $\to$ VanillaGrAPE $\to$ GRAFT $\times$ 3 datasets), 12 wrong-topology, 12 stackability, 12 depth-stress at $L=8,10$. After BH at $q=0.05$: \textbf{117/144} significant; every test that survived unadjusted $p<0.05$ also survives BH. Non-significant residuals concentrated in GIN backbone, PubMed-SAGE, and high-perturbation feature-masking conditions.

% =============================================================================
\section{Additional running notes}\label{sec:notes}

\subsection{Identified GRAFT-win regime}
Sparse (deg $\leq$ 8) $\cap$ few-class ($\leq$ 10) $\cap$ node-level single-label $\cap$ homophilous ($h>0.5$) $\cap$ Planetoid-style 5\,\%/class semi-sup $\cap$ $L\geq 5$ where BP already starts to fail. Within this, GRAFT's edge of advantage grows with depth.

\subsection{Hyperparams that consistently work}
hidden=64, lr=0.01 (Adam, weight\_decay=$5e\text{-}4$), 200 epochs, no scheduler, no residual / BN / Dropout, 64 probes, alignment every 10 steps, $K=3$ hop cap, diffusion $\alpha=0.5$ for 10 iters.

\subsection{Failure-prone hyperparam choices we hit}
hidden=128, AdamW $+$ cosine LR, 20-per-class semi-sup. These broke the GRAFT port until we reverted to paper setup. Documented in commit history; flagged in CLAUDE memory as recurrent failure mode.

\subsection{Artifacts inventory}
\begin{itemize}\setlength\itemsep{1pt}
\item \texttt{neurips\_v4\_main.tex}: live paper, T1--T12 + appendix.
\item \texttt{drafts/hero\_table.tex}: wide backward-only hero, not in paper.
\item \texttt{drafts/hero\_realworld\_L20.tex}: deep real-world hero, not in paper.
\item \texttt{drafts/deep\_real\_world\_section.md}: prose for the new real-world section.
\item \texttt{graft\_depth\_sweep.\{pdf,png\}}, \texttt{graft\_perturb\_sweep.\{pdf,png\}}, \texttt{graft\_fig4\_combined.\{pdf,png\}}, \texttt{graft\_realworld\_depth.\{pdf,png\}}, \texttt{graft\_vs\_bp\_boxscatter.\{pdf,png\}}.
\item \texttt{results/}: per-experiment JSON dumps (\texttt{perturb\_20seeds/}, \texttt{ablation\_20seeds/}, \texttt{cafo\_baseline\_20seeds/}, \texttt{bp\_graft\_depth\_20seeds/}, \dots).
\item Logs: \texttt{realworld\_hero\_L20\_20seed.log}, \texttt{wikics\_paper\_setup.log}, \texttt{realworld\_10seed.log}, \texttt{realworld\_dfa\_10seed.log}, \texttt{cfull\_paper\_setup.log}, \texttt{dblpfull\_full\_depth.log}, \texttt{pubmedfull\_full\_depth.log}, \texttt{physics\_full\_depth.log}, \texttt{csfull\_full\_depth.log}, \texttt{perturb\_sweep.log}, \texttt{perturb\_extras.log}.
\end{itemize}

\end{document}