assets/ept_method_intro.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606

\documentclass[11pt]{article}

\usepackage[margin=1in]{geometry}
\usepackage{amsmath,amssymb}
\usepackage{bm}
\usepackage[round]{natbib}
\usepackage{enumitem}
\usepackage{booktabs}
\usepackage[colorlinks=true,linkcolor=blue,citecolor=blue,urlcolor=blue]{hyperref}

% --- light-weight notation ---------------------------------------------------
\newcommand{\R}{\mathbb{R}}
\newcommand{\C}{\mathbb{C}}
\renewcommand{\Re}{\operatorname{Re}}
\newcommand{\xin}{x_{\mathrm{in}}}
\newcommand{\zstar}{z^{\ast}}
\newcommand{\zbar}{\bar{z}}
\newcommand{\Fnc}{F_{\mathrm{nc}}}
\newcommand{\Jnc}{J_{\mathrm{nc}}}
\newcommand{\half}{\tfrac12}
\newcommand{\grad}{\nabla}
\newcommand{\dd}{\,\mathrm{d}}
\newcommand{\inner}[2]{\langle #1,\, #2\rangle}
\DeclareMathOperator{\Attn}{Attn}
\DeclareMathOperator{\FFN}{FFN}
\DeclareMathOperator{\softmax}{softmax}
\DeclareMathOperator{\LSE}{LSE}
\DeclareMathOperator{\LN}{LN}
\DeclareMathOperator{\jvp}{jvp}
\DeclareMathOperator{\vjp}{vjp}
\DeclareMathOperator*{\argmin}{arg\,min}

\title{\bf Training a Transformer Language Model with Equilibrium Propagation:\\
from energy-based EP to non-conservative, holomorphic, tracking-AEP}
\author{Method introduction (internal)}
\date{2026-06-21}

\begin{document}
\maketitle

\begin{abstract}
We train a transformer-class language model in which \emph{both} attention and the
feed-forward network learn \emph{without backpropagation through the computation},
using Equilibrium Propagation (EP). This note is written for a reader who knows
\emph{classic} energy-based EP \citep{scellier2017} --- the two-phase free/nudged
relaxation of a conservative, symmetric-Jacobian system --- but has not met the
non-conservative / asymmetric / holomorphic extensions. We first recall why classic
EP \emph{requires} a conservative system, then show that softmax self-attention
breaks that requirement (independent $Q,K,V$ give an asymmetric Jacobian). We then
introduce, from first principles, the pieces that repair this: the
\emph{asymmetric / adjoint} EP correction $J\!\to\!J^{\!\top}$
\citep{scurria2026}; the \emph{holomorphic} EP estimator \citep{laborieux2022};
the \emph{Convergent Energy Transformer} (CET) route \citep{hoier2026} that
sidesteps the problem by making attention conservative; and finally \emph{our}
recipe: a damped non-conservative equilibrium-transformer block, trained with
\emph{tracking-AEP} (re-linearizing the correction at the moving common-mode
midpoint) plus a residual-driven stabilization stack. We report what is solidly
validated --- component gradients match backprop at cosine $0.99$--$1.0$, and EP
trains the block stably and competitively with a backprop transformer at equal
parameters on a character-level LM --- and clearly mark the larger-scale work
(the $C{=}512$ ``residual-defense'' line) as \emph{ongoing}.
\end{abstract}

\tableofcontents

%==============================================================================
\section{Recap: classic energy-based EP and why it needs a conservative system}
\label{sec:classic}

\paragraph{Setup.}
Classic EP \citep{scellier2017} trains a dynamical system whose state
$z\in\R^{d}$ relaxes, under a fixed input/clamp, to the minimum of a scalar
\emph{energy} $E(z,\theta)$. Two ideas make it a learning rule.

\paragraph{Two phases.}
\begin{itemize}[leftmargin=1.4em,itemsep=2pt]
  \item \emph{Free phase.} Run the gradient dynamics $\dot z=-\grad_z E(z,\theta)$
        to the free equilibrium $\zstar=\argmin_z E(z,\theta)$,
        in practice an Euler relaxation to a fixed point.
  \item \emph{Nudged phase.} Add the task loss to the energy with a small strength
        $\beta$, $E_\beta = E + \beta\,\ell(z)$, and relax to the nudged
        equilibrium $z_\beta$.
\end{itemize}

\paragraph{The contrastive gradient.}
EP's central identity is that the loss gradient w.r.t.\ any parameter is the
\emph{contrastive difference of $\partial E/\partial\theta$ across the two phases}:
\begin{equation}
  \frac{\partial \mathcal{L}}{\partial \theta}
  \;\approx\;
  \frac{1}{\beta}\!\left[
    \frac{\partial E}{\partial\theta}(z_\beta,\theta)
    -\frac{\partial E}{\partial\theta}(\zstar,\theta)
  \right]
  \qquad(\text{one-sided, bias }O(\beta)).
  \label{eq:ep-onesided}
\end{equation}
Centered / symmetric nudging \citep{laborieux2021} uses $\pm\beta$ and averages,
reducing the estimator bias to $O(\beta^2)$:
\begin{equation}
  \frac{\partial \mathcal{L}}{\partial \theta}
  \;\approx\;
  \frac{1}{2\beta}\!\left[
    \frac{\partial E}{\partial\theta}(z_{+\beta})
    -\frac{\partial E}{\partial\theta}(z_{-\beta})
  \right].
  \label{eq:ep-centered}
\end{equation}
The update is \emph{local}: each parameter reads only the two equilibria of the
terms it touches; there is no backward pass and no weight transport. As
$\beta\!\to\!0$ with a converged free phase, the EP estimate equals the
implicit/equilibrium gradient, and (in an RNN with static input) it equals the
step-wise BPTT gradient \citep{ernoult2019}.

\paragraph{Why this needs a conservative / symmetric-Jacobian system.}
Equations \eqref{eq:ep-onesided}--\eqref{eq:ep-centered} are only valid because
the dynamics are the \emph{gradient} of a scalar energy. Write the force as
$F(z) = -\grad_z E(z)$ and its Jacobian as $J=\partial F/\partial z$. If $F$
descends an energy, then $J = -\,\partial^2 E/\partial z^2$ is a Hessian and is
therefore \emph{symmetric}, $J=J^{\!\top}$. This symmetry is exactly what makes the
nudged perturbation a faithful surrogate for the loss \emph{adjoint}: linearizing
the nudged relaxation around $\zstar$ produces a response governed by
$(I-J)^{-1}$, and because $J=J^{\!\top}$ this self-adjoint operator is the same one
the true gradient (which involves $(I-J^{\!\top})^{-1}$) requires. We therefore
record the four implicit premises of classic EP --- the transformer will break all
four, and each fix below targets exactly one of them:
\begin{description}[leftmargin=2.6em,itemsep=2pt]
  \item[(A) Conservative / symmetric.] A scalar energy $E$ exists, so $J=J^{\!\top}$.
  \item[(B) Free phase converged.] The readout sits at the true fixed point;
        residual $\approx 0$.
  \item[(C) Small-$\beta$ linear response, clean nudge.] $\beta\!\to\!0$ is a mere
        perturbation, and no non-analytic ``clamp'' contaminates the estimate.
  \item[(D) The fixed point stays stable throughout training.] After every weight
        update the free phase still relaxes to a stable fixed point.
\end{description}

%==============================================================================
\section{The gap: softmax attention is non-conservative}
\label{sec:gap}

A pre-LN transformer block computes, for a state $z$,
\begin{equation}
  \Attn(z) = \softmax\!\Big(\tfrac{Q(z)K(z)^{\!\top}}{\sqrt{d}},\ \text{causal}\Big)V(z)\,W_O,
  \qquad
  Q=zW_Q,\ K=zW_K,\ V=zW_V,
  \label{eq:attn}
\end{equation}
with \emph{independent} projections $W_Q,W_K,W_V$. The query--key coupling
$i\!\to\!j$ is governed by $W_QW_K^{\!\top}$, while $j\!\to\!i$ is governed by
$W_KW_Q^{\!\top}$; these differ, and $V$ is a third independent map. Consequently
the attention Jacobian is \emph{asymmetric}, $J_{\Attn}\neq J_{\Attn}^{\!\top}$, and
\emph{no scalar energy has this gradient}. An untied $4\times$ FFN
($W_2\,\mathrm{GELU}(W_1\cdot)$ with $W_2\neq W_1^{\!\top}$) is non-conservative for
the same reason. Premise~(A) fails.

Empirically this is not a cosmetic issue: with an asymmetric $J$ the nudged phase
relaxes under $J$ but the correct loss adjoint needs $J^{\!\top}$, so the raw EP
contrast is \emph{biased}. Measured against the true backprop gradient, uncorrected
EP gives an attention-parameter cosine of only $\approx 0.25$ (essentially the
wrong direction), even though the loss-adjacent output projection looks fine. (This
is the same pathology that limits feedback alignment, which only trains the layer
right before the loss and leaves $Q/K/V$ at cosine $\approx 0.25$ and the upstream
FFN at $\approx -0.01$.)

There are two ways out, and we will use the second:
\begin{enumerate}[leftmargin=1.6em,itemsep=2pt]
  \item \textbf{Energy route} (make attention conservative): fold attention into a
        scalar energy with a \emph{tied} value, so $F=-\grad E$ and classic EP is
        exactly valid. This is the CET route (\S\ref{sec:cet-energy}); it costs the
        $Q\!\neq\!K$ asymmetry and the free value that make attention expressive.
  \item \textbf{Force route} (keep real attention, repair the \emph{estimator}):
        leave \eqref{eq:attn} as a non-conservative \emph{force} and add a
        correction that turns $J$ into $J^{\!\top}$ in the nudged phase. This is the
        AEP route (\S\ref{sec:aep}), and it is what our block uses.
\end{enumerate}

%==============================================================================
\section{AEP, holomorphic EP, and the force-form readout}
\label{sec:aep}

\subsection{Force-form (vector-field) EP}
\label{sec:vf}
The first step is to drop the energy and write the dynamics directly as a force
$F(z)$, relaxing $\dot z=F(z)$ to a fixed point $\zstar$. The parameter gradient is
then read off a \emph{vector-field} (VF) contrast \citep{scurria2026}:
\begin{equation}
  \frac{\partial\mathcal{L}}{\partial\theta}
  \;\approx\;
  \frac{\partial}{\partial\theta}\,\big\langle a,\ F(\zstar;\theta)\big\rangle,
  \qquad
  a \;=\; \frac{z_{-\beta}-z_{+\beta}}{2\beta}\ \approx\ -\frac{\dd \zstar}{\dd\beta},
  \label{eq:vf}
\end{equation}
where $a$ is the centered contrast (the ``adjoint state'') read from the two nudged
equilibria, and the right-hand side is \emph{one} autograd call evaluated at the
fixed point only --- per-term local bookkeeping, \emph{not} backprop through the
relaxation steps. Every term of the block (attention, FFN, LayerNorm affines, and
the embeddings, which enter through the input clamp $-(z-\xin)$) is a term of the
same $F$, so \eqref{eq:vf} trains them jointly with no per-module schedule.

\paragraph{Attribution / honest caveat.}
The force-form VF readout \eqref{eq:vf} is \emph{not ours}: it is the baseline of
\citet{scurria2026}. Crucially it \emph{collapses on its own} for a non-conservative
system (their CIFAR-10 VF reaches chance, $10\%$; MNIST $64\%$ vs.\ $92.7\%$),
exactly mirroring our measured cosine $\approx 0.25$ for uncorrected attention. VF
is therefore the ``starting point that fails''; what rescues it is the next step.

\subsection{The AEP correction: \texorpdfstring{$J\!\to\!J^{\!\top}$}{J to J transpose}}
\label{sec:aep-corr}
For a non-conservative $F$, the nudged relaxation linearized at $\zstar$ runs under
$J=\partial F/\partial z$, but the true adjoint requires $J^{\!\top}$. \emph{Asymmetric
EP} (AsymEP) \citep{scurria2026} repairs this by adding to the nudged force a term
that subtracts twice the antisymmetric part of the Jacobian. With
$v=z-\zstar$ and $\Jnc$ the Jacobian of the \emph{non-conservative} part $\Fnc$,
\begin{equation}
  \mathrm{corr}(z) \;=\; \Jnc\,v - \Jnc^{\!\top} v
  \;=\; (\Jnc-\Jnc^{\!\top})\,v
  \;=\; 2\,A_J\,v,
  \qquad
  A_J \equiv \tfrac12\big(\Jnc-\Jnc^{\!\top}\big),
  \label{eq:aep}
\end{equation}
which is \emph{mathematically identical} to their $-2A_J(\zstar)(z-\zstar)$. The
nudged force becomes $f \;=\; F(z) \mp \beta\,\grad_z\ell(z) - \mathrm{corr}(z)$,
so the attention part of the nudged linearization is replaced as
\begin{equation}
  J\,v \;-\; (J-J^{\!\top})\,v \;=\; J^{\!\top} v ,
\end{equation}
i.e.\ \emph{$J$ is turned into $J^{\!\top}$}, restoring the correct adjoint and hence the
exact gradient for $Q\!\neq\!K$ attention. Two structural facts make this cheap and
local:
\begin{itemize}[leftmargin=1.4em,itemsep=2pt]
  \item \emph{The symmetric (conservative) parts cancel.} The damping $-c\,z$ has
        Jacobian $-cI$ (symmetric), the FFN-as-Hopfield-energy and the input clamp
        are symmetric, so they contribute $0$ to $A_J$. Thus a \emph{single}
        correction on the attention term repairs the \emph{whole} block; FFN/clamp
        ride along in the conservative part and are already exact under VF.
  \item \emph{It is matrix-free.} We never build $\Jnc$. Each nudged step uses one
        Jacobian-vector product and one vector-Jacobian product,
        $\Jnc v=\jvp(\Fnc,\zstar,v)$ and $\Jnc^{\!\top} v=\vjp(\Fnc,\zstar,v)$.
\end{itemize}

\paragraph{Attribution.}
The correction \eqref{eq:aep} is \citet{scurria2026}'s, \emph{not} ours. Their scope
is feedforward / Hopfield nets on static MNIST/CIFAR with an \emph{explicitly
constructed} Jacobian, no attention, no sequence model, and no stability controller.
\emph{Ours on this line} is: (i) the matrix-free $\jvp/\vjp$ form (their explicit
Jacobian is infeasible at transformer state dimension $B\!\cdot\!T\!\cdot\!C$);
(ii) the application to data-dependent \emph{softmax attention}; (iii) the
combination with holomorphic estimation (\S\ref{sec:holo}); (iv) the common-mode
\emph{tracking} variant (\S\ref{sec:tracking}); and (v) the transformer-LM
application together with the stability stack (\S\ref{sec:stab}).

\paragraph{Validity window.}
The correction is linearized \emph{at $\zstar$}, so the nudged trajectory must stay
inside the linear-response window. At $\varepsilon{=}0.1$ a nudge horizon
$T_2\!\approx\!20$ is comfortably inside; $T_2\gtrsim 60$ can leave it (\S\ref{sec:stab}).

\subsection{Holomorphic EP: variance-reduced, higher-order estimates}
\label{sec:holo}
The $\pm\beta$ contrast trades bias against noise: small $\beta$ shrinks the
$O(\beta^2)$ bias but amplifies the $1/\beta$ noise on $(z_{-\beta}-z_{+\beta})/2\beta$.
Holomorphic EP \citep{laborieux2022} removes this trade-off by replacing the two
real points with $N$ points on a \emph{complex circle},
$\beta_k = r\,e^{2\pi i k/N}$, relaxing the \emph{holomorphically extended} dynamics
and reading the contrast off a discrete Cauchy integral:
\begin{equation}
  a \;=\; -\,\Re\!\left[\frac{1}{Nr}\sum_{k=0}^{N-1} e^{-i\phi_k}\,(z_k-\zstar)\right],
  \qquad \phi_k=\tfrac{2\pi k}{N},
  \label{eq:holo}
\end{equation}
whose bias is $O(r^{N})$ instead of $O(r^{2})$ --- so $r$ may be $5$--$10\times$
larger at equal bias, cutting the $1/\beta$ noise by the same factor. The
holomorphic extension is built by hand (complex LayerNorm with non-conjugate
variance, softmax as a ratio of exponentials, the $\tanh$-form GELU which is an
entire function); the AEP correction \eqref{eq:aep} is \emph{real-linear in $v$}, so
it preserves holomorphy and is applied to the real and imaginary parts separately.
No clamps appear inside the holomorphic nudge --- clamps are non-analytic and would
destroy the $O(r^N)$ bias order. This addresses premise~(C). \citep{laborieux2022}
is the source; we add only the combination with the AEP correction and with softmax
attention.

%==============================================================================
\section{The equilibrium-transformer block (and the CET alternative)}
\label{sec:block}

\subsection{Our damped, non-conservative block (\texttt{thick})}
\label{sec:thick}
The state is $z\in\R^{B\times T\times C}$, one vector per token position. Inference
is a relaxation to a fixed point under a \emph{single force} $F$,
$z\leftarrow z+\varepsilon F(z)$ for $T_1$ steps ($\varepsilon{=}0.1$, $T_1{\approx}150$),
after which logits $=\zstar W_h$. The force is a pre-LN transformer block written as
a force rather than a layer stack:
\begin{equation}
  F(z) =
  \underbrace{-(z-\xin)}_{\text{input clamp}}
  +\underbrace{\Attn(\LN_1(z))}_{\text{causal MHSA},\ W_Q,W_K,W_V,W_O}
  +\underbrace{W_2\mathrm{GELU}(W_1\LN_2(z)+b_1)+b_2}_{\text{untied }4\times\text{ FFN}}
  -\underbrace{c\,z}_{\text{damping}}.
  \label{eq:thick}
\end{equation}
Here $\xin=\mathrm{tok}[\mathrm{idx}]+\mathrm{pos}$ is the (trained) input
embedding, clamped as a boundary condition through the $-(z-\xin)$ term; this is the
same fixed-point map a Deep Equilibrium model \citep{bai2019} uses. The block is
strongly non-conservative ($Q\!\neq\!K$, untied FFN), and AEP makes EP exact for it.

\paragraph{Why the $-c\,z$ damping is the key recipe move.}
Raw attention at high gain has \emph{no} fixed point: the residual floors at
$\sim\!3\times10^{-2}$ and the relaxation never settles, so the entire EP family
(corrected or not) cannot even start (there is no $\zstar$ to nudge around). Adding
$-c\,z$ ($c\!\geq\!1$) makes the map contractive enough to \emph{create a stable
fixed point at any attention strength}, while leaving the map non-conservative
(independent $Q/K/V$ are untouched). Critically, the damping's Jacobian $-cI$ is
symmetric, so it \emph{cancels in $A_J$} \eqref{eq:aep}: it buys a fixed point
without polluting the AEP correction, which still sees only attention's
non-reciprocal part. Together, ``damping $+$ AEP'' is the minimal recipe that makes
real attention EP-trainable, taking the attention-parameter cosine from
$\approx 0.25$ (uncorrected) to $0.99$--$1.0$ even at high gain.

\paragraph{A subtlety for LN-inside blocks.}
Because LayerNorm sits \emph{inside} \eqref{eq:thick} and its Jacobian scales like
$1/\sigma(z)$, large damping shrinks $\|\zstar\|$ and thereby \emph{inflates} the
effective Jacobian (measured: plain-relax residual $8.8\times10^{-3}$ at $c{=}0$
vs.\ $3.4\times10^{-2}$ at $c{=}2$). So for \texttt{thick} we keep $c$ small ($c{=}1$)
and the actual stabilizer is the Jacobian-norm penalty of \S\ref{sec:stab}, not the
damping. (For a simpler ``thin'' variant whose FFN is an energy-based modern-Hopfield
memory and whose attention is a raw damped force, the damping \emph{is} required.)

\subsection{The CET / energy route (the conservative alternative)}
\label{sec:cet-energy}
\textbf{CET} here means the \emph{Convergent Energy Transformer} of
\citet{hoier2026} --- an energy-based transformer block, trained with EP, that we
reproduced (on masked image completion) as the prior SOTA for ``EP $+$ attention''.
Its trick is to make attention \emph{conservative} so classic EP applies with
\emph{no} correction: attention is folded into a scalar energy
\begin{equation}
  E_{\mathrm{att}}(z) \;=\;
  -\frac{1}{\gamma}\sum_{\text{heads},\,i}
  \LSE_{j}\!\big(\gamma\, q_i\!\cdot\!k_j\big)
  \quad(\text{causal-masked}),
  \label{eq:cet}
\end{equation}
whose force \emph{ties the value to the key} ($v\!\equiv\!k$), plus a confinement
$\tfrac12 c\|z\|^2$ (because $E_{\mathrm{att}}$ is unbounded below) and a
modern-Hopfield memory energy $E_{\mathrm{mem}}(z)=-\sum\mathrm{relu}(zW_m)^2$
playing the role of the FFN (its force is a \emph{tied}-weight squared-ReLU MLP). On
this energy $F=-\grad E$ exactly, so classic EP is valid with symmetric Jacobian and
no AEP. In our reproduction EP matched truncated-BPTT (``EP $\approx$ TBPTE'',
gradient cosine $0.99$). The trade-off is expressivity: the tied value and
reciprocal coupling are the least expressive form of attention. Under \emph{exact}
gradients on the LM, this conservative route (and a monotone-DEQ variant
\citep{winston2020}) costs $\approx 0.15$--$0.2$ CE relative to the non-conservative
\texttt{thick} block --- which is precisely why we pay for the AEP machinery and keep
real attention.

%==============================================================================
\section{Our recipe: tracking-AEP and the stabilization stack}
\label{sec:recipe}

\subsection{Tracking-AEP: re-linearize at the moving common mode}
\label{sec:tracking}
The AEP correction \eqref{eq:aep} is frozen at $\zstar$. Near a good solution this
becomes the binding error: as the model sharpens, the true gradient shrinks below
the \emph{bias floor} of the frozen linearization, and the highly non-normal block
Jacobian makes that floor large (we measure $\|\Jnc v-\Jnc^{\!\top} v\|/\|\Jnc v\|=1.37$
at $\zstar$). The fix is to re-linearize the antisymmetric correction not at the
frozen $\zstar$ but at the \emph{instantaneous common mode} of the two nudged
trajectories,
\begin{equation}
  \zbar \;=\; \half\big(z_{+}+z_{-}\big),
  \qquad
  \mathrm{corr}(z) \;=\; \Jnc(\zbar)\,v - \Jnc(\zbar)^{\!\top} v,
  \quad v = z-\zbar,
  \label{eq:track}
\end{equation}
evaluated step-by-step as $\zbar$ moves with the nudge (run the $+$ and $-$ phases in
lockstep, recompute $\jvp/\vjp$ about the running $\zbar$). This is exact transposed
differential dynamics with no compounding linearization error, and it is loose-tolerant
(it does not demand an ultra-tight free phase). At a plateau checkpoint where the
frozen estimator had collapsed (gradient cosine vs.\ BPTT $-0.045$, batch-to-batch
self-coherence $-0.27$, magnitude ratio $\sim\!4000\times$), tracking-AEP restores
cosine $0.997$, self-coherence $+0.95$, magnitude ratio $0.9$. Tracking-AEP and the
common-mode formulation \eqref{eq:track} are \emph{ours}.

\subsection{The validity threshold and the residual as the health signal}
\label{sec:stab}
The governing empirical fact is that the EP estimator has a \emph{validity threshold}
in the free-phase relative residual
\begin{equation}
  \mathrm{res} \;=\; \frac{\|z^{+}-\zstar\|}{\|\zstar\|}
  \qquad(\text{one extra relaxation step}),
\end{equation}
which is the load-bearing health signal (premise~(B)). Gradient cosine vs.\ the exact
reference degrades sharply with res: $\approx 0.85$ at $\mathrm{res}\!\sim\!5\times10^{-5}$,
batch-dependent $0.2$--$0.9$ at $10^{-3}$, and noise at $10^{-2}$. BPTT has no such
threshold (it differentiates the actual finite unroll, converged or not); \emph{this
asymmetry, and nothing deeper, is the EP-specific difficulty}. Accordingly the free
phase is run adaptively: relax to $T_1{=}150$, then continue in chunks until
$\mathrm{res}\!\le\!10^{-4}$ before nudging. We emphasize there is \emph{no} structural
``EP ceiling'': an early ``EP caps at $\sim\!2.5$'' verdict was traced to two
undertrained/invalid-regime runs and retracted.

\subsection{The stabilization stack}
Training pushes the dynamics off the contractive manifold (premise~(D)) --- and not
only for EP: even \emph{exact} BPTT on this architecture walks off the manifold on
long horizons (residual $\to 4.7\times10^{-2}$, val CE $\to 3.0$). The stack that
keeps the system valid:
\begin{itemize}[leftmargin=1.4em,itemsep=3pt]
  \item \textbf{Frozen / controlled Jacobian-norm penalty (\texttt{jacreg}).} A soft
        penalty $\lambda\,\|\Jnc(\zstar)\|_F^2$, estimated matrix-free by Hutchinson
        (one $\jvp$ on a random probe, differentiated w.r.t.\ $\theta$). This is
        \citet{bai2021}'s DEQ-stabilization penalty, \emph{not} ours. It keeps the
        free phase contractive and hence the estimator inside its validity region.
        A continuous controller drives it,
        $\lambda \leftarrow \mathrm{clip}\big(\lambda\,(\mathrm{res}_{\mathrm{EMA}}/\mathrm{target})^{0.3}\big)$,
        on an EMA-smoothed residual (the raw residual is noisy and a multiplicative
        controller on it random-walks). A key hard lesson: the controller \emph{floor}
        is load-bearing and must never anneal to zero --- two independent
        $\lambda\!\to\!0$ runs died identically (val CE $60$--$77$, $\mathrm{res}\!\equiv\!0$),
        which post-mortem is an \emph{explosion disguised as convergence by
        floating-point absorption} ($\varepsilon F<\mathrm{ulp}(z)$ freezes the
        relaxation), not a benign dead state.
  \item \textbf{Residual, not spectral radius, as the control signal.} The block
        Jacobian is highly non-normal, so transient growth is invisible to
        eigenvalues (measured $\rho(J){=}0.94$ ``stable'' while the relaxation
        diverged to $\mathrm{res}\,0.21$). The one-step residual \emph{is} the
        transient; we control on it.
  \item \textbf{Validity gate.} When the residual exceeds a gate, the EP update is
        mathematically undefined, so we apply only the homeostat (jacreg) and skip the
        nudge --- a fast recovery step. At larger scale this gate is load-bearing
        (off-equilibrium EP updates poison the weights).
  \item \textbf{Adaptive $T_2$ by hindsight snapshot selection.} On slow-mixing
        batches a long nudge phase can diverge through non-normal transient growth,
        and step-size early-stopping \emph{fails} (the transient triggers it
        spuriously). Instead, run to $T_{2\max}$ in lockstep, snapshot the contrast
        $a_t$ every few steps, and return the \emph{most settled} snapshot (smallest
        increment of $a_t$); judging by increments of the \emph{quantity of interest}
        rather than step sizes makes transient growth harmless. This is ours; it
        lifts probe cosine from $0.871$ to $0.932$.
\end{itemize}

\subsection{Ongoing: the residual-defense term (\texttt{resreg}) --- under validation}
\label{sec:resreg}
At larger width ($C{=}512$) we observe a distinct, \emph{still-open} failure that we
call the below-$2.10$ wall: frozen-jacreg, tracking-AEP EP descends to best
$\approx 2.09$ and then bifurcates within $\sim\!200$ steps (residual
$5\!\times\!10^{-3}\!\to\!0.15$, gradient cosine $0.98\!\to\!0$, CE $\to\!4{+}$),
while \emph{exact} BPTT with the identical recipe sails past to $1.72$. The diagnosed
root cause is an \emph{objective mismatch}: EP optimizes the (refined) fixed point and
never defends the finite-step residual that evaluation actually uses, whereas BPTT
differentiates the finite unroll and so implicitly rewards contraction. The diverged
state is a forward bifurcation to a \emph{limit cycle}, so more relaxation steps cannot
fix it; only a residual \emph{cost} can. The proposed fix is an explicit T1-residual
penalty on the \emph{evaluated} state $z_{150}=\mathrm{relax}(\xin,T_1)$ taken before
any refinement,
\begin{equation}
  R_{\mathrm{res}} \;=\; \frac{\|\varepsilon F(z_{150})\|^2}{\|z_{150}\|^2+\varepsilon},
  \qquad
  \text{gradient w.r.t.\ }\theta\text{ with }z_{150}\text{ detached},
  \label{eq:resreg}
\end{equation}
scaled task-relative and added to the EP gradient (run with the validity gate off, so
the penalty is not bypassed exactly when the residual is high). \textbf{Status: this is
ongoing.} The residual-defense term \eqref{eq:resreg} held the residual pinned at
$1$--$5\times10^{-4}$ and reached best $2.0573$ (past the wall) through only step
$\sim\!1000$ before a storage cleanup deleted the run; full re-validation toward the
$\approx 1.8$ BPTT ceiling is pending. We present it as a diagnosis $+$ proposed fix,
\emph{not} a finished result. (The objective-mismatch diagnosis, the common-mode
tracking estimator, the residual-driven controller and validity gate, and this
residual-defense term are ours.)

%==============================================================================
\section{Established results (and what is still open)}
\label{sec:results}

\paragraph{Solidly validated.}
\begin{itemize}[leftmargin=1.4em,itemsep=3pt]
  \item \textbf{EP/AEP component gradients match backprop.} On the character LM,
        AEP gives causal-attention parameters cosine $0.99$, the (Hopfield) FFN
        $1.00$, and the full LM block $0.99$ vs.\ the true backprop gradient
        --- versus feedback alignment at $Q/K/V\approx 0.25$, FFN $\approx -0.01$.
        On the CET reproduction, global cosine $0.99$ and EP $\approx$ TBPTE on
        masked-image completion.
  \item \textbf{EP trains the equilibrium transformer stably, without backprop.}
        With the stabilization stack, end-to-end EP runs $10\text{k}+$ steps with
        zero non-finite steps.
  \item \textbf{It matches/beats a BP transformer at equal parameters.} On
        Shakespeare character-LM (single block, $C{=}128$), at a fully controlled
        $14$k-step comparison (Table~\ref{tab:results}): EP reaches val CE
        \textbf{1.676} (multi-seed $1.680\pm0.005$, $3$ seeds); the like-for-like
        standard BP transformer (matched in parameter \emph{shape} to the thick
        block) reaches $1.610$; EP \emph{beats} the thinner BP baseline ($1.689$).
        The total gap of $0.066$ decomposes into an architecture tax $\approx 0.025$
        (BPTT on the identical block $1.635$) and an EP-rule tax $\approx 0.041\pm0.005$
        --- real, tightly reproducible, and consistent with the measured estimator
        misalignment (cosine $0.85$--$0.93$).
\end{itemize}

\begin{table}[t]
  \centering
  \small
  \begin{tabular}{llc}
    \toprule
    \textbf{training rule} & \textbf{architecture / recipe} & \textbf{best val CE}\\
    \midrule
    BP & standard transformer (like-for-like for \texttt{thick}) & \textbf{1.610}\\
    BPTT $+$ $\lambda$-controller $+$ param-EMA & \texttt{thick} (exact grad, same stabilizer) & 1.635\\
    \textbf{EP} & \texttt{thick}; tracking-AEP $+$ adaptive $T_1/T_2$ & \textbf{1.676}\\
    BP & standard transformer (thin-matched) & 1.689\\
    BPTT (exact grad) & \texttt{thick}, unregularized & 2.021 (destabilizes late)\\
    random & --- & 4.174\\
    \bottomrule
  \end{tabular}
  \caption{Fully-controlled $14$k-step comparison on Shakespeare char-LM
    (random $=\ln 65$). EP matches the architecture-controlled exact-gradient
    run to within $0.041$ and beats the thin-matched BP baseline. ``BPTT as
    ablation'' separates the training-rule cost (EP$-$BPTT) from the
    architecture cost (BPTT$-$BP).}
  \label{tab:results}
\end{table}

\paragraph{Honest framing of the controlled comparison.}
EP beats \emph{bare} BPTT, but the controlled table shows most of that win is EP's
\emph{mandatory} stabilization loop doubling as regularization: bare exact-gradient
training walks off the contractive manifold at $14$k, and the same controller that EP
cannot live without also lifts BPTT to $1.635$. The contraction controller is good for
the equilibrium architecture regardless of training rule; EP merely forced its
discovery.

\paragraph{Ongoing / under validation.}
The $C{=}512$ work is \emph{not} a finished result. (i) The $2.40$ plateau there is
diagnosed as a late-training EP estimator bias-floor / batch-incoherence, which
tracking-AEP breaks in training ($2.40\!\to\!2.16$, still descending in a $2500$-step
warm-start test). (ii) The below-$2.10$ wall is diagnosed as the objective mismatch of
\S\ref{sec:resreg}; the residual-defense term \eqref{eq:resreg} validated res-tight and
past the wall (best $2.0573$) \emph{only through step $\sim\!1000$} before the run was
lost, and a full re-run toward the $\approx 1.8$ BPTT ceiling is pending. These should
be read as diagnoses with promising partial evidence, not as established numbers.

%==============================================================================
\section*{Attribution summary}
\addcontentsline{toc}{section}{Attribution summary}

\begin{description}[leftmargin=2.2em,itemsep=2pt]
  \item[Theirs.] Classic energy-based EP and centered nudging
    \citep{scellier2017,laborieux2021}; EP $\equiv$ BPTT in the converged, $\beta\!\to\!0$
    limit \citep{ernoult2019}; holomorphic EP \citep{laborieux2022}; the asymmetric/AEP
    correction $J\!\to\!J^{\!\top}$ \emph{and} the force-form VF readout
    \citep{scurria2026}; the Jacobian-norm penalty \citep{bai2021}; DEQ
    \citep{bai2019} and monotone DEQ \citep{winston2020}; the Convergent Energy
    Transformer / CET \citep{hoier2026}.
  \item[Ours.] The transformer application of the force route and the damping recipe
    (damping $+$ AEP making real attention EP-trainable at any gain); the matrix-free
    $\jvp/\vjp$ form of the correction at transformer scale and its combination with
    holomorphic estimation and softmax attention; \emph{tracking-AEP} (common-mode
    re-linearization, Eq.~\ref{eq:track}); the residual-driven controller, the validity
    gate, and adaptive-$T_2$ snapshot selection; and the (ongoing) residual-defense term
    \texttt{resreg} (Eq.~\ref{eq:resreg}) with its objective-mismatch diagnosis.
\end{description}

%==============================================================================
\begin{thebibliography}{9}
\bibitem[Bai et al., 2019]{bai2019}
  S.~Bai, J.~Z.~Kolter, V.~Koltun.
  \emph{Deep Equilibrium Models}. NeurIPS 2019.

\bibitem[Bai et al., 2021]{bai2021}
  S.~Bai, V.~Koltun, J.~Z.~Kolter.
  \emph{Stabilizing Equilibrium Models by Jacobian Regularization}. ICML 2021.

\bibitem[Ernoult et al., 2019]{ernoult2019}
  M.~Ernoult, J.~Grollier, D.~Querlioz, Y.~Bengio, B.~Scellier.
  \emph{Updates of Equilibrium Prop Match Gradients of Backprop Through Time in an
  RNN with Static Input}. NeurIPS 2019.

\bibitem[H{\o}ier et al., 2026]{hoier2026}
  R.~H{\o}ier, K.~Kerjan, B.~Scellier.
  \emph{Training a Convergent Energy Transformer with Equilibrium Propagation} (CET).
  ICLR 2026 Associative Memory workshop; OpenReview \texttt{Qrfml76eWJ}.

\bibitem[Laborieux et al., 2021]{laborieux2021}
  A.~Laborieux, M.~Ernoult, B.~Scellier, Y.~Bengio, J.~Grollier, D.~Querlioz.
  \emph{Scaling Equilibrium Propagation to Deep ConvNets by Drastically Reducing its
  Gradient Estimator Bias} (centered/symmetric nudging). Frontiers in Neuroscience, 2021.

\bibitem[Laborieux \& Zenke, 2022]{laborieux2022}
  A.~Laborieux, F.~Zenke.
  \emph{Holomorphic Equilibrium Propagation Computes Exact Gradients Through Finite Size
  Oscillations}. NeurIPS 2022.

\bibitem[Scellier \& Bengio, 2017]{scellier2017}
  B.~Scellier, Y.~Bengio.
  \emph{Equilibrium Propagation: Bridging the Gap between Energy-Based Models and
  Backpropagation}. Frontiers in Computational Neuroscience, 2017.

\bibitem[Scurria et al., 2026]{scurria2026}
  A.~Scurria, P.~Vanden Abeele, B.~Mognetti, S.~Massar.
  \emph{Equilibrium Propagation for Non-Conservative Systems} (AsymEP).
  arXiv:2602.03670, 2026.

\bibitem[Winston \& Kolter, 2020]{winston2020}
  E.~Winston, J.~Z.~Kolter.
  \emph{Monotone Operator Equilibrium Networks} (monotone DEQ). NeurIPS 2020.
\end{thebibliography}

\end{document}