summaryrefslogtreecommitdiff
path: root/assets
diff options
context:
space:
mode:
authorYuren Hao <yurenh2@illinois.edu>2026-07-03 05:56:50 -0500
committerYuren Hao <yurenh2@illinois.edu>2026-07-03 05:56:50 -0500
commitb83947778e2c776f757a07d4719b7ce961d7ed55 (patch)
treeb9cc01d7adda691d9156d9d04f4fb2f644674e96 /assets
Initial commit: ept — backprop-free equilibrium transformer (EP)
Code (ep_run/), organized docs (docs/{method,campaign,hardware,outreach,paper}), analysis scripts (scripts/), ONBOARDING.md entry point. Large data/checkpoints git-ignored (share separately). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_014FAPDWQ49M5Ye3NpTndTpn
Diffstat (limited to 'assets')
-rw-r--r--assets/ept_method_intro.tex606
-rw-r--r--assets/frozen_vs_adaptive.pngbin0 -> 217748 bytes
2 files changed, 606 insertions, 0 deletions
diff --git a/assets/ept_method_intro.tex b/assets/ept_method_intro.tex
new file mode 100644
index 0000000..31d458d
--- /dev/null
+++ b/assets/ept_method_intro.tex
@@ -0,0 +1,606 @@
+\documentclass[11pt]{article}
+
+\usepackage[margin=1in]{geometry}
+\usepackage{amsmath,amssymb}
+\usepackage{bm}
+\usepackage[round]{natbib}
+\usepackage{enumitem}
+\usepackage{booktabs}
+\usepackage[colorlinks=true,linkcolor=blue,citecolor=blue,urlcolor=blue]{hyperref}
+
+% --- light-weight notation ---------------------------------------------------
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\C}{\mathbb{C}}
+\renewcommand{\Re}{\operatorname{Re}}
+\newcommand{\xin}{x_{\mathrm{in}}}
+\newcommand{\zstar}{z^{\ast}}
+\newcommand{\zbar}{\bar{z}}
+\newcommand{\Fnc}{F_{\mathrm{nc}}}
+\newcommand{\Jnc}{J_{\mathrm{nc}}}
+\newcommand{\half}{\tfrac12}
+\newcommand{\grad}{\nabla}
+\newcommand{\dd}{\,\mathrm{d}}
+\newcommand{\inner}[2]{\langle #1,\, #2\rangle}
+\DeclareMathOperator{\Attn}{Attn}
+\DeclareMathOperator{\FFN}{FFN}
+\DeclareMathOperator{\softmax}{softmax}
+\DeclareMathOperator{\LSE}{LSE}
+\DeclareMathOperator{\LN}{LN}
+\DeclareMathOperator{\jvp}{jvp}
+\DeclareMathOperator{\vjp}{vjp}
+\DeclareMathOperator*{\argmin}{arg\,min}
+
+\title{\bf Training a Transformer Language Model with Equilibrium Propagation:\\
+from energy-based EP to non-conservative, holomorphic, tracking-AEP}
+\author{Method introduction (internal)}
+\date{2026-06-21}
+
+\begin{document}
+\maketitle
+
+\begin{abstract}
+We train a transformer-class language model in which \emph{both} attention and the
+feed-forward network learn \emph{without backpropagation through the computation},
+using Equilibrium Propagation (EP). This note is written for a reader who knows
+\emph{classic} energy-based EP \citep{scellier2017} --- the two-phase free/nudged
+relaxation of a conservative, symmetric-Jacobian system --- but has not met the
+non-conservative / asymmetric / holomorphic extensions. We first recall why classic
+EP \emph{requires} a conservative system, then show that softmax self-attention
+breaks that requirement (independent $Q,K,V$ give an asymmetric Jacobian). We then
+introduce, from first principles, the pieces that repair this: the
+\emph{asymmetric / adjoint} EP correction $J\!\to\!J^{\!\top}$
+\citep{scurria2026}; the \emph{holomorphic} EP estimator \citep{laborieux2022};
+the \emph{Convergent Energy Transformer} (CET) route \citep{hoier2026} that
+sidesteps the problem by making attention conservative; and finally \emph{our}
+recipe: a damped non-conservative equilibrium-transformer block, trained with
+\emph{tracking-AEP} (re-linearizing the correction at the moving common-mode
+midpoint) plus a residual-driven stabilization stack. We report what is solidly
+validated --- component gradients match backprop at cosine $0.99$--$1.0$, and EP
+trains the block stably and competitively with a backprop transformer at equal
+parameters on a character-level LM --- and clearly mark the larger-scale work
+(the $C{=}512$ ``residual-defense'' line) as \emph{ongoing}.
+\end{abstract}
+
+\tableofcontents
+
+%==============================================================================
+\section{Recap: classic energy-based EP and why it needs a conservative system}
+\label{sec:classic}
+
+\paragraph{Setup.}
+Classic EP \citep{scellier2017} trains a dynamical system whose state
+$z\in\R^{d}$ relaxes, under a fixed input/clamp, to the minimum of a scalar
+\emph{energy} $E(z,\theta)$. Two ideas make it a learning rule.
+
+\paragraph{Two phases.}
+\begin{itemize}[leftmargin=1.4em,itemsep=2pt]
+ \item \emph{Free phase.} Run the gradient dynamics $\dot z=-\grad_z E(z,\theta)$
+ to the free equilibrium $\zstar=\argmin_z E(z,\theta)$,
+ in practice an Euler relaxation to a fixed point.
+ \item \emph{Nudged phase.} Add the task loss to the energy with a small strength
+ $\beta$, $E_\beta = E + \beta\,\ell(z)$, and relax to the nudged
+ equilibrium $z_\beta$.
+\end{itemize}
+
+\paragraph{The contrastive gradient.}
+EP's central identity is that the loss gradient w.r.t.\ any parameter is the
+\emph{contrastive difference of $\partial E/\partial\theta$ across the two phases}:
+\begin{equation}
+ \frac{\partial \mathcal{L}}{\partial \theta}
+ \;\approx\;
+ \frac{1}{\beta}\!\left[
+ \frac{\partial E}{\partial\theta}(z_\beta,\theta)
+ -\frac{\partial E}{\partial\theta}(\zstar,\theta)
+ \right]
+ \qquad(\text{one-sided, bias }O(\beta)).
+ \label{eq:ep-onesided}
+\end{equation}
+Centered / symmetric nudging \citep{laborieux2021} uses $\pm\beta$ and averages,
+reducing the estimator bias to $O(\beta^2)$:
+\begin{equation}
+ \frac{\partial \mathcal{L}}{\partial \theta}
+ \;\approx\;
+ \frac{1}{2\beta}\!\left[
+ \frac{\partial E}{\partial\theta}(z_{+\beta})
+ -\frac{\partial E}{\partial\theta}(z_{-\beta})
+ \right].
+ \label{eq:ep-centered}
+\end{equation}
+The update is \emph{local}: each parameter reads only the two equilibria of the
+terms it touches; there is no backward pass and no weight transport. As
+$\beta\!\to\!0$ with a converged free phase, the EP estimate equals the
+implicit/equilibrium gradient, and (in an RNN with static input) it equals the
+step-wise BPTT gradient \citep{ernoult2019}.
+
+\paragraph{Why this needs a conservative / symmetric-Jacobian system.}
+Equations \eqref{eq:ep-onesided}--\eqref{eq:ep-centered} are only valid because
+the dynamics are the \emph{gradient} of a scalar energy. Write the force as
+$F(z) = -\grad_z E(z)$ and its Jacobian as $J=\partial F/\partial z$. If $F$
+descends an energy, then $J = -\,\partial^2 E/\partial z^2$ is a Hessian and is
+therefore \emph{symmetric}, $J=J^{\!\top}$. This symmetry is exactly what makes the
+nudged perturbation a faithful surrogate for the loss \emph{adjoint}: linearizing
+the nudged relaxation around $\zstar$ produces a response governed by
+$(I-J)^{-1}$, and because $J=J^{\!\top}$ this self-adjoint operator is the same one
+the true gradient (which involves $(I-J^{\!\top})^{-1}$) requires. We therefore
+record the four implicit premises of classic EP --- the transformer will break all
+four, and each fix below targets exactly one of them:
+\begin{description}[leftmargin=2.6em,itemsep=2pt]
+ \item[(A) Conservative / symmetric.] A scalar energy $E$ exists, so $J=J^{\!\top}$.
+ \item[(B) Free phase converged.] The readout sits at the true fixed point;
+ residual $\approx 0$.
+ \item[(C) Small-$\beta$ linear response, clean nudge.] $\beta\!\to\!0$ is a mere
+ perturbation, and no non-analytic ``clamp'' contaminates the estimate.
+ \item[(D) The fixed point stays stable throughout training.] After every weight
+ update the free phase still relaxes to a stable fixed point.
+\end{description}
+
+%==============================================================================
+\section{The gap: softmax attention is non-conservative}
+\label{sec:gap}
+
+A pre-LN transformer block computes, for a state $z$,
+\begin{equation}
+ \Attn(z) = \softmax\!\Big(\tfrac{Q(z)K(z)^{\!\top}}{\sqrt{d}},\ \text{causal}\Big)V(z)\,W_O,
+ \qquad
+ Q=zW_Q,\ K=zW_K,\ V=zW_V,
+ \label{eq:attn}
+\end{equation}
+with \emph{independent} projections $W_Q,W_K,W_V$. The query--key coupling
+$i\!\to\!j$ is governed by $W_QW_K^{\!\top}$, while $j\!\to\!i$ is governed by
+$W_KW_Q^{\!\top}$; these differ, and $V$ is a third independent map. Consequently
+the attention Jacobian is \emph{asymmetric}, $J_{\Attn}\neq J_{\Attn}^{\!\top}$, and
+\emph{no scalar energy has this gradient}. An untied $4\times$ FFN
+($W_2\,\mathrm{GELU}(W_1\cdot)$ with $W_2\neq W_1^{\!\top}$) is non-conservative for
+the same reason. Premise~(A) fails.
+
+Empirically this is not a cosmetic issue: with an asymmetric $J$ the nudged phase
+relaxes under $J$ but the correct loss adjoint needs $J^{\!\top}$, so the raw EP
+contrast is \emph{biased}. Measured against the true backprop gradient, uncorrected
+EP gives an attention-parameter cosine of only $\approx 0.25$ (essentially the
+wrong direction), even though the loss-adjacent output projection looks fine. (This
+is the same pathology that limits feedback alignment, which only trains the layer
+right before the loss and leaves $Q/K/V$ at cosine $\approx 0.25$ and the upstream
+FFN at $\approx -0.01$.)
+
+There are two ways out, and we will use the second:
+\begin{enumerate}[leftmargin=1.6em,itemsep=2pt]
+ \item \textbf{Energy route} (make attention conservative): fold attention into a
+ scalar energy with a \emph{tied} value, so $F=-\grad E$ and classic EP is
+ exactly valid. This is the CET route (\S\ref{sec:cet-energy}); it costs the
+ $Q\!\neq\!K$ asymmetry and the free value that make attention expressive.
+ \item \textbf{Force route} (keep real attention, repair the \emph{estimator}):
+ leave \eqref{eq:attn} as a non-conservative \emph{force} and add a
+ correction that turns $J$ into $J^{\!\top}$ in the nudged phase. This is the
+ AEP route (\S\ref{sec:aep}), and it is what our block uses.
+\end{enumerate}
+
+%==============================================================================
+\section{AEP, holomorphic EP, and the force-form readout}
+\label{sec:aep}
+
+\subsection{Force-form (vector-field) EP}
+\label{sec:vf}
+The first step is to drop the energy and write the dynamics directly as a force
+$F(z)$, relaxing $\dot z=F(z)$ to a fixed point $\zstar$. The parameter gradient is
+then read off a \emph{vector-field} (VF) contrast \citep{scurria2026}:
+\begin{equation}
+ \frac{\partial\mathcal{L}}{\partial\theta}
+ \;\approx\;
+ \frac{\partial}{\partial\theta}\,\big\langle a,\ F(\zstar;\theta)\big\rangle,
+ \qquad
+ a \;=\; \frac{z_{-\beta}-z_{+\beta}}{2\beta}\ \approx\ -\frac{\dd \zstar}{\dd\beta},
+ \label{eq:vf}
+\end{equation}
+where $a$ is the centered contrast (the ``adjoint state'') read from the two nudged
+equilibria, and the right-hand side is \emph{one} autograd call evaluated at the
+fixed point only --- per-term local bookkeeping, \emph{not} backprop through the
+relaxation steps. Every term of the block (attention, FFN, LayerNorm affines, and
+the embeddings, which enter through the input clamp $-(z-\xin)$) is a term of the
+same $F$, so \eqref{eq:vf} trains them jointly with no per-module schedule.
+
+\paragraph{Attribution / honest caveat.}
+The force-form VF readout \eqref{eq:vf} is \emph{not ours}: it is the baseline of
+\citet{scurria2026}. Crucially it \emph{collapses on its own} for a non-conservative
+system (their CIFAR-10 VF reaches chance, $10\%$; MNIST $64\%$ vs.\ $92.7\%$),
+exactly mirroring our measured cosine $\approx 0.25$ for uncorrected attention. VF
+is therefore the ``starting point that fails''; what rescues it is the next step.
+
+\subsection{The AEP correction: \texorpdfstring{$J\!\to\!J^{\!\top}$}{J to J transpose}}
+\label{sec:aep-corr}
+For a non-conservative $F$, the nudged relaxation linearized at $\zstar$ runs under
+$J=\partial F/\partial z$, but the true adjoint requires $J^{\!\top}$. \emph{Asymmetric
+EP} (AsymEP) \citep{scurria2026} repairs this by adding to the nudged force a term
+that subtracts twice the antisymmetric part of the Jacobian. With
+$v=z-\zstar$ and $\Jnc$ the Jacobian of the \emph{non-conservative} part $\Fnc$,
+\begin{equation}
+ \mathrm{corr}(z) \;=\; \Jnc\,v - \Jnc^{\!\top} v
+ \;=\; (\Jnc-\Jnc^{\!\top})\,v
+ \;=\; 2\,A_J\,v,
+ \qquad
+ A_J \equiv \tfrac12\big(\Jnc-\Jnc^{\!\top}\big),
+ \label{eq:aep}
+\end{equation}
+which is \emph{mathematically identical} to their $-2A_J(\zstar)(z-\zstar)$. The
+nudged force becomes $f \;=\; F(z) \mp \beta\,\grad_z\ell(z) - \mathrm{corr}(z)$,
+so the attention part of the nudged linearization is replaced as
+\begin{equation}
+ J\,v \;-\; (J-J^{\!\top})\,v \;=\; J^{\!\top} v ,
+\end{equation}
+i.e.\ \emph{$J$ is turned into $J^{\!\top}$}, restoring the correct adjoint and hence the
+exact gradient for $Q\!\neq\!K$ attention. Two structural facts make this cheap and
+local:
+\begin{itemize}[leftmargin=1.4em,itemsep=2pt]
+ \item \emph{The symmetric (conservative) parts cancel.} The damping $-c\,z$ has
+ Jacobian $-cI$ (symmetric), the FFN-as-Hopfield-energy and the input clamp
+ are symmetric, so they contribute $0$ to $A_J$. Thus a \emph{single}
+ correction on the attention term repairs the \emph{whole} block; FFN/clamp
+ ride along in the conservative part and are already exact under VF.
+ \item \emph{It is matrix-free.} We never build $\Jnc$. Each nudged step uses one
+ Jacobian-vector product and one vector-Jacobian product,
+ $\Jnc v=\jvp(\Fnc,\zstar,v)$ and $\Jnc^{\!\top} v=\vjp(\Fnc,\zstar,v)$.
+\end{itemize}
+
+\paragraph{Attribution.}
+The correction \eqref{eq:aep} is \citet{scurria2026}'s, \emph{not} ours. Their scope
+is feedforward / Hopfield nets on static MNIST/CIFAR with an \emph{explicitly
+constructed} Jacobian, no attention, no sequence model, and no stability controller.
+\emph{Ours on this line} is: (i) the matrix-free $\jvp/\vjp$ form (their explicit
+Jacobian is infeasible at transformer state dimension $B\!\cdot\!T\!\cdot\!C$);
+(ii) the application to data-dependent \emph{softmax attention}; (iii) the
+combination with holomorphic estimation (\S\ref{sec:holo}); (iv) the common-mode
+\emph{tracking} variant (\S\ref{sec:tracking}); and (v) the transformer-LM
+application together with the stability stack (\S\ref{sec:stab}).
+
+\paragraph{Validity window.}
+The correction is linearized \emph{at $\zstar$}, so the nudged trajectory must stay
+inside the linear-response window. At $\varepsilon{=}0.1$ a nudge horizon
+$T_2\!\approx\!20$ is comfortably inside; $T_2\gtrsim 60$ can leave it (\S\ref{sec:stab}).
+
+\subsection{Holomorphic EP: variance-reduced, higher-order estimates}
+\label{sec:holo}
+The $\pm\beta$ contrast trades bias against noise: small $\beta$ shrinks the
+$O(\beta^2)$ bias but amplifies the $1/\beta$ noise on $(z_{-\beta}-z_{+\beta})/2\beta$.
+Holomorphic EP \citep{laborieux2022} removes this trade-off by replacing the two
+real points with $N$ points on a \emph{complex circle},
+$\beta_k = r\,e^{2\pi i k/N}$, relaxing the \emph{holomorphically extended} dynamics
+and reading the contrast off a discrete Cauchy integral:
+\begin{equation}
+ a \;=\; -\,\Re\!\left[\frac{1}{Nr}\sum_{k=0}^{N-1} e^{-i\phi_k}\,(z_k-\zstar)\right],
+ \qquad \phi_k=\tfrac{2\pi k}{N},
+ \label{eq:holo}
+\end{equation}
+whose bias is $O(r^{N})$ instead of $O(r^{2})$ --- so $r$ may be $5$--$10\times$
+larger at equal bias, cutting the $1/\beta$ noise by the same factor. The
+holomorphic extension is built by hand (complex LayerNorm with non-conjugate
+variance, softmax as a ratio of exponentials, the $\tanh$-form GELU which is an
+entire function); the AEP correction \eqref{eq:aep} is \emph{real-linear in $v$}, so
+it preserves holomorphy and is applied to the real and imaginary parts separately.
+No clamps appear inside the holomorphic nudge --- clamps are non-analytic and would
+destroy the $O(r^N)$ bias order. This addresses premise~(C). \citep{laborieux2022}
+is the source; we add only the combination with the AEP correction and with softmax
+attention.
+
+%==============================================================================
+\section{The equilibrium-transformer block (and the CET alternative)}
+\label{sec:block}
+
+\subsection{Our damped, non-conservative block (\texttt{thick})}
+\label{sec:thick}
+The state is $z\in\R^{B\times T\times C}$, one vector per token position. Inference
+is a relaxation to a fixed point under a \emph{single force} $F$,
+$z\leftarrow z+\varepsilon F(z)$ for $T_1$ steps ($\varepsilon{=}0.1$, $T_1{\approx}150$),
+after which logits $=\zstar W_h$. The force is a pre-LN transformer block written as
+a force rather than a layer stack:
+\begin{equation}
+ F(z) =
+ \underbrace{-(z-\xin)}_{\text{input clamp}}
+ +\underbrace{\Attn(\LN_1(z))}_{\text{causal MHSA},\ W_Q,W_K,W_V,W_O}
+ +\underbrace{W_2\mathrm{GELU}(W_1\LN_2(z)+b_1)+b_2}_{\text{untied }4\times\text{ FFN}}
+ -\underbrace{c\,z}_{\text{damping}}.
+ \label{eq:thick}
+\end{equation}
+Here $\xin=\mathrm{tok}[\mathrm{idx}]+\mathrm{pos}$ is the (trained) input
+embedding, clamped as a boundary condition through the $-(z-\xin)$ term; this is the
+same fixed-point map a Deep Equilibrium model \citep{bai2019} uses. The block is
+strongly non-conservative ($Q\!\neq\!K$, untied FFN), and AEP makes EP exact for it.
+
+\paragraph{Why the $-c\,z$ damping is the key recipe move.}
+Raw attention at high gain has \emph{no} fixed point: the residual floors at
+$\sim\!3\times10^{-2}$ and the relaxation never settles, so the entire EP family
+(corrected or not) cannot even start (there is no $\zstar$ to nudge around). Adding
+$-c\,z$ ($c\!\geq\!1$) makes the map contractive enough to \emph{create a stable
+fixed point at any attention strength}, while leaving the map non-conservative
+(independent $Q/K/V$ are untouched). Critically, the damping's Jacobian $-cI$ is
+symmetric, so it \emph{cancels in $A_J$} \eqref{eq:aep}: it buys a fixed point
+without polluting the AEP correction, which still sees only attention's
+non-reciprocal part. Together, ``damping $+$ AEP'' is the minimal recipe that makes
+real attention EP-trainable, taking the attention-parameter cosine from
+$\approx 0.25$ (uncorrected) to $0.99$--$1.0$ even at high gain.
+
+\paragraph{A subtlety for LN-inside blocks.}
+Because LayerNorm sits \emph{inside} \eqref{eq:thick} and its Jacobian scales like
+$1/\sigma(z)$, large damping shrinks $\|\zstar\|$ and thereby \emph{inflates} the
+effective Jacobian (measured: plain-relax residual $8.8\times10^{-3}$ at $c{=}0$
+vs.\ $3.4\times10^{-2}$ at $c{=}2$). So for \texttt{thick} we keep $c$ small ($c{=}1$)
+and the actual stabilizer is the Jacobian-norm penalty of \S\ref{sec:stab}, not the
+damping. (For a simpler ``thin'' variant whose FFN is an energy-based modern-Hopfield
+memory and whose attention is a raw damped force, the damping \emph{is} required.)
+
+\subsection{The CET / energy route (the conservative alternative)}
+\label{sec:cet-energy}
+\textbf{CET} here means the \emph{Convergent Energy Transformer} of
+\citet{hoier2026} --- an energy-based transformer block, trained with EP, that we
+reproduced (on masked image completion) as the prior SOTA for ``EP $+$ attention''.
+Its trick is to make attention \emph{conservative} so classic EP applies with
+\emph{no} correction: attention is folded into a scalar energy
+\begin{equation}
+ E_{\mathrm{att}}(z) \;=\;
+ -\frac{1}{\gamma}\sum_{\text{heads},\,i}
+ \LSE_{j}\!\big(\gamma\, q_i\!\cdot\!k_j\big)
+ \quad(\text{causal-masked}),
+ \label{eq:cet}
+\end{equation}
+whose force \emph{ties the value to the key} ($v\!\equiv\!k$), plus a confinement
+$\tfrac12 c\|z\|^2$ (because $E_{\mathrm{att}}$ is unbounded below) and a
+modern-Hopfield memory energy $E_{\mathrm{mem}}(z)=-\sum\mathrm{relu}(zW_m)^2$
+playing the role of the FFN (its force is a \emph{tied}-weight squared-ReLU MLP). On
+this energy $F=-\grad E$ exactly, so classic EP is valid with symmetric Jacobian and
+no AEP. In our reproduction EP matched truncated-BPTT (``EP $\approx$ TBPTE'',
+gradient cosine $0.99$). The trade-off is expressivity: the tied value and
+reciprocal coupling are the least expressive form of attention. Under \emph{exact}
+gradients on the LM, this conservative route (and a monotone-DEQ variant
+\citep{winston2020}) costs $\approx 0.15$--$0.2$ CE relative to the non-conservative
+\texttt{thick} block --- which is precisely why we pay for the AEP machinery and keep
+real attention.
+
+%==============================================================================
+\section{Our recipe: tracking-AEP and the stabilization stack}
+\label{sec:recipe}
+
+\subsection{Tracking-AEP: re-linearize at the moving common mode}
+\label{sec:tracking}
+The AEP correction \eqref{eq:aep} is frozen at $\zstar$. Near a good solution this
+becomes the binding error: as the model sharpens, the true gradient shrinks below
+the \emph{bias floor} of the frozen linearization, and the highly non-normal block
+Jacobian makes that floor large (we measure $\|\Jnc v-\Jnc^{\!\top} v\|/\|\Jnc v\|=1.37$
+at $\zstar$). The fix is to re-linearize the antisymmetric correction not at the
+frozen $\zstar$ but at the \emph{instantaneous common mode} of the two nudged
+trajectories,
+\begin{equation}
+ \zbar \;=\; \half\big(z_{+}+z_{-}\big),
+ \qquad
+ \mathrm{corr}(z) \;=\; \Jnc(\zbar)\,v - \Jnc(\zbar)^{\!\top} v,
+ \quad v = z-\zbar,
+ \label{eq:track}
+\end{equation}
+evaluated step-by-step as $\zbar$ moves with the nudge (run the $+$ and $-$ phases in
+lockstep, recompute $\jvp/\vjp$ about the running $\zbar$). This is exact transposed
+differential dynamics with no compounding linearization error, and it is loose-tolerant
+(it does not demand an ultra-tight free phase). At a plateau checkpoint where the
+frozen estimator had collapsed (gradient cosine vs.\ BPTT $-0.045$, batch-to-batch
+self-coherence $-0.27$, magnitude ratio $\sim\!4000\times$), tracking-AEP restores
+cosine $0.997$, self-coherence $+0.95$, magnitude ratio $0.9$. Tracking-AEP and the
+common-mode formulation \eqref{eq:track} are \emph{ours}.
+
+\subsection{The validity threshold and the residual as the health signal}
+\label{sec:stab}
+The governing empirical fact is that the EP estimator has a \emph{validity threshold}
+in the free-phase relative residual
+\begin{equation}
+ \mathrm{res} \;=\; \frac{\|z^{+}-\zstar\|}{\|\zstar\|}
+ \qquad(\text{one extra relaxation step}),
+\end{equation}
+which is the load-bearing health signal (premise~(B)). Gradient cosine vs.\ the exact
+reference degrades sharply with res: $\approx 0.85$ at $\mathrm{res}\!\sim\!5\times10^{-5}$,
+batch-dependent $0.2$--$0.9$ at $10^{-3}$, and noise at $10^{-2}$. BPTT has no such
+threshold (it differentiates the actual finite unroll, converged or not); \emph{this
+asymmetry, and nothing deeper, is the EP-specific difficulty}. Accordingly the free
+phase is run adaptively: relax to $T_1{=}150$, then continue in chunks until
+$\mathrm{res}\!\le\!10^{-4}$ before nudging. We emphasize there is \emph{no} structural
+``EP ceiling'': an early ``EP caps at $\sim\!2.5$'' verdict was traced to two
+undertrained/invalid-regime runs and retracted.
+
+\subsection{The stabilization stack}
+Training pushes the dynamics off the contractive manifold (premise~(D)) --- and not
+only for EP: even \emph{exact} BPTT on this architecture walks off the manifold on
+long horizons (residual $\to 4.7\times10^{-2}$, val CE $\to 3.0$). The stack that
+keeps the system valid:
+\begin{itemize}[leftmargin=1.4em,itemsep=3pt]
+ \item \textbf{Frozen / controlled Jacobian-norm penalty (\texttt{jacreg}).} A soft
+ penalty $\lambda\,\|\Jnc(\zstar)\|_F^2$, estimated matrix-free by Hutchinson
+ (one $\jvp$ on a random probe, differentiated w.r.t.\ $\theta$). This is
+ \citet{bai2021}'s DEQ-stabilization penalty, \emph{not} ours. It keeps the
+ free phase contractive and hence the estimator inside its validity region.
+ A continuous controller drives it,
+ $\lambda \leftarrow \mathrm{clip}\big(\lambda\,(\mathrm{res}_{\mathrm{EMA}}/\mathrm{target})^{0.3}\big)$,
+ on an EMA-smoothed residual (the raw residual is noisy and a multiplicative
+ controller on it random-walks). A key hard lesson: the controller \emph{floor}
+ is load-bearing and must never anneal to zero --- two independent
+ $\lambda\!\to\!0$ runs died identically (val CE $60$--$77$, $\mathrm{res}\!\equiv\!0$),
+ which post-mortem is an \emph{explosion disguised as convergence by
+ floating-point absorption} ($\varepsilon F<\mathrm{ulp}(z)$ freezes the
+ relaxation), not a benign dead state.
+ \item \textbf{Residual, not spectral radius, as the control signal.} The block
+ Jacobian is highly non-normal, so transient growth is invisible to
+ eigenvalues (measured $\rho(J){=}0.94$ ``stable'' while the relaxation
+ diverged to $\mathrm{res}\,0.21$). The one-step residual \emph{is} the
+ transient; we control on it.
+ \item \textbf{Validity gate.} When the residual exceeds a gate, the EP update is
+ mathematically undefined, so we apply only the homeostat (jacreg) and skip the
+ nudge --- a fast recovery step. At larger scale this gate is load-bearing
+ (off-equilibrium EP updates poison the weights).
+ \item \textbf{Adaptive $T_2$ by hindsight snapshot selection.} On slow-mixing
+ batches a long nudge phase can diverge through non-normal transient growth,
+ and step-size early-stopping \emph{fails} (the transient triggers it
+ spuriously). Instead, run to $T_{2\max}$ in lockstep, snapshot the contrast
+ $a_t$ every few steps, and return the \emph{most settled} snapshot (smallest
+ increment of $a_t$); judging by increments of the \emph{quantity of interest}
+ rather than step sizes makes transient growth harmless. This is ours; it
+ lifts probe cosine from $0.871$ to $0.932$.
+\end{itemize}
+
+\subsection{Ongoing: the residual-defense term (\texttt{resreg}) --- under validation}
+\label{sec:resreg}
+At larger width ($C{=}512$) we observe a distinct, \emph{still-open} failure that we
+call the below-$2.10$ wall: frozen-jacreg, tracking-AEP EP descends to best
+$\approx 2.09$ and then bifurcates within $\sim\!200$ steps (residual
+$5\!\times\!10^{-3}\!\to\!0.15$, gradient cosine $0.98\!\to\!0$, CE $\to\!4{+}$),
+while \emph{exact} BPTT with the identical recipe sails past to $1.72$. The diagnosed
+root cause is an \emph{objective mismatch}: EP optimizes the (refined) fixed point and
+never defends the finite-step residual that evaluation actually uses, whereas BPTT
+differentiates the finite unroll and so implicitly rewards contraction. The diverged
+state is a forward bifurcation to a \emph{limit cycle}, so more relaxation steps cannot
+fix it; only a residual \emph{cost} can. The proposed fix is an explicit T1-residual
+penalty on the \emph{evaluated} state $z_{150}=\mathrm{relax}(\xin,T_1)$ taken before
+any refinement,
+\begin{equation}
+ R_{\mathrm{res}} \;=\; \frac{\|\varepsilon F(z_{150})\|^2}{\|z_{150}\|^2+\varepsilon},
+ \qquad
+ \text{gradient w.r.t.\ }\theta\text{ with }z_{150}\text{ detached},
+ \label{eq:resreg}
+\end{equation}
+scaled task-relative and added to the EP gradient (run with the validity gate off, so
+the penalty is not bypassed exactly when the residual is high). \textbf{Status: this is
+ongoing.} The residual-defense term \eqref{eq:resreg} held the residual pinned at
+$1$--$5\times10^{-4}$ and reached best $2.0573$ (past the wall) through only step
+$\sim\!1000$ before a storage cleanup deleted the run; full re-validation toward the
+$\approx 1.8$ BPTT ceiling is pending. We present it as a diagnosis $+$ proposed fix,
+\emph{not} a finished result. (The objective-mismatch diagnosis, the common-mode
+tracking estimator, the residual-driven controller and validity gate, and this
+residual-defense term are ours.)
+
+%==============================================================================
+\section{Established results (and what is still open)}
+\label{sec:results}
+
+\paragraph{Solidly validated.}
+\begin{itemize}[leftmargin=1.4em,itemsep=3pt]
+ \item \textbf{EP/AEP component gradients match backprop.} On the character LM,
+ AEP gives causal-attention parameters cosine $0.99$, the (Hopfield) FFN
+ $1.00$, and the full LM block $0.99$ vs.\ the true backprop gradient
+ --- versus feedback alignment at $Q/K/V\approx 0.25$, FFN $\approx -0.01$.
+ On the CET reproduction, global cosine $0.99$ and EP $\approx$ TBPTE on
+ masked-image completion.
+ \item \textbf{EP trains the equilibrium transformer stably, without backprop.}
+ With the stabilization stack, end-to-end EP runs $10\text{k}+$ steps with
+ zero non-finite steps.
+ \item \textbf{It matches/beats a BP transformer at equal parameters.} On
+ Shakespeare character-LM (single block, $C{=}128$), at a fully controlled
+ $14$k-step comparison (Table~\ref{tab:results}): EP reaches val CE
+ \textbf{1.676} (multi-seed $1.680\pm0.005$, $3$ seeds); the like-for-like
+ standard BP transformer (matched in parameter \emph{shape} to the thick
+ block) reaches $1.610$; EP \emph{beats} the thinner BP baseline ($1.689$).
+ The total gap of $0.066$ decomposes into an architecture tax $\approx 0.025$
+ (BPTT on the identical block $1.635$) and an EP-rule tax $\approx 0.041\pm0.005$
+ --- real, tightly reproducible, and consistent with the measured estimator
+ misalignment (cosine $0.85$--$0.93$).
+\end{itemize}
+
+\begin{table}[t]
+ \centering
+ \small
+ \begin{tabular}{llc}
+ \toprule
+ \textbf{training rule} & \textbf{architecture / recipe} & \textbf{best val CE}\\
+ \midrule
+ BP & standard transformer (like-for-like for \texttt{thick}) & \textbf{1.610}\\
+ BPTT $+$ $\lambda$-controller $+$ param-EMA & \texttt{thick} (exact grad, same stabilizer) & 1.635\\
+ \textbf{EP} & \texttt{thick}; tracking-AEP $+$ adaptive $T_1/T_2$ & \textbf{1.676}\\
+ BP & standard transformer (thin-matched) & 1.689\\
+ BPTT (exact grad) & \texttt{thick}, unregularized & 2.021 (destabilizes late)\\
+ random & --- & 4.174\\
+ \bottomrule
+ \end{tabular}
+ \caption{Fully-controlled $14$k-step comparison on Shakespeare char-LM
+ (random $=\ln 65$). EP matches the architecture-controlled exact-gradient
+ run to within $0.041$ and beats the thin-matched BP baseline. ``BPTT as
+ ablation'' separates the training-rule cost (EP$-$BPTT) from the
+ architecture cost (BPTT$-$BP).}
+ \label{tab:results}
+\end{table}
+
+\paragraph{Honest framing of the controlled comparison.}
+EP beats \emph{bare} BPTT, but the controlled table shows most of that win is EP's
+\emph{mandatory} stabilization loop doubling as regularization: bare exact-gradient
+training walks off the contractive manifold at $14$k, and the same controller that EP
+cannot live without also lifts BPTT to $1.635$. The contraction controller is good for
+the equilibrium architecture regardless of training rule; EP merely forced its
+discovery.
+
+\paragraph{Ongoing / under validation.}
+The $C{=}512$ work is \emph{not} a finished result. (i) The $2.40$ plateau there is
+diagnosed as a late-training EP estimator bias-floor / batch-incoherence, which
+tracking-AEP breaks in training ($2.40\!\to\!2.16$, still descending in a $2500$-step
+warm-start test). (ii) The below-$2.10$ wall is diagnosed as the objective mismatch of
+\S\ref{sec:resreg}; the residual-defense term \eqref{eq:resreg} validated res-tight and
+past the wall (best $2.0573$) \emph{only through step $\sim\!1000$} before the run was
+lost, and a full re-run toward the $\approx 1.8$ BPTT ceiling is pending. These should
+be read as diagnoses with promising partial evidence, not as established numbers.
+
+%==============================================================================
+\section*{Attribution summary}
+\addcontentsline{toc}{section}{Attribution summary}
+
+\begin{description}[leftmargin=2.2em,itemsep=2pt]
+ \item[Theirs.] Classic energy-based EP and centered nudging
+ \citep{scellier2017,laborieux2021}; EP $\equiv$ BPTT in the converged, $\beta\!\to\!0$
+ limit \citep{ernoult2019}; holomorphic EP \citep{laborieux2022}; the asymmetric/AEP
+ correction $J\!\to\!J^{\!\top}$ \emph{and} the force-form VF readout
+ \citep{scurria2026}; the Jacobian-norm penalty \citep{bai2021}; DEQ
+ \citep{bai2019} and monotone DEQ \citep{winston2020}; the Convergent Energy
+ Transformer / CET \citep{hoier2026}.
+ \item[Ours.] The transformer application of the force route and the damping recipe
+ (damping $+$ AEP making real attention EP-trainable at any gain); the matrix-free
+ $\jvp/\vjp$ form of the correction at transformer scale and its combination with
+ holomorphic estimation and softmax attention; \emph{tracking-AEP} (common-mode
+ re-linearization, Eq.~\ref{eq:track}); the residual-driven controller, the validity
+ gate, and adaptive-$T_2$ snapshot selection; and the (ongoing) residual-defense term
+ \texttt{resreg} (Eq.~\ref{eq:resreg}) with its objective-mismatch diagnosis.
+\end{description}
+
+%==============================================================================
+\begin{thebibliography}{9}
+\bibitem[Bai et al., 2019]{bai2019}
+ S.~Bai, J.~Z.~Kolter, V.~Koltun.
+ \emph{Deep Equilibrium Models}. NeurIPS 2019.
+
+\bibitem[Bai et al., 2021]{bai2021}
+ S.~Bai, V.~Koltun, J.~Z.~Kolter.
+ \emph{Stabilizing Equilibrium Models by Jacobian Regularization}. ICML 2021.
+
+\bibitem[Ernoult et al., 2019]{ernoult2019}
+ M.~Ernoult, J.~Grollier, D.~Querlioz, Y.~Bengio, B.~Scellier.
+ \emph{Updates of Equilibrium Prop Match Gradients of Backprop Through Time in an
+ RNN with Static Input}. NeurIPS 2019.
+
+\bibitem[H{\o}ier et al., 2026]{hoier2026}
+ R.~H{\o}ier, K.~Kerjan, B.~Scellier.
+ \emph{Training a Convergent Energy Transformer with Equilibrium Propagation} (CET).
+ ICLR 2026 Associative Memory workshop; OpenReview \texttt{Qrfml76eWJ}.
+
+\bibitem[Laborieux et al., 2021]{laborieux2021}
+ A.~Laborieux, M.~Ernoult, B.~Scellier, Y.~Bengio, J.~Grollier, D.~Querlioz.
+ \emph{Scaling Equilibrium Propagation to Deep ConvNets by Drastically Reducing its
+ Gradient Estimator Bias} (centered/symmetric nudging). Frontiers in Neuroscience, 2021.
+
+\bibitem[Laborieux \& Zenke, 2022]{laborieux2022}
+ A.~Laborieux, F.~Zenke.
+ \emph{Holomorphic Equilibrium Propagation Computes Exact Gradients Through Finite Size
+ Oscillations}. NeurIPS 2022.
+
+\bibitem[Scellier \& Bengio, 2017]{scellier2017}
+ B.~Scellier, Y.~Bengio.
+ \emph{Equilibrium Propagation: Bridging the Gap between Energy-Based Models and
+ Backpropagation}. Frontiers in Computational Neuroscience, 2017.
+
+\bibitem[Scurria et al., 2026]{scurria2026}
+ A.~Scurria, P.~Vanden Abeele, B.~Mognetti, S.~Massar.
+ \emph{Equilibrium Propagation for Non-Conservative Systems} (AsymEP).
+ arXiv:2602.03670, 2026.
+
+\bibitem[Winston \& Kolter, 2020]{winston2020}
+ E.~Winston, J.~Z.~Kolter.
+ \emph{Monotone Operator Equilibrium Networks} (monotone DEQ). NeurIPS 2020.
+\end{thebibliography}
+
+\end{document}
diff --git a/assets/frozen_vs_adaptive.png b/assets/frozen_vs_adaptive.png
new file mode 100644
index 0000000..e45e77b
--- /dev/null
+++ b/assets/frozen_vs_adaptive.png
Binary files differ