From b83947778e2c776f757a07d4719b7ce961d7ed55 Mon Sep 17 00:00:00 2001 From: Yuren Hao Date: Fri, 3 Jul 2026 05:56:50 -0500 Subject: =?UTF-8?q?Initial=20commit:=20ept=20=E2=80=94=20backprop-free=20e?= =?UTF-8?q?quilibrium=20transformer=20(EP)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code (ep_run/), organized docs (docs/{method,campaign,hardware,outreach,paper}), analysis scripts (scripts/), ONBOARDING.md entry point. Large data/checkpoints git-ignored (share separately). Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_014FAPDWQ49M5Ye3NpTndTpn --- ep_run/CODEX_VERDICT.md | 151 +++ ep_run/EP_DIAGNOSIS_DOSSIER.md | 99 ++ ep_run/FUGU_OPTIONS_VERDICT.md | 263 +++++ ep_run/FUGU_Q1_VERDICT.md | 123 +++ ep_run/FUGU_Q_OPTIONS.md | 29 + ep_run/FUGU_VERDICT_FULL.md | 160 +++ ep_run/GPT55_BUG_HUNT.md | 249 +++++ ep_run/adaptive_eps_calib.py | 40 + ep_run/adaptive_eps_calib2.py | 39 + ep_run/alert.sh | 13 + ep_run/analogET_extracted.txt | 1861 ++++++++++++++++++++++++++++++++ ep_run/analyze.py | 95 ++ ep_run/analyze_all.py | 86 ++ ep_run/analyze_ln_jacobian.py | 166 +++ ep_run/analyze_softmax_jacobian.py | 168 +++ ep_run/anderson_control.py | 67 ++ ep_run/asym_probe.py | 922 ++++++++++++++++ ep_run/auto_probe.py | 25 + ep_run/bench_gpu2.py | 33 + ep_run/bf16_dbg.py | 29 + ep_run/bf16_dbg2.py | 30 + ep_run/bias_var.py | 63 ++ ep_run/bp_charlm.py | 78 ++ ep_run/compile_bench.py | 44 + ep_run/cos_monitor.py | 49 + ep_run/cos_sweep.log | 15 + ep_run/cos_sweep.py | 35 + ep_run/data_prep.log | 4 + ep_run/diag_cos.py | 45 + ep_run/drift_diag.py | 87 ++ ep_run/eig_control.py | 50 + ep_run/eig_jacreg.py | 38 + ep_run/eig_probe.py | 43 + ep_run/ep_ajr_check.py | 22 + ep_run/ep_c3_watch.py | 19 + ep_run/ep_c_check.py | 22 + ep_run/ep_eps05_grid.py | 29 + ep_run/ep_eps05_track.py | 24 + ep_run/ep_eps05_track2.py | 26 + ep_run/ep_eps05_watch.py | 20 + ep_run/ep_fast_check.py | 23 + ep_run/ep_fast_timing.py | 20 + ep_run/ep_jacreg_binary.py | 22 + ep_run/ep_jacreg_grid.py | 27 + ep_run/ep_jacreg_spike.py | 26 + ep_run/ep_resreg_check.py | 23 + ep_run/ep_resreg_grid.py | 24 + ep_run/ep_rr_check.py | 22 + ep_run/ep_sn_monitor.py | 43 + ep_run/ep_t2fix_watch.py | 20 + ep_run/epmc.json | 1 + ep_run/eps_sweep_s3200.py | 29 + ep_run/eval_relax_s3200.py | 23 + ep_run/extracted_paper.txt | 2039 ++++++++++++++++++++++++++++++++++++ ep_run/factorized_exit.py | 330 ++++++ ep_run/fast_probe.py | 41 + ep_run/gcalib.py | 38 + ep_run/gen_ept.py | 32 + ep_run/grad_quality.py | 64 ++ ep_run/holo_ep.py | 332 ++++++ ep_run/jnc_scaling.py | 46 + ep_run/knockout_s3200.py | 27 + ep_run/local_layers.py | 305 ++++++ ep_run/lt_ep_anderson.py | 54 + ep_run/lt_ep_attention.py | 129 +++ ep_run/lt_ep_compare.py | 69 ++ ep_run/lt_ep_diag.py | 57 + ep_run/lt_ep_ffn.py | 119 +++ ep_run/lt_ep_stack.py | 165 +++ ep_run/lt_ep_train.py | 630 +++++++++++ ep_run/mdpi_paper.html | 10 + ep_run/model.py | 156 +++ ep_run/model_local.py | 470 +++++++++ ep_run/oracle_adjoint_train.py | 368 +++++++ ep_run/prepare_tinystories.py | 40 + ep_run/prepare_tinystories_bpe.py | 49 + ep_run/probe_geometry.py | 162 +++ ep_run/profile_ep.log | 21 + ep_run/profile_ep.py | 40 + ep_run/ra_mlp.py | 287 +++++ ep_run/rearm_203.sh | 8 + ep_run/redx_freezer.py | 19 + ep_run/redx_freezer2.py | 21 + ep_run/redx_trajprobe.py | 47 + ep_run/resreg_probe.py | 62 ++ ep_run/resreg_warm_probe_loop.py | 49 + ep_run/sample_eq.py | 70 ++ ep_run/scurria_nonconservative.txt | 1708 ++++++++++++++++++++++++++++++ ep_run/solver_wall.py | 61 ++ ep_run/spec_bifurcation.py | 38 + ep_run/spec_check.py | 15 + ep_run/spec_rho_vs_c.py | 29 + ep_run/speed_probe.py | 63 ++ ep_run/stiefel_feedback.py | 126 +++ ep_run/t2fix_freezer.py | 21 + ep_run/t2fix_rho_prober.py | 52 + ep_run/test_aselect_deepdive.py | 323 ++++++ ep_run/test_compile_aselect.py | 23 + ep_run/track_probe.py | 77 ++ ep_run/train.py | 183 ++++ ep_run/train_local.py | 300 ++++++ ep_run/train_local_ce.py | 580 ++++++++++ ep_run/train_recon.py | 322 ++++++ ep_run/train_stiefel.py | 211 ++++ ep_run/verify_aep_manual.py | 62 ++ ep_run/watch_all.sh | 15 + ep_run/watch_clean.py | 36 + ep_run/watch_contraction.py | 41 + ep_run/watch_hr.py | 33 + ep_run/watch_runs.py | 39 + 110 files changed, 16378 insertions(+) create mode 100644 ep_run/CODEX_VERDICT.md create mode 100644 ep_run/EP_DIAGNOSIS_DOSSIER.md create mode 100644 ep_run/FUGU_OPTIONS_VERDICT.md create mode 100644 ep_run/FUGU_Q1_VERDICT.md create mode 100644 ep_run/FUGU_Q_OPTIONS.md create mode 100644 ep_run/FUGU_VERDICT_FULL.md create mode 100644 ep_run/GPT55_BUG_HUNT.md create mode 100644 ep_run/adaptive_eps_calib.py create mode 100644 ep_run/adaptive_eps_calib2.py create mode 100755 ep_run/alert.sh create mode 100644 ep_run/analogET_extracted.txt create mode 100644 ep_run/analyze.py create mode 100644 ep_run/analyze_all.py create mode 100644 ep_run/analyze_ln_jacobian.py create mode 100644 ep_run/analyze_softmax_jacobian.py create mode 100644 ep_run/anderson_control.py create mode 100644 ep_run/asym_probe.py create mode 100644 ep_run/auto_probe.py create mode 100644 ep_run/bench_gpu2.py create mode 100644 ep_run/bf16_dbg.py create mode 100644 ep_run/bf16_dbg2.py create mode 100644 ep_run/bias_var.py create mode 100644 ep_run/bp_charlm.py create mode 100644 ep_run/compile_bench.py create mode 100644 ep_run/cos_monitor.py create mode 100644 ep_run/cos_sweep.log create mode 100644 ep_run/cos_sweep.py create mode 100644 ep_run/data_prep.log create mode 100644 ep_run/diag_cos.py create mode 100644 ep_run/drift_diag.py create mode 100644 ep_run/eig_control.py create mode 100644 ep_run/eig_jacreg.py create mode 100644 ep_run/eig_probe.py create mode 100644 ep_run/ep_ajr_check.py create mode 100644 ep_run/ep_c3_watch.py create mode 100644 ep_run/ep_c_check.py create mode 100644 ep_run/ep_eps05_grid.py create mode 100644 ep_run/ep_eps05_track.py create mode 100644 ep_run/ep_eps05_track2.py create mode 100644 ep_run/ep_eps05_watch.py create mode 100644 ep_run/ep_fast_check.py create mode 100644 ep_run/ep_fast_timing.py create mode 100644 ep_run/ep_jacreg_binary.py create mode 100644 ep_run/ep_jacreg_grid.py create mode 100644 ep_run/ep_jacreg_spike.py create mode 100644 ep_run/ep_resreg_check.py create mode 100644 ep_run/ep_resreg_grid.py create mode 100644 ep_run/ep_rr_check.py create mode 100644 ep_run/ep_sn_monitor.py create mode 100644 ep_run/ep_t2fix_watch.py create mode 100644 ep_run/epmc.json create mode 100644 ep_run/eps_sweep_s3200.py create mode 100644 ep_run/eval_relax_s3200.py create mode 100644 ep_run/extracted_paper.txt create mode 100644 ep_run/factorized_exit.py create mode 100644 ep_run/fast_probe.py create mode 100644 ep_run/gcalib.py create mode 100644 ep_run/gen_ept.py create mode 100644 ep_run/grad_quality.py create mode 100644 ep_run/holo_ep.py create mode 100644 ep_run/jnc_scaling.py create mode 100644 ep_run/knockout_s3200.py create mode 100644 ep_run/local_layers.py create mode 100644 ep_run/lt_ep_anderson.py create mode 100644 ep_run/lt_ep_attention.py create mode 100644 ep_run/lt_ep_compare.py create mode 100644 ep_run/lt_ep_diag.py create mode 100644 ep_run/lt_ep_ffn.py create mode 100644 ep_run/lt_ep_stack.py create mode 100644 ep_run/lt_ep_train.py create mode 100644 ep_run/mdpi_paper.html create mode 100644 ep_run/model.py create mode 100644 ep_run/model_local.py create mode 100644 ep_run/oracle_adjoint_train.py create mode 100644 ep_run/prepare_tinystories.py create mode 100644 ep_run/prepare_tinystories_bpe.py create mode 100644 ep_run/probe_geometry.py create mode 100644 ep_run/profile_ep.log create mode 100644 ep_run/profile_ep.py create mode 100644 ep_run/ra_mlp.py create mode 100644 ep_run/rearm_203.sh create mode 100644 ep_run/redx_freezer.py create mode 100644 ep_run/redx_freezer2.py create mode 100644 ep_run/redx_trajprobe.py create mode 100644 ep_run/resreg_probe.py create mode 100644 ep_run/resreg_warm_probe_loop.py create mode 100644 ep_run/sample_eq.py create mode 100644 ep_run/scurria_nonconservative.txt create mode 100644 ep_run/solver_wall.py create mode 100644 ep_run/spec_bifurcation.py create mode 100644 ep_run/spec_check.py create mode 100644 ep_run/spec_rho_vs_c.py create mode 100644 ep_run/speed_probe.py create mode 100644 ep_run/stiefel_feedback.py create mode 100644 ep_run/t2fix_freezer.py create mode 100644 ep_run/t2fix_rho_prober.py create mode 100644 ep_run/test_aselect_deepdive.py create mode 100644 ep_run/test_compile_aselect.py create mode 100644 ep_run/track_probe.py create mode 100644 ep_run/train.py create mode 100644 ep_run/train_local.py create mode 100644 ep_run/train_local_ce.py create mode 100644 ep_run/train_recon.py create mode 100644 ep_run/train_stiefel.py create mode 100644 ep_run/verify_aep_manual.py create mode 100755 ep_run/watch_all.sh create mode 100644 ep_run/watch_clean.py create mode 100644 ep_run/watch_contraction.py create mode 100644 ep_run/watch_hr.py create mode 100644 ep_run/watch_runs.py (limited to 'ep_run') diff --git a/ep_run/CODEX_VERDICT.md b/ep_run/CODEX_VERDICT.md new file mode 100644 index 0000000..7d29c39 --- /dev/null +++ b/ep_run/CODEX_VERDICT.md @@ -0,0 +1,151 @@ +# CODEX VERDICT: EP below-2.10 divergence + +## Ruling + +Verdict: (b) STRUCTURAL. + +Converging the EP adjoint phase is necessary for a correct equilibrium-gradient estimate, but it is not sufficient to make this training problem behave like BPTT. The exact equilibrium gradient differentiates `L(z*)`. BPTT differentiates the actual deployed computation `L(z_T)` with `T=150`. Those are different objectives whenever convergence is not effectively complete. The missing term is the finite-horizon residual/contraction term. No `t2sel` or `hr` knob can add that term to the exact fixed-point gradient. + +## Fact-set check + +1. Correct in substance. `runs/bptt_clean.log` reaches best val CE `1.8277` and keeps the finite-`T1` residual small, around `4e-4` to `1e-3` late in training. The failure is EP-specific in the comparable EP logs. The `rho~0.982` value is referenced by the rho prober header and scripts, but the requested log set does not contain the full `spec_bifurcation.py` output, so the CE/residual part is directly verified and the exact rho number is not independently reprinted in the available logs. + +2. Correct. `runs/t2_sweep.log` shows `cos(g_EP,g_transpose)` rising from `0.742413` at `t2sel=10` to `0.998194` at `t2sel=160`. `runs/hr_ceiling_sweep.log` shows the remaining gap flat across `hr=0.04..0.8` at about `0.94..0.946`. That is adjoint-phase truncation, not beta-radius tuning. The code agrees: `lt_ep_train.py` calls `holo_a_track`/`holo_a_select2` with a fixed `t2sel`, and `holo_ep.py` selects a finite nudged snapshot rather than enforcing an adjoint residual. + +3. Correct. The same sweep reports `cos(g_transpose,g_BPTT)=0.974976` at free-phase step residual `2.17e-5`. `diag_probe.log` shows the exact fixed-point regime, residual around `1e-9`, where `cos(g_transpose,g_BPTT)=1.000000`. This is the finite-horizon/free-residual gap. + +4. Correct in causal direction, with one evidence caveat. `ep_redx.log` shows the sudden event: CE goes from `2.7417` at step 3200 with residual `2.5e-2` to CE `41` with residual `1.1e-1` at step 3300. `redx_traj.log` shows EP gradient quality degrading and residuals growing along the approach. The rho drift and damping-resistant `c` sweep are encoded in `spec_bifurcation.py`, `spec_rho_vs_c.py`, and cited in `t2fix_rho.log`, but the actual c-sweep output is not present in the requested logs. The important point remains: the finite-`T1` residual becomes hypersensitive near marginality. + +5. Correct as the root cause, but not yet empirically closed for `t2sel=160`. `runs/t2fix_rho.log` only has steps 100, 200, 300 at CE about `6`, with `rho~0.794`; it does not prove that `t2sel=160` will pass or fail near CE 2.x. The math below decides the open question: exact equilibrium gradients still optimize the wrong objective for finite-time deployment. + +## Why the exact equilibrium gradient lacks contraction defense + +Let the relaxation map be + +```text +Phi_theta(z) = z + eps F_theta(z) +``` + +and let `z*` satisfy `F_theta(z*) = 0`. The equilibrium objective is + +```text +J_inf(theta) = L(z*(theta)). +``` + +Differentiating the fixed-point equation gives + +```text +F_z dz*/dtheta + F_theta = 0 +dz*/dtheta = -F_z^{-1} F_theta +``` + +Equivalently, solve the equilibrium adjoint + +```text +F_z^T lambda = -L_z(z*) +grad_theta J_inf = L_theta + lambda^T F_theta. +``` + +That is exactly what the EP/AEP estimator is trying to approximate. It contains `F_z^{-1}`, so slow modes can amplify loss sensitivity. But it contains no term for the finite relaxation length, no `T`, no initial residual, no `Phi^T`, and no derivative of `rho(Phi_z)` unless changing that contraction also changes `z*` or the equilibrium loss. A parameter that changes the convergence rate while leaving the fixed point and readout loss unchanged has zero exact equilibrium gradient. + +Scalar counterexample: + +```text +F_k(z) = -k (z - z*) +Phi_k(z) = z + eps F_k(z) +L = L(z*) +``` + +For any positive `k`, the fixed point is the same. Therefore + +```text +d L(z*) / d k = 0. +``` + +But the finite state is + +```text +z_T - z* = (1 - eps k)^T (z_0 - z*), +``` + +so + +```text +d L(z_T) / d k +``` + +contains a term proportional to + +```text +T (1 - eps k)^(T-1). +``` + +That is exactly the contraction-defense term. It is large near `rho=1`, precisely where `rho^150` becomes explosive. It vanishes only in the true infinite-time limit when `rho<1` with enough margin. + +For the full model, BPTT differentiates + +```text +z_{t+1} = Phi_theta(z_t) +grad_theta L(z_T) + = L_z(z_T)^T sum_{k=0}^{T-1} + (prod_{s=k+1}^{T-1} Phi_z(z_s)) eps F_theta(z_k) + + direct terms. +``` + +Those products are the same objects that determine finite-time contraction. When they decay slowly, the finite-horizon gradient feels it. Equilibrium EP replaces this whole finite product chain with the fixed-point inverse at `z*` and takes `T=infinity`; the transient residual term is gone. + +The code implements this split exactly. In `lt_ep_train.py`, `ep_step` relaxes to `T1`, optionally refines beyond `T1`, and computes the task gradient at `zs` through `(a * f).sum()`. `bptt_step` unrolls exactly `T1` steps and differentiates `ce(blk, z, y)` at the final unrolled state. `evaluate()` also uses exactly `T1` relaxation steps. Therefore BPTT is optimizing the evaluated computation and EP is optimizing the refined fixed-point computation. + +## Consequence + +The exact equilibrium adjoint can be correct and still push into a marginal operator, because the equilibrium objective is indifferent to settling time except through its effect on `z*`. The evidence that EP can reach `cos(g_EP,g_transpose)=0.998` only proves that EP can compute the fixed-point gradient. It does not prove that the fixed-point gradient contains BPTT's finite-horizon stabilizer. It does not. + +So the fix is not "set `t2sel=160` and call the adjoint converged." That removes one estimator error. It does not change the target objective. If the deployed model is `T1=150`, the training signal must include finite-horizon dynamics or an explicit contraction objective. + +## Local forward-only fix + +This is fundamental for pure equilibrium EP on `L(z*)`, but not fundamental for local forward-only learning if the objective is changed. + +Concrete construction: finite-horizon forward-mode/RTRL eligibility training for `L(z_T)` plus, if needed, a local contraction penalty. + +Run the physical relaxation forward for `T=150`. Alongside the state, propagate local eligibility traces: + +```text +e_{t+1}^{(p)} = Phi_z(z_t) e_t^{(p)} + eps dF_theta(z_t)/dp +``` + +At `T`, form the local three-factor update + +```text +Delta p proportional to - L_z(z_T)^T e_T^{(p)}. +``` + +This is forward-mode differentiation of the actual finite unroll. It is not reverse BPTT, and it is not a digital root finder. Exact per-parameter RTRL is expensive; practical versions use blockwise, low-rank, or random-direction eligibility traces. But this is the correct class of construction because it preserves the finite product terms that defend contraction. + +If hardware or cost makes forward-mode eligibility too expensive, the alternative local objective is an explicit contraction homeostat: + +```text +R_contr = E_v sum_t log( ||Phi_z(z_t) v_t|| / ||v_t|| ) +``` + +estimated with two nearby physical trajectories or JVP hardware, or a hard monotone/contractive parameterization enforcing a negative log norm. This changes the objective/architecture. It is a valid local fix, but it is not "better EP gradient quality." + +## Single decisive experiment + +Run oracle exact-equilibrium-adjoint training, not merely deeper EP, from the same pre-drift checkpoint and with `resreg=0`. + +At every update: + +1. Relax/refine to `z*`. +2. Solve the exact adjoint `F_z(z*)^T lambda = -L_z(z*)` by GMRES or an equivalent oracle. +3. Apply `grad_theta = L_theta + lambda^T F_theta`. +4. Track `cos(oracle,g_transpose)`, finite-`T1` residual, and free-phase rho on the fixed validation batch every 100 steps. + +Decision rule: + +```text +If oracle equilibrium-adjoint training keeps rho near the BPTT value and clears the wall, (a). +If it still drifts rho toward 1 and blows while cos(oracle,g_transpose) is near 1, (b). +``` + +My ruling is that the second outcome will occur. The exact equilibrium gradient is the wrong gradient for the finite-150-step computation; it cannot contain the missing finite-horizon contraction-defense term by construction. diff --git a/ep_run/EP_DIAGNOSIS_DOSSIER.md b/ep_run/EP_DIAGNOSIS_DOSSIER.md new file mode 100644 index 0000000..22308fa --- /dev/null +++ b/ep_run/EP_DIAGNOSIS_DOSSIER.md @@ -0,0 +1,99 @@ +# EP below-CE-2.1 divergence — complete diagnosis dossier (2026-06-22, CORRECTED) + +## Setup +Equilibrium transformer block: fixed point of a damped relaxation `z ← z + ε·F(z)`, ε=0.1, +`F(z) = x_in − (1+c)z + Attn(LN z) + FFN(LN z)`, c=1. Attention is **non-conservative** (independent +WQ,WK,WV,WO; qknorm RMSNorms q,k). Untied 4× GELU FFN. Trained **backprop-free** with **AsymEP** +(Scurria 2602.03670: nudged dynamics get `−2A_J(x⁰)(x−x⁰)`, making the nudged Jacobian = Jᵀ at the +free equilibrium). Code: `lt_ep_train.py` (`force`/`tforce`:81-106, `relax`:123, `ep_step`:140), +`holo_ep.py` (holomorphic estimator). Eval/BPTT use the T1=150 relaxed state. + +**SYMPTOM:** EP descends, then **suddenly** diverges below CE≈2.1 (e.g. val 2.74 → 41 within ~100 steps, +T1-residual 2.5e-2 → 0.42). Exact **BPTT on the identical model trains cleanly to CE 1.83.** + +## CORRECTED diagnosis (measured this round — supersedes earlier framings) + +**Fact 1 — it is a forward LIMIT CYCLE, there is no fixed point at the diverging operator.** +`eval_relax` on the marginal ckpt redx **s3200** (val 2.74, just before the blowup): relax from the +embedding for **6000 steps** → +`res(t): 50→3.6e-2, 150→2.3e-2, 500→2.5e-2, 1000→2.5e-2, 2000→2.3e-2, 4000→2.3e-2, 6000→2.4e-2`, +tail(last 1000) min 2.08e-2 / max 2.73e-2, **non-monotone**. It **floors ~2.3e-2 and oscillates — never +reaches a fixed point.** (Reproduces an earlier lost-run finding: limit cycle, FTLE<0.) + +**Fact 2 — the cycle is driven by the non-conservative ATTENTION.** +Knockout: scale the attention output (`WO ×= α`), eval_relax 3000 steps: +``` +α=1.0: res-floor 2.5e-2, osc 6.0e-3 CYCLE +α=0.7: res-floor 1.6e-2, osc 3.0e-3 CYCLE (smaller) +α=0.4: res-floor 4.1e-3, osc 5.3e-4 nearly gone +α=0.2: res-floor 3.2e-4 CONVERGED (true fixed point restored) +α=0.0: res-floor 1.3e-3, osc 1.2e-3 tiny FFN-only cycle +``` +Reducing the attention monotonically shrinks the cycle and restores convergence. **The attention's +non-conservativity drives the limit cycle.** + +**Fact 3 — hypothesis: a Hopf-type bifurcation.** A relaxation `z←z+εF(z)` (map `M=I+εJ`) can only +*oscillate* (limit cycle) if `M` has a **complex eigenvalue pair crossing |λ|=1**. A symmetric/conservative +J has real eigenvalues → monotone convergence or blow-up, never a cycle. As EP training grows the attention +asymmetry/gain, a complex pair crosses → Hopf → limit cycle → readout of a cycle-point degrades → CE explodes. + +## RETRACTED framings (do not anchor on these) +- codex's "(b) structural: equilibrium gradient L(z\*) blind to contraction → forward-mode/RTRL fix" + — **assumed a fixed point z\* exists.** It does not at the diverging operator (limit cycle). The scalar + counterexample (param changing convergence rate but not z\*) is moot when z\* doesn't exist. +- "ρ drifts to 0.998 / slow convergence" — was a **transient artifact** of a ρ-probe window (caught the + initial 3.6e-2→2.3e-2 decay, missed the floor+oscillation). + +## Still-valid facts (about the GRADIENT estimator — separate axis from the forward cycle) +- BPTT (exact grad) → CE 1.83, converges; its trajectory does NOT drive the attention into the cycling regime. +- AsymEP gradient is accurate WHEN a converged fixed point exists: cos(g_EP, exact-adjoint)=0.99 at hr=0.2, + res 1e-9; the "0.94 ceiling" was nudged/adjoint-phase truncation (cos rises 0.74→0.998 as nudge-depth + t2sel 10→160). i.e. the estimator is fine *given a fixed point* — but at the diverging state there is none. + +## AEP paper (Scurria 2602.03670) context +- AsymEP is exact AT the **stationary state** (needs convergence). Appendix G.3 explicitly treats the + **stability of non-conservative dynamics** — they can **oscillate** — controlled by the **asymmetry ratio + r_str** (Eq 37-38: `J = γ(√(1−r_str²)·S̃/‖S̃‖ + r_str·Ã/‖Ã‖)`) + **gain γ** + conservative init + `Var[J]∝1/N`. "AsymEP reduces oscillations." + +--- + +## Q1 (THIS query) — CONFIRM THE MECHANISM +Is the divergence a **non-conservative Hopf bifurcation**: the attention's antisymmetric part A drives a +**complex conjugate eigenvalue pair of the relaxation map M=I+εJ across |λ|=1**, producing the forward limit +cycle (Facts 1-3)? +1. Is the evidence (limit cycle in Fact 1 + the attention-scaling knockout in Fact 2) **conclusive** for a + Hopf bifurcation, or what is the gap / what alternative (e.g. real-eigenvalue saddle-node, a 2-cycle from + the discrete Euler step εF, an FFN contribution) is not yet excluded? +2. What is the **single cleanest measurement** to nail it — e.g. compute the eigenvalues of `M=I+εJ` at + s3200 (is there a complex pair with |λ|≥1, vs a real λ≥1)? a Floquet/period analysis of the cycle? an + ε-sweep (does shrinking ε convert the cycle to convergence — distinguishing a continuous-time vs + discrete-Euler instability)? +3. Verify the mechanism against the actual `lt_ep_train.py` force/relax code. + +## Q2 — THE FIX +Given Q1 (a Hopf bifurcation from the attention's non-conservativity), what is the best way to keep the +operator **below** the bifurcation (so a fixed point exists and AsymEP is valid) while preserving as much +attention expressivity as possible? Candidates: (a) **adaptive asymmetry penalty** (our `jacreg` penalizes +‖J_nc‖≈‖A‖, ramped on the residual/cycle onset; the validated 2.40 runs used this, the diverging runs froze +it weak); (b) **structural r_str-style parameterization** (bound the antisymmetric part by construction, +paper Eq 38); (c) **gain control** (γ scaling / qknorm — bound the spectral gain); (d) a **direct +cycle-amplitude / log-norm μ_P(J) penalty**. Which is most effective AND analog-realizable (forward-only, +local)? Give a concrete recipe. + +## Q3 — THE THESIS +Can a non-conservative attention stay **sub-Hopf** (no limit cycle) AND be expressive enough for coherent +language, or is there a **fundamental expressivity-vs-stability tradeoff** (the expressivity needs +asymmetry/gain that triggers the bifurcation)? Estimate the bifurcation threshold (in r_str/γ terms) for this +architecture and whether the sub-threshold regime suffices for an LM. Is a hybrid (bounded-asymmetry core + +thin correction) the realistic ceiling? + +## Q4 — EQUILIBRIUM vs NON-EQUILIBRIUM PRIMITIVE +AsymEP requires a **stationary state**, which does not exist in the limit cycle. Two routes: (i) keep the +operator below the Hopf (fixed point exists → AsymEP exact), accepting the expressivity bound; (ii) **embrace +the non-equilibrium** (limit-cycle) computation with a learning rule native to it (oscillatory / reservoir / +time-averaged). Which is the right primitive for analog hardware, and is (ii) even tractable with a local +forward-only rule? + +--- +Answer **Q1 → Q2 → Q3 → Q4 in order**, each rigorously and grounded in the code/data. Be decisive. diff --git a/ep_run/FUGU_OPTIONS_VERDICT.md b/ep_run/FUGU_OPTIONS_VERDICT.md new file mode 100644 index 0000000..4e3ed25 --- /dev/null +++ b/ep_run/FUGU_OPTIONS_VERDICT.md @@ -0,0 +1,263 @@ +# FUGU_OPTIONS_VERDICT — Q1–Q3 (independently verified) + +Scope: answers grounded in `lt_ep_train.py` (`force`/`tforce` :81-106, `relax` :123-133, +`ep_step` :140-232, `jacreg` :211-219, weight caps :52-53/398-399/563-567), `holo_ep.py`, +the calibration probes (`adaptive_eps_calib.py`, `adaptive_eps_calib2.py`, `eps_sweep_s3200.py`, +`jnc_scaling.py`, `lt_ep_anderson.py`), and the diagnosis dossiers. Each claim is flagged +**[SOLID]** (proved by code/data in repo) or **[UNCERTAIN]** (reasoned, not measured here). + +--- + +## Shared mechanism (the object all three questions act on) + +**[SOLID]** The active free relaxation is explicit (forward) Euler: +`z = z + eps * blk.force(z, xin).detach()` (`relax`, :123-133). In thick mode the force is +`F(z) = -(z - xin) + Attn(LN1 z) + FFN(LN2 z) - c*z` (`tforce`/`force` :81-85, :102-106), c=1. +So the per-step linear stability object is the **discrete map** `M = I + eps*J`, `J = dF/dz`. + +**[SOLID]** For a continuous eigenvalue `mu = a + i b` of `J`, the Euler multiplier is +`lambda = 1 + eps*mu`, and the map is stable iff `|1+eps*mu| < 1`, i.e. +`eps < eps_crit = -2a/(a^2 + b^2)` for `a < 0`. A continuous-STABLE rotating mode (`a<0`, `b` large) +is destabilized purely by too-large `eps`. + +**[SOLID]** The ε-monotonicity training data are decisive that this is an *integration* wall, not a +*gradient-quality* wall: eps=0.1 blew @ CE 2.74; eps=0.1 with a strictly better gradient (t2sel=160, +cos 0.998) blew EARLIER @ 3.02; eps=0.05 reached 2.41 before blowing. Better gradient → not later but +earlier; smaller step → strictly lower wall. That is exactly the `|1+eps*mu|>1` signature. + +### One correction to the dossier's "continuous/analog is stable at s3200" framing +**[SOLID — verified, refines prior verdict]** The eps-sweep "CONVERGED at eps=0.01" is measured with a +*different residual* than the cycle floor. `eps_sweep_s3200.py:17` reports the **step** residual +`r = ‖z2-z‖/‖z‖ = eps·‖F‖/‖z‖`; `adaptive_eps_calib.py:15` reports the **force** residual +`g = ‖F‖/‖z‖`. At eps=0.01 the sweep's `r≈8.9e-4` is just `0.01 × 0.089` — i.e. the *same* force-floor +`g≈0.09` that is called a "cycle" at eps=0.1. `FUGU_Q_OPTIONS.md` itself flags this: +"s3200 g floors ~0.09 even at tiny ε (genuinely no fixed point at the marginal op, OR just slow +finite-step convergence — ambiguous)." +**Implication:** the eps-sweep robustly proves *the oscillation/blow-up is a discrete-Euler artifact* +(the cycle amplitude dies as eps→0). It does **not** by itself prove the s3200 operator has a true +attracting fixed point (g→0) in continuous time — the force floor g≈0.09 persists. The clean +continuous-stable case is s2000 (g→0). So "analog HW would have no problem" is **[SOLID]** for the +*oscillatory blow-up* but **[UNCERTAIN]** for "s3200 settles to a usable equilibrium." The decisive +missing measurement remains the leading eigenpair of `J`/`M` at a continued fixed-point branch +(sign of `Re mu`). + +--- + +## Q1 — Evaluate (a) adaptive ε, (b) jacreg, (c) smaller fixed ε + +**Bottom line:** +- **(c) smaller fixed ε — RELOCATES the wall. [SOLID]** Already shown empirically (2.74→2.41). +- **(b) jacreg — RAISES/RELOCATES the wall from the model side. [SOLID it raises eps_crit; UNCERTAIN whether it can eliminate]** It lifts `eps_crit` by cutting `|Im mu|`/gain, but at fixed ε it is still a wall in `eps_crit`-space; it also taxes the expressivity it suppresses. +- **(a) adaptive ε — ELIMINATES the fixed-ε wall *iff* its floor stays below the instantaneous `eps_crit`; otherwise it degenerates to (c). [SOLID for the mechanism; the guarantee is conditional]** + +### Ranking +**To remove the measured software wall while preserving the model and the analog target:** +1. **Adaptive ε / robust solver** — only option that removes the *fixed-step* wall with **zero model/expressivity cost** and **zero change to the analog target**. It is pure integration-axis. +2. **jacreg** — effective secondary homeostat; raises `eps_crit`, but changes the learned operator and can cap the non-normality the good (BPTT-1.83) solution uses. +3. **smaller fixed ε** — diagnostic/fallback only; permanently pays the small-step cost on *every* example (including smooth ones) and still fails once stiffening passes the new floor. + +**For the analog (continuous) target specifically:** adaptive ε and smaller fixed ε are *emulator* +choices that leave the model identical to what analog HW runs — they are the right kind of fix. +jacreg *changes the model that analog HW would run* (see Q2). + +### (a) Adaptive ε — grounded in code +**[SOLID]** `adaptive_eps_calib2.py` uses the correct signal: shrink only on **overshoot** +(`g_t > prev*tol` → `eps*=down`), grow otherwise. The naive `adaptive_eps_calib.py` controller +(shrink on slow contraction) is shown to mis-park ε at the floor on all ops — it conflates small-ε's +slow contraction with instability. The corrected controller behaves as a continuous-relaxation +emulator: stiff s3200 → ε to 0.003-0.008; smooth s2000 → ε grows toward 0.1 and reaches g=0. + +### Is adaptive ε *guaranteed* to eliminate the wall? — the eps_min question +**[SOLID, decisive]** No, not unconditionally. With a hard floor `eps_min`, adaptive ε eliminates the +wall only while `eps_min < eps_crit = -2a/(a^2+b^2)`. If training keeps stiffening the rotating mode so +`eps_crit` falls below `eps_min`, adaptive ε becomes a fixed small step at the floor — i.e. it +**degenerates into option (c) and merely relocates the wall.** So the guarantee is conditional on the +floor, and equivalently on whether `eps_crit` (hence `|Im mu|`) is bounded away from where the floor +sits. + +### Does |Im μ| (b) saturate or grow unboundedly as CE drops? +This is the crux, and the honest answer is split: + +- **[SOLID] There IS structural stiffness-bounding machinery in the code that argues for saturation.** + (i) `qknorm` RMSNorms q,k → softmax logits are bounded regardless of weight growth (`attn` :63-67); + (ii) **weight-norm caps**: `capw = {WQ,WK,WV,WO,Wm,Wh,fc,pj}` are each projected back to + `capx × initial-norm` every optimizer step (`:52-53`, `:398-399`, `:563-567`); (iii) damping `c=1` + gives a passive `-(1+c)z = xin-2z` contraction floor; (iv) LayerNorm bounds input scale into attn/FFN; + (v) weight decay. With qknorm + capped projections + LN, the per-matrix gains feeding `J_nc` cannot + grow without bound, which bounds `|Im mu|` and therefore keeps `eps_crit` bounded **below**. This is a + genuine reason to expect `|Im mu|` to **saturate** (or at least be bounded) rather than diverge. + +- **[SOLID, opposing data point] But within the *observed* range stiffness was still rising:** fixed + ε=0.1→0.05 moved the wall 2.74→2.41 rather than removing it, i.e. `eps_crit` was still falling across + that CE interval. So saturation, if it exists, had not yet bitten in the measured window. + +- **[UNCERTAIN] No direct eigenvalue/`|Im mu|`-vs-CE trace exists in the repo.** `jnc_scaling.py` + measures `‖J_nc‖` growth-per-step vs width but is not a CE-resolved `|Im mu|` curve. So whether `b` + truly plateaus before `eps_crit` reaches a practical `eps_min` is **not measured**. + +**Synthesis (decisive, hedged correctly):** adaptive ε is the best wall-eliminator and the only +zero-tax, analog-faithful one — **and** the code's caps/qknorm/damping make it *likely* that `|Im mu|` +is bounded, so a sufficiently small `eps_min` should eliminate (not merely relocate) the wall in +practice. But this is a *bounded-floor* guarantee, not an unconditional one: if `|Im mu|` were to grow +without bound, any finite `eps_min` is eventually a wall. **Recommended:** make the floor itself +log an `eps_crit` proxy (overshoot persisting at the floor) and either drop the floor, reject the step, +or hand off to Anderson — i.e. fail-open rather than fail-into-(c). + +--- + +## Q2 — The jacreg paradox + +**Verdict: no paradox. jacreg works by RAISING `eps_crit` from the model side — it fixes the SAME +discretization wall, not a demonstrated continuous-time instability. Relative to adaptive ε it is a +sim-crutch for the measured failure, but it carries a *separate, real* analog benefit (settling +quality), and it would become a genuine fix if a true continuous instability (Re μ≥0) ever emerges.** + +### Why a model-side stiffness penalty fixes a simulation artifact — mechanism +**[SOLID]** `jacreg` is a Hutchinson JVP penalty `R = jacreg·‖J_nc·er‖²/‖er‖²` (`:211-219`), and in thick +mode `nc_force = Attn + FFN` (`:92-97`). Minimizing `‖J_nc‖` reduces the learned non-conservative +gain, which reduces the rotating component `|b|=|Im mu|` (and non-normal amplification). Since +`eps_crit = -2a/(a^2+b^2)`, smaller `|b|` → **larger** `eps_crit` → fixed ε=0.1 stays under the +Euler-stability boundary longer. So jacreg moves the *same* `|1+eps*mu|=1` wall by shrinking `b`, while +adaptive ε moves the *same* wall by shrinking `eps`. Two knobs on one inequality. + +### Raising eps_crit vs fixing a continuous-time problem +**[SOLID for measured regime]** For s3200-type failures the relevant mode has `Re mu < 0` (the cycle +dies as eps→0). There is no *established* continuous instability to fix, so jacreg's contribution there +is purely "raise eps_crit" — discretization-wall relief from the model side. +**[UNCERTAIN beyond it]** If training ever drives `Re mu → 0⁺` (a true Hopf), then no integrator +(adaptive ε, implicit, Anderson) can stabilize the original continuous equilibrium; only a model-side +change (jacreg, stronger damping/c, structural monotonicity, gain/asymmetry bounds) is a real fix. +jacreg is the insurance policy for that case. + +### Does the benefit transfer to analog hardware? — two benefits, separated +**[SOLID] (i) The "prevents eps=0.1 Euler blow-up" benefit does NOT transfer.** Analog HW has no `eps` +and does not iterate `z←z+εF`; it performs continuous relaxation. If `Re mu<0`, the analog ODE is +stable and never had this wall. To the extent jacreg only buys eps_crit headroom, it is papering over a +sim artifact analog wouldn't have — a crutch. + +**[SOLID/UNCERTAIN-magnitude] (ii) The "less stiff/less ringy continuous dynamics" benefit DOES +transfer.** Even with `Re mu<0`, a large `|Im mu|` mode has a poor damping ratio: it rings, settles +slowly, demands more bandwidth, longer observation/integration windows, and is more noise/delay +sensitive — all of which degrade the *physical* free-phase settling and the readout of nudged +equilibria on analog HW. Reducing `‖J_nc‖` improves the continuous damping ratio. So jacreg is *also* a +legitimate analog settling/robustness regularizer. **[UNCERTAIN]** the size of this analog benefit is +not measured here. + +### Real fix or sim-crutch, relative to adaptive ε? +**[SOLID]** For the *confirmed explicit-Euler artifact*: +- **adaptive ε / Anderson / implicit = the real fix of the emulator** — they preserve the learned + vector field and make the digital sim stop inventing a cycle the analog system wouldn't have. +- **jacreg = a model-changing crutch for that artifact**, but simultaneously a *real* (if secondary) + analog settling regularizer and the *only* lever if a genuine continuous instability appears. + +**Recommended composition (not "either/or"):** (1) use adaptive ε / a real solver as the primary +emulator fix so the sim is faithful; (2) keep jacreg as a **bounded, adaptive** homeostat +(the controller already exists, `:520-529`) sized for analog settling-time/robustness or true +marginality — NOT as a strong fixed penalty that taxes the non-normality the BPTT-1.83 solution needs. +The historical evidence fits this: the validated ~2.40 runs used *adaptive* jacreg; the diverging runs +*froze it weak* — i.e. they removed the homeostat, not the integrator. + +--- + +## Q3 — Anderson acceleration / implicit (IMEX) integrators + +**Verdict: Yes — they can replace explicit Euler as the *solver* and kill the discretization +instability, and they are compatible with AsymEP *provided they converge to the same equilibria of the +same vector fields*. They change nothing about the analog model; they are emulator-fidelity choices. +Implicit Euler is unconditionally stable but per-step expensive (the solve is itself a relaxation). +Anderson is the more practical lever: it both accelerates and can suppress the Euler cycle when a true +fixed point exists, but it is not guaranteed and needs damping/restarts/residual gating.** + +### (i) Compatibility with AsymEP +**[SOLID]** The EP estimator depends on the *states*, not on how they were reached. `ep_step` computes +`zs = relax(...)` and treats it as the free equilibrium (`:142-144`); the AsymEP correction uses local +`Jv = jvp(nc_force, zs, v)`, `JTv = vjp(nc_force, zs, v)`, `corr = Jv - JTv` at `zs` (`:172-178`); the +parameter gradient is `(a * f).sum()` with `f = force(zs.detach(), xin, cg=True)` (`:202-205`). None of +this requires explicit Euler — it requires that `zs` is a genuine root `F(zs)≈0` and that the nudged +states are equilibria of the nudged/corrected force. A better solver that returns the *same roots* is +fully compatible, and the `-2A` correction is computed *at* `z*` regardless of the solver that found it. + +**[SOLID — important, refines prior framing] The nudged phase must also be re-solved.** The free phase +is not the only explicit-Euler loop: the nudge (`nudge()` :163-180) and every holomorphic estimator +(`holo_a`, `holo_a_select2`, `holo_a_track`, `holo_a_lockin` in `holo_ep.py`) advance with +`z = z + eps*(f - corr)`. The `-2A` correction lives *inside* these loops. So "swap the integrator" +means swap it in **both** phases; a solver that converges the free `z*` but leaves the nudged phase on +coarse Euler will still corrupt `a = -dz*/dβ`. + +**[SOLID] Hard limit:** if the continuous field has no attracting root in the operating regime, no +solver can manufacture the stationary state AsymEP needs — it will fail, find a spurious root, or +return a numerical artifact. A solver fixes *integration*, not *non-existence of equilibrium*. (This is +why the s3200 force-floor ambiguity from the Shared-mechanism section matters: confirm a true root +exists before trusting AsymEP there.) + +### (ii) Implicit / IMEX — tractable or self-defeating? +**[SOLID, theory]** Backward Euler multiplier is `1/(1-h·mu)`, A-stable: for any `Re mu<0` it is stable +at *every* step size, so it would kill the stiff-rotation Euler cycle outright. +**[SOLID, cost]** Each backward step solves `y - h·F(y) - z_n = 0`, where `F` contains LN, causal +softmax attention, and FFN. A Newton/Krylov/Picard solve needs several force evals and matrix-free +JVP/VJP linear solves over the full `B·T·C` state per step — i.e. **the per-step solve is itself a +relaxation/root-find**, which is the self-defeating risk for a default inner loop. +**[UNCERTAIN/qualitative] IMEX nuance:** making only the cheap leak `-(1+c)z` implicit is trivial but +does **not** tame the dangerous learned rotating attention mode (the danger is in `J_nc`, not the leak); +treating `J_nc` implicitly reintroduces the big linear solve. So implicit/IMEX is best as a **robust +fallback / macro-step / offline reference**, not the default per-step integrator. + +### (iii) Anderson — speed only, or stabilization too? +**[SOLID, conceptual]** Anderson (DEQ-style; `lt_ep_anderson.py` stores recent `X`, `G(X)=z+εF`, solves +a small regularized least-squares for the mixing coefficients, extrapolates) is a quasi-Newton/GMRES-on- +the-residual. For a Picard/Euler map whose oscillatory multiplier sits just outside the unit circle, +the residual-minimizing extrapolation can **suppress the limit cycle**, not merely speed a contracting +one — so it is more than acceleration. `lt_ep_anderson.py` is explicitly framed as exactly this test +("can a fixed-point solver converge the free phase where plain relaxation cannot?"). +**[SOLID, caveats]** Not guaranteed: it cannot create a root that doesn't exist; aggressive mixing can +diverge; it needs damping (β-mixing), restarts, and residual-monotonicity gating; and (per (i)) it must +wrap the nudged phase too. Net: **strongest practical candidate** — cheaper than full implicit Newton, +able to stabilize when a root exists, but must be safeguarded. + +### (iv) Does integrator choice matter for the ANALOG target? +**[SOLID] For the analog model itself: no.** Analog HW performs the true continuous relaxation of `F`; +it runs no explicit Euler, no Anderson, no backward Euler. The integrator is not part of the deployed +computation. +**[SOLID] For digital training/eval of that target: yes, decisively.** Coarse explicit Euler can invent +a limit cycle the analog system would never exhibit, corrupting both the loss and the equilibrium the +EP gradient is taken at. The correct framing — and the right way to state it in the thesis — is exactly: + +> Analog HW does the true continuous relaxation; the simulator only needs a **faithful + cheap emulator** +> of that relaxation. Adaptive ε, Anderson, and implicit/IMEX are all just *better emulators* — they +> change the simulation's fidelity/cost, not the EP objective or the analog primitive. + +The one asymmetry to keep in mind: **jacreg is NOT in this "just a better emulator" bucket** (it edits +the model the analog HW would run), whereas adaptive ε / Anderson / implicit ARE. That is the precise +sense in which the integrator family is the analog-faithful fix and jacreg is the model-side one. + +### Recommended solver strategy +1. Replace fixed ε=0.1 explicit Euler in the **free** phase with an overshoot/step-rejection adaptive + solver (the corrected `adaptive_eps_calib2.py` logic), with a fail-open floor (Q1). +2. Add **damped Anderson with restarts + residual gating** for both free and nudged phases once the + residual stalls/cycles; solve `F=0` rather than running a fixed Euler count and hoping. +3. Keep **implicit/backward Euler as a reference/fallback**, not the default inner loop (per-step cost). +4. Leave **AsymEP unchanged in principle**: find `z*`, find nudged equilibria, apply `Jv-JTv` at `z*`, + and **gate the update** (`res_gate`, `:153-162`) when residual says no stationary state was found. +5. Retain **jacreg as a bounded adaptive homeostat** (analog settling / true-Hopf insurance), not as the + primary fix. +6. For analog claims, report **solver-independent diagnostics**: force residual `‖F(z*)‖/‖z*‖` (NOT just + the eps-scaled step residual — they differ by a factor of eps, which confounded the eps-sweep), and, + when feasible, the leading continuous `mu` (sign of `Re mu`) and settling/ringing time. + +--- + +## Summary table + +| Option | Eliminates or relocates wall | Changes model? | Analog-faithful? | Verdict | +|---|---|---|---|---| +| (a) adaptive ε | Eliminates if eps_min < eps_crit; else relocates | No | Yes (emulator) | **Primary fix** [SOLID mechanism; bounded-floor guarantee] | +| (b) jacreg | Raises eps_crit (relocates in eps_crit-space) | Yes | No for the wall; yes for settling | **Secondary homeostat / crutch + true-Hopf insurance** | +| (c) smaller fixed ε | Relocates only | No | Yes but inefficient | **Diagnostic / fallback** [SOLID] | +| Anderson | Can eliminate cycle if a root exists | No | Yes (emulator) | **Best practical solver, needs safeguards** | +| Implicit/IMEX | Eliminates (A-stable) | No | Yes (emulator) | **Correct but per-step costly; fallback/reference** | + +Key uncertainties flagged: (1) whether `|Im mu|` saturates vs grows as CE drops is **not directly +measured** — code caps/qknorm/damping argue for bounded, but ε=0.1→0.05 data show it was still rising +in-window; (2) whether s3200 has a true continuous fixed point (g→0) vs only a dead oscillation is +**ambiguous** because the eps-sweep's step-residual ≠ force-residual; the clean continuous-stable +evidence is s2000, not s3200. diff --git a/ep_run/FUGU_Q1_VERDICT.md b/ep_run/FUGU_Q1_VERDICT.md new file mode 100644 index 0000000..03c3c8d --- /dev/null +++ b/ep_run/FUGU_Q1_VERDICT.md @@ -0,0 +1,123 @@ +# Q1 verdict — below-CE-2.1 divergence mechanism + +## Bottom line + +**Refute “conclusive” as stated.** The dossier evidence is **strong and code-consistent** evidence that the s3200 free relaxation has lost the usable attracting fixed point and is in an **attention-gain-driven non-conservative oscillatory regime**. It is **not yet conclusive evidence for a Hopf / Neimark-Sacker bifurcation** specifically. + +The missing piece is **local spectral evidence at an actual fixed point / continued fixed-point branch**: a leading **complex conjugate eigenvalue pair** of the relaxation map + +\[ +G(z)=z+\varepsilon F(z), \qquad M = DG(z^*) = I + \varepsilon J, \qquad J=\partial F/\partial z|_{z^*} +\] + +crossing the unit circle, with real eigenvalue instabilities absent. The current data show the attractor and the causal knob, but not the local bifurcation class. + +## 1. Is Fact 1 + Fact 2 conclusive for Hopf? + +**No.** It is conclusive for a narrower statement: + +> At the redx s3200 checkpoint, with `eps=0.1`, the implemented forward relaxation does not converge from the evaluated embedding state to a fixed point within 6000 steps; instead the one-step residual floors around `~2.3e-2` and oscillates. Reducing the attention output gain by scaling `WO` monotonically shrinks this oscillation and restores convergence by about `alpha=0.2`. + +That is highly consistent with a non-conservative attention-driven oscillatory instability, but it does **not** uniquely identify a Hopf bifurcation. + +### What the evidence establishes + +- **Forward non-convergence / cycle-like attractor:** `eval_relax_s3200.py` applies the same explicit relaxation update as training/eval, records the normalized one-step displacement, and the dossier logs a persistent non-monotone residual floor: about `2.3e-2` after thousands of steps, with tail min/max `2.08e-2 / 2.73e-2`. That is incompatible with ordinary monotone convergence to the free fixed point on that trajectory. +- **Attention gain is the main causal knob:** `knockout_s3200.py` scales `blk.WO` by `alpha`, i.e. scales the attention output contribution, and the dossier logs monotonic shrinkage of the residual floor/oscillation from `alpha=1.0` to `0.2`, where convergence is restored. +- **The code really allows non-conservative oscillatory dynamics:** the thick force contains independent attention projections plus an untied FFN inside an explicit Euler relaxation map. There is no energy/gradient-flow guarantee in the active `attn_mode='thick'` path. + +### The gap + +A Hopf/Neimark-Sacker claim is a **local spectral claim** about the derivative of the map at a fixed point branch. The current facts are **trajectory/knockout facts**: + +- Fact 1 shows a sustained oscillatory forward trajectory at `eps=0.1`; it does not show which eigenvalue crossed first. +- Fact 2 shows that reducing total attention output gain removes the oscillation; it does not isolate the **antisymmetric Jacobian part** `A=(J-J^T)/2`, nor does it rule out other nonlinear or discrete-time routes to an oscillatory attractor. +- Because the reported s3200 `alpha=1` trajectory does not converge, an eigenvalue computed at an arbitrary T1 or cycle point would be only an **instantaneous Jacobian**, not a formal Hopf test unless the underlying fixed point/branch is also identified or continued. + +### Alternatives not yet excluded + +1. **Real-eigenvalue fixed-point loss / saddle-node-like route.** + A real leading eigenvalue of `M` crossing `+1` would indicate loss of contraction along a non-oscillatory mode. The observed limit cycle could then be a secondary nonlinear attractor reached after the fixed point destabilizes or disappears, not the primary Hopf mechanism. + +2. **Discrete Euler artifact.** + The actual implemented dynamics are not continuous-time integration; they are the map `z <- z + eps*F(z)`. If `J` has eigenvalue `nu=a+ib`, the Euler-map eigenvalue is `mu=1+eps*nu`. It is possible to have `a<0` — stable continuous-time linear dynamics — but `|1+eps*nu|>1` at `eps=0.1` because the step is too large. That would be a numerical/discrete relaxation instability, not a true continuous-time Hopf. A real `mu<-1` would be the clean period-2 / flip case. + +3. **FFN contribution.** + The thick `nc_force` treats `attention + FFN` as the non-conservative part, and the FFN is untied. The knockout log itself shows `alpha=0.0` still has a tiny residual/oscillation (`res-floor ~1.3e-3`, `osc ~1.2e-3`), so an FFN-only contribution is not zero. The data do support attention as the dominant driver, but not attention as the exclusive source. + +4. **qknorm / attention nonlinearity contribution.** + The evaluated block has `blk.qknorm=True`, and q/k RMSNorm is inside attention. Scaling `WO` suppresses the whole attention output path, including effects mediated by qknorm. Therefore the knockout does not separate “antisymmetric attention matrix/gain” from the nonlinear qknorm-shaped attention Jacobian. + +So the rigorous conclusion is: + +> **Plausible and likely:** attention-dominated non-conservative complex-mode instability. +> **Not yet proven:** Hopf/Neimark-Sacker crossing of a complex conjugate pair as the bifurcation mechanism. + +## 2. Single cleanest measurement + +**Do the local Jacobian spectrum measurement.** More precisely: compute the leading eigenvalues of both `J = dF/dz` and `M = I + eps*J` on the s3200 checkpoint along a fixed-point branch restored by attention scaling. This is more decisive than a pure epsilon sweep or Floquet analysis, because it directly distinguishes complex-pair Hopf from real-eigenvalue loss and also predicts whether `eps=0.1` is a discrete-Euler artifact. + +### Exact measurement to run later, not now + +Freeze the redx s3200 checkpoint, same batch/sequence and `qknorm=True`. Define + +\[ +F_\alpha(z,x)=-(z-x)+\alpha\,Attn(LN_1 z)+FFN(LN_2 z)-c z +\] + +where `alpha` is implemented exactly as in `knockout_s3200.py` by scaling `WO`. For `alpha` values bracketing the observed transition — at minimum around `0.2`, `0.4`, `0.7`, `1.0`, then refined near the first loss of convergence — do: + +1. Find a true fixed point `z*_alpha` satisfying `||F_alpha(z*)||/||z*||` very small, using long relaxation where it converges and preferably continuation/Newton from the previous `alpha` so the branch can be followed up to the marginal point. +2. At each `z*_alpha`, compute the leading eigenvalues `nu_i` of + `J_alpha = dF_alpha/dz | z*_alpha` + using JVP/Arnoldi or another matrix-free eigensolver. +3. Convert them to relaxation-map eigenvalues + `mu_i = 1 + 0.1 * nu_i`. +4. Record the leading `|mu_i|`, whether the leading pair is complex or real, and for complex `nu=a+ib` also record `a=Re(nu)` and the Euler stability threshold + \[ + eps_crit = -2a/(a^2+b^2) \quad \text{when } a<0. + \] + +### Outcome table + +- **Confirms Hopf / Neimark-Sacker of the implemented relaxation map:** + A complex conjugate pair is the leading spectrum and crosses `|mu|=1` as `alpha` or training step increases; real eigenvalues stay inside the unit circle. The observed oscillation frequency should be compatible with `arg(mu)` per relaxation step. This confirms the map-level Hopf mechanism. + +- **Confirms true continuous-time Hopf rather than Euler artifact:** + The same complex pair has `Re(nu)` crossing through `0` to positive values. Then shrinking `eps` changes the discretization but does not restore continuous-time stability once `Re(nu)>0`. + +- **Shows Euler-step artifact instead:** + The leading pair is complex and `|1+0.1*nu| >= 1`, but `Re(nu) < 0`. Then the continuous-time linearization is damped, while the explicit Euler step is unstable. The predicted stabilizing step is `eps < eps_crit`; an epsilon sweep would be confirmatory, but the spectrum already gives the answer. + +- **Shows real saddle-node / steady instability instead:** + The leading eigenvalue crossing is real near `mu=+1` / `nu=0`. Then the Hopf claim is wrong; the limit cycle is downstream nonlinear behavior after a real fixed-point loss. + +- **Shows flip / two-cycle artifact:** + A real map eigenvalue crosses `mu=-1` or is `< -1`. Then the oscillation is a discrete period-doubling / 2-cycle-type instability, not Hopf. + +- **Shows FFN is materially involved:** + If the unstable/near-unstable pair remains when `alpha=0`, or if the leading antisymmetric contribution is dominated by the FFN block, then “attention antisymmetric part drives it” is overstated. If the pair moves safely inside the unit circle as `alpha` is reduced and disappears with attention removed, then the attention-dominant mechanism is supported. + +Why not make the epsilon sweep the primary measurement? It is useful, but indirect. If smaller `eps` converges, that could indicate an Euler artifact, but it would not by itself distinguish complex Euler instability from real flip or other nonlinear step-size effects. The Jacobian spectrum gives the bifurcation class and the epsilon prediction in one measurement. + +Why not Floquet/period first? Floquet multipliers of the observed cycle would quantify stability of the cycle, and period/frequency could corroborate `arg(mu)`, but they do not identify which fixed-point eigenvalue caused the attractor to appear. Use Floquet/period only as a follow-up. + +## 3. Consistency with the actual code + +The proposed mechanism is **consistent with the implemented force and relaxation map**, with the caveat that the code implicates `attention + FFN` as the active non-conservative block, not mathematically pure attention alone. + +- **The thick force is exactly the stated form.** In `lt_ep_train.py`, `tforce` computes layer-normed attention and FFN and returns `-(z - xin) + self.attn(h1) + ff - self.c * z` (`lt_ep_train.py:81-85`). With `c=1`, this is `xin - 2z + Attn(LN z) + FFN(LN z)`. The autograd-enabled `force` path for `attn_mode == 'thick'` computes the same structure and returns it at `lt_ep_train.py:99-106`. + +- **The relaxation is explicit Euler.** `relax` updates `z` by `z = z + eps * blk.force(z, xin).detach()` (`lt_ep_train.py:123-133`). Therefore the linearized relaxation map is exactly `M = I + eps*J`. + +- **The free phase used by EP is this relaxation state.** `ep_step` embeds the input, computes `zs = relax(..., T1, eps)`, then measures a one-step residual from that state (`lt_ep_train.py:140-145`). The code explicitly records this as the T1 free-phase state before any optional refinement (`lt_ep_train.py:146`). + +- **The attention path is non-conservative in the active model.** Attention uses independent `WQ`, `WK`, `WV`, `WO` projections (`lt_ep_train.py:58-68`), and optional q/k RMSNorm when `blk.qknorm` is set (`lt_ep_train.py:63-65`). The eval scripts do set `blk.qknorm=True` (`eval_relax_s3200.py:8`, `knockout_s3200.py:10`). There is no tied-energy construction in the thick path. + +- **The knockout really scales attention output.** `knockout_s3200.py` loads the same checkpoint and performs `blk.WO.mul_(alpha)` before relaxation (`knockout_s3200.py:9-17`). Thus the logged alpha trend is a legitimate intervention on total attention output gain. + +- **The code itself treats FFN as part of the non-conservative component.** In thick mode, `nc_force` returns `attention + FFN`, not attention alone (`lt_ep_train.py:92-97`). The AEP nudged correction also applies `Jv - JTv` of this `nc_force` in the real/thick modes (`lt_ep_train.py:171-179`). In `holo_ep.py`, the holomorphic and real-axis thick forces match the same `-(z-xin)+att+ff-c*z` structure (`holo_ep.py:36-51`, `holo_ep.py:134-152`), and their AEP correction again uses `Jv-JTv` of `blk.nc_force` (`holo_ep.py:76-84`, `holo_ep.py:176-185`). + +## Final verdict + +**The Hopf story is code-consistent and likely, but not proven.** The current evidence nails an attention-dominated non-conservative forward oscillation at the implemented `eps=0.1`; it does **not** yet nail the bifurcation class. The decisive next measurement is the **leading spectrum of `J` and `M=I+eps*J` on the s3200 fixed-point branch under attention-gain continuation**. A complex conjugate pair crossing `|mu|=1`, with real modes stable and with `Re(nu)` interpreted to rule in/out Euler-step instability, would settle the question. diff --git a/ep_run/FUGU_Q_OPTIONS.md b/ep_run/FUGU_Q_OPTIONS.md new file mode 100644 index 0000000..513e21a --- /dev/null +++ b/ep_run/FUGU_Q_OPTIONS.md @@ -0,0 +1,29 @@ +# Fugu query — fix options for the EP below-CE-2.1 divergence (2026-06-23) + +Read for full chain: EP_DIAGNOSIS_DOSSIER.md, EP_BELOW210_DIAGNOSIS_FIX.md (cont.6/cont.7), FUGU_VERDICT_FULL.md. +Code: lt_ep_train.py (force/tforce :81-106, relax :123 `z=z+eps*blk.force(z,xin).detach()`, ep_step :140, jacreg :211-219), holo_ep.py. + +## CONFIRMED diagnosis (measured this session) +The divergence is the **explicit-Euler instability of the *stiffening* rotating (non-conservative) attention mode**: +- relax is explicit Euler `z←z+εF`, ε=0.1 → stability object = discrete map M=I+εJ. Attention non-conservative (indep WQ/WK/WV/WO) → J has a complex eigenvalue μ=a+ib, **a<0 (continuous-STABLE)**, b≠0. Euler unstable when |1+εμ|>1, i.e. ε>ε_crit=−2a/(a²+b²). EP training makes attention expressive → mode stiffens (b grows) → ε_crit shrinks → at fixed ε it crosses → forward LIMIT CYCLE → loss blows. +- **ε-monotonicity (decisive), 3 training runs identical except ε:** ε=0.1→blew@**2.74**; ε=0.1,t2sel=160 (BETTER gradient, cos0.998)→blew@**3.02** (EARLIER — gradient is NOT the lever); ε=0.05→blew@**2.41**. Smaller ε → strictly lower wall; fixed ε only RELOCATES it (as attention keeps stiffening, even ε=0.05 eventually too coarse). +- **⇒ digital-sim integration artifact:** Re μ<0 ⇒ the CONTINUOUS ODE (analog HW, ε→0) is stable; only the coarse explicit-Euler sim cycles. eval ε-sweep on marginal ckpt s3200 confirms (res-floor shrinks monotonically as ε↓; ε=0.01 converges). +- AsymEP gradient is accurate WHEN a fixed point exists (cos 0.99 vs exact adjoint at tight res); the failure is the FORWARD relaxation losing its fixed point, not the gradient. + +## The 3 candidate fixes +(a) **adaptive ε** [#30]: closed-loop step-size — grow ε when contracting, shrink on residual OVERSHOOT. Integration-axis; SIM-only; no model change / no expressivity cost. +(b) **jacreg**: penalize ‖J_nc·v‖ (non-conservative Jacobian = attention+FFN; lt_ep_train.py:211-219 JVP through blk.nc_force) → reduces |Im μ| → raises ε_crit. MODEL change. Early 2.40-validated runs used *adaptive* jacreg; diverging runs froze it weak. +(c) **smaller fixed ε** (0.02/0.03): just moves the wall down; confirmatory, not a fix. + +## Adaptive-ε calibration so far (eval-only on s3200 cycling-op + s2000 smooth-op; ground-truthed by the ε-sweep) +- NAIVE controller (shrink if g=‖F‖/‖z‖ falls <2%/step): WRONG — parks ε at floor 0.005 on ALL ops, because slow per-step contraction (ρ~0.99) is misread as "ε too big." Conflates small-ε's slow contraction with instability. +- CORRECTED (shrink only on OVERSHOOT g_t>g_{t-1}; grow otherwise): adaptive as desired — s3200 (stiff): ε→0.003-0.008, g floors ~0.10 (≈ε=0.01 benchmark 0.09); s2000 (smooth): converges to **g=0** (true fixed pt), ε grows toward 0.1. Tradeoff in overshoot-tolerance: strict (tol1.0) catches s3200 cycle but over-shrinks near s2000's converged tail (float-noise ticks→spurious shrink); permissive (tol1.02) stays fast on smooth but MISSES s3200's slow cycle (stays ε~0.095, g~0.23). → suggests EMA-smoothed signal / step-doubling local-error / noise-relative tol. +- NOTE: s3200 g floors ~0.09 even at tiny ε (genuinely no fixed point at the marginal op, OR just slow finite-step convergence — ambiguous); s2000→g=0 cleanly. + +## QUESTIONS — answer Q1→Q3 in order, decisive, grounded in code/data, flag solid-vs-uncertain. + +**Q1 — Evaluate (a)/(b)/(c).** Which ELIMINATES the wall vs merely RELOCATES it? Which fits the *analog* target (continuous relaxation)? Rank + justify. Is adaptive ε guaranteed to eliminate the wall, or does its ε_min floor just relocate it (like fixed small ε) IF the rotating mode stiffens without bound — and does b/|Im μ| stiffen unboundedly as CE drops, or saturate? + +**Q2 — The jacreg paradox.** If the divergence is fundamentally a *simulation* discrete-Euler artifact (Re μ<0, continuous/analog stable), WHY does jacreg work? Is it just **raising ε_crit** (cutting |Im μ| so fixed ε becomes stable) — fixing the SAME discretization wall from the model side — or fixing a genuine continuous-time problem? **Does jacreg's benefit transfer to analog hardware** (continuous, no ε): (i) papering over a sim artifact analog wouldn't have, or (ii) also improving the continuous dynamics (faster/less-ringing settling — useful on analog)? So: is jacreg a "real fix" or a sim-crutch, relative to adaptive ε? + +**Q3 — Anderson / implicit integrators.** Can Anderson acceleration or implicit/semi-implicit (IMEX) integration replace explicit Euler to kill the discretization instability? (i) Compatible with **AsymEP** — which needs free equilibrium z*, nudged equilibrium, and the local Jacobian at z* (the −2A correction)? Does changing the forward integrator break the EP gradient estimator (which assumes relaxation reaches z*)? (ii) Implicit Euler's per-step nonlinear solve — tractable for a transformer block, or self-defeating (the solve is itself a relaxation)? (iii) Anderson on the fixed-point iteration — only speeds convergence, or also STABILIZES (suppresses the limit cycle)? (iv) Does the integrator choice matter for the **analog** target (continuous, no integrator), or is this purely sim-side acceleration — i.e. is the right framing "analog HW does the true continuous relaxation; the sim just needs a faithful+cheap emulator, and adaptive-ε / Anderson / implicit are all just better emulators"? diff --git a/ep_run/FUGU_VERDICT_FULL.md b/ep_run/FUGU_VERDICT_FULL.md new file mode 100644 index 0000000..3c06293 --- /dev/null +++ b/ep_run/FUGU_VERDICT_FULL.md @@ -0,0 +1,160 @@ +# FUGU_VERDICT_FULL — Q1–Q4 + +## Q1 — Mechanism: confirm/refute the non-conservative Hopf claim + +**Verdict:** confirm the broad failure mode, but do **not** overclaim the exact bifurcation label yet. The code/data are conclusive for an **attention-dominated non-conservative forward oscillatory instability with no usable fixed point** at redx `s3200`. They are **not yet conclusive** that the route is specifically a local continuous-time Hopf bifurcation of a fixed point. The best current statement is: + +> The implemented relaxation map `z_{t+1} = z_t + eps*F(z_t)` has crossed from a stationary computation into an attention-driven oscillatory attractor. The most likely local mechanism is a complex-conjugate pair of the map Jacobian `M = I + eps*J` crossing `|lambda| = 1` — a Hopf/Neimark-Sacker-type instability of the Euler relaxation. But the eigenvalue crossing has not yet been measured, so the exact bifurcation class remains a hypothesis. + +Grounding in the code: + +- `relax` is explicit Euler: `z = z + eps * blk.force(z, xin).detach()` in `lt_ep_train.py:123-133`. Therefore the relevant stability object for the implemented computation is the **discrete map** `M = I + eps*J`, not only the continuous vector field `J=dF/dz`. +- In the relevant `attn_mode='thick'` branch, `tforce` / `force` implement + `F(z) = -(z - xin) + Attn(LN1(z)) + FFN(LN2(z)) - c*z` (`lt_ep_train.py:81-85`, `102-106`). With `c=1`, the passive term is `xin - 2z`; learned attention/FFN Jacobian must fit inside that contraction margin. +- Attention is genuinely non-conservative in the implementation: independent `WQ/WK/WV/WO`, causal softmax, optional q/k RMSNorm (`qknorm`) in `lt_ep_train.py:58-68`. It is not the gradient of the tied conservative `attn_energy` path. +- In thick mode, `nc_force` includes **attention plus the untied FFN** (`lt_ep_train.py:92-97`). Thus the knockout supports “attention is dominant,” but the code also explains why a tiny FFN-only oscillation can remain when attention output is zeroed. + +What the measurements prove: + +1. **It is not slow convergence.** At s3200 the residual decays initially and then floors/oscillates around `~2.3e-2` through 6000 relaxation steps, with non-monotone tail `2.08e-2` to `2.73e-2`. That rules out the earlier “rho close to one but still convergent” framing for the actual forward computation. +2. **Attention is causally responsible for the large cycle.** Scaling `WO` monotonically shrinks the oscillation: `alpha=1.0` cycles, `0.7` cycles smaller, `0.4` is nearly gone, and `0.2` restores a true fixed point. That is strong causal evidence that attention’s non-conservative/gain component drives the failure. +3. **The estimator is not the primary explanation once no fixed point exists.** `ep_step` assumes `zs = relax(...)` is a free equilibrium and forms the AEP/nudged update around it. The dossier says AsymEP is accurate when a fixed point exists; at s3200 the required object is absent. + +What remains unexcluded: + +- **Discrete Euler artifact vs continuous Hopf.** For a continuous eigenvalue `mu=a+ib` of `J`, the Euler multiplier is `lambda=1+eps*mu`; stability requires `(1+eps*a)^2 + (eps*b)^2 < 1`. A stiff rotating mode with `a<0` can still have `|1+eps*mu|>1` at `eps=0.1`. Then the digital relaxation cycles even if the underlying continuous-time analog ODE would converge for smaller `eps`. +- **Real-multiplier alternatives.** A real `lambda` crossing `+1` would indicate saddle-node/pitchfork/loss of stationary solutions; a real `lambda` crossing `-1` would indicate a flip/period-doubling route. The observed smooth oscillation and attention scaling favor a complex pair, but do not prove one. +- **Global/coexisting-attractor route.** The long relaxation proves that the trajectory from the embedding does not settle to a stationary computation. It does not, by itself, prove the cycle emerged through a local fixed-point Hopf rather than a global basin/coexisting-attractor mechanism. +- **FFN contribution.** Because `alpha=0` still leaves a tiny cycle and thick-mode `nc_force` includes the FFN, the precise claim is “attention-dominated,” not “attention-only.” + +**Single cleanest confirming measurement:** perform an **attention-output-scale continuation with leading eigenvalues of the actual Euler map `M=I+eps*J` at the converged fixed point just below the transition**. + +Concretely, for the s3200 checkpoint: set `WO <- alpha*WO`, solve to tight fixed-point residual for subcritical `alpha`, compute leading eigenvalues of `J=dF/dz` and `M=I+eps*J` at `z*(alpha)`, and increase `alpha` until convergence is lost. This is cleaner than eigenvalues at an arbitrary point on the already-existing cycle, because Hopf is a fixed-point stability statement; Floquet analysis is useful second, but characterizes the cycle after it exists. + +Outcomes: + +- **Complex pair of `M` reaches/crosses `|lambda|=1` at the same `alpha_c` where the fixed point disappears:** confirms the Hopf/Neimark-Sacker mechanism for the implemented relaxation map. +- **The corresponding continuous eigenvalues `mu` have `Re(mu)` crossing zero:** confirms a true continuous-time Hopf, relevant to analog ODE hardware. +- **`|1+eps*mu|>1` while `Re(mu)<0`, and smaller `eps` restores convergence:** the failure is mainly an explicit-Euler/stiff-rotation artifact, not a continuous-time Hopf. +- **A real `lambda>=1`:** not Hopf; look for saddle-node/pitchfork/loss of stationary solution. +- **A real `lambda<=-1`:** not Hopf; a discrete flip/period-doubling route is implicated. +- **All fixed-point multipliers stay inside the unit circle up to loss of convergence:** likely global/coexisting attractor or basin issue rather than local Hopf. +- **Floquet multipliers of the observed cycle all stable except phase:** confirms a stable limit cycle, but still does not identify how the stationary solution was lost. + +So Q1: **confirm attention-driven non-conservative oscillatory non-convergence; keep “Hopf” as the leading, not-yet-proven, local mechanism until the fixed-point eigenvalue continuation is measured.** + +--- + +## Q2 — Fix: keep the operator below Hopf while preserving expressivity + +**Verdict:** the best immediate fix is a **residual-triggered adaptive stability homeostat**, implemented primarily with adaptive `jacreg`, plus `qknorm` and modest attention-gain/spectral guardrails. Direct cycle/residual penalties should be alarms/gates, not the primary shaping objective. A structural `r_str` parameterization is the cleanest long-term analog design, but it is less immediately surgical for the current transformer attention code. + +Important detail: the current `jacreg` is not a pure antisymmetric penalty. In `lt_ep_train.py:211-219`, it estimates `||J_nc v||^2 / ||v||^2` by a JVP through `blk.nc_force`; in thick mode `J_nc` is attention plus FFN (`lt_ep_train.py:92-97`). Thus it penalizes learned non-conservative/gain response — a proxy for dangerous rotating dynamics — not exactly `||(J-J^T)/2||`. That proxy is nevertheless the best-supported control knob in the dossier. + +Why adaptive `jacreg` is the right primary fix: + +- It targets the learned recurrent Jacobian that must remain inside the passive `-2z` contraction margin. +- The controller is already wired to the right observable: the free-phase residual. `lt_ep_train.py:520-529` increases `jr` when `res/res_target` rises and relaxes it when dynamics settle. +- The failure is abrupt; a fixed weak penalty can allow training to walk past the bifurcation. The penalty must adapt to residual/cycle onset. +- The dossier states the validated stable runs used adaptive `jacreg`, while diverging runs froze it weakly. +- It preserves more expressivity than simply shrinking all attention: attention can remain strong where it does not destroy the fixed point. + +Role of the other candidates: + +- **Structural `r_str` bound:** best theoretical/hardware guarantee if the recurrent operator can be parameterized as bounded symmetric plus bounded antisymmetric components. But in this code the attention state-Jacobian is data-dependent through LN, q/k projections, softmax, values, and `WO`; a simple weight-level `r_str` does not directly bound the actual rotating eigenvalues. Use this for redesign, not as the immediate rescue. +- **Gain control / `gamma` / `qknorm`:** necessary guardrail, insufficient alone. `qknorm` is already enabled in the s3200 scripts, yet `alpha=1` cycles. The `WO` knockout proves gain matters; use gain caps, but do not rely on blunt global gain reduction as the main solution. +- **Direct cycle-amplitude / residual penalty:** extremely analog-measurable, but symptom-level. It activates when the operator is already near/off the stationary manifold and may punish slow-but-stable modes. Use it to gate invalid EP updates and drive the homeostat. +- **Log-norm / contraction penalty:** theoretically stronger if computed in the right metric, but global and expensive; less obviously forward-local. Use as an offline diagnostic or occasional calibration, not the main analog training primitive. + +Concrete recipe: + +1. **Keep `qknorm` on** for thick attention. It bounds q/k logits and reduces Jacobian stiffness, but is not sufficient by itself. +2. **Initialize inside the basin.** Use small residual-branch initialization (`resinit < 1`, scaling `WO` and `pj`) and keep `c=1` or stronger initial leak. +3. **Use adaptive `jacreg` with nonzero floor and enough ceiling.** Keep the existing controller structure. Set `res_target` well below the measured cycle floor, roughly `1e-3` to `5e-3`; keep `jr_max` high enough to recover, e.g. the code’s `16` scale; use residual EMA to avoid controller thrash. +4. **Turn on `res_gate`.** If free/refined residual exceeds the validity gate, skip task EP/nudge gradients and apply only stabilization. Since the observed cycle floor is around `2.3e-2`, a gate of order `5e-3` to `1e-2` is appropriate. +5. **Prefer branch-aware regularization if modifying code.** Penalize attention’s learned Jacobian more strongly than FFN, because the knockout identifies attention as dominant; keep a lighter FFN penalty because the FFN-only tiny cycle exists. +6. **Add slow attention-output gain rails.** Because post-hoc `WO` scaling restores convergence at `alpha=0.2` and is near-safe by `alpha=0.4`, impose a soft cap/homeostat on `WO` or attention-output spectral/branch gain. Use it as a rail, not the primary objective. +7. **Use `resreg` only as secondary T1 protection.** The `resreg` term (`lt_ep_train.py:220-231`) protects the finite `T1` state used by evaluation/BPTT, but it does not replace fixed-point stability control. +8. **Monitor tail oscillation, not only a short residual probe.** The previous false “slow convergence” framing came from seeing transient decay and missing the residual floor. Track `T1` residual plus tail min/max or autocorrelation. + +Analog-realizable version: + +- Measure `||z_{t+1}-z_t||/||z_t||` or continuous `||F(z)||/||z||` locally during settling. +- Approximate `jacreg` forward-only by injecting small random state perturbations `eta` and measuring `F_nc(z+delta eta)-F_nc(z)`; use that to locally reduce attention/FFN array gains or asymmetry budgets. +- Do not require exact software `vjp`/`J-J^T` as the hardware primitive unless the substrate supports reciprocal probes. For hardware, use forward perturb-and-measure gain/curl proxies plus residual gating. + +So Q2: **primary fix = adaptive `jacreg`-style stability homeostasis; guardrails = `qknorm`, small residual initialization, spectral/gain caps, and `res_gate`; long-term clean analog design = structural `r_str/gamma` bounded operator.** + +--- + +## Q3 — Thesis: can sub-Hopf non-conservative attention be expressive enough? + +**Verdict:** yes, at least for this architecture/scale. The data show a practical expressivity-vs-stability tradeoff in rotating/gain budget, but not a fundamental theorem that coherent language requires post-Hopf dynamics. + +The right thesis is: + +> Non-conservative attention can be expressive below the Hopf boundary, but it must operate with a measured stability margin. Beyond that margin, the model becomes an oscillator rather than a valid equilibrium language model. + +Evidence: + +- Exact BPTT on the identical model trains cleanly to CE `1.83` and does not drive the forward operator into the cycling regime. That strongly suggests the architecture contains stable expressive LM solutions. +- The dossier says AsymEP matches the exact adjoint when a true fixed point exists. Therefore the failure is not an inherent fixed-point gradient ceiling; it is the EP trajectory crossing the stationary-computation boundary. +- The knockout gives a local threshold estimate at s3200: `alpha=1.0` cycles, `0.7` cycles, `0.4` is nearly gone but still floored, `0.2` converges. For this checkpoint/batch/`eps=0.1`, the strict fixed-point critical attention-output scale is roughly between `0.2` and `0.4`, plausibly near `0.3` of the trained s3200 attention-output gain. + +In `r_str/gamma` terms: + +- The code does not measure `r_str` directly, so a numeric `r_str` threshold would be fake precision. +- The dangerous quantity is the **effective rotating learned Jacobian relative to the contraction margin**: roughly asymmetric/rotating fraction times total attention/FFN gain, divided by the passive damping from `-(1+c)z`. +- For the implemented Euler map, the boundary is `|1+eps*mu|=1`, not just `Re(mu)=0`. With `eps=0.1`, high imaginary frequency can destabilize the map even when continuous-time damping remains negative. +- Operationally, the threshold is wherever the leading complex multiplier of `M=I+eps*J` reaches one. In the s3200 `WO`-scale coordinate, stay on the `alpha<=0.2-0.3` side for a strict fixed-point criterion unless the weights re-adapt under regularization. + +Does the sub-threshold regime suffice? + +- **For a coherent small LM:** yes, the BPTT result is strong evidence. +- **For maximum transformer expressivity:** stability imposes a cost. It limits sharp recurrent routing, high non-normal amplification, and strong directed cycles. Extra capacity should come from width, depth, longer but stable settling, or controlled feedforward correction, not uncontrolled curl. +- **For current AsymEP:** stable runs around CE `2.40` do not prove a ceiling; they show the present local training/control recipe has not yet matched BPTT. + +Is a hybrid the ceiling? + +**Likely yes for competitive analog language hardware.** The realistic design is: + +1. a bounded-asymmetry equilibrium core that stays sub-Hopf and supports exact AsymEP; +2. non-conservative attention inside a measured `r_str/gamma` or spectral-margin budget, with qknorm and gain homeostasis; +3. a thin explicit correction/readout/feedforward/digital-clocked path for operations that would otherwise require too much recurrent curl. + +So Q3: **sub-Hopf non-conservative attention can be expressive enough; the tradeoff is real but practical, not proven fundamental. The local s3200 threshold is `alpha_c ~ 0.2-0.4` — probably near `0.3` — or, generally, the point where the leading complex multiplier of `M` hits the unit circle. A bounded-asymmetry core plus thin correction is the realistic ceiling.** + +--- + +## Q4 — Primitive: equilibrium AsymEP or native non-equilibrium learning? + +**Verdict:** for this codebase and near-term analog hardware, the right primitive is **equilibrium computation kept below Hopf**. Preserve a stationary state `z*`; then AsymEP is exact in the regime it assumes. Do not embrace the accidental limit cycle as the main primitive unless you replace the objective, readout, and learning rule. + +Why the current code requires a stationary state: + +- `ep_step` first computes `zs = relax(...)` and treats it as the free equilibrium (`lt_ep_train.py:142-146`). +- The AEP correction uses `v = z - zs`, `Jv`, and `JTv` at `zs` (`lt_ep_train.py:172-178`). That is a local stationary-state linearization. A phase point on a cycle is not the `z*` required by the implicit-gradient formula. +- The block parameter gradient is formed from `(a * f).sum()` with `f = blk.force(zs.detach(), xin, cg=True)` (`lt_ep_train.py:202-205`). If no fixed point exists, this is not the derivative of a stationary solve. +- `holo_ep.py` has the same assumption: `holo_a`, `holo_a_select2`, and `holo_a_track` expand nudged trajectories around `zs`; `holo_a_lockin` is a demodulated nudge estimator around a base state, not a full learning rule for a self-sustained free limit cycle. + +What a true non-equilibrium route would require: + +1. define the computation as a periodic orbit, phase-averaged state, invariant measure, or reservoir trajectory instead of `z*`; +2. define CE on a time/phase average, not an arbitrary T1 sample; +3. handle the neutral phase mode of the orbit; +4. replace fixed-point adjoints with Floquet/Poincare adjoints, eligibility traces, perturbation learning, or lock-in demodulation over periods; +5. keep the orbit stable while learning all recurrent attention weights; +6. demonstrate locality, forward-only operation, and sample efficiency for language-scale credit assignment. + +Is route (ii) tractable forward-only? + +- **For fixed reservoirs plus readout:** yes; oscillatory analog reservoirs can be useful and trainable with local/readout or perturbation rules. +- **For full recurrent attention weights:** not currently as a clean replacement for AsymEP. It is possible as a research program — phase-demodulated perturbation learning, e-prop-like traces, Floquet-local approximations — but it will be approximate, noisy, phase-sensitive, and likely much less efficient. +- **For this project:** training through the s3200 cycle with existing EP/holo estimators is invalid. Gate task updates when residual indicates loss of equilibrium; apply stabilizing homeostasis; resume only after a fixed point is restored. + +Analog-hardware conclusion: + +- Equilibrium analog hardware has a simple primitive: settle, nudge, measure local contrast, update. +- A limit-cycle primitive requires clocks/phase references, demodulation windows, eligibility storage, and phase-stable credit assignment. That may be viable for special-purpose oscillatory reservoirs, but it gives up the main simplicity and exactness of EP. + +So Q4: **keep the operator below Hopf.** Treat non-equilibrium oscillatory learning as a separate reservoir/auxiliary research direction, not the central primitive for this AsymEP transformer. The system should be a bounded non-conservative equilibrium machine, not an accidental oscillator. diff --git a/ep_run/GPT55_BUG_HUNT.md b/ep_run/GPT55_BUG_HUNT.md new file mode 100644 index 0000000..241adba --- /dev/null +++ b/ep_run/GPT55_BUG_HUNT.md @@ -0,0 +1,249 @@ +# GPT55 EP/AEP Correctness Bug Hunt + +Scope: static review of `lt_ep_train.py`, `holo_ep.py`, `asym_probe.py`, with context from `../EP_BELOW210_DIAGNOSIS_FIX.md`. I did not modify training code and did not run training or GPU jobs. I used only tiny CPU checks for formula consistency. + +## Executive Summary + +The main suspect for the reported `cos(g_EP, g_transpose) ~= 0.94` plateau that is flat over `hr` is not beta noise. The code has no nudged-phase/adjoint convergence check. A tight free-phase residual does not imply the two-phase AEP contrast has converged to the fixed-point adjoint. In the linearized corrected dynamics, finite-`T2` error is independent of `hr`, exactly matching the clue. + +I did not find a sign flip in the deterministic AEP correction. The real two-phase contrast, holomorphic two-phase contrast, and exact-adjoint probe sign conventions are mutually consistent. I did find several real correctness hazards: the plain-EP AEP correction clip invalidates the transpose correction, stochastic `fnoise` makes JVP/VJP not derivatives of the same force realization, `t1max` trains at a refined fixed point while evaluation/BPTT use the finite `T1` state, and `asym_probe.py` has probe-specific footguns that can mislead the diagnosis. + +## Ranked Findings + +### 1. Finite nudged/adjoint relaxation is unchecked; this explains an `hr`-flat cosine plateau + +Files/lines: +- `lt_ep_train.py:163-180` runs plain nudges for exactly `T2` steps. +- `lt_ep_train.py:181-197` computes holomorphic/AEP `a` through `holo_a_track`, `holo_a_select2`, `holo_a_select`, or `holo_a`, but ignores whether the nudged contrast actually converged. +- `holo_ep.py:179-185` updates the two real phases for one finite step at a time. +- `holo_ep.py:193-211` selects a snapshot by smallest inter-snapshot increment, not by an adjoint residual. +- `holo_ep.py:229-254` does the same for common-mode tracking. +- `asym_probe.py:727-743` changes `beta/hr` in the diagnostic sweep but keeps the nudged relaxation budget fixed. +- `asym_probe.py:582-595` interprets beta-sweep behavior without a finite-`T2` branch. + +What is wrong: +The estimator validity logic checks/refines the free phase, but not the nudged phase. For the corrected local dynamics, with `M = I + eps * J^T` and `ell = dL/dz`, the two-phase contrast approximately obeys: + +```text +a_{t+1} = M a_t + eps * ell +fixed point: J^T a = -ell +``` + +The finite-`T2` error is `M^T2` times the initial adjoint error and is independent of the nudge radius `r/hr` in the linear regime. Therefore a cosine plateau that is flat across `hr=0.04..0.8` is a signature of under-converged nudged/adjoint relaxation, not evidence that beta noise or finite-beta bias has been ruled out. + +Why it matters: +This corrupts training gradients directly. It also corrupts the diagnosis because the probe can report a tight free-phase residual while the AEP contrast is still a truncated adjoint solve. As the operator hardens below CE about 2.1, the adjoint relaxation can slow down even if the free phase is tight. + +Severity: corrupts TRAINING and DIAGNOSIS/probe. + +Confidence: high. The line-level behavior matches the `hr`-insensitive clue exactly. + +Minimal fix: +Add a nudged/adjoint convergence criterion. At minimum, return and log `inc_min / (||a_best|| + eps)` from `holo_a_select2` and `holo_a_track`, and sweep `t2sel` in `asym_probe.py` at fixed `hr`. Better: compute an adjoint residual proxy at the selected `a`, e.g. `||J^T a + ell|| / (||ell|| + eps)` using VJP of the full thick force at the free state, and keep nudging until it is below tolerance. Treat `hr` sweeps as inconclusive unless `T2/t2sel` convergence is also demonstrated. + +### 2. Plain-EP AEP correction is norm-clipped, which invalidates the transpose correction + +Files/lines: +- `lt_ep_train.py:172-178` computes `corr = Jv - JTv`, then replaces it by `corr * (fn / cn)` when `||corr|| > ||f||`. + +What is wrong: +The AsymEP correction is algebraic: subtract `Jv - J^T v` so the local Jacobian becomes `J^T`. Scaling that correction by a state-dependent factor changes the corrected dynamics to something like `J - alpha(J - J^T)`, with `alpha < 1` exactly when asymmetry is large. That is no longer the transpose dynamics. + +Why it matters: +This can create a systematic estimator bias in plain EP (`holo=0`). It is especially dangerous near the regime where the antisymmetric correction is large, which is the regime this correction is supposed to fix. + +Severity: corrupts TRAINING for the plain real two-phase path. It does not affect the current `holo_a_select2`/`holo_a_track` path, which does not use this clip. + +Confidence: high. + +Minimal fix: +Remove the correction clip. If stability is needed, clip the total update, reduce `eps`, reduce `beta/hr`, or reject/halve the nudged step while logging that the estimator left its validity region. Do not scale only the antisymmetric correction. + +### 3. With `fnoise > 0`, JVP/VJP are not derivatives of the same force realization + +Files/lines: +- `lt_ep_train.py:87-90` samples fresh multiplicative noise inside `_noisy`. +- `lt_ep_train.py:92-97` applies `_noisy` inside `nc_force`. +- `lt_ep_train.py:171-178` evaluates the noisy force and then separately calls JVP/VJP through `nc_force`. +- `holo_ep.py:150-151` injects fresh noise into `rforce`. +- `holo_ep.py:176-185` and `holo_ep.py:224-239` use `blk.nc_force` for correction JVP/VJP, which can sample different noise again. + +What is wrong: +When `fnoise > 0`, the forward force, JVP force, and VJP force are separate random functions. The correction is no longer `Jv - J^T v` for the same operator used in the state update, and JVP and VJP are not even transposes of the same sampled Jacobian. + +Why it matters: +This corrupts AEP training in the noisy hardware simulation path and makes `navg` average a mixture of stochastic bias and stochastic noise. With `fnoise=0`, this issue is inactive. + +Severity: corrupts TRAINING when `--fnoise > 0`; otherwise inactive. + +Confidence: high. + +Minimal fix: +Do not sample inside differentiable force functions. Sample a fixed noise mask/device realization outside and pass it into both the forward force and the JVP/VJP force, or keep dynamic per-pass noise out of AEP correction and use deterministic mismatch for differentiable hardware probes. + +### 4. `t1max` trains the EP task gradient at a refined state while eval/BPTT use the finite-`T1` state + +Files/lines: +- `lt_ep_train.py:143-152` first computes `zT` at `T1`, then optionally refines `zs` up to `t1max`. +- `lt_ep_train.py:203-210` computes the EP block gradient and readout gradient at refined `zs`. +- `lt_ep_train.py:260-265` BPTT differentiates exactly `T1` unrolled steps. +- `lt_ep_train.py:279-286` validation evaluates exactly `T1` relaxed steps. + +What is wrong: +With `t1max > T1`, EP optimizes the refined fixed-point state, while the reported validation objective and BPTT reference use the finite-`T1` state. That is a real objective mismatch. + +Why it matters: +If `z_T1` drifts away from the refined fixed point, EP can improve the wrong state while validation and the practical finite-time model degrade. This was also identified in `../EP_BELOW210_DIAGNOSIS_FIX.md:11-20`. + +Severity: corrupts TRAINING when `t1max > T1` and the finite-`T1` state is the real objective. + +Confidence: high. + +Minimal fix: +Choose one objective and make all paths use it. If the objective is finite `T1`, compute the EP gradient/readout gradient at `zT` or add a principled finite-time/contraction term. If the objective is the fixed point, evaluate and compare BPTT against the same refined state. + +### 5. `resreg` is hard-wired to thick `tforce` and its scaling includes already-added non-task gradients + +Files/lines: +- `lt_ep_train.py:220-224` computes the residual penalty with `blk.tforce(zT, xin0)`. +- `lt_ep_train.py:225-228` scales by `gtask` after prior task and `jacreg` gradients may already be in `grads`. + +What is wrong: +`blk.tforce` is the thick-block force only (`lt_ep_train.py:81-85`). If `resreg` is used with `attn_mode` other than `thick`, the residual penalty is for the wrong dynamics. Also, `gtask` is described as task-gradient norm but includes any gradients already added to `grads`, including `jacreg` from `lt_ep_train.py:211-219`. + +Why it matters: +This can apply a residual penalty in the wrong direction for non-thick modes and makes the `resreg` ratio slightly different from its stated meaning. + +Severity: corrupts TRAINING conditionally: non-thick `resreg` is high risk; thick with `jacreg` is a smaller scaling bug. + +Confidence: high. + +Minimal fix: +Guard `resreg` with `assert blk.attn_mode == 'thick'` or compute the residual through `blk.force(..., cg=True)` for the active mode. Capture the pure task-gradient norm before adding `jacreg` and `resreg`. + +### 6. Complex masked softmax is numerically unstable because masked logits affect the row shift + +Files/lines: +- `holo_ep.py:26-29` computes `c = a.real.amax(...)` before masking, then multiplies `exp(a - c)` by `mask`. +- `holo_ep.py:48` passes the causal mask as a complex tensor into this helper. +- Real attention masks before softmax at `lt_ep_train.py:66-68`. + +What is wrong: +Mathematically, the row shift cancels if arithmetic is exact. Numerically, a large masked future logit can dominate `c`, causing all valid entries to underflow or lose precision. The real path masks before softmax and does not have this issue. + +Why it matters: +With `qknorm` enabled this is mitigated because logits are bounded, but without `qknorm` it can bias or NaN holomorphic phases. + +Severity: corrupts TRAINING conditionally in complex holomorphic paths, especially without `--qknorm`. + +Confidence: medium. + +Minimal fix: +Keep the mask boolean and compute the shift over valid entries only: + +```python +c = a.real.masked_fill(~mask, -float("inf")).amax(-1, keepdim=True) +w = torch.exp(a - c).masked_fill(~mask, 0) +``` + +### 7. Holomorphic EP helpers silently implement only the thick force, but `ep_step` allows them for any mode + +Files/lines: +- `lt_ep_train.py:181-197` calls `holo_ep` whenever `holo > 0`, without checking `blk.attn_mode`. +- `holo_ep.py:36-51` implements `cforce` as thick LN + attention + FFN. +- `holo_ep.py:134-152` implements `rforce` as the same thick real-axis force. +- `lt_ep_train.py:349` default `--attn_mode` is `real`, while `lt_ep_train.py:358-359` allow `--holo`. + +What is wrong: +If a user runs `--holo` with `attn_mode=real`, `energy`, or `mono`, the nudged force used to estimate `a` is not the model force. + +Why it matters: +This silently corrupts training for a legal CLI flag combination. + +Severity: corrupts TRAINING for non-thick `--holo` runs. + +Confidence: high. + +Minimal fix: +Add a hard guard in `ep_step`: `if holo > 0 and blk.attn_mode != 'thick': raise ValueError(...)`, or implement holomorphic force extensions for the other modes. + +### 8. `asym_probe.py` hard-codes model construction choices that may not match the checkpoint + +Files/lines: +- `asym_probe.py:31-50` exposes `--gelu`, `--T1`, `--T2`, `--hr`, etc. +- `asym_probe.py:105-119` forces `attn_mode="thick"`, `c=1.0`, `qknorm=True`, `fnoise=0.0`, `track=True`, and assigns `blk.gelu = cfg.gelu`. +- `lt_ep_train.py:81-120` never reads `blk.gelu`; GELU is hard-coded to tanh-form in the active thick force. + +What is wrong: +The probe can analyze a different model than the checkpoint was trained with. The `--gelu` flag is especially misleading because assigning `blk.gelu` has no effect in the current `EQBlock`. + +Why it matters: +For the current qknorm/thick/c=1/tanh runs this is probably harmless. For c-bump, non-qknorm, non-thick, or historical erf/tanh comparisons, it can make `g_transpose`, `g_BPTT`, and `g_EP` refer to the wrong dynamics. + +Severity: DIAGNOSIS/probe only, unless probe conclusions are used to choose training changes. + +Confidence: high. + +Minimal fix: +Save the training config in checkpoints and load `attn_mode`, `c`, `qknorm`, GELU mode, and relevant flags from it. Remove `--gelu` or implement it in `EQBlock`. + +### 9. `asym_probe.py` labels `ep_step`'s returned residual as estimator/free-phase convergence, but it is the pre-refinement `T1` residual + +Files/lines: +- `lt_ep_train.py:143-152` computes `res` at `T1`, then may refine `zs` and store `res_used`. +- `lt_ep_train.py:232` returns `res`, not `res_used`. +- `asym_probe.py:840` prints that value as `EP estimator free-phase residual from ep_step`. +- `asym_probe.py:505-522` separately computes and prints the refined exact-reference residual. + +What is wrong: +The probe can conflate three different residuals: `T1` residual, refined free-phase residual, and nudged/adjoint residual. Only the last one diagnoses whether the EP contrast has converged. + +Why it matters: +This can make a run look "tightly converged" or "not tightly converged" depending on which print line the reader tracks. It also reinforces the wrong conclusion that free-phase convergence alone validates the estimator. + +Severity: DIAGNOSIS/probe only. + +Confidence: high. + +Minimal fix: +Return both `res_T1` and `res_refined` from `ep_step`, print both in the probe, and add a separate nudged/adjoint residual for `a`. + +### 10. `holo_ep.py` self-test/debug main is broken by unreachable code and an undefined function + +Files/lines: +- `holo_ep.py:257-280` defines `holo_a_lockin` and returns. +- `holo_ep.py:281-290` contains unreachable code that looks like a missing `holo_grads` function body. +- `holo_ep.py:329-332` calls `holo_grads`, which is not defined. + +What is wrong: +Running `python holo_ep.py` as a diagnostic script will fail. + +Why it matters: +This does not affect `lt_ep_train.py` imports of `holo_a`, `holo_a_select2`, or `holo_a_track`, but it can break or mislead standalone estimator checks. + +Severity: DIAGNOSIS/probe only. + +Confidence: high. + +Minimal fix: +Move `holo_ep.py:281-290` into a real `def holo_grads(...)` or delete the stale self-test. + +## Checked And Found Correct + +- AEP correction sign: `lt_ep_train.py:171-178`, `holo_ep.py:181-185`, and `holo_ep.py:233-239` subtract `Jv - J^T v`, which is the correct sign for making the local differential dynamics use `J^T`. +- Two-phase contrast sign: `lt_ep_train.py:199-200` and `holo_ep.py:193-195` compute `(z_- - z_+) / (2 beta/r)`, which matches `lambda` solving `J^T lambda = -dL/dz`. +- Exact-adjoint probe sign: `asym_probe.py:443-445` solves `J^T lambda = -ell`, and `asym_probe.py:457-465` computes `lambda^T F_theta`. That is the correct implicit fixed-point gradient. +- Deterministic force consistency for thick mode: `lt_ep_train.py:81-85`, `lt_ep_train.py:102-106`, `holo_ep.py:134-152`, and `holo_ep.py:36-51` match on the real axis. Tiny CPU check with qknorm gave max `|tforce-rforce| = 1.19e-7` and max `|tforce-cforce.real| = 2.09e-7`. +- GELU consistency in current code: `lt_ep_train.py:84`, `lt_ep_train.py:96`, `lt_ep_train.py:105`, `holo_ep.py:32-33`, and `holo_ep.py:148` all use tanh-form GELU. Tiny CPU check found max difference from `F.gelu(..., approximate='tanh')` of `2.38e-7`. +- qknorm consistency in current thick paths: real attention uses qknorm at `lt_ep_train.py:63-65`, complex force at `holo_ep.py:44-46`, and real nudged force at `holo_ep.py:142-144`. +- Common-mode AEP anchor: `holo_ep.py:231-239` correctly computes `zbar`, duplicates it as the anchor, and applies the antisymmetric correction to `Z - zbar`. +- Block-parameter gradient scope: `lt_ep_train.py:203-205` computes `grad((a * f).sum(), blk.block)`, and `asym_probe.py:457-465` uses the same clamp-gradient path for token/position parameters. This is the right scope for force parameters. +- Readout-head gradient: `lt_ep_train.py:208-210` computes only the direct CE gradient for `Wh`. Since `Wh` is not in the force, there is no missing implicit force term and no double-counting. +- BPTT probe unroll: `asym_probe.py:819` uses `bptt_step`, which unrolls the same `blk.force` update as training at `lt_ep_train.py:260-265`. +- `no_grad` around JVP/VJP is not itself a bug. A tiny CPU check confirmed both `torch.func.jvp/vjp` and `torch.autograd.functional.jvp/vjp` still return derivatives inside surrounding `torch.no_grad()` blocks. + +## Highest-Value Next Static/CPU Checks + +1. Add a CPU-sized linearized test that compares `a_T` from `holo_a_track` against a direct solve of `J^T a = -ell` while sweeping `T2` and `hr`. Prediction: the 0.94-style error should move with `T2`, not `hr`. +2. In `asym_probe.py`, add a `--t2-sweep` diagnostic at fixed `hr`, and print `||a_t - a_{t-K}|| / ||a_t||` plus, if affordable, `||J^T a + ell|| / ||ell||`. +3. Re-run the existing beta/hr sweep only after proving the selected `a` is converged for each point. diff --git a/ep_run/adaptive_eps_calib.py b/ep_run/adaptive_eps_calib.py new file mode 100644 index 0000000..475e296 --- /dev/null +++ b/ep_run/adaptive_eps_calib.py @@ -0,0 +1,40 @@ +import torch, pickle, math +from pathlib import Path +import lt_ep_train as L +from lt_ep_train import EQBlock +L.DD=Path('data/tinystories_bpe'); L.vocab=pickle.load(open(L.DD/'meta.pkl','rb'))['vocab_size'] +dev='cuda'; B=8; T=256 +torch.manual_seed(1234); idx,y=L.get_batch('val',B,T); idx=idx.to(dev) if hasattr(idx,'to') else idx +ck=torch.load('runs/redx_traj/s3200.pt',map_location=dev) +def mkblk(): + blk=EQBlock(512,16,256,256,s=1.0,c=1.0,attn_mode='thick'); blk.qknorm=True + with torch.no_grad(): + for p,w in zip(blk.allp,ck['allp']): p.copy_(w.to(dev)) + return blk +def force_g(blk,z,xin): + f=blk.force(z,xin).detach(); return f, f.norm().item()/(z.norm().item()+1e-9) +def run(adaptive, e0=0.1, emin=0.005, emax=0.1, down=0.5, up=1.1, theta=0.98, N=10000): + blk=mkblk() + with torch.no_grad(): + xin=blk.embed(idx).detach(); z=xin.clone(); eps=e0; prev=None; gs=[]; eh=[] + for t in range(N): + f,g=force_g(blk,z,xin); gs.append(g); eh.append(eps) + if adaptive and prev is not None: + if g>theta*prev: eps=max(emin,eps*down) + elif g<0.9*prev: eps=min(emax,eps*up) + prev=g; z=z+eps*f + tail=gs[-500:] + return dict(gmin=min(tail), gmean=sum(tail)/len(tail), avg_eps=sum(eh)/len(eh), final_eps=eh[-1]) +print("=== adaptive-eps controller CALIBRATION on s3200 (cycling op) ===") +print("ground truth: fixed eps=0.1 cycles (g~0.23); fixed eps=0.01 converges (g~0.09)") +print("-- benchmarks (fixed eps) --") +for e in (0.1, 0.01): + r=run(False, e0=e, emin=e, emax=e); print(f" fixed eps={e}: g_tail[min={r['gmin']:.4f} mean={r['gmean']:.4f}]") +print("-- adaptive configs (want: g <= 0.01-benchmark, avg_eps as HIGH as possible = fewer effective steps) --") +for name,kw in [("C1 cons", dict(down=0.5,up=1.1,theta=0.98)), + ("C2 mod", dict(down=0.7,up=1.2,theta=0.98)), + ("C3 caut", dict(down=0.5,up=1.05,theta=0.99)), + ("C4 aggr", dict(down=0.6,up=1.3,theta=0.95))]: + r=run(True, **kw); + print(f" {name} {kw}: g_tail[min={r['gmin']:.4f} mean={r['gmean']:.4f}] avg_eps={r['avg_eps']:.4f} final_eps={r['final_eps']:.4f}") +print("=== DONE ===") diff --git a/ep_run/adaptive_eps_calib2.py b/ep_run/adaptive_eps_calib2.py new file mode 100644 index 0000000..ab78f7e --- /dev/null +++ b/ep_run/adaptive_eps_calib2.py @@ -0,0 +1,39 @@ +import torch, pickle +from pathlib import Path +import lt_ep_train as L +from lt_ep_train import EQBlock +L.DD=Path('data/tinystories_bpe'); L.vocab=pickle.load(open(L.DD/'meta.pkl','rb'))['vocab_size'] +dev='cuda'; B=8; T=256 +torch.manual_seed(1234); idx,y=L.get_batch('val',B,T); idx=idx.to(dev) if hasattr(idx,'to') else idx +CK={'s3200':torch.load('runs/redx_traj/s3200.pt',map_location=dev), + 's2000':torch.load('runs/redx_traj/s2000.pt',map_location=dev)} +def mkblk(name): + blk=EQBlock(512,16,256,256,s=1.0,c=1.0,attn_mode='thick'); blk.qknorm=True + with torch.no_grad(): + for p,w in zip(blk.allp,CK[name]['allp']): p.copy_(w.to(dev)) + return blk +def fg(blk,z,xin): + f=blk.force(z,xin).detach(); return f, f.norm().item()/(z.norm().item()+1e-9) +# corrected controller: shrink on OVERSHOOT (g rose), grow otherwise +def run(name, e0=0.05, emin=0.003, emax=0.1, up=1.05, down=0.7, tol=1.0, N=8000): + blk=mkblk(name) + with torch.no_grad(): + xin=blk.embed(idx).detach(); z=xin.clone(); eps=e0; prev=None; gs=[]; eh=[] + for t in range(N): + f,g=fg(blk,z,xin); gs.append(g); eh.append(eps) + if prev is not None: + if g > prev*tol: eps=max(emin, eps*down) # residual climbed -> eps too big + else: eps=min(emax, eps*up) # contracting -> grow for speed + prev=g; z=z+eps*f + tail=gs[-500:] + return dict(gmin=min(tail), gmean=sum(tail)/len(tail), avg_eps=sum(eh)/len(eh), final_eps=eh[-1]) +print("=== corrected adaptive-eps (shrink on OVERSHOOT) — calibrate on stiff + smooth ===") +print("target: s3200 converges (g~0.09) at avg_eps>0.005 (faster than naive); s2000 stays eps~0.1") +for name in ('s3200','s2000'): + print(f"-- {name} --") + for tag,kw in [("A up1.05 dn0.7", dict(up=1.05,down=0.7)), + ("B up1.1 dn0.5", dict(up=1.1,down=0.5)), + ("C up1.03 dn0.8 tol1.02", dict(up=1.03,down=0.8,tol=1.02))]: + r=run(name,**kw) + print(f" {tag}: g_tail[min={r['gmin']:.4f} mean={r['gmean']:.4f}] avg_eps={r['avg_eps']:.4f} final_eps={r['final_eps']:.4f}") +print("=== DONE ===") diff --git a/ep_run/alert.sh b/ep_run/alert.sh new file mode 100755 index 0000000..6fe5929 --- /dev/null +++ b/ep_run/alert.sh @@ -0,0 +1,13 @@ +#!/bin/bash +LOG=ep_run/runs/ep_resreg_warm.log +cd /home/yurenh2/ept +while true; do + sleep 900 + if [ -n "$(find "$LOG" -mmin +45 2>/dev/null)" ]; then echo "LOG STALE >45min (resreg_warm dead/stuck)"; break; fi + LAST=$(grep -E "val CE" "$LOG" | tail -1) + BEST=$(echo "$LAST" | grep -oE "best [0-9.]+" | grep -oE "[0-9.]+$") + EMA=$(echo "$LAST" | grep -oE "ema=[0-9.]+" | grep -oE "[0-9.]+$") + awk "BEGIN{exit !($BEST < 2.02)}" 2>/dev/null && { echo "NEW BEST <2.02 (full recovery + improvement): $LAST"; break; } + awk "BEGIN{exit !($EMA > 4.0)}" 2>/dev/null && { echo "RE-COLLAPSE ema>4: $LAST"; break; } +done +echo "FIRED: $LAST" diff --git a/ep_run/analogET_extracted.txt b/ep_run/analogET_extracted.txt new file mode 100644 index 0000000..b139640 --- /dev/null +++ b/ep_run/analogET_extracted.txt @@ -0,0 +1,1861 @@ + Dense Associative Memories with Analog Circuits + Marc Gong Bacvanski1 , Xincheng You2 , John Hopfield3 , and Dmitry Krotov4 + 1 + MIT + 2 + Independent Researcher + 3 + Princeton University + 4 + IBM Research + + December 16 2025 +arXiv:2512.15002v1 [cs.NE] 17 Dec 2025 + + + + + Abstract: The increasing computational demands of modern AI systems have exposed fundamental + limitations of digital hardware, driving interest in alternative paradigms for efficient large-scale inference. + Dense Associative Memory (DenseAM) is a family of models that offers a flexible framework for repre- + senting many contemporary neural architectures, such as transformers and diffusion models, by casting + them as dynamical systems evolving on an energy landscape. In this work, we propose a general method + for building analog accelerators for DenseAMs and implementing them using electronic RC circuits, cross- + bar arrays, and amplifiers. We find that our analog DenseAM hardware performs inference in constant + time independent of model size. This result highlights an asymptotic advantage of analog DenseAMs + over digital numerical solvers that scale at least linearly with the model size. We consider three settings + of progressively increasing complexity: XOR, the Hamming (7,4) code, and a simple language model + defined on binary variables. We propose analog implementations of these three models and analyze the + scaling of inference time, energy consumption, and hardware. Finally, we estimate lower bounds on the + achievable time constants imposed by amplifier specifications, suggesting that even conservative existing + analog technology can enable inference times on the order of tens to hundreds of nanoseconds. By har- + nessing the intrinsic parallelism and continuous-time operation of analog circuits, our DenseAM-based + accelerator design offers a new avenue for fast and scalable AI hardware. + + + 1 Introduction + The unprecedented growth of artificial intelligence (AI) has driven demand for increasingly large and + powerful models. At present, the field of generative AI is primarily driven by two settings: autore- + gressive transformers [1] and diffusion models [2]. While these settings have demonstrated remarkable + capabilities, they do so at a substantial computational cost. Their current implementations utilize digital + computation, which faces fundamental challenges in energy efficiency, scalability, and latency, especially + as model sizes and deployment demands continue to grow [3, 4, 5]. These limitations have prompted + interest in alternative computational paradigms that can efficiently handle the demands of modern AI + workloads [6]. + Dense Associative Memories (DenseAMs) [7, 8], a promising class of AI models which generalize + Hopfield networks [9], offer a new angle for tackling these problems. Unlike conventional feed-forward + models, DenseAM inference can be defined through the temporal evolution of a state vector that is + governed by a system of differential equations [10]. The state vector can be thought of as a particle + exploring the surface of a high-dimensional energy landscape, which is the Lyapunov function of these + dynamical equations. DenseAMs have been demonstrated to be flexible and expressive computational + frameworks, capable of representing many primitives of modern AI architectures, such as attention + mechanism [11], transformers [12], and diffusion models [13, 14, 15]. Furthermore, DenseAMs are error- + correcting systems [16], a property ensuring that small perturbations of the desired temporal evolution + of the state vector are corrected away by the dynamics of the network itself, rather than accumulated + in time. Finally, DenseAMs are asymptotically stable—during the course of temporal evolution the + computation happens during a finite transient period of time, which is followed by a steady state of + Code available at https://github.com/mbacvanski/AnalogET. + + + + 1 + neural activities. This asymptotic stabilization of dynamical trajectories removes the requirement to read +out the “answer” to the computation problem at a precise moment of time, making DenseAMs robust +to several classes of hardware imperfections. The confluence of the above properties makes DenseAMs +appealing networks for analog hardware implementations that, on the one hand, are grounded in the +physics of stable error-correcting dynamical systems and, on the other hand, are capable of representing +computation in state-of-the-art AI networks. + In 1989, Hopfield argued that analog neural hardware can exceed the efficiency of digital implemen- +tations when the device physics directly instantiate the computational dynamics of the model itself [17]. +Here, we revisit this idea with DenseAM models: we propose an analog circuit-based hardware accel- +erator design whose dynamics directly realize those of the DenseAM. We find that analog DenseAM +hardware enables constant-time inference independent of model size, which is in stark contrast to GPU +solvers and digital implementations. This intrinsic property makes DenseAM a natural fit for analog AI +accelerators, and it highlights our circuit architecture as a viable hardware path to realize them. Using +component specifications already demonstrated in fabricated devices, analog DenseAM hardware may +achieve inference times on the order of tens to hundreds of nanoseconds, several orders of magnitude +faster than digital systems. + By leveraging the natural dynamics of analog systems, this work establishes a new design of fast and +scalable AI accelerators. The framework of DenseAMs and their efficient analog hardware implementa- +tions suggest a pathway for fundamentally redesigning the hardware-software interface for AI, enabling +a new paradigm for fast, energy-efficient, and scalable computation. + + +2 Dense Associative Memory basics +The DenseAM framework [10, 18] provides a model that has straightforward neuronal dynamics, yet is +surprisingly expressive in its ability to represent AI models including transformer attention, diffusion +models, and associative memories. In its simplest form it is defined by two sets of neurons (typically +called visible and hidden neurons) and a system of coupled non-linear differential equations governing +their behavior, see Figure 1. The visible neurons are characterized by their internal states vi and their +outputs gi , index i = 1 . . . Nv ; while the hidden neurons have internal states hµ and outputs fµ , index +µ = 1 . . . Nh . From the AI perspective, one can think about internal state of the neuron as a pre-activation +of that neuron, and the output as a post-activation, which is obtained by applying an activation function +to the pre-activation. From the biological perspective, one can think about the internal state of the +neuron as a membrane voltage potential, and the output of that neuron as an axonal output, or a firing +rate of that neuron. This framework admits both neuron-wise activation functions (gi = g(vi ), where +g(·) is some continuous function, e.g., a ReLU), and collective activation functions such as softmax or +layer normalization, which depend on the states of multiple neurons. + The network parameters are stored in the synaptic weights ξ ∈ RNh ×Nv , whose matrix elements +denoted by ξµi can be either hand-engineered or learned. The time decay constants for the two groups +of neurons are τv and τh . With these conventions, the temporal evolution of the two groups of neurons +can be expressed as  Nh +  dvi X + τ = ξµi fµ + ai − vi +  +  v dt +  +  +  + µ=1 + (1) + Nv + dh +  + µ +  X + τh dt = ξµi gi + bµ − hµ +  +  +  + i=1 + +This forms a bipartite graph of neuronal connections, where the state of the hidden neurons is updated +by the state of the visible neurons, and vice versa. Importantly, the same matrix ξ appears in both +equations, once as ξ and again as ξ ⊤ . Although this is sometimes described as using “symmetric” +weights, ξ is not assumed to be symmetric in the linear-algebraic sense; it is simply the same matrix +used in both directions. Finally, ai and bµ denote biases, which are additional weights of the system and +whose values may be hard-coded or learned depending on the application. + The most important aspect of this model is the existence of a global energy function (Lyapunov +function) that describes neuronal dynamics. To demonstrate this, it is most convenient to use the +Lagrangian formalism [10, 18, 16]. Each set of neurons is defined through a Lagrangian function of their +internal states. The activation functions are defined as partial derivatives of that Lagrangian with respect +to internal states. The total energy is the sum of energies of each set of neurons, plus the interaction + + + + 2 + Figure 1: Top left: Bipartite neural network formulation, where hidden neurons hµ and visible neurons +vi are connected via symmetric synaptic weights ξ. Top right: Circuit realization of symmetric weight +matrix via resistive crossbar array. Each crosspoint encodes a weight ξµi by its resistance Rµi = 1/ξµi . +Lower right: Circuit schematic of a single hidden neuron. It drives its row of the crossbar array with +a voltage according to its activation fµ , and its internal dynamics are driven by the incoming current +flowing into it from the crossbar array. Lower left: Softmax activation function built from bipolar +junction transistors (some components not shown). + + +energy. The energy of each set of neurons is a Legendre transformation of the corresponding Lagrangian +(plus the term proportional to the bias). Thus, the global energy of Equation 1 is given by + Nv + X  Nh + X  Nh X + X Nv + E= gi (vi − ai ) − Lv + fµ (hµ − bµ ) − Lh − fµ ξµi gi (2) + i=1 µ=1 µ=1 i=1 + | {z } | {z } | {z } + energy of visible neurons energy of hidden neurons interaction energy + +where the activation functions are defined as partial derivatives of the Lagrangians + ∂Lv ∂Lh + gi = , fµ = + ∂vi ∂hµ +For convex Lagrangians this global energy decreases with time on the dynamical trajectories of Equa- +tion 1. If, additionally, the activation functions (and corresponding Lagrangians) are chosen in such a +way that this energy is bounded from below, the dynamical trajectories are guaranteed to arrive at a +stable fixed point of activations. The dynamical equations typically have many asymptotic fixed points, +which correspond to local minima of the energy function in Equation 2. Both properties above (convexity +of Lagrangians and lower-bounded energy) are satisfied for all settings studied in this paper. By picking +different nonlinear activation functions (or corresponding Lagrangians), this system yields a variety of +models that can describe associative memory, softmax attention, and other commonly used settings in +AI [10, 11, 18, 19, 20]. + A particularly relevant example for modern sequence modeling is the Energy Transformer (ET) [12], +which reformulates transformer’s inference pass as a gradient flow on an energy function defined over the + + + 3 + set of tokens. The ET block contains two contributions to the energy function: attention energy and the +Hopfield network. The energy attention module routes the information between the tokens, while the +Hopfield module aligns the tokens with the manifold of token embeddings. In our implementation, the +context tokens act as a set of dynamically instantiated memories that interact with the predicted token +through a DenseAM-like energy. In section 6 we exploit this connection to construct an Analog Energy +Transformer (Analog ET) whose continuous-time dynamics are implemented directly in hardware using +our DenseAM circuit primitives. + + +3 Related work +Early analog implementations of associative memories focused on the classical Hopfield network. Founda- +tional designs, such as continuous-time analog circuits [21, 22] and later demonstrations using amorphous- +silicon resistors [23], memristive devices [24, 25], and phase-change memories [26], targeted the quadratic +Hopfield energy function. These works emphasize device engineering and memory-cell design rather than +system-level dynamics, and inherit the limited storage capacity and representational power of traditional +Hopfield networks. That line of research is largely concerned with how to fabricate programmable re- +sistance elements themselves; our work assumes programmable conductances as a given primitive and +focuses on the continuous-time dynamics that operate on top of them. Our work also differs from these +works by addressing DenseAMs with higher-order energy functions and continuous-valued states. + Another direction is the use of cavity-QED systems for associative memory. Marsh et al. [27] analyze +a confocal cavity implementation of a quadratic Hopfield network and show that the cavity dynamics +induce a descent-like relaxation rule on spin states. Their model remains restricted to quadratic energies +and binary spins, and operates in a cryogenic, cavity-QED setting. Our work instead targets higher-order +DenseAMs with continuous states, and emphasizes scalable, room-temperature analog microelectronics +with explicit hardware-aware dynamical analysis. + More recent physical implementations move beyond purely quadratic energies. Musa et al. [28] +propose a free-space optical realization of the higher-order DenseAM energy. Their system constructs a +static physical representation of the energy landscape, but inference relies on an external digital controller +that performs iterative spin-flip updates. Thus, the hardware computes energies, while the optimization +dynamics remain digital. In contrast, our analog microelectronic architecture embeds the gradient flow +itself into hardware: inference is performed by a single continuous-time evolution rather than by discrete +digital updates. + + +4 DenseAM circuit design +Here, we introduce a novel architecture for a class of analog electronic hardware accelerators that models +Equation 1’s system of nonlinear differential equations using time evolution. Our DenseAM design +shown in Figure 1 is comprised of two sets of neurons that interact through a resistive crossbar array. +The resistive crossbar array turns voltage differences between neurons into currents flowing between the +neurons according to synaptic weights, and each neuron’s internal circuitry converts those currents into +dynamics that reproduce Equation 1. + +Resistive weights as a crossbar array. The crossbar array construction is a canonical design of +matrix-vector multiplication using analog electronics [17, 29], and is a natural fit for the weight matrix +ξ in our model. Traditionally, each crosspoint between a row and column line is connected by a resistor +(often memristors, RRAM, or other analog memories that produce resistances), a vector of input voltages +is applied at row lines, and the column lines are held at ground typically via a transimpedance amplifier. +By Ohm’s law, each resistive crosspoint produces a current that multiplies the row’s input voltage by +the inverse of the resistance. Because currents add along each column line, the total current output at a +column is the inner product between the vector of input voltages and the column’s conductance vector. +Thus, the array as a whole implements a parallel analog matrix multiplication of the form Iout = GVin , +where G is the matrix of conductances (inverse of resistances). + Unlike a traditional crossbar array whose rows are driven at a fixed voltage and whose columns +are held at ground, our DenseAM circuit design uses each weight bidirectionally, exactly representing +the bidirectional connections between visible and hidden neurons. As a result, the current flowing into +each neuron corresponds to the weighted sum of the differences P between visible and hidden neuron +activations. For example, for hidden neuron µ, this current is i ξµi (gi − fµ ). This construction enables + + + 4 + (1, 0) (1, 1) + 1 g3 0.4 + Neurons + Visible + + + + + Energy + 0.2 + 0 + + 1 f3 0.0 + Neurons + Hidden + + + + + (0, 0) (0, 1) + 0 0.4 + + + + + Energy + 0.5 + Energy + + + + + 0.2 + + 0.0 0.0 + 0.0 0.5 1.0 1.5 2.0 2.5 3.0 + 0 1 0 1 + Time (s) + v3 v3 + +Figure 2: Solving XOR with a DenseAM. Visible Figure 3: XOR energy landscape of neuron v3 un- +neuron g3 = v3 serves as the output, while the two der different settings of visible input neurons v1 and +input neurons (unlabeled, thin lines) are clamped v2 . Minima in the energy function correspond to +at 1 and 0 for True and False. Output v3 is initial- stationary points of the dynamics. Gradient flow +ized at 0.5 and converges to a positive prediction of dynamics bring v3 to these attractor points, result- +1. The activation of the hidden neuron f3 for the ing in correct XOR outputs. +truth-table row (1, 0, 1) becomes highly activated, +with others (fine lines) are suppressed by softmax. +Energy (2), or equivalently (5), decreases monoton- +ically along the inference trajectory. + + +weight symmetry to be enforced by hardware sharing: both forward and reverse weights are realized by +the same resistive elements. Importantly, as long as weights are represented as conductances, they must +be non-negative. + +Design of a single neuron. Each neuron in the circuit computes its dynamics by integrating the cur- +rents it receives from the crossbar array, which represent weighted differences between its own activation +and those of connected neurons. Considering a hidden neuron (the design for visible neurons is symmet- +ric by design), the neuron’s internal voltage hµ is stored on capacitor C1 and evolves in continuous time, +while the neuron’s activation fµ is obtained by passing hµ through a nonlinear function (e.g. ReLU or +softmax). + The current flowing into hidden neuron µ is produced by its interaction with all visible neurons via +the synaptic weights ξµi for P i = 1, . . . , Nv . Specifically, this is as a weighted sum of the differences +between neuron P activations: i ξµi (gi − fµ ). Inside each neuron, a “self” path scales fµ to produceP the +voltage sµ = fµ i ξµi . This term is added to the value of the incoming current so that the −fµ i ξµi +term is cancelled inside each neuron. As a result, the hidden state, represented as the voltage across +capacitor C1 , integrates only the desired weighted input plus any external stimulus bµ . Its dynamics +reduce to the canonical DenseAM form with a time constant of R2 C1 : + Nv + dhµ X + R2 C 1 = ξµi gi + bµ − hµ (3) + dt i=1 + +Elementwise (or vectorized) nonlinearities then produce activations gi = g(vi ) and fµ = f (hµ ) (e.g., +ReLU, softmax) across the visible and hidden neurons. See Appendix A for the full circuit derivation. + + +5 Analog DenseAM Examples +We begin by studying two examples of the proposed design: the XOR task, and the (7,4) error-correcting +Hamming code. + + + + + 5 + 5.1 XOR +The XOR problem is a canonical test for nonlinear representation and inference, as it cannot be solved +by any linear model. We show a minimal DenseAM model for the XOR task, illustrating how energy- +based dynamics can solve this simple task with a continuous-time analog system. The network consists +of Nv = 3 visible neurons, and Nh = 4 hidden neurons. At t = 0 visible neurons v1 and v2 are initialized +at their input values corresponding to the input bits. The last visible neuron v3 is initialized at v3 = 0.5. +The hidden neurons are initialized at zero. The two input visible neurons remain clamped during the +dynamics, while the third output visible neuron and the hidden neurons evolve in time according to (1). +Each row of the memory matrix ξ corresponds to a row of the XOR truth table. The visible neurons +use an identity activation function where gi = vi , and the hidden neurons use a softmax activation. The +biases are set as + N v + 1X 2 + ai = 0, bµ = − ξµi + 2 i=1 + + Figure 2 shows the temporal evolution of visible and hidden neuron activations, as well as the total +energy, during inference on the XOR input (1, 0). The output visible neuron’s activation g3 gradually +converges to the correct prediction of 1, while the hidden neuron associated with that memory, f3 , +becomes strongly activated and the remaining hidden neurons are suppressed by the softmax nonlinearity. +The system’s energy decreases monotonically throughout the trajectory and stabilizes once the network +reaches its fixed-point prediction. Figure 3 depicts the system’s energy landscape as a function of output +neuron v3 for different clamped input configurations (v1 , v2 ). In each case, the energy exhibits a clear +convex minimum at the correct XOR output, demonstrating that gradient flow along the energy surface +drives v3 reliably toward the correct prediction. As shown in Appendix C, we validate our circuit design +and dynamics using SPICE simulation. + τh → 0. Since the second equation in + To analyze this DenseAM, it is instructive to consider the limit P + Nh +(1) is linear in hidden units hµ , they can be integrated out. With µ=1 fµ = 1, the resulting dynamics +of the visible neurons can be written as + Nh Nv + dvi X  βX  + (ξµi − vi )2 +  + τv = ξµi − vi fµ where fµ = softmax − (4) + dt µ=1 + 2 i=1 + +The effective energy on the visible neurons can be written as + Nh Nv + 1 X h βX i + E eff (v) = − log exp − (ξµi − vi )2 (5) + β µ=1 + 2 i=1 + +Intuitively, each hidden neuron computes a squared Euclidean distance between the visible state and its +stored pattern ξ µ . The softmax nonlinearity assigns higher weight to the pattern closest to the current +state of the visible neurons. The resulting visible neuron dynamics are gradient flow for this effective +energy. It is important to note that memories in this implementation are represented by conductances +of the crossbar array, which are always positive. For this reason, matrix elements of memories ξµi must +be positive, necessitating the use of the bias terms, which are just voltage sources that can be arbitrarily +signed. + While a time constant of τh = 0 is impossible to physically construct due to finite conductances +and nonzero capacitances, setting τh ≪ τv realizes the same adiabatic limit in practice. When hidden +neurons evolve much faster than visible ones, they reach their steady state almost instantaneously for each +configuration of visible neurons. The result is an adiabatic elimination of hidden dynamics, yielding the +effective visible-only dynamics above. In practice, for the XOR task, even a relatively modest τh = τv /10 +ratio yields perfect performance. + +5.2 Hamming (7,4) code +The Hamming (7,4) code is an error-correcting code that encodes 4 data bits into a 7-bit codeword by +adding 3 parity bits. The resulting 7-bit strings are special: only certain patterns are valid codewords, +and they are spaced apart so that if a single bit is flipped, the error can be detected and corrected [30]. +Table 1 lists the 16 codewords corresponding to four arbitrary data bits. + + + 6 + 1 + g5 + Neurons + Visible + Data bits (d1 d2 d3 d4 ) Codeword (c1 c2 c3 c4 c5 c6 c7 ) + + 0 + 0000 0000000 + 0001 0001111 + 1 f7 0010 0010110 + Neurons + Hidden + + + + + 0011 0011001 + 0100 0100101 + 0 + 0101 0101010 + 0.5 0110 0110011 + Energy + + + + + 0111 0111100 + 1000 1000011 + 0.0 1001 1001100 + 0 1 2 3 4 5 + 1010 1010101 + Time (s) + 1011 1011010 + 1100 1100110 + 1101 1101001 +Figure 4: Correcting a bit error in a Hamming 1110 1110000 +(7,4) code. Visible neuron g5 flips indicating the 1111 1111111 +bit flip error happened on the 5th codeword bit. f7 +is the hidden neuron corresponding to the memory Table 1: Valid codewords of the Hamming(7,4) +of the correct codeword. Thin lines correspond to code, ordered by their 4-bit data payload. +the other neuron activations. + + + Unlike the XOR case where the only evolving neuron is the readout bit, the Hamming (7,4) code may +require flipping the value of any one of the visible neurons. During inference, the visible neurons are +initialized to the corrupted 7-bit input word. All neurons are left free to evolve, and the dynamics relax +the state toward the nearest stored codeword. Energy minima are located at the valid codewords, so the +network converges to the correct code provided the error is within the Hamming radius of 1. Thus, the +DenseAM replicates the standard decoding property of the Hamming (7,4) code: any single-bit flip is +corrected automatically. Figure 4 illustrates a case where a flipped bit g5 is restored during convergence. + The Hamming (7,4) model’s 7 visible neurons, each corresponding to a codeword bit, are connected +to 16 hidden neurons, each representing one valid codeword. The weight matrix ξ ∈ {0, 1}16×7 is formed +by stacking the 16 codewords as its rows. Visible neurons have the identity activation, hidden neurons +use a softmax activation, and biases are chosen as in the XOR case to give the same integrated-out +visible dynamics as (4). + + +6 Analog Energy Transformer (Analog ET) via DenseAM +Our DenseAM circuit construction can be used to build more complex energy-based models, such as +the transformer-like architecture proposed in the Energy Transformer paper [12]. For causal next-token +prediction with a single attention head, the Energy Transformer’s energy function can be written as the +following (See Appendix J for full derivation): +  ⊤ ⊤  ⊤ attn ⊤ hopf + E = 12 ∥v − a∥2 − v⊤ ξ attn f attn + ξ hopf f hopf + f attn − b + f hopf +   + h h −c + − Lattn hattn − Lhopf hhopf +   + (6) + +with the activation functions and their Lagrangians defined as + L + X + fAattn = softmax(βhattn )A , Lattn (h) = β1 log eβhA (7) + A=1 + M h + X i2 + fµhopf = ReLU(hhopf + µ ), Lhopf (h) = 21 ReLU(hµ ) (8) + µ=1 + +where a, b, and c correspond to the biases of the visible neurons, attention hidden neurons, and Hopfield +network hidden neurons, respectively. The L context tokens are indexed by A, and the M hidden neurons +of the Hopfield network are indexed by µ. Because the visible units use an identity activation function, + + + 7 + Figure 5: Analog ET circuit demonstrating the autoregressive inference procedure. A newly inferenced +token is decoded, sampled, and re-embedded to obtain the weight vector ξ attn + L+1 , which is set as the weight +vector for a new hidden neuron hattn + L+1 in the energy attention block (light gray on right). For this layout +we have flipped the crossbar array, so that indices A and µ run horizontally and index i runs vertically. + + +gi = vi using the languge of Equation 1, the gradient flow of the energy yields the dynamics: + ∂E ⊤  ⊤ + τv v̇ = − = ξ attn f attn + ξ hopf f hopf + a − v (9) + ∂v + ∂E + τh ḣattn + = − attn = ξattn v + b − hattn (10) + ∂f + ∂E + τh ḣhopf = − hopf = ξhopf v + c − hhopf (11) + ∂f +In this formulation, v represents the embedding of the output (next) token, and its evolution is driven by +two terms: one term from the energy attention with weights ξattn and hidden neuron activations f attn , +and one term from the Hopfield network with weights ξ hopf and hidden neuron activations f hopf . The +weights of the energy attention DenseAM are dependent on the context: for a token dimension D, context +length L, and the task of predicting the token at index L + 1, the weights ξ attn ∈ RL×D are generated +by embedding each token of the context via a learned embedding matrix applied to each context token. +In contrast, the Hopfield network weights ξ hopf are learned during training and fixed at inference. The +number of memories in the Hopfield network is a hyperparameter M , such that ξ hopf ∈ RM ×D . + This system suggests a hardware implementation where v interacts with two independent DenseAMs, +one for the energy attention and one for the Hopfield term, which can share the same physical crossbar +structure. Figure 5 shows that the circuit structure remains a crossbar array (like Figure 1), but with +two distinct classes of hidden neurons. Because of the summation of currents along each row of the +crossbar array, the incoming current to visible neuron vi is the sum of contributions from the energy +attention block and from the Hopfield network block. The energy attention hidden neurons hattn use a +softmax activation function, while the Hopfield network hidden neurons hhopf use a ReLU activation. + +6.1 Analog Energy Transformer on the parity task +We build and evaluate the Analog ET on the L-bit parity task, which can  + P be thought of as an elementary + L +“language model”: given bits bit1 , . . . , bitL , predict bitL+1 = A=1 bitA mod 2. Parity is instructive +because it requires a representation of a global, order-L interaction, precluding linear and shallow models +from representing it efficiently. A successful model must be able to form high-order interactions in order +to generalize. We formulate parity as a next-token prediction problem: given an L-bit string as context, +predict its parity in the next token. + We train the Analog ET model digitally using backpropagation through time [31] implemented with +Jax’s automatic differentiation. The resulting weights can be deployed onto the analog hardware; in + + + 8 + 11001010 0 01000110 1 + + 4 +Visible neurons + + + 2 + 0 + 1 +Prediction + + + + + 0 + 10 +Energy + + + + + 20 + 30 + 0.0 0.2 0.4 0.6 0.8 1.0 0.0 0.2 0.4 0.6 0.8 1.0 + Time t Time t +Figure 6: Inference of parity Analog ET on two example 8-bit strings. Top row plots the visible neurons vi +over time, middle row plots the decoded token prediction, bottom row plots the energy that monotonically +decreases during inference. After a transient period of computation, the network arrives at a steady- +state, making the result of the computation robust against the precise timing of the readout. + + +our experiments we simulate the dynamics of hardware with the Diffrax [32] ODE solver library. On +the 8-bit parity task, our model achieves 100% accuracy on the hold-out validation set of 52 bit strings, +demonstrating clear generalization capabilities. See Appendix H.1 for more details on training and model +design. + Figure 6 shows the dynamics of the visible neurons and energy during two example inference runs +of the Analog ET. Notably, the visible neuron values are constant by the end of the inference period, +meaning that the inference remains highly stable to mismatch and delay in timing during readout. A +single sample-and-hold and switching circuit would enable a single Analog-Digital Converter (ADC) to +read out all the visible neurons at convergence, significantly reducing mismatch, and drastically saving +device area, complexity, and energy. The intrinsic stability of attractor points arises uniquely from +the continuous-time dynamics of the DenseAM, making these models particularly well suited to analog +hardware. + +6.2 Autoregressive inference +Dashed lines in Figure 5 illustrate the autoregressive inference procedure of the Analog ET. To generate +the L-th token given context tokens x(1) , . . . , x(L−1) , each token is first embedded and concatenated to +form the attention weight matrix +  (1)  + e +  e(2)  + ξ attn,(L−1) =  .  ∈ R(L−1)×D +   +  ..  + e(L−1) + +These rows are loaded into the Analog ET’s energy attention weight matrix ξ attn by programming the +corresponding crossbar resistances. During inference, the visible state v(t) evolves according to the +Analog ET dynamics until convergence. A decoder readout (e.g., a linear layer) applied to the converged +v(t = T ) values produces logits, from which the next token x(L) is sampled. This token is then embedded +to form e(L) , and appended to the existing context. The cycle repeats with the updated attention weight + + + 9 + matrix +  attn,(L−1)  + ξ + ξ attn,(L) = ∈ RL×D + e(L) + +which now includes the new embedding e(L) . In hardware, this corresponds to connecting an additional +hidden neuron in the energy attention block of Figure 5, and setting its resistive weights with e(L) . +Because the physical order of hidden neurons does not affect the energy function, this new neuron can +be placed in any position among the hidden neurons. When the context length is fixed, the hidden +neuron corresponding to the earliest token can simply be reprogrammed with the new vector of weights +e(L) , resulting in the hardware equivalent of a sliding-window context. In practice, an external digital +controller, e.g., an Field-Programmable Gate Array (FPGA) or Application-Specific Integrated Circuit +(ASIC) would orchestrate crossbar programming and token decoding, while the DenseAM dynamics +perform the far more substantial workload of computing each next-token embedding. + This procedure is analogous to key-value (KV) caching in standard transformer inference [33]. Context +tokens x(1) , . . . , x(L−1) produce key and value vectors k(1) , . . . , k(L−1) and v(1) , . . . , v(L−1) respectively. +When new token x(L) is generated, its corresponding k(L) and v(L) vectors are appended to the cache, +allowing all previous k(