From b83947778e2c776f757a07d4719b7ce961d7ed55 Mon Sep 17 00:00:00 2001 From: Yuren Hao Date: Fri, 3 Jul 2026 05:56:50 -0500 Subject: =?UTF-8?q?Initial=20commit:=20ept=20=E2=80=94=20backprop-free=20e?= =?UTF-8?q?quilibrium=20transformer=20(EP)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code (ep_run/), organized docs (docs/{method,campaign,hardware,outreach,paper}), analysis scripts (scripts/), ONBOARDING.md entry point. Large data/checkpoints git-ignored (share separately). Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_014FAPDWQ49M5Ye3NpTndTpn --- .gitignore | 13 + ONBOARDING.md | 95 + assets/ept_method_intro.tex | 606 ++++ assets/frozen_vs_adaptive.png | Bin 0 -> 217748 bytes docs/COLAB_50M.md | 117 + docs/campaign/C512_PLATEAU_CAMPAIGN.md | 159 + docs/campaign/C512_ROUND2_ABCD.md | 118 + docs/campaign/EP_BELOW210_DIAGNOSIS_FIX.md | 260 ++ docs/campaign/FINDINGS.md | 609 ++++ ...SESSION_2026-06-24_HOPF_DIAGNOSIS_RESREG_FIX.md | 81 + docs/hardware/COLLABORATOR_BRIEF.md | 46 + docs/hardware/FUGU_CODEX_PHYSICS_ANSWER.md | 125 + docs/hardware/GPT_PRO_PHYSICS_ANSWER.md | 46 + docs/hardware/HW_RESEARCH_FINDINGS.md | 98 + .../PHYSICS_QUESTIONS_FOR_DEEP_REASONING.md | 65 + docs/hardware/SCALING_AND_HARDWARE_PLAN.md | 58 + docs/method/ARCHITECTURE.md | 117 + docs/method/EP_DERIVATION.md | 215 ++ docs/method/METHODS.md | 576 ++++ docs/method/READING.md | 58 + docs/method/READING_EN.md | 54 + docs/outreach/EMAIL_DRAFT_BEN.md | 42 + docs/outreach/OUTREACH_TARGETS.md | 199 ++ docs/outreach/SCELLIER_OUTREACH.md | 62 + docs/paper/PAPER_A_OUTLINE.md | 57 + ep_run/CODEX_VERDICT.md | 151 + ep_run/EP_DIAGNOSIS_DOSSIER.md | 99 + ep_run/FUGU_OPTIONS_VERDICT.md | 263 ++ ep_run/FUGU_Q1_VERDICT.md | 123 + ep_run/FUGU_Q_OPTIONS.md | 29 + ep_run/FUGU_VERDICT_FULL.md | 160 + ep_run/GPT55_BUG_HUNT.md | 249 ++ ep_run/adaptive_eps_calib.py | 40 + ep_run/adaptive_eps_calib2.py | 39 + ep_run/alert.sh | 13 + ep_run/analogET_extracted.txt | 1861 +++++++++++ ep_run/analyze.py | 95 + ep_run/analyze_all.py | 86 + ep_run/analyze_ln_jacobian.py | 166 + ep_run/analyze_softmax_jacobian.py | 168 + ep_run/anderson_control.py | 67 + ep_run/asym_probe.py | 922 ++++++ ep_run/auto_probe.py | 25 + ep_run/bench_gpu2.py | 33 + ep_run/bf16_dbg.py | 29 + ep_run/bf16_dbg2.py | 30 + ep_run/bias_var.py | 63 + ep_run/bp_charlm.py | 78 + ep_run/compile_bench.py | 44 + ep_run/cos_monitor.py | 49 + ep_run/cos_sweep.log | 15 + ep_run/cos_sweep.py | 35 + ep_run/data_prep.log | 4 + ep_run/diag_cos.py | 45 + ep_run/drift_diag.py | 87 + ep_run/eig_control.py | 50 + ep_run/eig_jacreg.py | 38 + ep_run/eig_probe.py | 43 + ep_run/ep_ajr_check.py | 22 + ep_run/ep_c3_watch.py | 19 + ep_run/ep_c_check.py | 22 + ep_run/ep_eps05_grid.py | 29 + ep_run/ep_eps05_track.py | 24 + ep_run/ep_eps05_track2.py | 26 + ep_run/ep_eps05_watch.py | 20 + ep_run/ep_fast_check.py | 23 + ep_run/ep_fast_timing.py | 20 + ep_run/ep_jacreg_binary.py | 22 + ep_run/ep_jacreg_grid.py | 27 + ep_run/ep_jacreg_spike.py | 26 + ep_run/ep_resreg_check.py | 23 + ep_run/ep_resreg_grid.py | 24 + ep_run/ep_rr_check.py | 22 + ep_run/ep_sn_monitor.py | 43 + ep_run/ep_t2fix_watch.py | 20 + ep_run/epmc.json | 1 + ep_run/eps_sweep_s3200.py | 29 + ep_run/eval_relax_s3200.py | 23 + ep_run/extracted_paper.txt | 2039 ++++++++++++ ep_run/factorized_exit.py | 330 ++ ep_run/fast_probe.py | 41 + ep_run/gcalib.py | 38 + ep_run/gen_ept.py | 32 + ep_run/grad_quality.py | 64 + ep_run/holo_ep.py | 332 ++ ep_run/jnc_scaling.py | 46 + ep_run/knockout_s3200.py | 27 + ep_run/local_layers.py | 305 ++ ep_run/lt_ep_anderson.py | 54 + ep_run/lt_ep_attention.py | 129 + ep_run/lt_ep_compare.py | 69 + ep_run/lt_ep_diag.py | 57 + ep_run/lt_ep_ffn.py | 119 + ep_run/lt_ep_stack.py | 165 + ep_run/lt_ep_train.py | 630 ++++ ep_run/mdpi_paper.html | 10 + ep_run/model.py | 156 + ep_run/model_local.py | 470 +++ ep_run/oracle_adjoint_train.py | 368 +++ ep_run/prepare_tinystories.py | 40 + ep_run/prepare_tinystories_bpe.py | 49 + ep_run/probe_geometry.py | 162 + ep_run/profile_ep.log | 21 + ep_run/profile_ep.py | 40 + ep_run/ra_mlp.py | 287 ++ ep_run/rearm_203.sh | 8 + ep_run/redx_freezer.py | 19 + ep_run/redx_freezer2.py | 21 + ep_run/redx_trajprobe.py | 47 + ep_run/resreg_probe.py | 62 + ep_run/resreg_warm_probe_loop.py | 49 + ep_run/sample_eq.py | 70 + ep_run/scurria_nonconservative.txt | 1708 ++++++++++ ep_run/solver_wall.py | 61 + ep_run/spec_bifurcation.py | 38 + ep_run/spec_check.py | 15 + ep_run/spec_rho_vs_c.py | 29 + ep_run/speed_probe.py | 63 + ep_run/stiefel_feedback.py | 126 + ep_run/t2fix_freezer.py | 21 + ep_run/t2fix_rho_prober.py | 52 + ep_run/test_aselect_deepdive.py | 323 ++ ep_run/test_compile_aselect.py | 23 + ep_run/track_probe.py | 77 + ep_run/train.py | 183 ++ ep_run/train_local.py | 300 ++ ep_run/train_local_ce.py | 580 ++++ ep_run/train_recon.py | 322 ++ ep_run/train_stiefel.py | 211 ++ ep_run/verify_aep_manual.py | 62 + ep_run/watch_all.sh | 15 + ep_run/watch_clean.py | 36 + ep_run/watch_contraction.py | 41 + ep_run/watch_hr.py | 33 + ep_run/watch_runs.py | 39 + refs/fre_rnn_full.txt | 1945 ++++++++++++ refs/hw_groups_claims.json | 103 + refs/hw_research_claims.json | 134 + refs/hw_research_claims2.json | 85 + refs/paper_2603.12934.txt | 2039 ++++++++++++ refs/scurria_2602.03670v2.txt | 3311 ++++++++++++++++++++ scripts/aep_attention.py | 157 + scripts/aep_characterize.py | 157 + scripts/aep_contractive.py | 52 + scripts/aep_contractive2.py | 41 + scripts/aep_depth.py | 30 + scripts/aep_option1.py | 115 + scripts/aep_projected.py | 125 + scripts/ask_fugu.py | 24 + scripts/bp_transformer.py | 141 + scripts/cet_aep.py | 272 ++ scripts/cet_mvp.py | 372 +++ scripts/plot_jr_cmp.py | 20 + 153 files changed, 29377 insertions(+) create mode 100644 .gitignore create mode 100644 ONBOARDING.md create mode 100644 assets/ept_method_intro.tex create mode 100644 assets/frozen_vs_adaptive.png create mode 100644 docs/COLAB_50M.md create mode 100644 docs/campaign/C512_PLATEAU_CAMPAIGN.md create mode 100644 docs/campaign/C512_ROUND2_ABCD.md create mode 100644 docs/campaign/EP_BELOW210_DIAGNOSIS_FIX.md create mode 100644 docs/campaign/FINDINGS.md create mode 100644 docs/campaign/SESSION_2026-06-24_HOPF_DIAGNOSIS_RESREG_FIX.md create mode 100644 docs/hardware/COLLABORATOR_BRIEF.md create mode 100644 docs/hardware/FUGU_CODEX_PHYSICS_ANSWER.md create mode 100644 docs/hardware/GPT_PRO_PHYSICS_ANSWER.md create mode 100644 docs/hardware/HW_RESEARCH_FINDINGS.md create mode 100644 docs/hardware/PHYSICS_QUESTIONS_FOR_DEEP_REASONING.md create mode 100644 docs/hardware/SCALING_AND_HARDWARE_PLAN.md create mode 100644 docs/method/ARCHITECTURE.md create mode 100644 docs/method/EP_DERIVATION.md create mode 100644 docs/method/METHODS.md create mode 100644 docs/method/READING.md create mode 100644 docs/method/READING_EN.md create mode 100644 docs/outreach/EMAIL_DRAFT_BEN.md create mode 100644 docs/outreach/OUTREACH_TARGETS.md create mode 100644 docs/outreach/SCELLIER_OUTREACH.md create mode 100644 docs/paper/PAPER_A_OUTLINE.md create mode 100644 ep_run/CODEX_VERDICT.md create mode 100644 ep_run/EP_DIAGNOSIS_DOSSIER.md create mode 100644 ep_run/FUGU_OPTIONS_VERDICT.md create mode 100644 ep_run/FUGU_Q1_VERDICT.md create mode 100644 ep_run/FUGU_Q_OPTIONS.md create mode 100644 ep_run/FUGU_VERDICT_FULL.md create mode 100644 ep_run/GPT55_BUG_HUNT.md create mode 100644 ep_run/adaptive_eps_calib.py create mode 100644 ep_run/adaptive_eps_calib2.py create mode 100755 ep_run/alert.sh create mode 100644 ep_run/analogET_extracted.txt create mode 100644 ep_run/analyze.py create mode 100644 ep_run/analyze_all.py create mode 100644 ep_run/analyze_ln_jacobian.py create mode 100644 ep_run/analyze_softmax_jacobian.py create mode 100644 ep_run/anderson_control.py create mode 100644 ep_run/asym_probe.py create mode 100644 ep_run/auto_probe.py create mode 100644 ep_run/bench_gpu2.py create mode 100644 ep_run/bf16_dbg.py create mode 100644 ep_run/bf16_dbg2.py create mode 100644 ep_run/bias_var.py create mode 100644 ep_run/bp_charlm.py create mode 100644 ep_run/compile_bench.py create mode 100644 ep_run/cos_monitor.py create mode 100644 ep_run/cos_sweep.log create mode 100644 ep_run/cos_sweep.py create mode 100644 ep_run/data_prep.log create mode 100644 ep_run/diag_cos.py create mode 100644 ep_run/drift_diag.py create mode 100644 ep_run/eig_control.py create mode 100644 ep_run/eig_jacreg.py create mode 100644 ep_run/eig_probe.py create mode 100644 ep_run/ep_ajr_check.py create mode 100644 ep_run/ep_c3_watch.py create mode 100644 ep_run/ep_c_check.py create mode 100644 ep_run/ep_eps05_grid.py create mode 100644 ep_run/ep_eps05_track.py create mode 100644 ep_run/ep_eps05_track2.py create mode 100644 ep_run/ep_eps05_watch.py create mode 100644 ep_run/ep_fast_check.py create mode 100644 ep_run/ep_fast_timing.py create mode 100644 ep_run/ep_jacreg_binary.py create mode 100644 ep_run/ep_jacreg_grid.py create mode 100644 ep_run/ep_jacreg_spike.py create mode 100644 ep_run/ep_resreg_check.py create mode 100644 ep_run/ep_resreg_grid.py create mode 100644 ep_run/ep_rr_check.py create mode 100644 ep_run/ep_sn_monitor.py create mode 100644 ep_run/ep_t2fix_watch.py create mode 100644 ep_run/epmc.json create mode 100644 ep_run/eps_sweep_s3200.py create mode 100644 ep_run/eval_relax_s3200.py create mode 100644 ep_run/extracted_paper.txt create mode 100644 ep_run/factorized_exit.py create mode 100644 ep_run/fast_probe.py create mode 100644 ep_run/gcalib.py create mode 100644 ep_run/gen_ept.py create mode 100644 ep_run/grad_quality.py create mode 100644 ep_run/holo_ep.py create mode 100644 ep_run/jnc_scaling.py create mode 100644 ep_run/knockout_s3200.py create mode 100644 ep_run/local_layers.py create mode 100644 ep_run/lt_ep_anderson.py create mode 100644 ep_run/lt_ep_attention.py create mode 100644 ep_run/lt_ep_compare.py create mode 100644 ep_run/lt_ep_diag.py create mode 100644 ep_run/lt_ep_ffn.py create mode 100644 ep_run/lt_ep_stack.py create mode 100644 ep_run/lt_ep_train.py create mode 100644 ep_run/mdpi_paper.html create mode 100644 ep_run/model.py create mode 100644 ep_run/model_local.py create mode 100644 ep_run/oracle_adjoint_train.py create mode 100644 ep_run/prepare_tinystories.py create mode 100644 ep_run/prepare_tinystories_bpe.py create mode 100644 ep_run/probe_geometry.py create mode 100644 ep_run/profile_ep.log create mode 100644 ep_run/profile_ep.py create mode 100644 ep_run/ra_mlp.py create mode 100644 ep_run/rearm_203.sh create mode 100644 ep_run/redx_freezer.py create mode 100644 ep_run/redx_freezer2.py create mode 100644 ep_run/redx_trajprobe.py create mode 100644 ep_run/resreg_probe.py create mode 100644 ep_run/resreg_warm_probe_loop.py create mode 100644 ep_run/sample_eq.py create mode 100644 ep_run/scurria_nonconservative.txt create mode 100644 ep_run/solver_wall.py create mode 100644 ep_run/spec_bifurcation.py create mode 100644 ep_run/spec_check.py create mode 100644 ep_run/spec_rho_vs_c.py create mode 100644 ep_run/speed_probe.py create mode 100644 ep_run/stiefel_feedback.py create mode 100644 ep_run/t2fix_freezer.py create mode 100644 ep_run/t2fix_rho_prober.py create mode 100644 ep_run/test_aselect_deepdive.py create mode 100644 ep_run/test_compile_aselect.py create mode 100644 ep_run/track_probe.py create mode 100644 ep_run/train.py create mode 100644 ep_run/train_local.py create mode 100644 ep_run/train_local_ce.py create mode 100644 ep_run/train_recon.py create mode 100644 ep_run/train_stiefel.py create mode 100644 ep_run/verify_aep_manual.py create mode 100755 ep_run/watch_all.sh create mode 100644 ep_run/watch_clean.py create mode 100644 ep_run/watch_contraction.py create mode 100644 ep_run/watch_hr.py create mode 100644 ep_run/watch_runs.py create mode 100644 refs/fre_rnn_full.txt create mode 100644 refs/hw_groups_claims.json create mode 100644 refs/hw_research_claims.json create mode 100644 refs/hw_research_claims2.json create mode 100644 refs/paper_2603.12934.txt create mode 100644 refs/scurria_2602.03670v2.txt create mode 100644 scripts/aep_attention.py create mode 100644 scripts/aep_characterize.py create mode 100644 scripts/aep_contractive.py create mode 100644 scripts/aep_contractive2.py create mode 100644 scripts/aep_depth.py create mode 100644 scripts/aep_option1.py create mode 100644 scripts/aep_projected.py create mode 100644 scripts/ask_fugu.py create mode 100644 scripts/bp_transformer.py create mode 100644 scripts/cet_aep.py create mode 100644 scripts/cet_mvp.py create mode 100644 scripts/plot_jr_cmp.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d648a33 --- /dev/null +++ b/.gitignore @@ -0,0 +1,13 @@ +# large data & checkpoints — share separately (HF / drive), never in git +ep_run/data/ +ep_run/runs/ +runs/ +archive/ +assets/*.pdf +*.pt +*.state +*.bin +*.tar.gz +__pycache__/ +*.pyc +.ipynb_checkpoints/ diff --git a/ONBOARDING.md b/ONBOARDING.md new file mode 100644 index 0000000..dfa758b --- /dev/null +++ b/ONBOARDING.md @@ -0,0 +1,95 @@ +# ept — onboarding for a new collaborator (algorithm / experiments side) + +*Single entry point, kept current. Deeper docs linked at the bottom. — 2026-07-03* + +## 1. What this project is (in three sentences) +We train a **transformer as a fixed-point (equilibrium) system with Equilibrium Propagation (EP) — no +backpropagation**. The forward pass is a damped relaxation `z ← z + ε·F(z)` that settles to a fixed point +`z*`; the weight update is **local**, computed from the contrast between a *free* settle and a slightly +*nudged* settle (no backward pass, no stored activations, no weight transport). This is exactly the +computation analog in-memory hardware does natively — so the north star is a **backprop-free, on-chip-trainable +path to language models**, with the GPU work here de-risking the algorithm before hardware. + +## 2. Architecture (current — the `thick` block in `ep_run/lt_ep_train.py`) +One block = one dynamical system on the token state `z ∈ R^{B,T,C}`: +``` +F(z) = −(z − x_in) + Attn(LN(z)) + FFN(LN(z)) − c·z + └ input clamp ┘ └ causal softmax self-attn ┘ └ untied 4×GELU FFN ┘ └ damping (contraction) ┘ +``` +- `x_in = tok[idx] + pos`, clamped as a boundary condition. Forward = relax T1 steps → `z*`; readout `logits = z*·W_h`. +- **This is a standard pre-LN transformer block run as a Deep-Equilibrium fixed point**, with a `−c·z` leak for contraction. +- It is **non-conservative**: the attention (untied Q/K/V/O) is not the gradient of any energy → the Jacobian is + non-normal (we measure `|Jv−Jᵀv|/|Jv| ≈ 1.4`). *That non-conservativity is the source of both the expressivity and + the central difficulty (below).* (An older energy-formulation variant is in `docs/method/ARCHITECTURE.md`, now superseded.) +- **EP training loop** (`ep_step`): free settle → *nudged* settle (output pulled toward the target by `β`) with an + **AsymEP correction** (an antisymmetric-Jacobian term that makes the gradient *exact* for the non-conservative + operator); the update is the state-difference contracted with `∂F/∂θ`. The nudged phase uses a fast + adaptive-`T2` "holo a-select" estimator (`holo_ep.py`). + +## 3. Where we are (results, C512, TinyStories-BPE) +- **EP gradient ≈ exact BPTT gradient** (cosine ≈ 0.92–0.99 per component when the free phase is converged) — the + learning rule is validated, not approximate. +- **Best val cross-entropy 1.9313** (vs a same-parameter BP transformer ~1.79); generates coherent children's stories. +- The recipe **trains stably and matches/approaches BP** at this scale. Model = C512 / H16 / T256, damped DEQ block. +- ⚠️ **The 1.93 number is warm-started** from a stable early checkpoint (`s2000`); a single from-scratch run currently + plateaus at **~2.10** (see §5, the crux). + +## 4. The one hard problem (and the paper it spun off) +The binding constraint is **NOT the gradient** — it's **forward fixed-point STABILITY during training**. As +optimization makes attention more expressive/non-conservative, the operator loses contraction, a complex-eigenvalue +pair of its Jacobian crosses the imaginary axis (**a supercritical Hopf bifurcation**), the relaxation stops +converging (residual → 0.1+), and training breaks. Controls that hold it: **`resreg`** (penalize the T1 residual), +**`jacreg`** (penalize the Jacobian norm), and the new **`eigreg`** (leading-abscissa / log-norm control, §5). +> This stability question generalized into a **standalone paper** — *"Dynamics and Convergence of Equilibrium +> Learning"* (the report we shared with Ben Scellier is that spin-off, in `/home/yurenh2/aep-dynamics/`): the Hopf + +> a leading-spectral-signal cure, shown across MLP/CNN/RNN and across learning rules (EP and DEQ/RBP). ept is the +> language-model-scale instance of the same phenomenon. + +## 5. Open problems — where you can plug in (ranked) +1. **★ Crack from-scratch below 2.0 (the crux).** We *ultimately need* from-scratch (no magic warm checkpoint) for a + real / hardware result. Diagnosis (via the new `--fingerprint`): the warm source `s2000` is a **deeply contractive** + operator (numerical abscissa −10) with a well-aligned EP gradient; a from-scratch plateau operator sits **near the + Hopf boundary** (abscissa +1.11) with a modestly worse gradient — and *training drifts the operator toward the + boundary as it learns* (val 3.16→2.24 tracks abscissa −10→+1.11). **Hypothesis to test:** hold the operator + deeply-contractive from scratch with `--eigreg` (leading-abscissa control) → crack the plateau without a warm start. + Tools are built and default-off: `diag_cos.py` (`--diag_cos N`, `--fingerprint`), `eig_control.py` (`--eigreg`). +2. **Scaling** to hundreds-of-M / small-LLM (gated on cloud compute — a Scellier/AWS path is in progress). +3. **Speed** (`ep_run/profile_ep.py`, `cos_sweep.py`): the holo a-select is ~56% of the step; `t2sel` is a + cosine-preserving speed lever (160→80 ≈ 1.8× free); multi-GPU data-parallel EP is untried. +4. **Analog realism**: device noise / low-bit quantization / asymmetric update in the simulator (not yet added; the + Yu-Neng Wang hardware conversation is about exactly this device model). + +## 6. Codebase map (`ep_run/`) +- **`lt_ep_train.py`** — everything: the block, `ep_step` (EP training), `bptt_step` (exact-gradient control), + `relax`, `evaluate`, the residual/jacreg controllers, the training loop. The one file to read first. +- **`holo_ep.py`** — the adaptive-T2 nudged-phase estimator (`holo_a_select`, `holo_a_track`). +- **`diag_cos.py`** (new) — `cos(EP, BPTT)` trajectory + operator `fingerprint` (res / cos / numerical-abscissa / val). +- **`eig_control.py`** (new) — the `--eigreg` leading-abscissa control (power-iteration, scalable, analog-compatible). +- `eig_probe.py`, `cos_sweep.py`, `profile_ep.py`, `bp_transformer.py` (BP baseline) — probes / baselines. +- `data/` (TinyStories-BPE, ~712M) and `runs/` (~8G checkpoints) — **git-ignored; get these separately.** + +## 7. How to run +Canonical C512 recipe (one block, EP, ~holo fast path): +``` +python3 lt_ep_train.py --mode ep --attn_mode thick --B 24 --C 512 --H 16 --T 256 --c 1.0 \ + --jacreg 0.1 --jr_floor 0.1 --jr_max 16 --holo 2 --hr 0.02 --t2sel 80 --track --pema 0.999 \ + --t1max 150 --res_est 1e-4 --resreg 0.2 --qknorm --T1 150 --T2 20 --lr 6e-4 --wsd 0.25 \ + --steps 32000 --data data/tinystories_bpe --ckpt runs/myrun.pt --state runs/myrun.state +``` +Diagnostics: add `--diag_cos 500` (log cos-to-BPTT over training) · `--init_ckpt --fingerprint` (print an +operator's 4-D fingerprint) · `--eigreg 0.1 --eig_margin 1.0` (leading-abscissa control, alt to `--jacreg`). +BP baseline (fair control): `--mode bptt`. **All experiment processes must use `nohup`.** + +## 8. Deeper docs (organized under `docs/`) +- **`docs/method/`** — `METHODS.md`, `EP_DERIVATION.md` (the EP/AsymEP gradient derivation), `ARCHITECTURE.md` + (implementation detail; older energy-formulation, partly superseded by §2 above), `READING.md`. +- **`docs/campaign/`** — `FINDINGS.md` (running log of what worked / didn't) + the full plateau history: + `C512_PLATEAU_CAMPAIGN.md`, `C512_ROUND2_ABCD.md`, `EP_BELOW210_DIAGNOSIS_FIX.md`, + `SESSION_2026-06-24_HOPF_DIAGNOSIS_RESREG_FIX.md` (the Hopf diagnosis + resreg fix). +- **`docs/hardware/`** — `SCALING_AND_HARDWARE_PLAN.md` (scaling + analog end goal), `COLLABORATOR_BRIEF.md` + (hardware-collaborator one-pager), `HW_RESEARCH_FINDINGS.md`, the physics Q&A docs. +- **`docs/outreach/`** — `OUTREACH_TARGETS.md`, `SCELLIER_OUTREACH.md`, `EMAIL_DRAFT_BEN.md`. +- **`docs/paper/PAPER_A_OUTLINE.md`** — the ept paper outline. The dynamics spin-off lives in `../aep-dynamics/`. + +*Repo layout:* `ep_run/` = code (start at `lt_ep_train.py`) · `docs/` = the above · `scripts/` = standalone +analysis/probe scripts · `assets/` = PDFs/figures · `refs/` = external paper texts · `archive/` = stale snapshots. diff --git a/assets/ept_method_intro.tex b/assets/ept_method_intro.tex new file mode 100644 index 0000000..31d458d --- /dev/null +++ b/assets/ept_method_intro.tex @@ -0,0 +1,606 @@ +\documentclass[11pt]{article} + +\usepackage[margin=1in]{geometry} +\usepackage{amsmath,amssymb} +\usepackage{bm} +\usepackage[round]{natbib} +\usepackage{enumitem} +\usepackage{booktabs} +\usepackage[colorlinks=true,linkcolor=blue,citecolor=blue,urlcolor=blue]{hyperref} + +% --- light-weight notation --------------------------------------------------- +\newcommand{\R}{\mathbb{R}} +\newcommand{\C}{\mathbb{C}} +\renewcommand{\Re}{\operatorname{Re}} +\newcommand{\xin}{x_{\mathrm{in}}} +\newcommand{\zstar}{z^{\ast}} +\newcommand{\zbar}{\bar{z}} +\newcommand{\Fnc}{F_{\mathrm{nc}}} +\newcommand{\Jnc}{J_{\mathrm{nc}}} +\newcommand{\half}{\tfrac12} +\newcommand{\grad}{\nabla} +\newcommand{\dd}{\,\mathrm{d}} +\newcommand{\inner}[2]{\langle #1,\, #2\rangle} +\DeclareMathOperator{\Attn}{Attn} +\DeclareMathOperator{\FFN}{FFN} +\DeclareMathOperator{\softmax}{softmax} +\DeclareMathOperator{\LSE}{LSE} +\DeclareMathOperator{\LN}{LN} +\DeclareMathOperator{\jvp}{jvp} +\DeclareMathOperator{\vjp}{vjp} +\DeclareMathOperator*{\argmin}{arg\,min} + +\title{\bf Training a Transformer Language Model with Equilibrium Propagation:\\ +from energy-based EP to non-conservative, holomorphic, tracking-AEP} +\author{Method introduction (internal)} +\date{2026-06-21} + +\begin{document} +\maketitle + +\begin{abstract} +We train a transformer-class language model in which \emph{both} attention and the +feed-forward network learn \emph{without backpropagation through the computation}, +using Equilibrium Propagation (EP). This note is written for a reader who knows +\emph{classic} energy-based EP \citep{scellier2017} --- the two-phase free/nudged +relaxation of a conservative, symmetric-Jacobian system --- but has not met the +non-conservative / asymmetric / holomorphic extensions. We first recall why classic +EP \emph{requires} a conservative system, then show that softmax self-attention +breaks that requirement (independent $Q,K,V$ give an asymmetric Jacobian). We then +introduce, from first principles, the pieces that repair this: the +\emph{asymmetric / adjoint} EP correction $J\!\to\!J^{\!\top}$ +\citep{scurria2026}; the \emph{holomorphic} EP estimator \citep{laborieux2022}; +the \emph{Convergent Energy Transformer} (CET) route \citep{hoier2026} that +sidesteps the problem by making attention conservative; and finally \emph{our} +recipe: a damped non-conservative equilibrium-transformer block, trained with +\emph{tracking-AEP} (re-linearizing the correction at the moving common-mode +midpoint) plus a residual-driven stabilization stack. We report what is solidly +validated --- component gradients match backprop at cosine $0.99$--$1.0$, and EP +trains the block stably and competitively with a backprop transformer at equal +parameters on a character-level LM --- and clearly mark the larger-scale work +(the $C{=}512$ ``residual-defense'' line) as \emph{ongoing}. +\end{abstract} + +\tableofcontents + +%============================================================================== +\section{Recap: classic energy-based EP and why it needs a conservative system} +\label{sec:classic} + +\paragraph{Setup.} +Classic EP \citep{scellier2017} trains a dynamical system whose state +$z\in\R^{d}$ relaxes, under a fixed input/clamp, to the minimum of a scalar +\emph{energy} $E(z,\theta)$. Two ideas make it a learning rule. + +\paragraph{Two phases.} +\begin{itemize}[leftmargin=1.4em,itemsep=2pt] + \item \emph{Free phase.} Run the gradient dynamics $\dot z=-\grad_z E(z,\theta)$ + to the free equilibrium $\zstar=\argmin_z E(z,\theta)$, + in practice an Euler relaxation to a fixed point. + \item \emph{Nudged phase.} Add the task loss to the energy with a small strength + $\beta$, $E_\beta = E + \beta\,\ell(z)$, and relax to the nudged + equilibrium $z_\beta$. +\end{itemize} + +\paragraph{The contrastive gradient.} +EP's central identity is that the loss gradient w.r.t.\ any parameter is the +\emph{contrastive difference of $\partial E/\partial\theta$ across the two phases}: +\begin{equation} + \frac{\partial \mathcal{L}}{\partial \theta} + \;\approx\; + \frac{1}{\beta}\!\left[ + \frac{\partial E}{\partial\theta}(z_\beta,\theta) + -\frac{\partial E}{\partial\theta}(\zstar,\theta) + \right] + \qquad(\text{one-sided, bias }O(\beta)). + \label{eq:ep-onesided} +\end{equation} +Centered / symmetric nudging \citep{laborieux2021} uses $\pm\beta$ and averages, +reducing the estimator bias to $O(\beta^2)$: +\begin{equation} + \frac{\partial \mathcal{L}}{\partial \theta} + \;\approx\; + \frac{1}{2\beta}\!\left[ + \frac{\partial E}{\partial\theta}(z_{+\beta}) + -\frac{\partial E}{\partial\theta}(z_{-\beta}) + \right]. + \label{eq:ep-centered} +\end{equation} +The update is \emph{local}: each parameter reads only the two equilibria of the +terms it touches; there is no backward pass and no weight transport. As +$\beta\!\to\!0$ with a converged free phase, the EP estimate equals the +implicit/equilibrium gradient, and (in an RNN with static input) it equals the +step-wise BPTT gradient \citep{ernoult2019}. + +\paragraph{Why this needs a conservative / symmetric-Jacobian system.} +Equations \eqref{eq:ep-onesided}--\eqref{eq:ep-centered} are only valid because +the dynamics are the \emph{gradient} of a scalar energy. Write the force as +$F(z) = -\grad_z E(z)$ and its Jacobian as $J=\partial F/\partial z$. If $F$ +descends an energy, then $J = -\,\partial^2 E/\partial z^2$ is a Hessian and is +therefore \emph{symmetric}, $J=J^{\!\top}$. This symmetry is exactly what makes the +nudged perturbation a faithful surrogate for the loss \emph{adjoint}: linearizing +the nudged relaxation around $\zstar$ produces a response governed by +$(I-J)^{-1}$, and because $J=J^{\!\top}$ this self-adjoint operator is the same one +the true gradient (which involves $(I-J^{\!\top})^{-1}$) requires. We therefore +record the four implicit premises of classic EP --- the transformer will break all +four, and each fix below targets exactly one of them: +\begin{description}[leftmargin=2.6em,itemsep=2pt] + \item[(A) Conservative / symmetric.] A scalar energy $E$ exists, so $J=J^{\!\top}$. + \item[(B) Free phase converged.] The readout sits at the true fixed point; + residual $\approx 0$. + \item[(C) Small-$\beta$ linear response, clean nudge.] $\beta\!\to\!0$ is a mere + perturbation, and no non-analytic ``clamp'' contaminates the estimate. + \item[(D) The fixed point stays stable throughout training.] After every weight + update the free phase still relaxes to a stable fixed point. +\end{description} + +%============================================================================== +\section{The gap: softmax attention is non-conservative} +\label{sec:gap} + +A pre-LN transformer block computes, for a state $z$, +\begin{equation} + \Attn(z) = \softmax\!\Big(\tfrac{Q(z)K(z)^{\!\top}}{\sqrt{d}},\ \text{causal}\Big)V(z)\,W_O, + \qquad + Q=zW_Q,\ K=zW_K,\ V=zW_V, + \label{eq:attn} +\end{equation} +with \emph{independent} projections $W_Q,W_K,W_V$. The query--key coupling +$i\!\to\!j$ is governed by $W_QW_K^{\!\top}$, while $j\!\to\!i$ is governed by +$W_KW_Q^{\!\top}$; these differ, and $V$ is a third independent map. Consequently +the attention Jacobian is \emph{asymmetric}, $J_{\Attn}\neq J_{\Attn}^{\!\top}$, and +\emph{no scalar energy has this gradient}. An untied $4\times$ FFN +($W_2\,\mathrm{GELU}(W_1\cdot)$ with $W_2\neq W_1^{\!\top}$) is non-conservative for +the same reason. Premise~(A) fails. + +Empirically this is not a cosmetic issue: with an asymmetric $J$ the nudged phase +relaxes under $J$ but the correct loss adjoint needs $J^{\!\top}$, so the raw EP +contrast is \emph{biased}. Measured against the true backprop gradient, uncorrected +EP gives an attention-parameter cosine of only $\approx 0.25$ (essentially the +wrong direction), even though the loss-adjacent output projection looks fine. (This +is the same pathology that limits feedback alignment, which only trains the layer +right before the loss and leaves $Q/K/V$ at cosine $\approx 0.25$ and the upstream +FFN at $\approx -0.01$.) + +There are two ways out, and we will use the second: +\begin{enumerate}[leftmargin=1.6em,itemsep=2pt] + \item \textbf{Energy route} (make attention conservative): fold attention into a + scalar energy with a \emph{tied} value, so $F=-\grad E$ and classic EP is + exactly valid. This is the CET route (\S\ref{sec:cet-energy}); it costs the + $Q\!\neq\!K$ asymmetry and the free value that make attention expressive. + \item \textbf{Force route} (keep real attention, repair the \emph{estimator}): + leave \eqref{eq:attn} as a non-conservative \emph{force} and add a + correction that turns $J$ into $J^{\!\top}$ in the nudged phase. This is the + AEP route (\S\ref{sec:aep}), and it is what our block uses. +\end{enumerate} + +%============================================================================== +\section{AEP, holomorphic EP, and the force-form readout} +\label{sec:aep} + +\subsection{Force-form (vector-field) EP} +\label{sec:vf} +The first step is to drop the energy and write the dynamics directly as a force +$F(z)$, relaxing $\dot z=F(z)$ to a fixed point $\zstar$. The parameter gradient is +then read off a \emph{vector-field} (VF) contrast \citep{scurria2026}: +\begin{equation} + \frac{\partial\mathcal{L}}{\partial\theta} + \;\approx\; + \frac{\partial}{\partial\theta}\,\big\langle a,\ F(\zstar;\theta)\big\rangle, + \qquad + a \;=\; \frac{z_{-\beta}-z_{+\beta}}{2\beta}\ \approx\ -\frac{\dd \zstar}{\dd\beta}, + \label{eq:vf} +\end{equation} +where $a$ is the centered contrast (the ``adjoint state'') read from the two nudged +equilibria, and the right-hand side is \emph{one} autograd call evaluated at the +fixed point only --- per-term local bookkeeping, \emph{not} backprop through the +relaxation steps. Every term of the block (attention, FFN, LayerNorm affines, and +the embeddings, which enter through the input clamp $-(z-\xin)$) is a term of the +same $F$, so \eqref{eq:vf} trains them jointly with no per-module schedule. + +\paragraph{Attribution / honest caveat.} +The force-form VF readout \eqref{eq:vf} is \emph{not ours}: it is the baseline of +\citet{scurria2026}. Crucially it \emph{collapses on its own} for a non-conservative +system (their CIFAR-10 VF reaches chance, $10\%$; MNIST $64\%$ vs.\ $92.7\%$), +exactly mirroring our measured cosine $\approx 0.25$ for uncorrected attention. VF +is therefore the ``starting point that fails''; what rescues it is the next step. + +\subsection{The AEP correction: \texorpdfstring{$J\!\to\!J^{\!\top}$}{J to J transpose}} +\label{sec:aep-corr} +For a non-conservative $F$, the nudged relaxation linearized at $\zstar$ runs under +$J=\partial F/\partial z$, but the true adjoint requires $J^{\!\top}$. \emph{Asymmetric +EP} (AsymEP) \citep{scurria2026} repairs this by adding to the nudged force a term +that subtracts twice the antisymmetric part of the Jacobian. With +$v=z-\zstar$ and $\Jnc$ the Jacobian of the \emph{non-conservative} part $\Fnc$, +\begin{equation} + \mathrm{corr}(z) \;=\; \Jnc\,v - \Jnc^{\!\top} v + \;=\; (\Jnc-\Jnc^{\!\top})\,v + \;=\; 2\,A_J\,v, + \qquad + A_J \equiv \tfrac12\big(\Jnc-\Jnc^{\!\top}\big), + \label{eq:aep} +\end{equation} +which is \emph{mathematically identical} to their $-2A_J(\zstar)(z-\zstar)$. The +nudged force becomes $f \;=\; F(z) \mp \beta\,\grad_z\ell(z) - \mathrm{corr}(z)$, +so the attention part of the nudged linearization is replaced as +\begin{equation} + J\,v \;-\; (J-J^{\!\top})\,v \;=\; J^{\!\top} v , +\end{equation} +i.e.\ \emph{$J$ is turned into $J^{\!\top}$}, restoring the correct adjoint and hence the +exact gradient for $Q\!\neq\!K$ attention. Two structural facts make this cheap and +local: +\begin{itemize}[leftmargin=1.4em,itemsep=2pt] + \item \emph{The symmetric (conservative) parts cancel.} The damping $-c\,z$ has + Jacobian $-cI$ (symmetric), the FFN-as-Hopfield-energy and the input clamp + are symmetric, so they contribute $0$ to $A_J$. Thus a \emph{single} + correction on the attention term repairs the \emph{whole} block; FFN/clamp + ride along in the conservative part and are already exact under VF. + \item \emph{It is matrix-free.} We never build $\Jnc$. Each nudged step uses one + Jacobian-vector product and one vector-Jacobian product, + $\Jnc v=\jvp(\Fnc,\zstar,v)$ and $\Jnc^{\!\top} v=\vjp(\Fnc,\zstar,v)$. +\end{itemize} + +\paragraph{Attribution.} +The correction \eqref{eq:aep} is \citet{scurria2026}'s, \emph{not} ours. Their scope +is feedforward / Hopfield nets on static MNIST/CIFAR with an \emph{explicitly +constructed} Jacobian, no attention, no sequence model, and no stability controller. +\emph{Ours on this line} is: (i) the matrix-free $\jvp/\vjp$ form (their explicit +Jacobian is infeasible at transformer state dimension $B\!\cdot\!T\!\cdot\!C$); +(ii) the application to data-dependent \emph{softmax attention}; (iii) the +combination with holomorphic estimation (\S\ref{sec:holo}); (iv) the common-mode +\emph{tracking} variant (\S\ref{sec:tracking}); and (v) the transformer-LM +application together with the stability stack (\S\ref{sec:stab}). + +\paragraph{Validity window.} +The correction is linearized \emph{at $\zstar$}, so the nudged trajectory must stay +inside the linear-response window. At $\varepsilon{=}0.1$ a nudge horizon +$T_2\!\approx\!20$ is comfortably inside; $T_2\gtrsim 60$ can leave it (\S\ref{sec:stab}). + +\subsection{Holomorphic EP: variance-reduced, higher-order estimates} +\label{sec:holo} +The $\pm\beta$ contrast trades bias against noise: small $\beta$ shrinks the +$O(\beta^2)$ bias but amplifies the $1/\beta$ noise on $(z_{-\beta}-z_{+\beta})/2\beta$. +Holomorphic EP \citep{laborieux2022} removes this trade-off by replacing the two +real points with $N$ points on a \emph{complex circle}, +$\beta_k = r\,e^{2\pi i k/N}$, relaxing the \emph{holomorphically extended} dynamics +and reading the contrast off a discrete Cauchy integral: +\begin{equation} + a \;=\; -\,\Re\!\left[\frac{1}{Nr}\sum_{k=0}^{N-1} e^{-i\phi_k}\,(z_k-\zstar)\right], + \qquad \phi_k=\tfrac{2\pi k}{N}, + \label{eq:holo} +\end{equation} +whose bias is $O(r^{N})$ instead of $O(r^{2})$ --- so $r$ may be $5$--$10\times$ +larger at equal bias, cutting the $1/\beta$ noise by the same factor. The +holomorphic extension is built by hand (complex LayerNorm with non-conjugate +variance, softmax as a ratio of exponentials, the $\tanh$-form GELU which is an +entire function); the AEP correction \eqref{eq:aep} is \emph{real-linear in $v$}, so +it preserves holomorphy and is applied to the real and imaginary parts separately. +No clamps appear inside the holomorphic nudge --- clamps are non-analytic and would +destroy the $O(r^N)$ bias order. This addresses premise~(C). \citep{laborieux2022} +is the source; we add only the combination with the AEP correction and with softmax +attention. + +%============================================================================== +\section{The equilibrium-transformer block (and the CET alternative)} +\label{sec:block} + +\subsection{Our damped, non-conservative block (\texttt{thick})} +\label{sec:thick} +The state is $z\in\R^{B\times T\times C}$, one vector per token position. Inference +is a relaxation to a fixed point under a \emph{single force} $F$, +$z\leftarrow z+\varepsilon F(z)$ for $T_1$ steps ($\varepsilon{=}0.1$, $T_1{\approx}150$), +after which logits $=\zstar W_h$. The force is a pre-LN transformer block written as +a force rather than a layer stack: +\begin{equation} + F(z) = + \underbrace{-(z-\xin)}_{\text{input clamp}} + +\underbrace{\Attn(\LN_1(z))}_{\text{causal MHSA},\ W_Q,W_K,W_V,W_O} + +\underbrace{W_2\mathrm{GELU}(W_1\LN_2(z)+b_1)+b_2}_{\text{untied }4\times\text{ FFN}} + -\underbrace{c\,z}_{\text{damping}}. + \label{eq:thick} +\end{equation} +Here $\xin=\mathrm{tok}[\mathrm{idx}]+\mathrm{pos}$ is the (trained) input +embedding, clamped as a boundary condition through the $-(z-\xin)$ term; this is the +same fixed-point map a Deep Equilibrium model \citep{bai2019} uses. The block is +strongly non-conservative ($Q\!\neq\!K$, untied FFN), and AEP makes EP exact for it. + +\paragraph{Why the $-c\,z$ damping is the key recipe move.} +Raw attention at high gain has \emph{no} fixed point: the residual floors at +$\sim\!3\times10^{-2}$ and the relaxation never settles, so the entire EP family +(corrected or not) cannot even start (there is no $\zstar$ to nudge around). Adding +$-c\,z$ ($c\!\geq\!1$) makes the map contractive enough to \emph{create a stable +fixed point at any attention strength}, while leaving the map non-conservative +(independent $Q/K/V$ are untouched). Critically, the damping's Jacobian $-cI$ is +symmetric, so it \emph{cancels in $A_J$} \eqref{eq:aep}: it buys a fixed point +without polluting the AEP correction, which still sees only attention's +non-reciprocal part. Together, ``damping $+$ AEP'' is the minimal recipe that makes +real attention EP-trainable, taking the attention-parameter cosine from +$\approx 0.25$ (uncorrected) to $0.99$--$1.0$ even at high gain. + +\paragraph{A subtlety for LN-inside blocks.} +Because LayerNorm sits \emph{inside} \eqref{eq:thick} and its Jacobian scales like +$1/\sigma(z)$, large damping shrinks $\|\zstar\|$ and thereby \emph{inflates} the +effective Jacobian (measured: plain-relax residual $8.8\times10^{-3}$ at $c{=}0$ +vs.\ $3.4\times10^{-2}$ at $c{=}2$). So for \texttt{thick} we keep $c$ small ($c{=}1$) +and the actual stabilizer is the Jacobian-norm penalty of \S\ref{sec:stab}, not the +damping. (For a simpler ``thin'' variant whose FFN is an energy-based modern-Hopfield +memory and whose attention is a raw damped force, the damping \emph{is} required.) + +\subsection{The CET / energy route (the conservative alternative)} +\label{sec:cet-energy} +\textbf{CET} here means the \emph{Convergent Energy Transformer} of +\citet{hoier2026} --- an energy-based transformer block, trained with EP, that we +reproduced (on masked image completion) as the prior SOTA for ``EP $+$ attention''. +Its trick is to make attention \emph{conservative} so classic EP applies with +\emph{no} correction: attention is folded into a scalar energy +\begin{equation} + E_{\mathrm{att}}(z) \;=\; + -\frac{1}{\gamma}\sum_{\text{heads},\,i} + \LSE_{j}\!\big(\gamma\, q_i\!\cdot\!k_j\big) + \quad(\text{causal-masked}), + \label{eq:cet} +\end{equation} +whose force \emph{ties the value to the key} ($v\!\equiv\!k$), plus a confinement +$\tfrac12 c\|z\|^2$ (because $E_{\mathrm{att}}$ is unbounded below) and a +modern-Hopfield memory energy $E_{\mathrm{mem}}(z)=-\sum\mathrm{relu}(zW_m)^2$ +playing the role of the FFN (its force is a \emph{tied}-weight squared-ReLU MLP). On +this energy $F=-\grad E$ exactly, so classic EP is valid with symmetric Jacobian and +no AEP. In our reproduction EP matched truncated-BPTT (``EP $\approx$ TBPTE'', +gradient cosine $0.99$). The trade-off is expressivity: the tied value and +reciprocal coupling are the least expressive form of attention. Under \emph{exact} +gradients on the LM, this conservative route (and a monotone-DEQ variant +\citep{winston2020}) costs $\approx 0.15$--$0.2$ CE relative to the non-conservative +\texttt{thick} block --- which is precisely why we pay for the AEP machinery and keep +real attention. + +%============================================================================== +\section{Our recipe: tracking-AEP and the stabilization stack} +\label{sec:recipe} + +\subsection{Tracking-AEP: re-linearize at the moving common mode} +\label{sec:tracking} +The AEP correction \eqref{eq:aep} is frozen at $\zstar$. Near a good solution this +becomes the binding error: as the model sharpens, the true gradient shrinks below +the \emph{bias floor} of the frozen linearization, and the highly non-normal block +Jacobian makes that floor large (we measure $\|\Jnc v-\Jnc^{\!\top} v\|/\|\Jnc v\|=1.37$ +at $\zstar$). The fix is to re-linearize the antisymmetric correction not at the +frozen $\zstar$ but at the \emph{instantaneous common mode} of the two nudged +trajectories, +\begin{equation} + \zbar \;=\; \half\big(z_{+}+z_{-}\big), + \qquad + \mathrm{corr}(z) \;=\; \Jnc(\zbar)\,v - \Jnc(\zbar)^{\!\top} v, + \quad v = z-\zbar, + \label{eq:track} +\end{equation} +evaluated step-by-step as $\zbar$ moves with the nudge (run the $+$ and $-$ phases in +lockstep, recompute $\jvp/\vjp$ about the running $\zbar$). This is exact transposed +differential dynamics with no compounding linearization error, and it is loose-tolerant +(it does not demand an ultra-tight free phase). At a plateau checkpoint where the +frozen estimator had collapsed (gradient cosine vs.\ BPTT $-0.045$, batch-to-batch +self-coherence $-0.27$, magnitude ratio $\sim\!4000\times$), tracking-AEP restores +cosine $0.997$, self-coherence $+0.95$, magnitude ratio $0.9$. Tracking-AEP and the +common-mode formulation \eqref{eq:track} are \emph{ours}. + +\subsection{The validity threshold and the residual as the health signal} +\label{sec:stab} +The governing empirical fact is that the EP estimator has a \emph{validity threshold} +in the free-phase relative residual +\begin{equation} + \mathrm{res} \;=\; \frac{\|z^{+}-\zstar\|}{\|\zstar\|} + \qquad(\text{one extra relaxation step}), +\end{equation} +which is the load-bearing health signal (premise~(B)). Gradient cosine vs.\ the exact +reference degrades sharply with res: $\approx 0.85$ at $\mathrm{res}\!\sim\!5\times10^{-5}$, +batch-dependent $0.2$--$0.9$ at $10^{-3}$, and noise at $10^{-2}$. BPTT has no such +threshold (it differentiates the actual finite unroll, converged or not); \emph{this +asymmetry, and nothing deeper, is the EP-specific difficulty}. Accordingly the free +phase is run adaptively: relax to $T_1{=}150$, then continue in chunks until +$\mathrm{res}\!\le\!10^{-4}$ before nudging. We emphasize there is \emph{no} structural +``EP ceiling'': an early ``EP caps at $\sim\!2.5$'' verdict was traced to two +undertrained/invalid-regime runs and retracted. + +\subsection{The stabilization stack} +Training pushes the dynamics off the contractive manifold (premise~(D)) --- and not +only for EP: even \emph{exact} BPTT on this architecture walks off the manifold on +long horizons (residual $\to 4.7\times10^{-2}$, val CE $\to 3.0$). The stack that +keeps the system valid: +\begin{itemize}[leftmargin=1.4em,itemsep=3pt] + \item \textbf{Frozen / controlled Jacobian-norm penalty (\texttt{jacreg}).} A soft + penalty $\lambda\,\|\Jnc(\zstar)\|_F^2$, estimated matrix-free by Hutchinson + (one $\jvp$ on a random probe, differentiated w.r.t.\ $\theta$). This is + \citet{bai2021}'s DEQ-stabilization penalty, \emph{not} ours. It keeps the + free phase contractive and hence the estimator inside its validity region. + A continuous controller drives it, + $\lambda \leftarrow \mathrm{clip}\big(\lambda\,(\mathrm{res}_{\mathrm{EMA}}/\mathrm{target})^{0.3}\big)$, + on an EMA-smoothed residual (the raw residual is noisy and a multiplicative + controller on it random-walks). A key hard lesson: the controller \emph{floor} + is load-bearing and must never anneal to zero --- two independent + $\lambda\!\to\!0$ runs died identically (val CE $60$--$77$, $\mathrm{res}\!\equiv\!0$), + which post-mortem is an \emph{explosion disguised as convergence by + floating-point absorption} ($\varepsilon F<\mathrm{ulp}(z)$ freezes the + relaxation), not a benign dead state. + \item \textbf{Residual, not spectral radius, as the control signal.} The block + Jacobian is highly non-normal, so transient growth is invisible to + eigenvalues (measured $\rho(J){=}0.94$ ``stable'' while the relaxation + diverged to $\mathrm{res}\,0.21$). The one-step residual \emph{is} the + transient; we control on it. + \item \textbf{Validity gate.} When the residual exceeds a gate, the EP update is + mathematically undefined, so we apply only the homeostat (jacreg) and skip the + nudge --- a fast recovery step. At larger scale this gate is load-bearing + (off-equilibrium EP updates poison the weights). + \item \textbf{Adaptive $T_2$ by hindsight snapshot selection.} On slow-mixing + batches a long nudge phase can diverge through non-normal transient growth, + and step-size early-stopping \emph{fails} (the transient triggers it + spuriously). Instead, run to $T_{2\max}$ in lockstep, snapshot the contrast + $a_t$ every few steps, and return the \emph{most settled} snapshot (smallest + increment of $a_t$); judging by increments of the \emph{quantity of interest} + rather than step sizes makes transient growth harmless. This is ours; it + lifts probe cosine from $0.871$ to $0.932$. +\end{itemize} + +\subsection{Ongoing: the residual-defense term (\texttt{resreg}) --- under validation} +\label{sec:resreg} +At larger width ($C{=}512$) we observe a distinct, \emph{still-open} failure that we +call the below-$2.10$ wall: frozen-jacreg, tracking-AEP EP descends to best +$\approx 2.09$ and then bifurcates within $\sim\!200$ steps (residual +$5\!\times\!10^{-3}\!\to\!0.15$, gradient cosine $0.98\!\to\!0$, CE $\to\!4{+}$), +while \emph{exact} BPTT with the identical recipe sails past to $1.72$. The diagnosed +root cause is an \emph{objective mismatch}: EP optimizes the (refined) fixed point and +never defends the finite-step residual that evaluation actually uses, whereas BPTT +differentiates the finite unroll and so implicitly rewards contraction. The diverged +state is a forward bifurcation to a \emph{limit cycle}, so more relaxation steps cannot +fix it; only a residual \emph{cost} can. The proposed fix is an explicit T1-residual +penalty on the \emph{evaluated} state $z_{150}=\mathrm{relax}(\xin,T_1)$ taken before +any refinement, +\begin{equation} + R_{\mathrm{res}} \;=\; \frac{\|\varepsilon F(z_{150})\|^2}{\|z_{150}\|^2+\varepsilon}, + \qquad + \text{gradient w.r.t.\ }\theta\text{ with }z_{150}\text{ detached}, + \label{eq:resreg} +\end{equation} +scaled task-relative and added to the EP gradient (run with the validity gate off, so +the penalty is not bypassed exactly when the residual is high). \textbf{Status: this is +ongoing.} The residual-defense term \eqref{eq:resreg} held the residual pinned at +$1$--$5\times10^{-4}$ and reached best $2.0573$ (past the wall) through only step +$\sim\!1000$ before a storage cleanup deleted the run; full re-validation toward the +$\approx 1.8$ BPTT ceiling is pending. We present it as a diagnosis $+$ proposed fix, +\emph{not} a finished result. (The objective-mismatch diagnosis, the common-mode +tracking estimator, the residual-driven controller and validity gate, and this +residual-defense term are ours.) + +%============================================================================== +\section{Established results (and what is still open)} +\label{sec:results} + +\paragraph{Solidly validated.} +\begin{itemize}[leftmargin=1.4em,itemsep=3pt] + \item \textbf{EP/AEP component gradients match backprop.} On the character LM, + AEP gives causal-attention parameters cosine $0.99$, the (Hopfield) FFN + $1.00$, and the full LM block $0.99$ vs.\ the true backprop gradient + --- versus feedback alignment at $Q/K/V\approx 0.25$, FFN $\approx -0.01$. + On the CET reproduction, global cosine $0.99$ and EP $\approx$ TBPTE on + masked-image completion. + \item \textbf{EP trains the equilibrium transformer stably, without backprop.} + With the stabilization stack, end-to-end EP runs $10\text{k}+$ steps with + zero non-finite steps. + \item \textbf{It matches/beats a BP transformer at equal parameters.} On + Shakespeare character-LM (single block, $C{=}128$), at a fully controlled + $14$k-step comparison (Table~\ref{tab:results}): EP reaches val CE + \textbf{1.676} (multi-seed $1.680\pm0.005$, $3$ seeds); the like-for-like + standard BP transformer (matched in parameter \emph{shape} to the thick + block) reaches $1.610$; EP \emph{beats} the thinner BP baseline ($1.689$). + The total gap of $0.066$ decomposes into an architecture tax $\approx 0.025$ + (BPTT on the identical block $1.635$) and an EP-rule tax $\approx 0.041\pm0.005$ + --- real, tightly reproducible, and consistent with the measured estimator + misalignment (cosine $0.85$--$0.93$). +\end{itemize} + +\begin{table}[t] + \centering + \small + \begin{tabular}{llc} + \toprule + \textbf{training rule} & \textbf{architecture / recipe} & \textbf{best val CE}\\ + \midrule + BP & standard transformer (like-for-like for \texttt{thick}) & \textbf{1.610}\\ + BPTT $+$ $\lambda$-controller $+$ param-EMA & \texttt{thick} (exact grad, same stabilizer) & 1.635\\ + \textbf{EP} & \texttt{thick}; tracking-AEP $+$ adaptive $T_1/T_2$ & \textbf{1.676}\\ + BP & standard transformer (thin-matched) & 1.689\\ + BPTT (exact grad) & \texttt{thick}, unregularized & 2.021 (destabilizes late)\\ + random & --- & 4.174\\ + \bottomrule + \end{tabular} + \caption{Fully-controlled $14$k-step comparison on Shakespeare char-LM + (random $=\ln 65$). EP matches the architecture-controlled exact-gradient + run to within $0.041$ and beats the thin-matched BP baseline. ``BPTT as + ablation'' separates the training-rule cost (EP$-$BPTT) from the + architecture cost (BPTT$-$BP).} + \label{tab:results} +\end{table} + +\paragraph{Honest framing of the controlled comparison.} +EP beats \emph{bare} BPTT, but the controlled table shows most of that win is EP's +\emph{mandatory} stabilization loop doubling as regularization: bare exact-gradient +training walks off the contractive manifold at $14$k, and the same controller that EP +cannot live without also lifts BPTT to $1.635$. The contraction controller is good for +the equilibrium architecture regardless of training rule; EP merely forced its +discovery. + +\paragraph{Ongoing / under validation.} +The $C{=}512$ work is \emph{not} a finished result. (i) The $2.40$ plateau there is +diagnosed as a late-training EP estimator bias-floor / batch-incoherence, which +tracking-AEP breaks in training ($2.40\!\to\!2.16$, still descending in a $2500$-step +warm-start test). (ii) The below-$2.10$ wall is diagnosed as the objective mismatch of +\S\ref{sec:resreg}; the residual-defense term \eqref{eq:resreg} validated res-tight and +past the wall (best $2.0573$) \emph{only through step $\sim\!1000$} before the run was +lost, and a full re-run toward the $\approx 1.8$ BPTT ceiling is pending. These should +be read as diagnoses with promising partial evidence, not as established numbers. + +%============================================================================== +\section*{Attribution summary} +\addcontentsline{toc}{section}{Attribution summary} + +\begin{description}[leftmargin=2.2em,itemsep=2pt] + \item[Theirs.] Classic energy-based EP and centered nudging + \citep{scellier2017,laborieux2021}; EP $\equiv$ BPTT in the converged, $\beta\!\to\!0$ + limit \citep{ernoult2019}; holomorphic EP \citep{laborieux2022}; the asymmetric/AEP + correction $J\!\to\!J^{\!\top}$ \emph{and} the force-form VF readout + \citep{scurria2026}; the Jacobian-norm penalty \citep{bai2021}; DEQ + \citep{bai2019} and monotone DEQ \citep{winston2020}; the Convergent Energy + Transformer / CET \citep{hoier2026}. + \item[Ours.] The transformer application of the force route and the damping recipe + (damping $+$ AEP making real attention EP-trainable at any gain); the matrix-free + $\jvp/\vjp$ form of the correction at transformer scale and its combination with + holomorphic estimation and softmax attention; \emph{tracking-AEP} (common-mode + re-linearization, Eq.~\ref{eq:track}); the residual-driven controller, the validity + gate, and adaptive-$T_2$ snapshot selection; and the (ongoing) residual-defense term + \texttt{resreg} (Eq.~\ref{eq:resreg}) with its objective-mismatch diagnosis. +\end{description} + +%============================================================================== +\begin{thebibliography}{9} +\bibitem[Bai et al., 2019]{bai2019} + S.~Bai, J.~Z.~Kolter, V.~Koltun. + \emph{Deep Equilibrium Models}. NeurIPS 2019. + +\bibitem[Bai et al., 2021]{bai2021} + S.~Bai, V.~Koltun, J.~Z.~Kolter. + \emph{Stabilizing Equilibrium Models by Jacobian Regularization}. ICML 2021. + +\bibitem[Ernoult et al., 2019]{ernoult2019} + M.~Ernoult, J.~Grollier, D.~Querlioz, Y.~Bengio, B.~Scellier. + \emph{Updates of Equilibrium Prop Match Gradients of Backprop Through Time in an + RNN with Static Input}. NeurIPS 2019. + +\bibitem[H{\o}ier et al., 2026]{hoier2026} + R.~H{\o}ier, K.~Kerjan, B.~Scellier. + \emph{Training a Convergent Energy Transformer with Equilibrium Propagation} (CET). + ICLR 2026 Associative Memory workshop; OpenReview \texttt{Qrfml76eWJ}. + +\bibitem[Laborieux et al., 2021]{laborieux2021} + A.~Laborieux, M.~Ernoult, B.~Scellier, Y.~Bengio, J.~Grollier, D.~Querlioz. + \emph{Scaling Equilibrium Propagation to Deep ConvNets by Drastically Reducing its + Gradient Estimator Bias} (centered/symmetric nudging). Frontiers in Neuroscience, 2021. + +\bibitem[Laborieux \& Zenke, 2022]{laborieux2022} + A.~Laborieux, F.~Zenke. + \emph{Holomorphic Equilibrium Propagation Computes Exact Gradients Through Finite Size + Oscillations}. NeurIPS 2022. + +\bibitem[Scellier \& Bengio, 2017]{scellier2017} + B.~Scellier, Y.~Bengio. + \emph{Equilibrium Propagation: Bridging the Gap between Energy-Based Models and + Backpropagation}. Frontiers in Computational Neuroscience, 2017. + +\bibitem[Scurria et al., 2026]{scurria2026} + A.~Scurria, P.~Vanden Abeele, B.~Mognetti, S.~Massar. + \emph{Equilibrium Propagation for Non-Conservative Systems} (AsymEP). + arXiv:2602.03670, 2026. + +\bibitem[Winston \& Kolter, 2020]{winston2020} + E.~Winston, J.~Z.~Kolter. + \emph{Monotone Operator Equilibrium Networks} (monotone DEQ). NeurIPS 2020. +\end{thebibliography} + +\end{document} diff --git a/assets/frozen_vs_adaptive.png b/assets/frozen_vs_adaptive.png new file mode 100644 index 0000000..e45e77b Binary files /dev/null and b/assets/frozen_vs_adaptive.png differ diff --git a/docs/COLAB_50M.md b/docs/COLAB_50M.md new file mode 100644 index 0000000..9236049 --- /dev/null +++ b/docs/COLAB_50M.md @@ -0,0 +1,117 @@ +# Colab 50M EP run — self-contained notebook cells + +Goal: train the 50M (C=2048) EP energy-transformer on TinyStories-BPE on a Colab A100/H100, +which fixes the two things timan1's A6000 can't: fp32 throughput and the big-width init instability +(more headroom to tune the curriculum + bigger batch). Checkpoints to Google Drive with full-state +resume, so the 12 h Colab timeout never loses progress — just re-run the training cell to continue. + +PREP (once, on your laptop): download from timan1 `~/ept/ept_colab.tar.gz` (16 KB, the code) and +upload it to your Google Drive root as `ept_colab.tar.gz`. Data is regenerated in-notebook (or +upload `~/ept/lt_ep_code/.../tinystories_bpe` bins to Drive to skip the ~40 min prep — optional). + +──────────────────────────────────────────────────────────────────────── +## Cell 1 — setup, Drive, GPU, deps +```python +import torch, subprocess, os +print(torch.__version__, torch.cuda.get_device_name(0)) +assert torch.__version__ >= "2.1", "need torch>=2.1 for torch.func/compile" +print(subprocess.run(["nvidia-smi","--query-gpu=name,memory.total","--format=csv,noheader"], + capture_output=True,text=True).stdout) +from google.colab import drive; drive.mount('/content/drive') +!pip -q install tokenizers +WORK="/content/work"; DRIVE="/content/drive/MyDrive"; os.makedirs(WORK, exist_ok=True) +!tar xzf {DRIVE}/ept_colab.tar.gz -C {WORK} +print("code:", os.listdir(WORK)) +``` + +## Cell 2 — data (regenerate, cached to Drive; skip if bins already uploaded) +```python +import os +DATA="/content/drive/MyDrive/ept_data/tinystories_bpe" +if os.path.exists(f"{DATA}/train.bin"): + print("BPE bins found on Drive — reusing.") +else: + os.makedirs("/content/drive/MyDrive/ept_data/tinystories", exist_ok=True) + %cd /content/drive/MyDrive/ept_data/tinystories + !test -f train.txt || wget -q -O train.txt https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStoriesV2-GPT4-train.txt + !test -f valid.txt || wget -q -O valid.txt https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStoriesV2-GPT4-valid.txt + # point the prep script at the Drive paths + import re; src=open(f"{WORK}/prepare_tinystories_bpe.py").read() + src=src.replace("/tmp/lt_ep/data/tinystories","/content/drive/MyDrive/ept_data/tinystories") + src=src.replace("/tmp/lt_ep/data/tinystories_bpe",DATA) + open(f"{WORK}/prep_bpe_colab.py","w").write(src) + %cd {WORK} + !python prep_bpe_colab.py +print("data:", os.listdir(DATA)) +``` + +## Cell 3 — STABILITY SMOKE (always run first; ~15 min). Must survive past warmup without abort. +```python +%cd {WORK} +# C=2048 starting curriculum (muP-scaled from C=1024's lr 4e-4 -> ~2e-4; longer warmup; gentler resinit) +!python lt_ep_train.py --mode ep --attn_mode thick --B 16 --C 2048 --H 16 --T 512 \ + --c 1.0 --jacreg 1.0 --jr_floor 0.1 --res_target 1.5e-3 --jr_max 64 --res_ema 0.9 \ + --holo 2 --hr 0.02 --pema 0.999 --t1max 300 --res_est 1e-4 --t2sel 60 --res_gate 5e-3 \ + --qknorm --resinit 0.05 --warmup 2500 --compile --T1 150 --T2 20 --lr 2e-4 \ + --steps 1200 --log 100 --data {DATA} +# READ THE OUTPUT: if it ABORTs or res spikes >0.1 repeatedly through steps 600-1200, the curriculum +# is still too hot -> lower lr to 1e-4 and/or resinit 0.03 and/or warmup 4000, re-run this cell. +# If res stays <1e-2 and val descends past step 1000, the curriculum is good -> go to Cell 4. +``` + +## Cell 3b — KEEP-ALIVE (run once, then it auto-clicks connect every 60s to beat the ~90min idle kill) +Open the browser JS console (F12 → Console) on the Colab tab and paste: +```javascript +function keepAlive(){ + document.querySelector("colab-connect-button")?.shadowRoot?.querySelector("#connect")?.click(); +} +setInterval(keepAlive, 60000); +``` +This beats ONLY the idle timeout. The HARD cap (free 12h / Pro 24h, and Pro+ background execution +is unreliable in 2026) is unbeatable — which is why Cell 4 is built to RESUME. When Colab drops you, +just reconnect and re-run Cell 4; it continues from the last `--save_every` checkpoint on Drive. + +## Cell 4 — FULL RUN with Drive full-state resume. Re-run this exact cell after EVERY disconnect. +```python +%cd {WORK} +ST="/content/drive/MyDrive/ept_ckpt/s4_50m.state"; CK="/content/drive/MyDrive/ept_ckpt/s4_50m.best.pt" +os.makedirs("/content/drive/MyDrive/ept_ckpt", exist_ok=True) +# --resume loads ST (weights+optimizer+sched+step+jr+best) if present -> idempotent across timeouts. +# --save_every 100 = atomic full-state save every 100 steps -> a kill loses at most ~100 steps. +!python lt_ep_train.py --mode ep --attn_mode thick --B 16 --C 2048 --H 16 --T 512 \ + --c 1.0 --jacreg 1.0 --jr_floor 0.1 --res_target 1.5e-3 --jr_max 64 --res_ema 0.9 \ + --holo 2 --hr 0.02 --pema 0.999 --t1max 300 --res_est 1e-4 --t2sel 60 --res_gate 5e-3 \ + --qknorm --resinit 0.05 --warmup 2500 --compile --T1 150 --T2 20 --lr 2e-4 \ + --steps 24000 --log 200 --save_every 100 --data {DATA} --ckpt {CK} --state {ST} --resume +# IMPORTANT: match every flag here to the curriculum that PASSED Cell 3 (esp. lr/warmup/resinit). +# On the FIRST run ST won't exist (fresh start, prints init residual); every re-run prints "[resume] from ...". +``` + +### Checkpointing guarantees (tested on timan1) +- `--state` writes the FULL state (weights + AdamW moments + LR-schedule position + step + λ + best) + to `ST.tmp` then `os.replace` → **atomic**: a kill mid-write leaves the previous good `ST` intact. +- `--resume` continues the LR schedule and optimizer momentum exactly (not a cold warm-start): + verified step 150 → resumed 151 with val still descending monotonically. +- State size at 50M ≈ ~1 GB (weights+pema+opt); `--save_every 100` ≈ a 1 GB Drive write every + ~20 min of A100 wall-clock (well under Drive's daily quota). Lower to 50 if you want ≤10-min loss. +- `--ckpt` (CK) separately keeps the best-val weights for sampling (Cell 5), updated only on improvement. + +## Cell 5 — sample stories from the best checkpoint (run anytime; reads CK from Drive) +```python +%cd {WORK} +CK="/content/drive/MyDrive/ept_ckpt/s4_50m.best.pt" +!python sample_eq.py --ckpt {CK} --data {DATA} --C 2048 --H 16 --T 512 --use_pema --n 4 \ + --prompt "Once upon a time" --temp 0.8 --topk 40 +``` +Note: sample_eq.py reads vocab from meta.pkl; for BPE it prints token ids unless decoded — if it +shows numbers not text, ping me and I'll add the BPE decode (tokenizer.json is in {DATA}). + +──────────────────────────────────────────────────────────────────────── +NOTES +- The curriculum in Cells 3/4 is a STARTING GUESS for C=2048 (we never got it stable on timan1). + Cell 3 is there precisely to dial it in fast on the better GPU before committing Cell 4's long run. +- Full-state resume tested on timan1 (step 150 → resumed 151, optimizer/schedule intact). +- Expected cost: A100 fp32 ~2-3x an A6000 → ~0.06-0.1 it/s → 24k steps ~3-4 days of wall-clock + ACROSS resumes (so leave it, re-run Cell 4 whenever Colab drops you). H100 faster. +- sample_eq.py BPE-decode gap is the one known rough edge; tell me if Cell 5 prints ids. +``` diff --git a/docs/campaign/C512_PLATEAU_CAMPAIGN.md b/docs/campaign/C512_PLATEAU_CAMPAIGN.md new file mode 100644 index 0000000..6066c21 --- /dev/null +++ b/docs/campaign/C512_PLATEAU_CAMPAIGN.md @@ -0,0 +1,159 @@ +# C512 "Why slow + plateau" diagnostic campaign (2026-06-17) + +Standalone record of the 7-experiment campaign the user proposed to diagnose why EP at C=512 BPE +plateaus at val CE 2.40 (orphan) and never reaches the BP capability band (1.0–1.5). Self-contained; +companion logs in `/tmp/lt_ep/`, general arc in `FINDINGS.md`. + +--- + +## 0. Setup and the user's hypothesis + +- **Models** (TinyStories BPE-4096, C=512 H=16 T=256 B=24; random ln4096 = 8.318): + - EP "orphan" (lr 8e-4, warmup 800, resinit 0.1, holo N=2, t2sel 40, jr_floor 0.1, res_target + 1.5e-3, jr_max 32, pema 0.999): **best val CE 2.4037**, ran full 20k, zero excursions. + - Standard BP transformer (same C/H/T, mlp=4): **best val CE 1.6953**. + - Target: BP capability band **1.0–1.5** (BP-C512 itself is 1.70; BP needs C≥1024 to enter the band). + +- **User's hypothesis (verbatim intent):** the "full-time slow + plateau" is **not** simple EP + LR-inequivalence; it is a conflict between the **equilibrium architecture's contractive operating + point** and **long-context mixing**, with LR merely expressing it as "8e-4 lives, 9e-4 explodes." + Supporting prior: at a matched tight residual operating point EP ≥ BPTT; the apparent gap is + EP-tight vs loose-BPTT (non-contractive), and the gap grows with T. + +- **Three plateau types the user distinguished:** (1) invalid plateau (res ~1e-2, gate skipping, + λ pinned); (2) valid-but-over-contracted plateau (res tiny, cos high, gate-skip low, still far + from BP); (3) controller-fight / estimator-controller mismatch. + +- **Priority order requested:** exp3 (BPTT+ctl) → exp2 (LR-sweep) → exp4 (warmup/gate decouple) + → exp5 (λ grid) → exp6/exp7 (branch + mixing). exp1 (triangulation) foundational. + +--- + +## 1. The seven experiments — spec, what ran, result, read + +### exp3 — C512 BPTT+controller (the decisive matched cut) +- **Spec:** replicate the orphan EP recipe (resinit, qk-norm, warmup, jr_floor, res_target, gate, + pema, lr 8e-4) but swap the EP task gradient for exact BPTT. If it plateaus ~2.2–2.4 → tax is + architecture/controller; if ~1.9 → EP estimator/noise. +- **Ran:** `--mode bptt`, all else matched, 20k steps, fuse on. +- **Result:** descended cleanly to **best 3.85 @step 1400**, then **destabilized at step 1600** + (jr→32, res→0.07) and lodged in a broken basin (val 6.27, res 0.068 — just under the 0.1 fuse, + no abort) for 8400+ steps. Killed at step ~12.8k. **Worse than the EP orphan (3.85 vs 2.40).** +- **Read:** the answer came back neither 2.2–2.4 nor 1.9 — BPTT **broke**. At C512 the contraction + controller does **not** keep the exact gradient on the manifold; EP's implicit contraction-bias is + what kept the orphan alive. Inverts the S0 result (BPTT+ctl 1.635 < EP 1.676). *Caveat: single + lr/seed; BPTT might be stable at another lr.* + +### exp2 — one-step loss-decrease LR-sweep at the plateau ckpt +- **Spec:** fixed ckpt + batch, gradients {BPTT, EP-task, EP-task+jac}, one-step update over an lr + grid, same-graph eval ΔCE. Answers "is it LR-inequivalence." +- **Result** (ΔCE, negative = better): BPTT best **−0.161** (lr 1e-2); EP best **−0.042** (lr 1e-4), + **diverges for lr ≥ 3e-4**. cos(g_EP,g_BPTT) 0.107, k=|g_EP|/|g_BPTT| ≈ 486. +- **Read:** **not LR.** EP's best one-step descent over *all* lr is 4× below BPTT's; a magnitude + rescale can't fix a 0.10-aligned direction. + +### exp1 — gradient triangulation, multi-checkpoint (foundational) +- **Spec:** at step 0/200/800/2000/plateau, per-group cos of {g_EP, g_BPTT-150, g_BPTT-400}, + norm-ratio k, batch-to-batch variance, + res. +- **Result:** + + | ckpt | val | res | EP·BPTT150 | BPTT150·400 | k | EP self-cos | BPTT self-cos | + |---|---|---|---|---|---|---|---| + | step0 (random) | 9.90 | 2.1e-2 | 0.261 | 0.459 | 0.319 | +0.654 | +0.617 | + | step200 | 6.07 | 4.2e-9 | **0.989** | 1.000 | 1.000 | +0.371 | +0.376 | + | step800 | 4.11 | 3.9e-4 | 0.964 | 0.935 | 0.844 | +0.755 | +0.721 | + | step2000 | 3.56 | 2.9e-4 | 0.885 | 0.845 | 0.675 | +0.445 | +0.358 | + | **plateau (2.40)** | 2.40 | 2.6e-5 | **−0.045** | **1.000** | **4223** | **−0.273** | **+0.957** | + +- **Read (decisive):** the EP estimator is **faithful while descending** (cos 0.99→0.89, steps + 200–2000) and undergoes an **SNR/coherence collapse at the optimum** — cos→0, k→4000, and crucially + the **batch self-cos goes −0.27** (EP gradients on different batches anti-correlate) while BPTT is + +0.96. BPTT-150≡BPTT-400 (cos 1.00) the entire time ⇒ the true gradient is always well-defined. + **The 2.40 plateau is an EP estimator bias-floor / batch-incoherence near the optimum** — not + horizon ambiguity (ruled out here), not LR (exp2). + +### exp4 — warmup / validity-gate decouple (3 arms) +- **Spec:** armA current warmup; armB full-strength contraction early + task LR still warmup; + armC no-warmup + smaller resinit / lower branch LR. Metric: gate-skip / res-under-gate / early val. +- **Result:** armA (warmup): res stuck ~1.5e-2 for ~50 steps → gate skips the nudge → **no task + learning for ~50–60 steps** (cos≈0, the reported grad is pure jacreg); cos jumps to 0.99 once + res