From 05c935ab03ee0bdb8597d19466192dfb92ee889d Mon Sep 17 00:00:00 2001
From: YurenHao0426 <Blackhao0426@gmail.com>
Date: Wed, 22 Apr 2026 23:46:33 -0500
Subject: Add vanilla FA (Lillicrap 2016) implementation + full experiment
 suite
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PAPER-CHANGING FINDING: FA is dramatically different from DFA on the
same architecture. FA has genuine deep credit quality where DFA has none.

Implementation:
- experiments/cifar_resmlp.py: added train_fa() + FA diagnostic support
  FA uses sequential backward credit propagation with d×d random matrices
  (a_l = B_l @ a_{l+1}) instead of DFA's direct output-error projection
  (a_l = B_l^T @ e_T). Same local loss form <f_l, a_l>.

Core results (A-H, 100ep 3-seed d=256 terminal-LN ResMLP):

  FA main audit:    0.401 ± 0.009 (DFA: 0.306 ± 0.008)  +9.5 pp
  FA vs frozen:     +5.2 pp ABOVE baseline (DFA: -4.3 pp below)
  FA deep cos:      +0.33 (DFA: ~0 degenerate)
  FA ||h_L||:       ~10^5 (DFA: ~5×10^8)  3 OOM less growth
  FA ||g_L||:       ~10^-6 meaningful (DFA: ~10^-10 floor)
  Mode 1(b) fires:  NO for FA; YES for DFA

  FA+pen lam=1e-2:  0.369 ± 0.003 (DFA+pen: 0.360 ± 0.002)
  FA+pen lam=1e-4:  0.377 ± 0.006 (DFA+pen lam=1e-4: 0.360)
    At lam=1e-4, FA already has deep cos +0.30 while DFA has -0.02

  FA random-target: acc 0.12 (chance), h_L=1.3e5 (DFA: 1.7e8)
  FA early 5ep:     deep cos already +0.32 (DFA ep1: -0.008)

Extension results (d=512 depth sweep, 100ep, s42):
  L=2:  FA 0.350, cos +0.96  (DFA: n/a)
  L=4:  FA 0.424, cos +0.29  (DFA: n/a)
  L=6:  FA 0.401, cos +0.16  (DFA: n/a)
  L=8:  FA 0.409, cos +0.11  (DFA: 0.306, cos -0.0001)
  L=12: FA 0.404, cos +0.09  (DFA: 0.309, cos -0.0001)

FA deep cos is positive at EVERY depth; DFA is ~0 everywhere.
FA accuracy exceeds DFA by 5-10 pp at L=8 and L=12.

This is the strongest empirical support for the Mode 2 → Mode 1
hypothesis: same local loss, same architecture, same optimizer —
only the credit signal differs. FA's sequential propagation produces
much better per-layer credit (cos +0.33 vs ~0), which prevents the
catastrophic activation growth that DFA exhibits.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 results/fa_smoke_test/results_cifar10.json | 120 +++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 results/fa_smoke_test/results_cifar10.json

(limited to 'results/fa_smoke_test')

diff --git a/results/fa_smoke_test/results_cifar10.json b/results/fa_smoke_test/results_cifar10.json
new file mode 100644
index 0000000..a8b563b
--- /dev/null
+++ b/results/fa_smoke_test/results_cifar10.json
@@ -0,0 +1,120 @@
+{
+  "42": {
+    "fa": {
+      "log": {
+        "train_loss": [
+          2.049124941177368,
+          1.9718045804214477,
+          1.9505524127578735
+        ],
+        "train_acc": [
+          0.2436,
+          0.27518,
+          0.2894
+        ],
+        "test_acc": [
+          0.2789,
+          0.3087,
+          0.3122
+        ]
+      },
+      "diagnostics": {
+        "bp_cosine": [
+          0.07161466032266617,
+          -0.008746136911213398,
+          -0.016568297520279884,
+          0.9941877722740173
+        ],
+        "perturbation_rho": [
+          0.041674911975860596,
+          0.0022312882356345654,
+          -0.008362723514437675,
+          0.2924357056617737
+        ],
+        "nudging": {
+          "0.001": [
+            -3.0086375772953033e-06,
+            5.8673322200775146e-08,
+            7.078051567077637e-08,
+            -9.350478649139404e-06
+          ],
+          "0.003": [
+            -8.966773748397827e-06,
+            1.1548399925231934e-07,
+            1.7695128917694092e-07,
+            -2.79964879155159e-05
+          ],
+          "0.01": [
+            -2.9983464628458023e-05,
+            3.6461278796195984e-07,
+            5.96512109041214e-07,
+            -9.332690387964249e-05
+          ]
+        },
+        "hidden_norms_per_layer": [
+          828.2960815429688,
+          8464.98046875,
+          19738.599609375,
+          21368.41796875,
+          19217.69140625
+        ],
+        "bp_grad_norms_per_layer": [
+          1.9956107280449942e-05,
+          4.579987034958322e-06,
+          4.49442450189963e-06,
+          4.525154963630484e-06,
+          4.308007646613987e-06
+        ]
+      },
+      "drift": {
+        "embed.weight": 7.785380853670397,
+        "embed.bias": 6.819826161992119,
+        "blocks.0.ln.weight": 0.42689943313598633,
+        "blocks.0.w1.weight": 6.155192994774994,
+        "blocks.0.w1.bias": 6.637409518031749,
+        "blocks.0.w2.weight": 17.833861612088054,
+        "blocks.1.ln.weight": 0.352157860994339,
+        "blocks.1.w1.weight": 5.231153715209586,
+        "blocks.1.w1.bias": 6.610786582785788,
+        "blocks.1.w2.weight": 13.997490142949763,
+        "blocks.2.ln.weight": 0.3031489849090576,
+        "blocks.2.w1.weight": 4.218717880513288,
+        "blocks.2.w1.bias": 4.851412113278458,
+        "blocks.2.w2.weight": 12.191107541748561,
+        "blocks.3.ln.weight": 0.260187029838562,
+        "blocks.3.w1.weight": 3.545270787599183,
+        "blocks.3.w1.bias": 3.6891700957298967,
+        "blocks.3.w2.weight": 10.80288177600079,
+        "out_ln.weight": 0.04471425712108612,
+        "out_head.weight": 0.9764490646917799,
+        "out_head.bias": 0.41938112118479187
+      }
+    }
+  },
+  "config": {
+    "dataset": "cifar10",
+    "d_hidden": 256,
+    "num_blocks": 4,
+    "batch_size": 128,
+    "epochs": 3,
+    "lr": 0.001,
+    "lr_fb": 0.001,
+    "wd": 0.01,
+    "lam": 0.1,
+    "K": 4,
+    "sigma_bridge": 0.05,
+    "ema_momentum": 0.995,
+    "term_grad_weight": 1.0,
+    "seeds": [
+      42
+    ],
+    "gpu": 0,
+    "output_dir": "results/fa_smoke_test",
+    "methods": [
+      "fa"
+    ],
+    "random_targets": false,
+    "penalty_lam": 0.0,
+    "num_classes": 10
+  }
+}
\ No newline at end of file
-- 
cgit v1.2.3