From 05c935ab03ee0bdb8597d19466192dfb92ee889d Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Wed, 22 Apr 2026 23:46:33 -0500 Subject: Add vanilla FA (Lillicrap 2016) implementation + full experiment suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PAPER-CHANGING FINDING: FA is dramatically different from DFA on the same architecture. FA has genuine deep credit quality where DFA has none. Implementation: - experiments/cifar_resmlp.py: added train_fa() + FA diagnostic support FA uses sequential backward credit propagation with d×d random matrices (a_l = B_l @ a_{l+1}) instead of DFA's direct output-error projection (a_l = B_l^T @ e_T). Same local loss form . Core results (A-H, 100ep 3-seed d=256 terminal-LN ResMLP): FA main audit: 0.401 ± 0.009 (DFA: 0.306 ± 0.008) +9.5 pp FA vs frozen: +5.2 pp ABOVE baseline (DFA: -4.3 pp below) FA deep cos: +0.33 (DFA: ~0 degenerate) FA ||h_L||: ~10^5 (DFA: ~5×10^8) 3 OOM less growth FA ||g_L||: ~10^-6 meaningful (DFA: ~10^-10 floor) Mode 1(b) fires: NO for FA; YES for DFA FA+pen lam=1e-2: 0.369 ± 0.003 (DFA+pen: 0.360 ± 0.002) FA+pen lam=1e-4: 0.377 ± 0.006 (DFA+pen lam=1e-4: 0.360) At lam=1e-4, FA already has deep cos +0.30 while DFA has -0.02 FA random-target: acc 0.12 (chance), h_L=1.3e5 (DFA: 1.7e8) FA early 5ep: deep cos already +0.32 (DFA ep1: -0.008) Extension results (d=512 depth sweep, 100ep, s42): L=2: FA 0.350, cos +0.96 (DFA: n/a) L=4: FA 0.424, cos +0.29 (DFA: n/a) L=6: FA 0.401, cos +0.16 (DFA: n/a) L=8: FA 0.409, cos +0.11 (DFA: 0.306, cos -0.0001) L=12: FA 0.404, cos +0.09 (DFA: 0.309, cos -0.0001) FA deep cos is positive at EVERY depth; DFA is ~0 everywhere. FA accuracy exceeds DFA by 5-10 pp at L=8 and L=12. This is the strongest empirical support for the Mode 2 → Mode 1 hypothesis: same local loss, same architecture, same optimizer — only the credit signal differs. FA's sequential propagation produces much better per-layer credit (cos +0.33 vs ~0), which prevents the catastrophic activation growth that DFA exhibits. Co-Authored-By: Claude Opus 4.6 (1M context) --- results/fa_smoke_test/results_cifar10.json | 120 +++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 results/fa_smoke_test/results_cifar10.json (limited to 'results/fa_smoke_test') diff --git a/results/fa_smoke_test/results_cifar10.json b/results/fa_smoke_test/results_cifar10.json new file mode 100644 index 0000000..a8b563b --- /dev/null +++ b/results/fa_smoke_test/results_cifar10.json @@ -0,0 +1,120 @@ +{ + "42": { + "fa": { + "log": { + "train_loss": [ + 2.049124941177368, + 1.9718045804214477, + 1.9505524127578735 + ], + "train_acc": [ + 0.2436, + 0.27518, + 0.2894 + ], + "test_acc": [ + 0.2789, + 0.3087, + 0.3122 + ] + }, + "diagnostics": { + "bp_cosine": [ + 0.07161466032266617, + -0.008746136911213398, + -0.016568297520279884, + 0.9941877722740173 + ], + "perturbation_rho": [ + 0.041674911975860596, + 0.0022312882356345654, + -0.008362723514437675, + 0.2924357056617737 + ], + "nudging": { + "0.001": [ + -3.0086375772953033e-06, + 5.8673322200775146e-08, + 7.078051567077637e-08, + -9.350478649139404e-06 + ], + "0.003": [ + -8.966773748397827e-06, + 1.1548399925231934e-07, + 1.7695128917694092e-07, + -2.79964879155159e-05 + ], + "0.01": [ + -2.9983464628458023e-05, + 3.6461278796195984e-07, + 5.96512109041214e-07, + -9.332690387964249e-05 + ] + }, + "hidden_norms_per_layer": [ + 828.2960815429688, + 8464.98046875, + 19738.599609375, + 21368.41796875, + 19217.69140625 + ], + "bp_grad_norms_per_layer": [ + 1.9956107280449942e-05, + 4.579987034958322e-06, + 4.49442450189963e-06, + 4.525154963630484e-06, + 4.308007646613987e-06 + ] + }, + "drift": { + "embed.weight": 7.785380853670397, + "embed.bias": 6.819826161992119, + "blocks.0.ln.weight": 0.42689943313598633, + "blocks.0.w1.weight": 6.155192994774994, + "blocks.0.w1.bias": 6.637409518031749, + "blocks.0.w2.weight": 17.833861612088054, + "blocks.1.ln.weight": 0.352157860994339, + "blocks.1.w1.weight": 5.231153715209586, + "blocks.1.w1.bias": 6.610786582785788, + "blocks.1.w2.weight": 13.997490142949763, + "blocks.2.ln.weight": 0.3031489849090576, + "blocks.2.w1.weight": 4.218717880513288, + "blocks.2.w1.bias": 4.851412113278458, + "blocks.2.w2.weight": 12.191107541748561, + "blocks.3.ln.weight": 0.260187029838562, + "blocks.3.w1.weight": 3.545270787599183, + "blocks.3.w1.bias": 3.6891700957298967, + "blocks.3.w2.weight": 10.80288177600079, + "out_ln.weight": 0.04471425712108612, + "out_head.weight": 0.9764490646917799, + "out_head.bias": 0.41938112118479187 + } + } + }, + "config": { + "dataset": "cifar10", + "d_hidden": 256, + "num_blocks": 4, + "batch_size": 128, + "epochs": 3, + "lr": 0.001, + "lr_fb": 0.001, + "wd": 0.01, + "lam": 0.1, + "K": 4, + "sigma_bridge": 0.05, + "ema_momentum": 0.995, + "term_grad_weight": 1.0, + "seeds": [ + 42 + ], + "gpu": 0, + "output_dir": "results/fa_smoke_test", + "methods": [ + "fa" + ], + "random_targets": false, + "penalty_lam": 0.0, + "num_classes": 10 + } +} \ No newline at end of file -- cgit v1.2.3