refs/fre_rnn_full.txt


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945

                                        Published as a conference paper at ICLR 2026


                                        T OWARD P RACTICAL E QUILIBRIUM P ROPAGATION :
                                        B RAIN - INSPIRED R ECURRENT N EURAL N ETWORK
                                        WITH F EEDBACK R EGULATION AND R ESIDUAL C ON -
                                        NECTIONS

                                         Zhuo Liu                                               Tao Chen ∗
                                         School of Microelectronics                             School of Microelectronics
                                         University of Science and Technology of China          University of Science and Technology of China
                                         Hefei 230026, Anhui, China                             Hefei 230026, Anhui, China
arXiv:2508.11659v2 [cs.NE] 7 May 2026


                                         zhuoliu00@mail.ustc.edu.cn                             tchen@ustc.edu.cn


                                                                                     A BSTRACT
                                                    Brain-like intelligent systems need brain-like learning methods. Equilibrium
                                                    Propagation (EP) is a biologically plausible learning framework with strong poten-
                                                    tial for brain-inspired computing hardware. However, existing implementations
                                                    of EP suffer from instability and prohibitively high computational costs. Inspired
                                                    by the structure and dynamics of the brain, we propose a biologically plausible
                                                    Feedback-regulated REsidual recurrent neural network (FRE-RNN) and study its
                                                    learning performance in the EP framework. Feedback regulation enables rapid
                                                    convergence by attenuating feedback signals and reducing the disturbance of feed-
                                                    back paths to feedforward paths. The improvement in the convergence property
                                                    reduces the computational cost and training time of EP by orders of magnitude,
                                                    delivering performance on par with backpropagation (BP) in benchmark tasks.
                                                    Meanwhile, residual connections with brain-inspired topologies help alleviate the
                                                    vanishing gradient problem that arises when feedback pathways are weak in deep
                                                    RNNs. Our approach substantially enhances the applicability and practicality of
                                                    EP. The techniques developed here also offer guidance for implementing in-situ
                                                    learning in physical neural networks.


                                        1       I NTRODUCTION
                                        Backpropagation (BP) has been the driving force behind the success of artificial intelligence (AI)
                                        across a wide variety of tasks, ranging from image recognition to natural language processing
                                        (Rumelhart et al., 1986; Lecun, 1988; He et al., 2016; Vaswani et al., 2017). Despite these tri-
                                        umphs, BP’s reliance on non-local error signals and weight transport lacks biological plausibility
                                        (Journé et al., 2023; Ororbia, 2023). The brain does not appear to implement the gradient computa-
                                        tions performed by BP, in particular the explicit derivative of the activation function, which demands
                                        precise access to the rate of change in neuronal activities at specific operating points (Ororbia, 2023).
                                        Moreover, implementing BP in neuromorphic systems incurs enormous overhead (Kudithipudi et al.,
                                        2025). Drawing inspiration from the topology and dynamics of the brain is a viable approach to ad-
                                        vancing biologically plausible learning mechanisms and to promoting energy-efficient computing
                                        systems for AI.
                                        Equilibrium Propagation (EP) (Scellier & Bengio, 2017; Ernoult et al., 2019; Laborieux et al., 2021)
                                        presents a compelling and hardware-friendly alternative. It leverages naturally settling dynamics in
                                        RNN for credit assignment, and eliminates the need for explicit activation derivatives. EP operates
                                        in two phases with nearly identical dynamics, and the synaptic adjustments depend only on local
                                        information (Ackley et al., 1985; Movellan, 1991; Ernoult et al., 2020). In EP, the output layer is
                                        softly nudged by the prediction error toward configurations that incrementally minimize the loss
                                        function, a regime termed weak supervision (Millidge et al., 2023). A major drawback of EP is
                                            ∗
                                                Corresponding Author


                                                                                            1
Published as a conference paper at ICLR 2026


its notably slow training speed and instability. An RNN often requires dozens or even hundreds
of iterations to reach a stable state (Scellier & Bengio, 2017). Previous attempts to optimize EP’s
performance have led to markedly more complicated procedures (O’Connor et al., 2019; Laborieux
& Zenke, 2024).
In this paper, we draw inspiration from the brain and propose a Feedback-regulated REsidual recur-
rent neural network (FRE-RNN). We substantially improve the convergence properties of the RNNs
and training speed of EP while achieving performance comparable to BP. Our contributions are as
follows:

       • By scaling down the feedback strength of RNNs, we enhance the robustness of EP and
         accelerate the training and inference speed by orders of magnitude because of the improved
         convergence properties.
       • To counteract the gradient vanishing problem caused by weak feedback, we introduce resid-
         ual connections into the layered RNNs, enabling the training of deep networks that previ-
         ously challenged EP and achieving performance closer to BP.
       • The feedback regulation and residual connections in RNNs of arbitrary graph topologies
         mirror the multi-scale recurrence in biological neural networks. Our work fosters EP’s bio-
         logical plausibility and extends its applicability in brain-inspired computational hardware.


2     BACKGROUND

2.1   C ONVERGENT RNN S WITH S TATIC I NPUT

Consider an RNN as a dynamical system driven by a static input x:

                                        s[t + 1] = F (x, s[t], θ),                                    (1)

where F is the transition function, s[t] is the network state at time step t(t = 0, 1, 2, . . . , T ), and
θ denotes the parameters. Assuming that the network state stabilizes in T steps, the RNN reaches
a stable point s[T ]. Its convergence is typically guaranteed by either symmetric connections with
asynchronous updates or by a sufficiently small spectral radius of asymmetric connections with
synchronous updates (Hopfield, 1982; Yildiz et al., 2012; Liu et al., 2026). That said, other factors,
e.g. activation function, also influence the dynamical properties of RNNs (Miller & Hardt, 2019).

2.2   S CALING A DJACENCY M ATRIX TO T UNE N ETWORK DYNAMICS

Scaling the spectral radius (SR) of the adjacency matrix, the largest eigenvalue of the weight matrix,
is a common method to tune the dynamics of RNN (Bai et al., 2012; Nakajima et al., 2024; Liu et al.,
2026). A SR less than one yields stable and convergent dynamics. In this case, injected signals tend
to decay over time, which manifests as short-term memory. A SR exceeding one can give rise to
expansive or even chaotic behavior in which small perturbations are amplified. By adjusting SR,
one can bias the RNN toward convergent, oscillatory, or edge-of-chaos regimes, thereby tuning
computational properties, such as convergence speed or long-term memory capacity. (Jaeger &
Haas, 2004; Legenstein & Maass, 2007; Miller & Hardt, 2019).

2.3   E QUILIBRIUM P ROPAGATION

Equilibrium propagation is a learning framework initially based on energy-based models. It proceeds
in two phases: a free (first) phase and a weakly clamped (second) phase. For the first phase, the
RNN converges to a steady state s0 under the stimulation of input alone. In the clamped phase, the
network is gently nudged by the prediction error and settles to a new stable state sβ . The weight
update can be simplified to a contrastive learning compatible with spiking-time-dependent plasticity
(STDP) (Scellier et al., 2018). EP has been further generalized to asymmetric RNNs governed by
vector field dynamics (Scellier et al., 2018). Recent work shows that asymmetry in skew-symmetric
Hopfield models can improve classification performance (Høier et al., 2024).


                                                    2
Published as a conference paper at ICLR 2026


2.4   F EEDBACK R EGULATION AND N ETWORK S TRUCTURE IN THE B RAIN

Cortical areas in the brain feature dynamic regulation of feedforward and feedback connections
(Felleman & Van Essen, 1991; Mejias et al., 2016; Michalareas et al., 2016; Semedo et al., 2022;
Fişek et al., 2023; Wang et al., 2023). In the visual system, for instance, feedforward signals domi-
nate immediately following the onset of external stimulus, whereas feedback signals become promi-
nent during spontaneous activity. Dynamically regulating the strength of feedback allows the brain
to optimize information integration, ensuring efficient perception and decision-making. In mam-
malian neocortices, information processing involves not only feedforward synaptic chains but also
extensive lateral and feedback loops that interconnect disparate regions, forming a richly recursive
network rather than a strictly layered structure. This topology implies short average path length
between neurons and efficient information flow (Watts & Strogatz, 1998; Markov et al., 2013; Lynn
& Bassett, 2019; Kulkarni & Bassett, 2025). In deep neural networks, residual connections reflect
the long-range skip-layer projections observed in cortical circuits (Perich & Rajan, 2020; Holk &
Mejias, 2024). They mitigate the vanishing gradient by providing skip pathways that preserve gra-
dient (He et al., 2016).

3     ACCELERATING EP WITH B RAIN - INSPIRED N ETWORK P ROPERTIES

                         (a)
                           𝑠0                𝑠1                  𝑠2              Predict: 𝑠𝑝
                                  𝑊0               𝛼1 𝑊1               𝑊𝑓                      Label: 𝑠𝑡


                                                   𝛽1 𝐵1              𝛽𝑓 𝐵𝑓
                                                                                 Error: 𝑒𝑝 = 𝑠𝑡 − 𝑠𝑝
                         (b)
                           𝑠0                𝑠1                  𝑠2         Predict: 𝑠𝑝
                                𝐶𝑜𝑛𝑣0             𝑃1 , 𝐶𝑜𝑛𝑣1          𝑃2 , 𝑊𝑓                  Label: 𝑠𝑡
                                (32,5,1,0)        2 , 64,5,1,0        (2)


                                                                            𝑇
                                             𝐶𝑜𝑛𝑣𝑇1 , 𝑃1−1 , 𝛽1       𝑊𝑓        , 𝑃2−1 , 𝛽𝑓
                                                                                   Error: 𝑒𝑝 = 𝑠𝑡 − 𝑠𝑝

Figure 1: Illustration of feedback and feedforward regulation. (a) Layered architecture of RNN. The
feedforward weights Wi and feedback weights Bi are rescaled by coefficients αi and βi respectively.
The dashed box encloses an RNN formed by layers s1 and s2 with feedforward and feedback path-
ways. βf is the nudging factor, which essentially scales the feedback strength of prediction error.
(b) Embedding convolutional architecture in RNN. Convolutional parameter (32,5,1,0) is written
as (channels, kernels, stride, padding). Parameter (2) in (b) denotes max-pooling with stride 2.
ConvTi represents transpose convolution, the inverse process of the convolution, and Pi−1 means
max-unpooling (Ernoult et al., 2019). Model architectures and training process are given in Ap-
pendix D.


3.1   P ROTOTYPICAL SETTING OF EQUILIBRIUM PROPAGATION

Unlike the prototypical setting of equilibrium propagation (P-EP) (Ernoult et al., 2019), we separate
the input and output layer from the recurrent network (Figure 1a). This separation allows the output
layer to adopt the SoftMax activation commonly used in feedforward networks, which facilitates
performance comparison (Laborieux & Zenke, 2024). For clarity, the RNN (black dashed box in
Figure 1a) shown here only contains two hidden layers s1 and s2 , but the approach applies to deeper
structures (see below). The states of the RNN evolve for T discrete steps until they converge. The
dynamics of the whole RNN can be formulated as:
                           sβf [t + 1] = F (sβf [t], b) = ρ(W · sβf [t] + b),
                                    b = [W0 · s0 , βf · Bf · ep ],                                         (2)


                                                                 3
Published as a conference paper at ICLR 2026


where sβf [t] is the state of the RNN at time t, ρ is the activation function, W is the forward weight
matrix of the RNN, and b combines the feedforward input and the error-nudging term. Note that
           β    β
sβf = [s1 f , s2 f ]. For each sample-label pair (x, star ), we run the free phase (βf = 0) for te
iterations, obtain the prediction sp = SoftMax(Wf · s2 ), and compute the prediction error ep =
star −sp . During the clamped phase, the error nudges the RNN through the feedback weights Bf and
scaling coefficient βf = βf 1 (βf 1 = 0.1 for layered architecture and βf 1 = 0.25 for convolutional
architecture by default). The network evolves for K further iterations under clamping to another
state. The weights (W0 , W1 ) are then updated with an STDP-compatible rule:
                                                                                           β
                                   ∆Wi = dsi+1 · (s0i )⊤ ,                        f1
                                                                        dsi+1 = si+1 − s0i+1 ,                           (3)
where dsi is the offset of stable point caused by the error nudging (Scellier et al., 2018). Similarly,
the final weight for output is updated:

                                                 ∆Wf = (star − s0p ) · (s02 )⊤ .                                         (4)
We also consider an RNN embedded with convolutional architecture in its forward paths (2 convo-
lution layers, 2 max-pooling layers and 1 fully connected layer) shown in Figure 1b. The forward
convolutional structure follows the architecture of existing convolutional neural networks (CNN)
(Krizhevsky et al., 2012; Simonyan & Zisserman, 2015), in which a pooling layer is placed after
the activation of the convolution layer. We transform the CNN to an RNN by adding feedback con-
nections symmetric with the feed-forward connections (See Appendix D for the pseudocode and
schematics).

3.2   F EEDBACK R EGULATION IN L AYERED RNN FOR FAST C ONVERGENCE

                  (a) 0                                                (b) 0
                 Index


                                                                    Index


                         100                                                100
                  (c) 0                                      100
                                                                       (d) 0                                      100
                 Index


                                                                    Index


                         100                                                100                                   10 2
                  (e) 0                                      10 1
                                                                       (f) 0
                                                                                                                  10 4
                 Index


                                                                    Index


                    100                                                  100
                  (g) 0                                      10 2
                                                                       (h) 0                                      10 6
                 Index


                                                                    Index


                         100                                                100
                               0   20   40       60   80                          0   20       40       60   80
                                             t                                                      t


Figure 2: Convergence dynamics and speed versus feedback scaling βi . All neurons in all hidden
layers are indexed (s1 :0-63; s2 :64-127). Colors indicate neuronal activity (a,c,e,g) and changes in
activity (b,d,f,h). (a) The state evolution of RNN with symmetric weights and βi = 0.1; (b) The
one-step difference of neural states in (a). (c, d) Symmetric weights with βi = 2; (e, f) Asymmetric
weights with βi = 0.1; (g, h) Asymmetric weights with βi = 4. In both symmetric and asymmetric
feedback cases, down-scaling feedback connections tends to stabilize the network. See Figure 5d
for the statistical robustness.

Although the SR can tune the RNN dynamics, scaling forward weights Wi distorts forward signal
propagation, which is harmful to performance (see below). Therefore, we turn to another choice,
namely, scaling only the feedback strength with βi . This coefficient scales the gradients, in the same
way as the nudging factor βf . We consider both symmetric (Bi = (Wi )⊤ ) and asymmetric (Bi ̸=
(Wi )⊤ ) recurrent connections in the study, and compare the results with FNNs of the same size
trained by BP (feedback connections removed) or feedback alignment (FA) (Lillicrap et al., 2016)
that uses random weights Bi ̸= (Wi )⊤ to feedback the error. Note that, after scaling, the overall
weight matrix W of a symmetric RNN is no longer strictly symmetric. Therefore, we started from
the vector field setting of EP rather then the energy-based setting in the first place. The feedforward
and feedback weights are multiplied by coefficients αi and βi respectively. Figure 2a-d shows
convergence speed for different βi . With asymmetric weights, the network can converge to a fixed
point (Figure 2e, f), exhibit cyclical oscillation (Figure 2g, h), or even become chaotic. The feedback
weights stay fixed during training process, which differs from EP in vector field dynamics (Scellier


                                                                   4
Published as a conference paper at ICLR 2026


et al., 2018). The pseudocode of learning procedure with a 2-hidden-layer RNN shown in Figure 1(a)
is provided in Algorithm 1.

Algorithm 1 EP with Feedforward and Feedback Scaling
Require: Input: (x, star )
Require: Parameters: θ = [W0 , W1 , Wf , Bf , B1 , α1 , β1 , βf 1 ]
Ensure: Updated parameters θ
 1: function F IRST- PHASE(θ, star )
 2:    s0 ← x
 3:    for t = 1 to T do
 4:         h1 ← W0 · s0 + β1 · B1 · s02
 5:         h2 ← α1 · W1 · s01
 6:         hp ← Wf · s02
 7:         s01 , s02 , s0p ← ρ(h1 ), ρ(h2 ), SoftMax(hp )
 8:    end for
 9:    Λ1 ← [s0i ], i = 0, 1, 2, p
10:    return Λ1
11: end function
12: function S ECOND - PHASE(θ, Λ1 , star )
         β        β        β
13:    s1 f 1 , s2 f 1 , sp f 1 ← s01 , s02 , s0p
14:    for t = 1 to K do
                                   β
15:         ep ← star − sp f 1
                                                  β
16:         h1 ← W0 · s0 + β1 · B1 · s2 f 1
                                     βf 1
17:         h2 ← α1 · W1 · s1 + βf · Bf · ep
                                 β
18:         hp ← Wf · s2 f 1
              βf 1      βf 1    βf 1
19:         s1 , s2 , sp ← ρ(h1 ), ρ(h2 ), SoftMax(hp )
20:    end for
                    β
21:    dsi ← si f 1 − s0i , i = 1, 2
22:    Λ2 ← [ds1 , ds2 ]
23:    return Λ2
24: end function
25: function U PDATING -W EIGHTS(θ, Λ1 , Λ2 , star )
26:    ∆Wi ← dsi+1 · (s0i )⊤ , i = 0, 1
27:    ∆Wf ← (star − s0p ) · (s02 )⊤
28: end function


3.3    R ESIDUAL C ONNECTIONS TO AVOID VANISHING G RADIENTS

In our 10-hidden-layer RNN with symmetric connections, we add cross layer residual links (Fig-
ure 3a-b) and carry out ablation study on their effects in performance. The three long-range bidi-
rectional connections bypass adjacent layers to reduce gradient decay. For RNN with asymmet-
ric connections, we introduce skip-layer connections between non-adjacent layers with P = 20%
probability, creating an RNN with arbitrary graph topologies (AGT) where any pair of layers form
connections stochastically (Figure 3c) (Salvatori et al., 2022). (See Algorithm S3 in Appendix D for
training detail)

4     E XPERIMENTS
We evaluated our RNN models on MNIST and CIFAR-10 datasets and compared the results with P-
EP and BP. The MNIST dataset consists of 70,000 grayscale handwritten digit images (28×28 pixels)
split into 60,000 training and 10,000 test samples. CIFAR-10 contains 60,000 RGB images (32×32
pixels) of 10 categories, divided into 50,000 training and 10,000 test samples. Pre-processing, net-
work structures and additional training details are in Appendix D.

4.1    I NFLUENCE OF F EEDFORWARD S CALING AND F EEDBACK S CALING

Figure 4 compares the effects of feedforward scaling αi and feedback scaling βi . In general, relative
small feedback scaling (βi = 0.1) yields high MNIST accuracy (Figure 4). In deeper RNNs, overly


                                                         5
Published as a conference paper at ICLR 2026


Figure 3: (a) A 10-hidden-layer RNN model with residual connections. The solid blue wires and
the dashed orange wires represent forward and feedback residual connections respectively. The
bidirectional connections are symmetric. (b) Adjacency matrix of (a). The blocks (green) other than
the sub-diagonals indicate residual connections. (c) Adjacency matrix for an RNN with arbitrary
graph topology.


                 (a)               2HL-Test accuracy                 (b)              3HL-Test accuracy                (c)                5HL-Test accuracy
                                                                                                                                                                                           1.00
                         0.001 0.9659 0.9730 0.9756 0.9753                 0.001 0.9348 0.9692 0.9718 0.9555                 0.001 0.3844 0.7980 0.9338 0.8048
                                                                                                                                                                                           0.95
                          0.01 0.9666 0.9723 0.9760 0.9758                  0.01 0.9246 0.9679 0.9765 0.9715                  0.01 0.2515 0.9365 0.9575 0.8583                             0.90
                     i


                                                                      i


                                                                                                                        i


                           0.1 0.9624 0.9711 0.9725 0.9694                   0.1 0.6323 0.9497 0.9741 0.9702                      0.1 0.2104 0.8386 0.9757 0.9332                          0.85

                                                                                                                                                                                           0.80
                           1.0 0.8925 0.9127 0.9300 0.7978                   1.0 0.4862 0.8739 0.5249 0.1676                      1.0 0.2096 0.3891 0.2500 0.1324
                                                                                                                                                                                           0.75
                                0.01   0.1       1.0         4.0                    0.01   0.1       1.0         4.0                   0.01   0.1                      1.0   4.0
                                             i                                                   i                                                                 i


Figure 4: The influence of feedforward scaling αi and feedback scaling βi on accuracy of MNIST
classification. (a) 2 hidden layers; (b) 3 hidden layers; (c) 5 hidden layers. Each layer has 64
neurons. By default, T = 10×NHiddenLayer , K = T /2. Each result is averaged over five repetitive
experiments.


   (a)                                                 (b)                                           (c)                                       (d)
               100                                                                                           1
                                                                                                                                                                   102
                                                               8                                             0
                                                                                                                                                Convergence time


                                                               7
                                                               6                                             1
 Test Error


                                                                                                     FTMLE


              10 1                                             5
                                                         SR


                                                               4                                             2                                                     101         symm-before training
                                                               3                                                                                                               asymm-before training
                                                               2                                             3                                                                 symm-after training
                                                               1                                                                                                               asymm-after training
              10 2                                             0                                             4                                                     100
                       01
                         1
                      0.1

                         5
                         1
                         2
                         4


                                                                     01
                                                                       1
                                                                    0.1

                                                                       5
                                                                       1
                                                                       2
                                                                       4


                                                                                                                   01
                                                                                                                     1
                                                                                                                  0.1

                                                                                                                     5
                                                                                                                     1
                                                                                                                     2
                                                                                                                     4


                                                                                                                                                                           01
                                                                                                                                                                             1
                                                                                                                                                                          0.1

                                                                                                                                                                             5
                                                                                                                                                                             1
                                                                                                                                                                             2
                                                                                                                                                                             4
                     0.0

                     0.2


                                                                   0.0

                                                                   0.2


                                                                                                                 0.0

                                                                                                                 0.2


                                                                                                                                                                         0.0

                                                                                                                                                                         0.2
                0.0


                                                               0.0


                                                                                                             0.0


                                                                                                                                                                       0.0


                                   i                                            i                                             i                                                        i


Figure 5: The test error, SR, FTMLE, and convergence time versus feedback scaling βi . The results
are obtained from a 3-hidden-layer (64 neurons per layer) model trained on MNIST dataset. Note
that the network does not converge under certain conditions, resulting in missing value in d.


                                                                                                 6
Published as a conference paper at ICLR 2026


low feedback scaling βi jeopardizes the performance, which we attribute to vanishing gradients
(Figure 4c, right two columns). In contrast, down-scaling the feedforward weights degrades perfor-
mance, as the inference signals are weakened through the layers (see rows of Figure 4a). However,
up-scaling αi can also be detrimental, as this easily leads to saturation of neural state. The best per-
formance of a 5-hidden-layer RNN is achieved without feedforward scaling αi = 1 and a trade-off
in feedback scaling at βi = 0.1. These results suggest that balancing the feedforward and feedback
strengths is critical for better performance, not only accuracy but also speed (see Table 1).
To further investigate the influence of feedback scaling βi , we plot the error, SR, finite time max-
imum Lyapunov exponent (FTMLE) (Shadden et al., 2005; Kanno & Uchida, 2014) and conver-
gence time against feedback scaling coefficient before and after training of a 3-hidden-layer RNN
on MNIST (Figure 5). It shows that larger feedback scaling βi decreases accuracy (Figure 5a). As
expected, SR is positively correlated to βi (see Figure 5b), and large SR can lead to instability of
an RNN indicated by the FTMLE shown in Figure 5c, which in turn explains the results in Figure
5a. In general, down-scaling the feedback (βi < 1) reduces the convergence time of RNN, which is
favorable. Note that up-scaling of feedback βi >1 can also decrease FTMLE and convergence time.
However, this is attributed to the saturation of neural state, and will also lower the performance.
Additionally, one might suspect that the gradient signals in the lower layers are not fulfilling their
intended role. In reservoir computing, where only the last layer is trained, the network can also
reach high accuracy as long as the output dimension is large enough. However, this is unlikely in
our case, as each layer in our network has only 64 neurons by default (other than the results in Table
1). To further confirm that the learning in lower layers is meaningful, we performed training with
the weights of lower layers frozen—details of these experiments are included in Appendix C.5. The
results clearly show that getting comparable results to BP requires effective training of lower layers.

      (a) 0                      i=0.01                             (b) 0                           i=0.1                             (c) 0                             i=1.0                         (d) 0                              i=4.0
                 10                                                             10                                                               10                                                               10
Testing Error


                                                               Testing Error


                                                                                                                                Testing Error


                10 1                                                           10 1                                                             10 1                                             Testing Error   10 1


                                10           50                                                10             50                                                   10            50                                                 10            50
                                20           100                                               20             100                                                  20            100                                                20            100
                10 2                                                           10 2                                                             10 2                                                             10 2
                       0   10   20    30           40     50                          0   10   20    30             40     50                          0   10      20    30            40   50                          0   10      20    30            40   50
                                 Epoch                                                          Epoch                                                               Epoch                                                            Epoch
      (e) 0                     2 Layers                               (f) 0                   3 Layers                              (g) 0                         5 Layers                           (h) 0                         10 Layers
                 10                                                             10                                                               10                                                               10


                                     0.001              1.0                                           0.001              1.0
Testing Error


                                                               Testing Error


                                                                                                                                Testing Error


                                                                                                                                                                                                 Testing Error


                                     0.01               2.0                                           0.01               2.0
                10 1                 0.1                4.0                    10 1                   0.1                4.0                    10 1                                                             10 1
                                     0.25                                                             0.25
                                                                                                                                                           0.001           1.0                                              0.001           1.0
                                                                                                                                                           0.01            2.0                                              0.01            2.0
                                                                                                                                                           0.1             4.0                                              0.1             4.0
                                                                                                                                                           0.25                                                             0.25
                10 2                                                           10 2                                                             10 2                                                             10 2
                       0   10   20    30           40     50                          0   10   20    30             40     50                          0   10      20    30            40   50                          0   10      20    30            40   50
                                 Epoch                                                          Epoch                                                               Epoch                                                            Epoch


Figure 6: Test error with different hyperparameters. The curves of different T (10, 20, 50, 100) with
2 hidden layers (64 neurons per hidden layer) and (a) βi = 0.01; (b) βi = 0.1; (c) βi = 1; (d)
βi = 4. The curves of different βi (0.001, 0.01, 0.1, 0.25, 1, 2, 4) with (e) 2 hidden layers; (f) 3
hidden layers; (g) 5 hidden layers; (h) 10 hidden layers. The shaded areas represent deviations of
five repeated experiments. By default, T = 10 × NHiddenLayer , K = T /2. See Appendix A for
more information.


4.2                    D OWN - SCALING F EEDBACK L EADS TO FASTER C ONVERGENCE

Figure 6a-d plots the error versus the number of epochs with different iteration steps T . Under the
condition of βi = 0.01 (Figure 6a), the model with T = 10 and K = 5 works as well as the model
with T = 100 and K = 50, suggesting possibility of speedup in training. Larger βi requires more
iterations to achieve a certain level of performance (See Figure 6b, c, d). Larger βi means larger SR
and FTMLE, thus requiring more iterations to settle the RNN as shown in Figure 2 and Figure 5(b-


                                                                                                                               7
Published as a conference paper at ICLR 2026


Table 1: Comparison with P-EP and BP in accuracy and cost. The results of P-EP come from
previous work (Ernoult et al., 2019). For BP results, we used a network with the same number
of layers and number of nodes/channels. Each experiment is repeated five times, and the standard
deviation is given. By default, βi = 0.01 in our results, the feedback weights are symmetric with
the feedforward weights for P-EP and Ours, and the learning rate in all layers are the same except
for Ours-DLR (different learning rate), which uses varying learning rates identical to that of P-EP.
For 2HL (two hidden layers) and 3HL (three hidden layers), there are 512 nodes per hidden layer.
See Appendix D for more details.
                                                                Epoch / Batch size Wall Clock Time
 Architecture   Training approach        Testing (Training)
                                                                       -T/K         (HH:MM:SS)
                 P-EP (sigmoid-s)      98.05%±0.10% (99.86%)      50/20-100/20          1:56:-
     2HL         Ours (tanh, Adam)    98.39%±0.04% (100.00%)       50/500-10/10        0:01:16
                  BP (tanh, Adam)     98.26%±0.06% (100.00%)        50/500-1/1         0:00:18
                 P-EP (sigmoid-s)      97.99%±0.18% (99.90%)      100/20-180/20         8:27:-
                 Ours-DLR (tanh)       97.65%±0.08% (98.93%)       100/20-18/10        1:01:14
     3HL            Ours (tanh)       97.83%±0.13% (99.98%)        100/20-18/10        1:01:54
                 Ours (tanh, Adam)    98.36%±0.06% (100.00%)       50/500-18/10        0:02:11
                  BP (tanh, Adam)     98.36%±0.08% (100.00%)        50/500-1/1         0:00:24
                P-EP (hard-sigmoid)    98.98%±0.04% (99.46%)      40/20-200/10          8:58:-
     Conv       Ours (hard-sigmoid)    99.14%±0.02% (99.78%)       40/128-20/10        0:12:28
                 BP (hard-sigmoid)    98.93%±0.18% (99.43%)         40/128-1/1         0:01:01


Table 2: Comparison with BP and FA and ablation study of residual connection. For layered
architecture, there are 64 nodes per hidden layer and we chose T = 10 × NHiddenLayer , and
K = 5 × NHiddenLayer , which guarantees saturation of accuracy at βi = 0.1. For convolutional
architectures, βi = 0.01. By default, the Adam optimizer is used. Each experiment is repeated five
times. See Appendix D for more training details.
  Architecture
                  Training approach MNIST-Testing (Training) CIFAR-10-Testing (Training)
  -connections
                         BP             97.69%±0.10% (100.00%)       49.23%±0.81% (56.72%)
     5-symm
                         Ours            97.64%±0.10% (99.98%)       50.72%±0.17% (57.02%)
                          FA             96.44%±0.10% (98.96%)       37.97%±2.18% (38.92%)
    5-asymm
                         Ours            96.37%±0.11% (97.99%)       45.27%±0.73% (46.79%)
                         BP              97.61%±0.04% (99.93%)       48.23%±1.26% (55.37%)
    10-symm              Ours            92.49%±0.32% (95.27%)       34.90%±0.38% (34.64%)
                    Ours-Residual        97.49%±0.05% (99.77%)       44.46%±0.51% (48.67%)
                          FA             94.52%±0.26% (95.54%)       30.16%±6.12% (30.20%)
    10-asymm             Ours            87.37%±0.49% (87.95%)       30.37%±1.09% (29.97%)
                      Ours-AGT           96.87%±0.11% (99.45%)       30.94%±4.90% (31.36%)
                         BP              97.48%±0.07% (99.74%)       47.35%±1.49% (54.59%)
    20-symm
                    Ours-Residual        95.95%±0.18% (98.20%)       43.61%±1.17% (44.26%)
                         BP              99.34%±0.04% (99.97%)       75.45%±0.46% (83.61%)
      Conv
                         Ours            99.27%±0.07% (99.78%)       75.04%±0.51% (80.79%)


d). Or even worse, the gradient signal is completely distorted. At βi = 4, even T=100 fails to exceed
95% accuracy. Figure 6e-h shows that while shallow networks benefit from low βi , deeper networks
(3, 5 and 10 layers) lose accuracy. In all cases, training performance peaks at certain βi dependent
on the network depth. Additional results are provided in Table S1 in Appendix B.
Table 1 compares our approach with P-EP, BP, and FA. Our model supersedes P-EP in training
speed by at least one order of magnitude for both convolutional architecture and layered architecture.
Importantly, our accuracy is comparable to BP and FA for the shallow architectures (5-hidden-layer
and conv model, see also Table 2). In consideration of the improved stability (Figure 6) via feedback
regulation, we anticipate that physical implementations of RNN can achieve performance on par


                                                  8
Published as a conference paper at ICLR 2026


with BP. Additionally, for layered architecture, we also adopt the same training parameters (learning
rate, batch size and epochs) as P-EP, differing only in feedback scaling (‘ours-DLR’ in Table 1). The
results present clear evidence of speedup, which mainly stems from the reduced number of iterations
required for convergence.


4.3   D OWN - SCALED F EEDBACK C OORDINATES P LASTICITY OF D IFFERENT L AYERS

It is hypothesized that the brain requires different plasticity in different areas due to their varying
functional roles (Atallah et al., 2004; Lowet et al., 2020). The variability in plasticity can be realized
explicitly by adjusting learning rates or implicitly by modulating the intensity of gradient. Previ-
ous work postulated that EP with weak feedback necessitates learning rates differing by orders of
magnitude across layers (Scellier & Bengio, 2017). Here, we found that due to gradient differences
across different layers induced by weak feedback, a 3-hidden-layer RNN at βi = 0.01 (Table 1,
‘ours (tanh)’) learns well with a uniform learning rate. This result suggests that the feedback scaling
alone is able to regulate gradient strength of different layers, pointing to another possible mechanism
to coordinate plasticity.


4.4   R ESIDUAL C ONNECTIONS OVERCOME THE G RADIENT VANISHING IN D EEP RNN S

Weak feedback exacerbates vanishing gradient in deeper layered RNN (Figures S5–S6 in Ap-
pendix B). Adding residual connections restores gradient flow (Figure S7 in Appendix B). As a
result, a 10-hidden-layer network sees substantial performance gains (Table 2), 5% increase in ac-
curacy for MNIST and 9% for CIFAR-10. Even 20-hidden-layer model can be trained. As shown
in Table 2, without residual connections, an asymmetric RNN trained by EP falls short of FA in
accuracy, but arbitrary residual links surpass the accuracy of FA for the MNIST classification (See
ablation study on connection probability in Appendix B.). For more complex dataset CIFAR-10, the
10-hidden-layer asymmetric model with residual random feedback connections achieves accuracy
nearly 14% below the symmetric model. A possible reason is that the gradient signal through mul-
tiple random fixed feedback connections becomes too distorted by error to coordinate the forward
weight learning.


5     D ISCUSSION

We have applied the feedback scaling to RNN to speed up the convergence and to accelerate training
with EP with negligible overhead. To counteract the vanishing gradient in deep architectures, we
have added residual connections to non-adjacent layers of deep RNNs, partly restoring classification
performance. In principle, the residual connections make credit assignment pathways shorter (Veit
et al., 2016). The training exhibits remarkable resilience to noise on weight and neural state. Our
structural modification is compatible with other algorithmic speed-ups (Scellier et al., 2023), thereby
expanding the design space for efficient EP implementations.
Recent work on credit assignment in brain-inspired networks, e.g. adjoint propagation (Liu et al.,
2026), partitions a large network into local RNNs with random internal connections of low SR
for fast convergence and dynamic resource allocation, yielding speed and accuracy similar to this
work. This work, however, adopts the feedback scaling to solve the stability issue and accelerate
convergence of EP.
Weak feedback is often considered in biologically plausible learning algorithms (Sacramento et al.,
2018; Haider et al., 2021; Meulemans et al., 2021). It has been shown that contrastive Hebbian
learning with weak feedback approximates backpropagation while converging quickly (Xie & Se-
ung, 2003). More recently, local representation alignment (LRA) likewise employed weak feedback
(Ororbia et al., 2023) and skip connections from the output to deep layers for efficient training. The
EP framework also approximates BP (Scellier & Bengio, 2017; Millidge et al., 2023), but under
the weak clamping condition (weak supervision) (Laborieux et al., 2021; Millidge et al., 2023). We
have shown that, at the infinitesimal inference limit, namely weak supervision and weak feedback
(Millidge et al., 2023), EP is equivalent to LRA and BP (Appendix C). In other words, the dynamics
of FRE-RNN is more like the feedforward neural network due to its weak feedback.


                                                    9
Published as a conference paper at ICLR 2026


However, there are still a few limitations to our approaches for large-scale neural networks that
underpin artificial intelligence. For complex datasets like CIFAR-10, there exists a notable perfor-
mance gap compared to BP, using deep fully connected neural networks. We attribute this gap to
the inaccurate approximation to the true gradient as computed by BP (See Appendix C.4). There-
fore, although EP can be extended to deep fully connected network (20-hidden-layers) and shallow
CNNs, its applicability for deep CNN remains to be explored. For deep architectures with asymmet-
ric connections, the accuracy decreases faster with increasing depth due to the inaccurate random
error feedback. More in-depth investigation on residual connection topology is required to scale
up the methodology to large scale deep architectures. Besides, the hyperparameters are optimized
empirically. We find a feedback scaling in the range of 0.01-0.1 is favorable for shallow networks
(less than 4 layers) and 0.1-0.25 for deeper architectures. Finding a general way to determine these
parameters is still on-going. Additionally, existing research on EP converging naturally continues
to focus primarily on static-input settings (Laborieux et al., 2021; Ernoult et al., 2019; Laborieux
& Zenke, 2024). Extending naturally converging RNN trained by EP to sequence tasks remains a
challenge.
From a neurobiological perspective, residual connections, particularly the randomly generated arbi-
trary graph topologies, yield cortex-like connectivity patterns in the brain. The feedback-regulated
residual RNNs equip the biologically plausible learning framework, EP, with biologically plausible
network architecture. Although it currently runs on GPUs, it can exploit the natural convergence of
physical RNNs and facilitate efficient learning and inference on dedicated neuromorphic hardware.

ACKNOWLEDGEMENTS
This work was supported by the National Key R&D Program of China (Grant No.
2024YFA1208804). Additional financial support from the University of Science and Technology
of China and the Chinese Academy of Sciences is also gratefully acknowledged.

C ODE AVAILABILITY
The code used in this work is available at https://github.com/Zero0Hero/
FRE-RNN-EP.

R EPRODUCIBILITY STATEMENT
The code necessary to reproduce the main results is provided as Jupyter Notebooks in the Supple-
mentary Materials. Researchers can directly run them to reproduce the results. Further details on
data pre-processing and training process are available within the provided code and in Appendix D.

T HE U SE OF L ARGE L ANGUAGE M ODELS (LLM S )
In the preparation of this work, the authors used GPT-5 and DeepSeek solely for the purpose of
polishing and improving the linguistic fluency and readability of the text. This includes tasks such
as correcting grammar and rephrasing sentences. After using the model, the authors have reviewed
and edited all content extensively and take full responsibility for all ideas, claims, and the final
language presented in this paper.

R EFERENCES
David H. Ackley, Geoffrey E. Hinton, and Terrence J. Sejnowski. A learning algorithm for boltz-
  mann machines. Cognitive Science, 9(1):147–169, 1985.
Hisham E. Atallah, Michael J. Frank, and Randall C. O’Reilly. Hippocampus, cortex, and basal
  ganglia: Insights from computational models of complementary learning systems. Neurobi-
  ology of Learning and Memory, 82(3):253–267, 2004. ISSN 1074-7427. doi: https://doi.
  org/10.1016/j.nlm.2004.06.004. URL https://www.sciencedirect.com/science/
  article/pii/S1074742704000693.
Zhang Bai, D. J. Miller, and Wang Yue. Nonlinear system modeling with random matrices: Echo
  state networks revisited. IEEE Transactions on Neural Networks and Learning Systems, 23(1):
  175–182, 2012. ISSN 2162-237X 2162-2388. doi: 10.1109/tnnls.2011.2178562.


                                                10
Published as a conference paper at ICLR 2026


Maxence Ernoult, Julie Grollier, Damien Querlioz, Yoshua Bengio, and Benjamin Scellier. Updates
 of equilibrium prop match gradients of backprop through time in an rnn with static input. In
 Proceedings of the 33rd International Conference on Neural Information Processing Systems, pp.
 Article 636. Curran Associates Inc., 2019.
Maxence Ernoult, Julie Grollier, Damien Querlioz, Yoshua Bengio, and Benjamin Scellier. Equilib-
 rium propagation with continual weight updates, 2020. URL https://openreview.net/
 forum?id=H1xJhJStPS.
D. J. Felleman and D. C. Van Essen. Distributed hierarchical processing in the primate cerebral
  cortex. Cerebral Cortex, 1(1):1–47, 1991. ISSN 1047-3211. doi: 10.1093/cercor/1.1.1. URL
  <GotoISI>://WOS:000208047200002.
Mehmet Fişek, Dustin Herrmann, Alexander Egea-Weiss, Matilda Cloves, Lisa Bauer, Tai-Ying
 Lee, Lloyd E. Russell, and Michael Häusser. Cortico-cortical feedback engages active den-
 drites in visual cortex. Nature, 617(7962):769–776, 2023. ISSN 1476-4687. doi: 10.1038/
 s41586-023-06007-6. URL https://doi.org/10.1038/s41586-023-06007-6.
Paul Haider, Benjamin Ellenberger, Laura Kriener, Jakob Jordan, Walter Senn, and Mihai A. Petro-
  vici. Latent equilibrium: a unified learning theory for arbitrarily fast computation with arbitrarily
  slow neurons. In Proceedings of the 35th International Conference on Neural Information Pro-
  cessing Systems, pp. Article 1365. Curran Associates Inc., 2021.
K. He, X. Zhang, S. Ren, and J. Sun. Deep residual learning for image recognition. In 2016 IEEE
  Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770–778, 2016. ISBN
  1063-6919. doi: 10.1109/CVPR.2016.90.
Maya van Holk and Jorge F Mejias. Biologically plausible models of cognitive flexibility: merging
 recurrent neural networks with full-brain dynamics. Current Opinion in Behavioral Sciences, 56:
 101351, 2024. doi: 10.1016/j.cobeha.2024.101351. van Holk, Maya; Mejias, Jorge F. ORCIDs:
 0000-0003-4930-1472 2352-1546 2352-1554.
J. J. Hopfield. Neural networks and physical systems with emergent collective computational abili-
   ties. Proceedings of the National Academy of Sciences, 79(8):2554–2558, 1982. ISSN 0027-8424
   1091-6490. doi: 10.1073/pnas.79.8.2554.
Rasmus Høier, Kirill Kalinin, Maxence Ernoult, and Christopher Zach. Dyadic learning in recur-
  rent and feedforward models. In NeurIPS 2024 Workshop Machine Learning with new Compute
  Paradigms, 2024. URL https://openreview.net/forum?id=LNfWowAErI.
Herbert Jaeger and Harald Haas. Harnessing nonlinearity: Predicting chaotic systems and sav-
  ing energy in wireless communication. Science, 304(5667):78–80, 2004. doi: 10.1126/science.
  1091277. URL https://doi.org/10.1126/science.1091277. doi: 10.1126/sci-
  ence.1091277.
Adrien Journé, Hector Garcia Rodriguez, Qinghai Guo, and Timoleon Moraitis. Hebbian deep learn-
  ing without feedback. In The Eleventh International Conference on Learning Representations,
  2023. URL https://openreview.net/forum?id=8gd4M-_Rj1.
Kazutaka Kanno and Atsushi Uchida. Finite-time lyapunov exponents in time-delayed nonlinear
  dynamical systems. Physical Review E, 89(3):032918, 2014. ISSN 1539-3755 1550-2376. doi:
  10.1103/PhysRevE.89.032918.
Diederik P. Kingma and Jimmy Ba. Adam: A method for stochastic optimization. In International
  Conference for Learning Representations. ICLR, 2015. doi: doi.org/10.48550/arXiv.1412.6980.
Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton. Imagenet classification with deep convo-
  lutional neural networks. In Advances in Neural Information Processing Systems, 2012.
Dhireesha Kudithipudi, Catherine Schuman, Craig M. Vineyard, Tej Pandit, Cory Merkel, Rajkumar
  Kubendran, James B. Aimone, Garrick Orchard, Christian Mayr, Ryad Benosman, Joe Hays, Cliff
  Young, Chiara Bartolozzi, Amitava Majumdar, Suma George Cardwell, Melika Payvand, Sonia
  Buckley, Shruti Kulkarni, Hector A. Gonzalez, Gert Cauwenberghs, Chetan Singh Thakur, Anand
  Subramoney, and Steve Furber. Neuromorphic computing at scale. Nature, 637(8047):801–812,
  2025. ISSN 0028-0836 1476-4687. doi: 10.1038/s41586-024-08253-8.


                                                  11
Published as a conference paper at ICLR 2026


Suman Kulkarni and Dani S. Bassett.      Toward principles of brain network organiza-
  tion and function.   Annual Review of Biophysics, 54(Volume 54, 2025):353–378,
  2025.   ISSN 1936-1238.   doi: https://doi.org/10.1146/annurev-biophys-030722-110624.
  URL         https://www.annualreviews.org/content/journals/10.1146/
  annurev-biophys-030722-110624.
Axel Laborieux and Friedemann Zenke. Improving equilibrium propagation without weight sym-
  metry through jacobian homeostasis. In The Twelfth International Conference on Learning Rep-
  resentations. ICLR, 2024. URL https://openreview.net/forum?id=kUveo5k1GF.
Axel Laborieux, Maxence Ernoult, Benjamin Scellier, Yoshua Bengio, Julie Grollier, and Damien
  Querlioz. Scaling equilibrium propagation to deep convnets by drastically reducing its gradient
  estimator bias. Frontiers in Neuroscience, 15:633674, 2021. ISSN 1662-453X. doi: 10.3389/
  fnins.2021.633674.
Yann Lecun. A theoretical framework for back-propagation. In Proceedings of the 1988 Connec-
  tionist Models Summer School, 1988.
Robert Legenstein and Wolfgang Maass. Edge of chaos and prediction of computational perfor-
  mance for neural circuit models. Neural Networks, 20(3):323–334, 2007. ISSN 08936080. doi:
  10.1016/j.neunet.2007.04.017.
Timothy P. Lillicrap, Daniel Cownden, Douglas B. Tweed, and Colin J. Akerman. Random synaptic
  feedback weights support error backpropagation for deep learning. Nature Communications, 7
  (1):13276, 2016. ISSN 2041-1723. doi: 10.1038/ncomms13276.
Zhuo Liu, Hao Shu, Linmiao Wang, Xu Meng, Yousheng Wang, Xuancheng Li, Wei Wang, and
  Tao Chen. Adjoint propagation of error signal through modular recurrent neural networks for
  biologically plausible learning. eLife, 15:e108237, 2026. ISSN 2050-084X. doi: 10.7554/eLife.
  108237. URL https://doi.org/10.7554/eLife.108237.
Adam S. Lowet, Qiao Zheng, Sara Matias, Jan Drugowitsch, and Naoshige Uchida. Distribu-
  tional reinforcement learning in the brain. Trends in Neurosciences, 43(12):980–997, 2020.
  ISSN 0166-2236. doi: https://doi.org/10.1016/j.tins.2020.09.004. URL https://www.
  sciencedirect.com/science/article/pii/S0166223620301983.
Christopher W. Lynn and Danielle S. Bassett. The physics of brain network structure, function
  and control. Nature Reviews Physics, 1(5):318–332, 2019. ISSN 2522-5820. doi: 10.1038/
  s42254-019-0040-8.
Nikola T. Markov, Mária Ercsey-Ravasz, David C. Van Essen, Kenneth Knoblauch, Zoltán
  Toroczkai, and Henry Kennedy. Cortical high-density counterstream architectures. Science, 342
  (6158), 2013. ISSN 0036-8075 1095-9203. doi: 10.1126/science.1238406.
Jorge F. Mejias, John D. Murray, Henry Kennedy, and Xiao-Jing Wang. Feedforward and feedback
  frequency-dependent interactions in a large-scale laminar network of the primate cortex. Science
  Advances, 2(11):e1601335, 2016. doi: doi:10.1126/sciadv.1601335. URL https://www.
  science.org/doi/abs/10.1126/sciadv.1601335.
Jan Melchior and Laurenz Wiskott. Hebbian-descent, 2019. URL https://arxiv.org/abs/
  1905.10585.
Alexander Meulemans, Matilde Tristany Farinha, Javier Garcia Ordonez, Pau Vilimelis Aceituno,
  João Sacramento, and Benjamin F. Grewe. Credit assignment in neural networks through deep
  feedback control. In M. Ranzato, A. Beygelzimer, Y. Dauphin, P.S. Liang, and J. Wortman
  Vaughan (eds.), Advances in Neural Information Processing Systems, volume 34, pp. 4674–4687.
  Curran Associates, Inc., 2021. URL https://proceedings.neurips.cc/paper_
  files/paper/2021/file/25048eb6a33209cb5a815bff0cf6887c-Paper.pdf.
Georgios Michalareas, Julien Vezoli, Stan van Pelt, Jan-Mathijs Schoffelen, Henry Kennedy, and
  Pascal Fries. Alpha-beta and gamma rhythms subserve feedback and feedforward influences
  among human visual cortical areas. Neuron, 89(2):384–397, 2016. ISSN 0896-6273. doi:
  https://doi.org/10.1016/j.neuron.2015.12.018. URL https://www.sciencedirect.com/
  science/article/pii/S0896627315011204.


                                               12
Published as a conference paper at ICLR 2026


John Miller and Moritz Hardt. Stable recurrent models. In International Conference on Learning
  Representations, 2019. URL https://openreview.net/forum?id=Hygxb2CqKm.
Beren Millidge, Yuhang Song, Tommaso Salvatori, Thomas Lukasiewicz, and Rafal Bogacz. Back-
  propagation at the infinitesimal inference limit of energy-based models: Unifying predictive cod-
  ing, equilibrium propagation, and contrastive hebbian learning. In The Eleventh International
  Conference on Learning Representations, 2023. URL https://openreview.net/forum?
  id=nIMifqu2EO.
Javier R. Movellan. Contrastive Hebbian Learning in the Continuous Hopfield Model, pp.
  10–17. Morgan Kaufmann, 1991. ISBN 978-1-4832-1448-1. doi: https://doi.org/10.1016/
  B978-1-4832-1448-1.50007-X. URL https://www.sciencedirect.com/science/
  article/pii/B978148321448150007X.
Mitsumasa Nakajima, Yongbo Zhang, Katsuma Inoue, Yasuo Kuniyoshi, Toshikazu Hashimoto,
 and Kohei Nakajima. Reservoir direct feedback alignment: deep learning by physical dynamics.
 Communications Physics, 7(1):411, 2024. ISSN 2399-3650. doi: 10.1038/s42005-024-01895-0.
Arild Nøkland.     Direct feedback alignment provides learning in deep neural net-
  works. In D. Lee, M. Sugiyama, U. Luxburg, I. Guyon, and R. Garnett (eds.), Ad-
  vances in Neural Information Processing Systems, volume 29. Curran Associates, Inc.,
  2016. URL https://proceedings.neurips.cc/paper_files/paper/2016/
  file/d490d7b4576290fa60eb31b5fc917ad1-Paper.pdf.
Peter O’Connor, Efstratios Gavves, and Max Welling. Initialized equilibrium propagation for
  backprop-free training. In International Conference on Learning Representations. ICLR, 2019.
  URL https://openreview.net/forum?id=B1GMDsR5tm.
Alexander Ororbia and Ankur Mali. Biologically motivated algorithms for propagating local target
  representations. In Proceedings of the AAAI Conference on Artificial Intelligence, volume 33, pp.
  4651–4658. AAAI, 2019.
Alexander Ororbia, Patrick Haffner, David Reitter, and C. Lee Giles. Learning to adapt by minimiz-
  ing discrepancy, 2017.
Alexander G. Ororbia. Brain-inspired machine intelligence: A survey of neurobiologically-plausible
  credit assignment, 2023. URL https://arxiv.org/abs/2312.09257.
Alexander G. Ororbia, Ankur Mali, Daniel Kifer, and C. Lee Giles. Backpropagation-free deep
  learning with recursive local representation alignment. In Proceedings of the AAAI Conference on
  Artificial Intelligence, volume 37, pp. 9327–9335. AAAI, 2023. URL https://ojs.aaai.
  org/index.php/AAAI/article/view/26118.
Matthew G. Perich and Kanaka Rajan. Rethinking brain-wide interactions through multi-region
 ‘network of networks’ models. Current Opinion in Neurobiology, 65:146–151, 2020. ISSN
 09594388. doi: 10.1016/j.conb.2020.11.003.
D. E. Rumelhart, G. E. Hinton, and R. J. Williams. Learning representations by back-propagating
  errors. Nature, 323(6088):533–536, 1986. ISSN 0028-0836. doi: 10.1038/323533a0. URL
  <GotoISI>://WOS:A1986E327500055. E3275 Times Cited:16725 Cited References
  Count:4.
João Sacramento, Rui Ponte Costa, Yoshua Bengio, and Walter Senn.     Dendritic corti-
   cal microcircuits approximate the backpropagation algorithm. In S. Bengio, H. Wal-
   lach, H. Larochelle, K. Grauman, N. Cesa-Bianchi, and R. Garnett (eds.), Ad-
   vances in Neural Information Processing Systems, volume 31. Curran Associates, Inc.,
   2018. URL https://proceedings.neurips.cc/paper_files/paper/2018/
   file/1dc3a89d0d440ba31729b0ba74b93a33-Paper.pdf.
Tommaso Salvatori, Luca Pinchetti, Beren Millidge, Yuhang Song, Tianyi Bao, Rafal Bogacz, and
  Thomas Lukasiewicz. Learning on arbitrary graph topologies via predictive coding. In Alice H.
  Oh, Alekh Agarwal, Danielle Belgrave, and Kyunghyun Cho (eds.), Advances in Neural Informa-
  tion Processing Systems, 2022. URL https://openreview.net/forum?id=dqO59nI_
  R9A.


                                                13
Published as a conference paper at ICLR 2026


Benjamin Scellier and Yoshua Bengio. Equilibrium propagation: Bridging the gap between energy-
  based models and backpropagation. Frontiers in Computational Neuroscience, 11:24, 2017. ISSN
  1662-5188. doi: 10.3389/fncom.2017.00024.
Benjamin Scellier, Anirudh Goyal, Jonathan Binas, Thomas Mesnard, and Yoshua Bengio. Gener-
  alization of equilibrium propagation to vector field dynamics, 2018. URL https://arxiv.
  org/abs/1808.04873.
Benjamin Scellier, Maxence Ernoult, Jack Kendall, and Suhas Kumar. Energy-based learn-
  ing algorithms for analog computing: a comparative study.    In A. Oh, T. Naumann,
  A. Globerson, K. Saenko, M. Hardt, and S. Levine (eds.), Advances in Neural In-
  formation Processing Systems, volume 36, pp. 52705–52731. Curran Associates, Inc.,
  2023. URL https://proceedings.neurips.cc/paper_files/paper/2023/
  file/a52b0d191b619477cc798d544f4f0e4b-Paper-Conference.pdf.
João D. Semedo, Anna I. Jasper, Amin Zandvakili, Aravind Krishna, Amir Aschner, Christian K.
   Machens, Adam Kohn, and Byron M. Yu. Feedforward and feedback interactions between visual
   cortical areas use different population activity patterns. Nature Communications, 13(1):1099,
   2022. ISSN 2041-1723. doi: 10.1038/s41467-022-28552-w. URL https://doi.org/10.
   1038/s41467-022-28552-w.
Shawn C. Shadden, Francois Lekien, and Jerrold E. Marsden. Definition and properties of la-
  grangian coherent structures from finite-time lyapunov exponents in two-dimensional aperi-
  odic flows. Physica D: Nonlinear Phenomena, 212(3-4):271–304, 2005. ISSN 01672789. doi:
  10.1016/j.physd.2005.10.007.
Karen Simonyan and Andrew Zisserman. Very deep convolutional networks for large-scale image
  recognition. In International Conference on Learning Representations, 2015.
Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez,
  Ł. ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. Von
  Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (eds.), Ad-
  vances in Neural Information Processing Systems, volume 30. Curran Associates, Inc.,
  2017. URL https://proceedings.neurips.cc/paper_files/paper/2017/
  file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf.
Andreas Veit, Michael J Wilber, and Serge Belongie. Residual networks behave like ensem-
  bles of relatively shallow networks. In D. Lee, M. Sugiyama, U. Luxburg, I. Guyon, and
  R. Garnett (eds.), Advances in Neural Information Processing Systems, volume 29. Curran
  Associates, Inc., 2016. URL https://proceedings.neurips.cc/paper_files/
  paper/2016/file/37bc2f75bf1bcfe8450a1a41c200364c-Paper.pdf.
Ran Wang, Xupeng Chen, Amirhossein Khalilian-Gourtani, Leyao Yu, Patricia Dugan, Daniel Fried-
  man, Werner Doyle, Orrin Devinsky, Yao Wang, and Adeen Flinker. Distributed feedforward
  and feedback cortical processing supports human speech production. Proceedings of the Na-
  tional Academy of Sciences, 120(42):e2300255120, 2023. doi: 10.1073/pnas.2300255120. URL
  https://doi.org/10.1073/pnas.2300255120. doi: 10.1073/pnas.2300255120.
D. J. Watts and S. H. Strogatz. Collective dynamics of ’small-world’ networks. Nature, 393
  (6684):440–442, 1998. ISSN 0028-0836. doi: Doi10.1038/30918. URL <GotoISI>://WOS:
  000074020000035. Zr842 Times Cited:29354 Cited References Count:27.
Alan Wolf, Jack B. Swift, Harry L. Swinney, and John A. Vastano. Determining lyapunov
  exponents from a time series. Physica D: Nonlinear Phenomena, 16(3):285–317, 1985.
  ISSN 0167-2789. doi: https://doi.org/10.1016/0167-2789(85)90011-9. URL https://www.
  sciencedirect.com/science/article/pii/0167278985900119.
X. Xie and H. S. Seung. Equivalence of backpropagation and contrastive hebbian learning in a
  layered network. Neural Computation, 15(2):441–454, 2003. ISSN 0899-7667. doi: 10.1162/
  089976603762552988.
Izzet B. Yildiz, Herbert Jaeger, and Stefan J. Kiebel. Re-visiting the echo state property. Neural
   Networks, 35:1–9, 2012. ISSN 08936080. doi: 10.1016/j.neunet.2012.07.005.


                                               14
Published as a conference paper at ICLR 2026


A    T HE DYNAMICS OF THE RNN
We quantify the convergence property of the recurrent neural network (RNN) with maximum Lya-
punov exponent (MLE) (Wolf et al., 1985), and finite time maximum Lyapunov exponent (FTMLE)
(Kanno & Uchida, 2014). When the MLE/FTMLE is large, the RNN converges slow or even not
at all. To compute MLE and FTMLE, we first initialize a random perturbation vector δ0 . Then we
record the sequence of states s0 [t] with t = 0, 1, 2, . . . , Te − 1 corresponding to the last sample of a
training set (see Figure 2 in the main text), and run the following steps:

      1. Normalize perturbation vectors to unit length:
                                                             δt
                                                    δt ←
                                                            ∥δt ∥
      2. Calculate the Jacobian matrix:
                                                           ∂F (s0 [t], b)
                                             J(s0 [t]) =
                                                             ∂s0 [t]
      3. Update the perturbation:
                                               δt+1 = J(s0 [t]) · δt
      4. Record
                                                  ri = ln ∥δt+1 ∥
                                                                      PTe −1
The maximum Lyapunov exponent is computed as λmax = T1e                     t=0   ri for a sufficiently large Te
(default Te = 500). The results at any T < Te are the FTMLE.
Figure S1–S2 show the FTMLE, MLE, training accuracy and test accuracy versus epochs of dif-
ferent models. In all cases, smaller βi usually yields smaller (FT)MLE, whereas larger βi do not
always lead to larger (FT)MLE because the activation function saturates. The saturation diminishes
perturbation.
For 2-hidden-layer RNN, smaller feedback scaling βi yields steady training progress and better ac-
curacy. Figure S3 plots the FTMLE and test accuracy against feedback scaling for different numbers
of hidden layers. It shows that smaller βi is favorable for shallow networks, because the RNN is eas-
ier to converge (indicated by FTMLE). But for deeper networks (5-hidden-layer or more), smaller
βi degrades performance because of vanishing gradient.
Further comparison between our FRE-RNN that incorporates convolutional structure with previous
work (Ernoult et al., 2019) are also plotted in Figure S4. These results suggest that small feedback
scaling (βi = 0.01) leads to a smoother training process.

B    G RADIENT VANISHING AND THE RESIDUAL CONNECTIONS
Figure S5 and S6 plot the error of each neuron versus epoch at different βi . For a 2-hidden-layer
RNN, the best performance is obtained at βi = 0.001. In this situation, the error of the first hidden
layer is at least two orders of magnitude less than the second hidden layer. At βi = 2, the error also
decreases from higher (high index neurons, closer to output layer) to lower layers, which is attributed
to the saturation of the activation function. In general, the training progresses more steadily for
smaller βi despite the vanishing gradient, which also applies to deeper networks (up to 10-hidden-
layer).
To eliminate the vanishing gradient in EP, direct feedback from the higher layers or local amplifi-
cation (with higher learning rate) is unavoidable (Nøkland, 2016; Ororbia et al., 2023). Figure S7
shows the effect of residual connections. βi = 0.1 yield the best accuracy 97.5%, due to the balance
between gradient flow and convergence.
Figure S8 shows the testing accuracy varies with the connection probability P of AGT with 10
hidden layers. Except for the connections in layered model, the connection between any two hidden
layers is generated with probability P , i.e., we first use P to decide if the connections between any
two layers will be established. As P increases, the accuracy rises first, peaks at 0.2 and decreases
around 1. However, the reason behind is yet to be explored.


                                                    15
Published as a conference paper at ICLR 2026


Figure S1: The FTMLE, MLE, training accuracy and testing accuracy of symmetric RNNs versus
epochs with different feedback scaling βi (legend). First row: 2 hidden layers; Second row: 3 hidden
layers; Third row: 5 hidden layers; Fourth row: 10 hidden layers. The activation is tanh. Each case
is repeated 5 times.


                                                16
Published as a conference paper at ICLR 2026


Figure S2: The FTMLE, MLE, training accuracy and testing accuracy of asymmetric RNNs versus
epochs with different feedback scaling βi (legend). First row: 2 hidden layers; Second row: 3 hidden
layers; Third row: 5 hidden layers; Fourth row: 10 hidden layers. The activation is tanh. Each case
is repeated 5 times.


Figure S3: The FTMLE and testing accuracy versus feedback scaling βi with different numbers of
hidden layers. (a) Symmetry weights; (b) Asymmetric weights. The FTMLE and testing accuracy
given here correspond to their maxima in all epochs. Note that the 5-hidden-layer asymmetric RNN
with large βi diverged and resulted in missing data points in (b). Each case is repeated 5 times.


                                                17
Published as a conference paper at ICLR 2026


Figure S4: Comparison of RNN embedded with convolutional structure on the MNIST between
P-EP (a) (Ernoult et al., 2019) and our approach at different βi (b-d). We used the same parameters
as the EP reference (Ernoult et al., 2019).


                                                18
Published as a conference paper at ICLR 2026


Figure S5: For 2-hidden-layer RNN, the mean error of each neuron in the last batch and testing
accuracy versus epochs at different βi . All neurons in the hidden layers and the output layer are
indexed from the input to the output layer.


                                               19
Published as a conference paper at ICLR 2026


Figure S6: For the 10-hidden-layer model, the mean error of each neuron in the last batch and
testing accuracy versus epochs at different βi . All neurons in the hidden layers and the output layer
are indexed from the input to the output layer.


                                                 20
Published as a conference paper at ICLR 2026


Figure S7: For the 10-hidden-layer model with residual connections, the mean error of each neuron
in the last batch and testing accuracy versus epochs at different βi . All neurons in the hidden layers
and the output layer are indexed from the input to the output layer.


Figure S8: The testing accuracy on MNIST varies with the connection probability P of AGT with
10 hidden layers. The experiments are repeated 5 times.


                                                  21
Published as a conference paper at ICLR 2026


Table S1: Testing accuracy (mean of 5 repeated experiments) with different feedback scaling βi . By
default, T = 10 × NHiddenLayer , K = 5 × NHiddenLayer . Each hidden layer has 64 nodes.
 Architecture-connections   βi = 0.001   βi = 0.01     βi = 0.1   βi = 0.25   βi = 1   βi = 2   βi = 4
        2HL-symm              97.69%      97.57%       97.25%      96.22%     93.12%   66.04%   40.92%
        3HL-symm              97.22%      97.64%       97.41%      96.60%     55.86%   32.64%   22.11%
        5HL-symm              93.54%      95.54%       97.60%      90.63%     25.31%   17.88%   14.61%
       10HL-symm              87.15%      89.99%       92.54%      41.84%     14.07%   14.30%   14.23%
  10HL-Residual-symm             –        97.52%       97.46%         –       95.51%      –        –
        conv-symm                –        99.15%       98.71%         –       11.35%      –        –
       2HL-asymm              96.96%      96.97%       96.88%      96.79%     93.88%   91.81%   89.91%
       3HL-asymm              95.17%      96.91%       96.76%      96.66%     91.21%   54.65%   26.72%
       5HL-asymm              91.14%      92.34%       96.41%      96.35%     17.15%   11.35%   13.07%
      10HL-asymm              84.27%      85.83%       87.79%      90.97%     16.13%   14.21%   16.67%
    10HL-AGT-asymm               –        96.37%       96.75%         –       33.31%      –        –


C     E QUIVALENCE WITH EP AND BP UNDER THE CONDITION OF
      INFINITESIMAL INFERENCE LIMIT


Figure S9: A layered network model used to illustrate the process of backpropagation (BP), local
representation alignment (LRA), and EP. Note that the final prediction layer ·p corresponds to the
third layer with subindex ·3 . For LRA, we use βLRA instead of β1 and βf . For BP, the feedback
(orange) paths are absent.

In this section, we will use the infinitesimal inference limit (Millidge et al., 2023) to derive the
equivalence of EP with LRA and BP.

C.1   BACKPROPAGATION

When we remove the feedback connection of a 2-hidden-layer RNN shown in Figure S9, a feedfor-
ward network is left and can be trained with BP. The forward process of BP is described by:


                                      s1 = ρ(h1 ), h1 = W0 · s0 ,
                                      s2 = ρ(h2 ), h2 = W1 · s1 ,                                  (S1)
                                       sp = hp , hp = Wf · s2 .

Defining a loss LBP = 21 (sp −star )2 , the weights adjust according to the gradient of the loss. Taking
∆W0 as an example:
                             ∂LBP
                 ∆W0 = −
                             ∂W0
                       = −ρ′ (h1 ) ⊙ W1⊤ · ρ′ (h2 ) ⊙ Wf⊤ · (sp − star ) · (s0 )⊤ ,
                                                                            
                                                                                                    (S2)

where “⊙” means Hadamard product (element-wise product), “·” means scalar or matrix multipli-
cation. For two vectors/matrices, “⊙” requires identical dimensions and computes element-wise
products. Broadcasting rules may apply (e.g., a column vector vm×1 ⊙ Am×n scales each column
of A by v).


                                                  22
Published as a conference paper at ICLR 2026


C.2   L OCAL R EPRESENTATION A LIGNMENT

LRA is an alternative training method following the principle of discrepancy reduction (Ororbia
et al., 2017; Ororbia & Mali, 2019). It can be divided into two phases: 1) the network runs the
forward process, producing latent representations of the input samples. 2) the weights adjust in the
direction of reducing the mismatch between current latent representations and target representations
in each layer.
The forward process is the same as BP:
                                  s01 = ρ(h01 ),         h01 = W0 · s0 ,
                                     s02 = ρ(h02 ),      h02 = W1 · s01 ,                            (S3)
                                      s0p = h0p ,       h0p = Wf · s02 .
where s0i are interpreted as the latent representations. The prediction error is ep = star − s0p . Then
we can get the target representations of the second hidden layer:
                    sβ2 LRA = ρ(hβ2 LRA ),    hβ2 LRA = W1 · s01 + βLRA · Bf · ep ,                  (S4)
The same goes for the first hidden layer:
          sβ1 LRA = ρ(hβ1 LRA ),   hβ1 LRA = W1 · s0 + βLRA · B1 · e2 ,       e2 = sβ2 LRA − s02 ,   (S5)
LRA defines the loss as the total discrepancy between latent representations and target representa-
tions:
                                 L                             L
                                X                             X   1 0
                      LLRA =         ki Li (s0i , sβi LRA ) =       (si − sβi LRA )2 ,         (S6)
                                i=1                           i=1
                                                                  2
The weight Wi adjusts according to the local mismatch between s0i+1 and sβi+1
                                                                           LRA
                                                                               :
                                         ∂ki Li (s0i+1 , sβi+1
                                                            LRA
                                                                )
                              ∆Wi = −
                                                 ∂Wi
                                   = (sβi+1
                                         LRA
                                             − s0i+1 ) ⊙ f ′ (h0i+1 ) · (s0i )⊤
                                     ≈ (sβi+1
                                           LRA
                                               − s0i+1 ) · (s0i )⊤ ,                             (S7)
where the derivative of the activation function is omitted in the last row, a useful practice common
in LRA (Melchior & Wiskott, 2019; Ororbia & Mali, 2019; Ororbia et al., 2023). When βLRA → 0,
sβi LRA → s0i and hβi LRA → h0i , then


                   ei = sβi LRA − s0i = ρ(hβi LRA ) − ρ(h0i )
                      = ρ(h0i + βLRA · Bi · ei+1 ) − ρ(h0i )
                      ≈ [ρ(h0i ) + ρ′ (h0i ) ⊙ (βLRA · Bi · ei+1 ) − ρ(h0i ))]βLRA →0 ,              (S8)
                          ′
                      = ρ (h0i ) ⊙ (βLRA · Bi · ei+1 )

The approximation in Equation S8 is based on a first-order Taylor expansion of ρ(h0i + ∆h) around
h0i , where ∆h = βLRA · Bi · ei+1 . For a small perturbation ∆h → 0, the Taylor expansion gives:
                          ρ(h0i + ∆h) = ρ(h0i ) + ρ′ (h0i ) · ∆h + O(∆h2 ),                          (S9)
                                                    2
When βLRA → 0, higher order terms O(∆h ) are negligible, leaving only the linear terms. We
arrive at the last row after canceling out ρ(h0i ). There we can express the weight adjustments as
   ∆W0 = e1 · (s00 )⊤
                                                                        
             ′ 0                     ′ 0
                                                                       
                                                                            · (s0 )⊤ B =(W )⊤
                                                                                    
         = ρ (h1 ) ⊙ βLRA · B1 · ρ (h2 ) ⊙ βLRA · Bf · (star − sp )
                                                                                      i   i

                            ′ 0        ⊤     ′ 0         ⊤                     ⊤
                                                                       
         = −βLRA · βLRA · ρ (h1 ) ⊙ W1 · ρ (h2 ) ⊙ Wf · (sp − star ) · (s0 ) ,              (S10)
which is the same as BP (Equation S2) except for a constant. Thus, LRA at weak feedback limit
approximates BP. An LRA algorithm for a 2-hidden-layer network is described in Algorithm S1. The
feedback weights in LRA need not be learned here, but can be kept symmetric with the feedforward
weights.


                                                        23
Published as a conference paper at ICLR 2026


C.3    E QUILIBRIUM P ROPAGATION

We can also formulate EP in terms of discrepancy reduction. In EP (Algorithm 1 in the main text),
the network states evolve as follows (β = 0 for the first phase and β = βf for the second phase):


                                             hβ1 = W0 · sβ0 + β1 · B1 · sβ2 ,
                                            hβ2 = W1 · sβ1 + βf · Bf · ep ,
                                                       hβp = Wf · sβ2 ,
                                            sβ1 , sβ2 , sβp = ρ(hβ1 ), ρ(hβ2 ), hβp ,                   (S11)


where ep = star − s0p is the predicting error. The network converges to final states h01 , h02 , s01 , s02 in
the free phase. The error of s2 neurons can be described by:

                                                        β
                                         ds2 = [ρ(h2 f )]βf →0 − [ρ(h02 )]βf =0
                                                ≈ ρ′ (h02 ) ⊙ (βf · Bf · ep ),                          (S12)

where only the first-order infinitesimal term is retained as β1 → 0. The same goes for the first
hidden layer:

                                            β
                              ds1 = [ρ(h1 f )]βf →0 − [ρ(h01 )]βf =0
                                   ≈ ρ′ (h01 ) ⊙ (β1 · B1 · (ρ′ (h02 ) ⊙ (βf · Bf · ep ))),             (S13)

The weight W0 can be updated by:

                                  ds1 · (s00 )⊤
                  ∆W0 =                         = ρ′ (h01 ) ⊙ B1 · (ρ′ (h02 ) ⊙ Bf · ep ) · (s00 )⊤ ,   (S14)
                                    β1 · β f

With Bi = Wi⊤ ,

         ds1 = βf · β1 · ρ′ (h01 ) ⊙ W1⊤ · (ρ′ (h02 ) ⊙ Wf⊤ · −(sp − star )),                           (S15)
                 ds1 · (s00 )⊤
                               = −ρ′ (h01 ) ⊙ W1⊤ · ρ′ (h02 ) ⊙ Wf⊤ · (sp − star ) · (s00 )⊤ .
                                                                                  
       ∆W0 =                                                                                            (S16)
                   β1 · β f

Note that compared with the weight update in the main text, 1/(β1 ·βf ) is added to recover a gradient
amplitude similar to BP. Further, if we assume that the high-order infinitesimal in the first phase can
be omitted, the dynamics of RNN is governed by:

                      s01      = ρ(hβ1 ),       h01 = [W0 · s0 + β1 · B1 · s02 ]β1 →0 ≈ W0 · s0 ,       (S17)
                s02         = ρ(h02 ),   h02 = [W1 · s01 + βf · Bf · ep ]β1 →0,βf =0 ≈ W1 · s01 ,       (S18)
                                            s0p = h0p , h0p = Wf · s02 .                                (S19)


The information flow of RNN degenerates into that of a feedforward network. This does not affect
the error information dsi , thus Equation S16 approximates Equation S2 for BP. Meanwhile, it re-
sembles LRA with low βLRA , which turns explicit error into implicit error. Hitherto, we have shown
that although the errors are obtained differently in EP, LRA, and BP, they are equivalent under the
assumption of weak supervision and weak feedback.


                                                              24
Published as a conference paper at ICLR 2026


Algorithm S1 Local Representation Alignment (LRA)
Input: (x, star )
Parameter: θ = [W0 , W1 , W2 , B2 , B1 , βLRA ]
Output: θ
 1: function F ORWARD(θ, x)
 2:    s0 ← x
 3:    s01 ← ρ(h1 ), h1 ← W0 · s0
 4:    s02 ← ρ(h2 ), h2 ← W1 · s01
 5:    s0p ← Wf · s02
 6:    Λ1 ← [s0i ], i = 0, 1, 2, p
 7:    return Λ1
 8: end function
 9: function F EEDBACK(θ, Λ1 , star )
10:    ep ← star − s0p
11:    sβ2 LRA ← ρ(h2 ), h2 ← W1 · s01 + βLRA · Bf · ep
12:    e2 ← sβ2 LRA − s02
13:    sβ1 LRA ← ρ(h1 ), h1 ← W0 · s0 + βLRA · B1 · e2
14:    e1 ← sβ1 LRA − s01
15:    Λ2 ← [e1 , e2 , ep ]
16:    return Λ2
17: end function
18: function U PDATING -W EIGHTS(θ, Λ1 , Λ2 )
19:    ∆Wi ← ei+1 · (s0i )T , i = 0, 1
20:    ∆Wf ← ep · (s02 )T
21: end function


C.4   E XPERIMENTS FOR EQUIVALENCE WITH EP AND BP

Prior works have shown that EP can be equalized to BPTT in specific conditions and can achieve
comparable performance (Ernoult et al., 2019; Laborieux et al., 2021). As discussed in the previ-
ous section, although the overall architecture forms an RNN, the network behaves similarly to a
feedforward model due to weak feedback connections.
To experimentally show the equivalence of EP and BP, we can further compare our model with
FNN with same feedforward weights trained by BP. We mainly compare cosine similarity of states,
bias gradients and weight gradients for the first batch (batch size is 200) as given in Figure S10.
Figure S10(a-c) shows similarity under the conditions of βi = 1 with different iterations. For the the
bias gradients, i.e., dsi , the cosine similarity declines rapidly, indicating no similarity between our
model and BP. With weak feedback βi = 0.1, as shown in Figure S10(d-f), the similarity of states
approaches 1 and the similarity of bias gradient of last 6(4) layers exceeds 0.5 with T = 500/50
(T = 20). These results provide further evidence that EP is equivalent to BP under the condition of
weak feedback.
We further studied the influence of βi on the cosine similarity. Figure S10(g) shows that larger
βi leads to lower similarity of states. Figure S10(h) shows that lower βi = 0.01 also leads to
the decrease in similarity, which may caused by insufficient precision of data storage (float32 by
default). Therefore, we use datatype float64 to repeat experiments. Figure S10(k,l) shows that the
similarity of gradient signal remains around 1 with βi = 0.1. This indicates that weak feedback does
indeed lead to an exponential decline in gradient signals, thus requiring higher relative accuracy.


                                                  25
 Published as a conference paper at ICLR 2026


        (a)                            states                      (b)                       bias gradients                      (c)                    weight gradients
                    1.0                                                         1.0                                                         1.0
                                                                                                                                                         T=500,K=200
cosine similarity


                                                            cosine similarity


                                                                                                                        cosine similarity
                                                                                                                                                         T=50,K=20
                    0.5                                                         0.5                                                         0.5          T=20,K=8
                    0.0                                                         0.0                                                         0.0

                    0.5                                                         0.5                                                         0.5
                          fc1 fc2 fc3 fc4 fc5 fc6 fc7 fc8 out                         fc1 fc2 fc3 fc4 fc5 fc6 fc7 fc8 out                         fc1 fc2 fc3 fc4 fc5 fc6 fc7 fc8 out
       (d)                             states                       (e)                      bias gradients                        (f)                  weight gradients
                    1.0                                                         1.0                                                         1.0
cosine similarity


                                                            cosine similarity


                                                                                                                        cosine similarity
                    0.5                                                         0.5                                                         0.5

                    0.0                                                         0.0                                                         0.0                    T=500,K=200
                                                                                                                                                                   T=50,K=20
                                                                                                                                                                   T=20,K=8
                    0.5                                                         0.5                                                         0.5
                          fc1 fc2 fc3 fc4 fc5 fc6 fc7 fc8 out                         fc1 fc2 fc3 fc4 fc5 fc6 fc7 fc8 out                         fc1 fc2 fc3 fc4 fc5 fc6 fc7 fc8 out
       (g)                             states                       (h)                      bias gradients                         (i)                 weight gradients
                    1.0                                                         1.0                                                         1.0
                                                                                                                                                          i=0.0              i=0.5
cosine similarity


                                                            cosine similarity


                                                                                                                        cosine similarity

                                                                                                                                                          i=0.01             i=1
                    0.5                                                         0.5                                                         0.5           i=0.1

                    0.0                                                         0.0                                                         0.0

                    0.5                                                         0.5                                                         0.5
                          fc1 fc2 fc3 fc4 fc5 fc6 fc7 fc8 out                         fc1 fc2 fc3 fc4 fc5 fc6 fc7 fc8 out                         fc1 fc2 fc3 fc4 fc5 fc6 fc7 fc8 out
            (j)                        states                       (k)                      bias gradients                         (l)                 weight gradients
                    1.0                                                         1.0                                                         1.0
cosine similarity


                                                            cosine similarity


                                                                                                                        cosine similarity


                    0.5                                                         0.5                                                         0.5                            i=0.0
                                                                                                                                                                           i=0.01
                                                                                                                                                                           i=0.1
                    0.0                                                         0.0                                                         0.0
                                                                                                                                                                           i=0.5
                                                                                                                                                                           i=1
                    0.5                                                         0.5                                                         0.5
                          fc1 fc2 fc3 fc4 fc5 fc6 fc7 fc8 out                         fc1 fc2 fc3 fc4 fc5 fc6 fc7 fc8 out                         fc1 fc2 fc3 fc4 fc5 fc6 fc7 fc8 out

 Figure S10: The cosine similarity of gradients and states between our model and feedforward model
 trained by BP in an 8-hidden-layer FNN (states: s0i ; bias gradients: dsi ; weight gradients: ∆Wi ).
 The axis x is the layers of the model. Error propagates from the last layer ”out” to the first hidden
 layer ”fc1” layer by layer. (a-c), with different numbers of iterations under feedback scaling βi = 1.
 (d-f), with different numbers of iterations under small feedback scaling βi = 0.1. (g-i), with different
 feedback scaling (T=50,K=20). (j-l), Repeat (g-i) with datatype float64 (float32 by default).


                                                                                                26
Published as a conference paper at ICLR 2026


C.5                         V ERIFYING THE EFFECTIVENESS OF WEAK FEEDBACK IN E QUILIBRIUM P ROPAGATION


                                                                       1.00
                                                                                                                i=0.01
                                                                                                                i=0.1
                                                                       0.95


                                                       Test accuracy
                                                                       0.90

                                                                       0.85

                                                                       0.80
                                                                                                 1                            3                                                 5
                                                                                                               Only last nll layers learning

Figure S11: The testing accuracy on MNIST with different βi varies with nll . The experiments are
repeated 5 times.

To demonstrate that the lower few layers of our model are indeed receiving meaningful credit signals,
we report the test accuracy of only updating the last nll layer (i.e., freezing the weights of Layers
1−nll ) in Figure S11. For a 5-hidden layer model with βi = 0.1, updating only the final layer yields
a test accuracy of about 85%. As nll increases to 5, the accuracy also reaches around 97.5%. A
similar trend is observed for the model with βi = 0.01. These results show that achieving over 97%
accuracy requires effective gradient propagation to all layers, confirming that our model successfully
delivers usable credit signals throughout the entire network.

C.6                         ROBUSTNESS TO THE NOISE

               (a)                        2HL                                                 (b)                        3HL                                       (c)                        5HL
                                                                        1.00                                                               1.00                                                                 1.00
                            0 0.972 0.972 0.972 0.904                                                      0 0.974 0.974 0.967 0.752                                          0 0.976 0.969 0.814 0.469
Weight noise intensity


                                                                               Weight noise intensity


                                                                                                                                                  Weight noise intensity


                                                                        0.95                                                               0.95                                                                 0.95
                         0.001 0.972 0.972 0.972 0.896                  0.90                            0.001 0.973 0.975 0.964 0.749      0.90                            0.001 0.973 0.965 0.812 0.464        0.90

                         0.01 0.917 0.917 0.912 0.689                   0.85                            0.01 0.903 0.904 0.838 0.485       0.85                            0.01 0.876 0.815 0.520 0.299         0.85
                                                                        0.80                                                               0.80                                                                 0.80
                          0.1 0.194 0.206 0.196 0.217                                                    0.1 0.167 0.180 0.174 0.167                                        0.1 0.134 0.135 0.134 0.135
                                                                        0.75                                                               0.75                                                                 0.75
                                0      1e-05 0.001 0.1                                                         0      1e-05 0.001 0.1                                               0      1e-05 0.001 0.1
                                    State noise intensity                                                          State noise intensity                                                State noise intensity


Figure S12: The maximum test accuracy model with different noise intensity on weights and states
added both in training and test. (a) With 2 hidden layers; (b) With 3 hidden layers; (c) With 5 hidden
layers. The model is trained for 50 epochs and the experiments are repeated 5 times.

To evaluate the robustness of the model, we introduce noise on weights and time-varying noise
on states, which are random Gauss noise imposed at each weight update or at each state update,
respectively. The noise on weights is directly added to the weight, while the noise on states is added
as the bias b in Equation 3.1. The mean absolute values of non-zero weights and neural activations
after noiseless training are approximately 0.09 and 0.76 respectively.
The accuracy of the model with two hidden layers varies with two types of noises as presented in Fig-
ure S12(a). It maintains satisfactory performance when the standard deviation of state noise reaches
0.1 or the standard deviation of weight noise reaches 0.01. In deeper structures (Figure S12(b,c)),
the results are consistent with the aforementioned observations for weight noise, demonstrating ex-
cellent robustness. However, the tolerance to time-varying state noise degrades significantly, which
we attribute to the layer-wise noise accumulation and the distortion of weak gradient signal by the
noise in the training process. To confirm our hypothesis, we impose the noises only in the test,


                                                                                                                         27
Published as a conference paper at ICLR 2026


                      (a)                   2HL                                 (b)                                 3HL                                         (c)                            5HL
                                                                1.00                                                                       1.00                                                                       1.00
                             0 0.973 0.973 0.973 0.971 0.944                                       0 0.974 0.976 0.975 0.974 0.940                                            0 0.975 0.977 0.977 0.977 0.919
                                                                0.95                                                                       0.95                                                                       0.95


                                                                       Weight noise intensity


                                                                                                                                                  Weight noise intensity
 Weight noise intensity
                          0.001 0.971 0.971 0.974 0.970 0.945   0.90                            0.001 0.975 0.973 0.973 0.972 0.945        0.90                            0.001 0.973 0.973 0.973 0.973 0.928        0.90

                           0.01 0.919 0.916 0.919 0.918 0.912   0.85                             0.01 0.908 0.903 0.908 0.902 0.894        0.85                             0.01 0.885 0.876 0.880 0.873 0.842        0.85
                                                                0.80                                                                       0.80                                                                       0.80
                            0.1 0.199 0.187 0.194 0.219 0.177                                     0.1 0.176 0.182 0.152 0.167 0.142                                          0.1 0.133 0.128 0.143 0.163 0.142
                                                                0.75                                                                       0.75                                                                       0.75
                                  0   0.001 0.001 0.1 0.                                                0   0.001 0.001 0.1          0.5                                           0   0.001 0.001 0.1          0.5
                                      State noise intensity                                                  State noise intensity                                                      State noise intensity


Figure S13: The maximum test accuracy model with different noise intensity on weights and states
(the state noise is added only in test). (a) With 2 hidden layers; (b) With 3 hidden layers; (c) With 5
hidden layers. The model is trained for 50 epochs in a single experiment.


and the test accuracy almost remains unaffected (Figure S13). Therefore, the network is potentially
resilient to noise. However, how to improve resilience in the training process requires further study.


D                            T RAINING DETAILS

Table S2 provides the parameters of the Adam optimizer that are used in Tables S1–S2 (Kingma &
Ba, 2015). The training details for Table 1 are given in Table S3. For convolutional architectures
in EP, the training process can be described by Algorithm S2. The training sample is fed into the
network through Conv0 . Then the state of the first layer goes through max pooling MaxPool1 and
convolution Conv1 sequentially to reach the second layer. The second layer also feedbacks its states
to the first layer through transposed convolution ConvT1 and max-unpooling MaxUnpool1 . With
T iterations, the RNN converges to the steady states and produces outputs through MaxPool2 and
a fully connected layer. Then the prediction error is computed and used to nudge the RNN by
the reverse of the fully connected layer and max-unpooling MaxUnpool2 . Note that the unpooling
MaxUnpooli requires the indices from the corresponding pooling MaxPooli .
For Table S2, Adam optimizer is used for all experiments. The activation functions sigmoid-s
                                              1
and hard-sigmoid are defined as ρ(x) = 1+e−4(x−0.5) , ρ(x) = max(min(x, 0), 1), respectively
(Ernoult et al., 2019). For 5-HL, 10-HL and 20-HL architectures, the Adam optimizer parameters
are as shown in Table S2 (epoch: 50, batch size: 500). The inference details of the architecture
shown in Figure 3b are described by Algorithm S3. The details for convolutional architectures are
given in Table S3 and Figure S14–S15. The cosine-annealing scheduler is used in convolutional
architectures for CIFAR-10 (Tmax = 50, ηmin = 10−6 ).
For MNIST, no pre-processing is used. For the CIFAR-10 dataset, we follow ref. (Scellier
et al., 2023) to pre-process the images. We normalize the input images using mean µ =
(0.4914, 0.4822, 0.4465) and standard deviation σ = 3 × (0.2023, 0.1944, 0.2010).
The results for comparison of time consumption were obtained in a virtualized Windows 11 envi-
ronment with Intel Xeon Gold 6238R CPU, 16 GB RAM, and Nvidia RTX A5000 (24 GB VRAM).
Other results were obtained on a Windows 11 environment with Intel Core i5-12490F, 32 GB RAM,
and Nvidia GTX 1650 (4 GB VRAM) or a Windows 11 environment with AMD R7-7700, 32 GB
RAM, and Nvidia RTX 4070 (12 GB VRAM). The default numerical precision is float32 (single-
precision float).


                                                     Table S2: The parameters of the Adam optimizer.
                                                           Parameter Name                     Default Value
                                                   Learning rate (MNIST / CIFAR-10)          10−3 /2 × 10−4
                                             First-order moment estimation decay rate (β1 )        0.9
                                            Second-order moment estimation decay rate (β2 )       0.999
                                                Small constant for numerical stability (ϵ)        10−8


                                                                                                                    28
Published as a conference paper at ICLR 2026


Table S3: Training details for Table 1 and Table 2. The results of EB-EP and P-EP come from
previous work (Ernoult et al., 2019). SGD refers to Stochastic Gradient Descent with mini-batches.
                                                                        Epoch / Batch size
    Architecture             Training approach           Optimizer                                       Learning rate         Weight decay
                                                                               -T/K
                               P-EP (sigmoid-s)             SGD            50/20-100/20               [0.005, 0.05, 0.2]           None
        2HL
                            Proposed (tanh, Adam)           Adam           50/500-10/10             [0.001, 0.001, 0.001]          None
                               P-EP (sigmoid-s)             SGD           100/20-180/20            [0.002, 0.01, 0.05, 0.2]        None
                             Proposed-DLR (tanh)            SGD           100/20-18/10             [0.002, 0.01, 0.05, 0.2]        None
        3HL                     Proposed (tanh)             SGD            100/20-18/10               [0.1, 0.1, 0.1, 0.1]         None
                            Proposed (tanh, Adam)           Adam           50/500-18/10                      10−3                  None
                               BP (tanh, Adam)              Adam            50/500-1/1                       10−3                  None
                             P-EP (hard-sigmoid)            SGD            40/20-200/10              [0.015, 0.035, 0.15]          None
        Conv
                           Proposed (hard-sigmoid)          SGD           40/128-20/10                 [0.15, 0.35, 0.9]           10−5
      (Table 1)
                              BP (hard-sigmoid)             SGD             40/128-1/1                [0.001, 0.02, 0.4]           10−5
         Conv              Proposed (hard-sigmoid)          Adam          40/128-20/10                    2 × 10−4                 10−6
   (Table 2 MNIST)            BP (hard-sigmoid)             Adam            40/128-1/1                    2 × 10−4                 10−6
        Conv               Proposed (hard-sigmoid)          Adam          50/128-40/10                   2.5 × 10−4              2 × 10−4
 (Table 2 CIFAR-10)           BP (hard-sigmoid)             Adam            50/128-1/1                   2.5 × 10−4              2 × 10−4


                                                                   64@8x8                     64@4x4
                           32@24x24                32@12x12
         1@28x28                                                                                                                1x10


                       Conv1                       MaxPool1                   Conv2                        MaxPool2           Dense

                                 Figure S14: Convolutional architectures for MNIST.


                                                                               128@8x8                 128@4x4
                                                 64@16x16      64@8x8
                      32@32x32        32@16x16
         3@32x32                                                                                                                 1x10


                   Conv1           MaxPool1      Conv2             MaxPool2           Conv3                      MaxPool3

                             Figure S15: Convolutional architectures for CIFAR-10.


                                                                   29
Published as a conference paper at ICLR 2026


Algorithm S2 Two phases in EP training process for convolution architecture
Input: Sample-label pairs (x, star )
Parameter: θ = [W0 , W1 , Wf , Bf , B1 , α1 , β1 , βf 1 ]
Output: θ
 1: function F IRST- PHASE(θ, star )
 2:    s0 ← x
 3:    for t ← 1 to T do
 4:        h1 ← Conv0 (s0 ) + β1 · MaxUnpool1 (ConvT1 (s02 ))
 5:        h2 ← Conv1 (MaxPool1 (s01 ))
 6:        hp ← Wf · Flatten(MaxPool2 (s02 ))
 7:        s01 , s02 , s0p ← ρ(h1 ), ρ(h2 ), SoftMax(hp )
 8:    end for
 9:    Λ1 ← [s0i ], i = 0, 1, 2, p
10:    return Λ1
11: end function
12: function S ECOND - PHASE(θ, Λ1 , star )
             β        β        β
13:    s0 , s1 f 1 , s2 f 1 , sp f 1 ← x, s01 , s02 , s0p
14:    for t ← 1 to K do
                                    β
15:         ep ← star − sp f 1
                                                                     β
16:            h1 ← Conv0 (s0 ) + β1 · MaxUnpool1 (ConvT1 (s2 f 1 ))
                                        β
17:            h2 ← Conv1 (MaxPool1 (s1 f 1 )) + βf · MaxUnpool2 (Unflatten(Wf⊤ ep ))
                                                            β
18:        hp ← Wf · Flatten(MaxPool2 (s2 f 1 ))
            β        β        β
19:        s1 f 1 , s2 f 1 , sp f 1 ← ρ(h1 ), ρ(h2 ), SoftMax(hp )
20:    end for
21: end function


                                                                30
Published as a conference paper at ICLR 2026


Algorithm S3 EP with feedback scaling and residual connections (Figure 3b)
Input: (x, star )
Parameter: θ = [W0 , Wi , Wf , Bf , Bi , βi , βf 1 ]
Output: θ
 1: function I TERATION(θ, Λ1 , star )
 2:    for t ← 1 to K do
 3:         if Nudging Phase then
 4:              βf ← βf 1
                       β        β        β
 5:              s0 , s1 f 1 , s2 f 1 , sp f 1 ← x, s01 , s02 , s0p
 6:         else
 7:              βf ← 0
 8:              s0 ← x
 9:         end if
                                              β                   β
10:         h1 ← W0 s0 + β1 B1 s2 f + β4,1 B4,1 s4 f
                            βf                  βf
11:         h2 ← W1 s1 + β2 B2 s3
                            β                   β
12:         h3 ← W2 s2 f + β3 B3 s4 f
                            βf                   β              β   β
13:         h4 ← W3 s3 + β14 B4 s5 f + W1,4 s1 f + β7,4 B7,4 s7 f
                            β                   β
14:         h5 ← W4 s4 f + β5 B5 s6 f
                            βf                  β
15:         h6 ← W5 s5 + β6 B6 s7 f
                            β                   β             β      β
16:         h7 ← W6 s6 f + β7 B7 s8 f + W4,7 s4 f + β10,7 B10,7 s10f
                            βf                  βf
17:         h8 ← W7 s7 + β8 B8 s9
                            β                   β
18:         h9 ← W8 s8 f + β9 B9 s10f
                              βf                  β               β
19:         h10 ← W9 s9 + βf Bf ep f + W7,10 s7 f
                            β
20:         hp ← Wf s10f
              βf
21:         si ← ρ(hi ), i = 0, 1, 2, . . . , 10
              β
22:         sp f ← SoftMax(hp )
23:    end for
24: end function


                                               31