1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
|
{
"description": "Nudging test 3-seed values for \u00a74 \u00b64 cross-method functional dissociation. Each value is the test-loss change after a single step of size eta=0.01 along the per-layer credit direction at the converged checkpoint.",
"eta": 0.01,
"epochs": 30,
"arch": "4-block d=256 pre-LN ResMLP",
"penalty_lam": 0.01,
"methods": {
"state_bridge": {
"per_seed": {
"42": {
"per_block": [
-0.0035966814029961824,
-0.0023783869110047817,
-0.0017011994495987892,
-0.0012592736165970564
],
"deep_mean": -0.0017796199924002092,
"note": "deep = blocks l1, l2, l3 (excluding l0)"
},
"123": {
"per_block": [
-0.003914414905011654,
-0.002520129084587097,
-0.0018878313712775707,
-0.0014582867734134197
],
"deep_mean": -0.0019554157430926957,
"note": "deep = blocks l1, l2, l3 (excluding l0)"
},
"456": {
"per_block": [
-0.004026009701192379,
-0.0028109778650105,
-0.001904117758385837,
-0.0014447440626099706
],
"deep_mean": -0.002053279895335436,
"note": "deep = blocks l1, l2, l3 (excluding l0)"
}
},
"three_seed_deep_mean": -0.0019294385436094469,
"three_seed_deep_std_ddof0": 0.00011322116053216024
},
"credit_bridge": {
"per_seed": {
"42": {
"per_block": [
-0.0005118446424603462,
-0.0004658599500544369,
-0.00044331286335363984,
-0.00042594311526045203
],
"deep_mean": -0.0004450386428895096,
"note": "deep = blocks l1, l2, l3 (excluding l0)"
},
"123": {
"per_block": [
-0.00045391899766400456,
-0.00041642854921519756,
-0.00038977732765488327,
-0.00037192515446804464
],
"deep_mean": -0.00039271034377937514,
"note": "deep = blocks l1, l2, l3 (excluding l0)"
},
"456": {
"per_block": [
-0.00048537791008129716,
-0.00046242878306657076,
-0.00043993344297632575,
-0.00042223266791552305
],
"deep_mean": -0.0004415316313194732,
"note": "deep = blocks l1, l2, l3 (excluding l0)"
}
},
"three_seed_deep_mean": -0.000426426872662786,
"three_seed_deep_std_ddof0": 2.3884137309066322e-05
},
"dfa": {
"per_seed": {
"42": {
"per_block": [
-0.00013115769252181053,
-5.455967038869858e-05,
-5.524198058992624e-05,
-5.596294067800045e-05
],
"deep_mean": -5.525486388554176e-05,
"note": "deep = blocks l1, l2, l3 (excluding l0)"
},
"123": {
"per_block": [
-0.00010294892126694322,
-3.057112917304039e-05,
-4.6447094064205885e-05,
-5.68098621442914e-05
],
"deep_mean": -4.460936179384589e-05,
"note": "deep = blocks l1, l2, l3 (excluding l0)"
},
"456": {
"per_block": [
-0.00014482333790510893,
-4.394981078803539e-05,
-4.4448592234402895e-05,
-5.99866034463048e-05
],
"deep_mean": -4.946166882291436e-05,
"note": "deep = blocks l1, l2, l3 (excluding l0)"
}
},
"three_seed_deep_mean": -4.9775298167434004e-05,
"three_seed_deep_std_ddof0": 4.3516626110321815e-06
}
},
"ratios_3seed_means": {
"SB / CB": 4.5246645258569975,
"SB / DFA": 38.76297309398745,
"CB / DFA": 8.56703803618358
}
}
|