summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYurenHao0426 <Blackhao0426@gmail.com>2026-04-08 05:57:53 -0500
committerYurenHao0426 <Blackhao0426@gmail.com>2026-04-08 05:57:53 -0500
commitbe39c2b5ebec37f993b1a862459455a98cf39eb2 (patch)
tree0b373ccfd983ae866f12c9029db3bfd863a8e2fd
parent52693a9be4349c2820ac79e3e3d9af53813a7412 (diff)
Round 35: SB and CB also show data-agnostic Mode 1 growth on random targets
- experiments/cifar_resmlp.py: add --methods filter and --random_targets flag; extend compute_diagnostics to log hidden_norms_per_layer and bp_grad_norms_per_layer - paper/main.tex §3 ¶1: broaden random-target finding to all 3 fixed-feedback methods (DFA: ||h_L||=14510, SB: ||h_L||=6225, CB: ||h_L||=19974 at ep 3, all at chance acc) - paper/main.tex Appendix J: extended with cross-method smoke-test table This generalizes the §3 mechanism story from 'DFA-specific' to 'all 3 audited fixed-feedback local-credit methods'. Combined with rounds 32-34, the proximate cause of Mode 1 (a) is now well-localized: - Not requires residual skip (round 33 H2 walkback) - Not requires task signal (round 34 random targets, DFA) - Not DFA-specific (round 35 random targets, SB+CB) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
-rw-r--r--experiments/cifar_resmlp.py110
-rw-r--r--paper/main.pdfbin461963 -> 464384 bytes
-rw-r--r--paper/main.tex22
3 files changed, 88 insertions, 44 deletions
diff --git a/experiments/cifar_resmlp.py b/experiments/cifar_resmlp.py
index 1582f6d..4324e9e 100644
--- a/experiments/cifar_resmlp.py
+++ b/experiments/cifar_resmlp.py
@@ -99,6 +99,8 @@ def train_bp(model, train_loader, test_loader, device, args):
for x, y in train_loader:
x = x.view(x.size(0), -1).to(device)
y = y.to(device)
+ if getattr(args, 'random_targets', False):
+ y = torch.randint(0, args.num_classes, y.shape, device=device)
logits = model(x)
loss = F.cross_entropy(logits, y)
optimizer.zero_grad()
@@ -160,6 +162,8 @@ def train_dfa(model, train_loader, test_loader, device, args):
for x, y in train_loader:
x = x.view(x.size(0), -1).to(device)
y = y.to(device)
+ if getattr(args, 'random_targets', False):
+ y = torch.randint(0, args.num_classes, y.shape, device=device)
batch = x.size(0)
# Forward pass (no grad for hidden states)
@@ -262,6 +266,8 @@ def train_state_bridge(model, train_loader, test_loader, device, args):
for x, y in train_loader:
x = x.view(x.size(0), -1).to(device)
y = y.to(device)
+ if getattr(args, 'random_targets', False):
+ y = torch.randint(0, args.num_classes, y.shape, device=device)
batch = x.size(0)
with torch.no_grad():
@@ -418,6 +424,8 @@ def train_credit_bridge(model, train_loader, test_loader, device, args):
for x, y in train_loader:
x = x.view(x.size(0), -1).to(device)
y = y.to(device)
+ if getattr(args, 'random_targets', False):
+ y = torch.randint(0, args.num_classes, y.shape, device=device)
batch = x.size(0)
with torch.no_grad():
@@ -595,10 +603,16 @@ def compute_diagnostics(model, method_name, test_loader, device, args,
e_T[torch.arange(batch), y] -= 1
s = e_T.detach()
+ # Per-layer hidden norms (median across batch) and BP grad norms (per-sample L2, median)
+ hidden_norms_per_layer = [float(hiddens[l].detach().norm(dim=-1).median().item()) for l in range(L + 1)]
+ bp_grad_norms_per_layer = [float(bp_grads[l].norm(dim=-1).median().item()) for l in range(L + 1)]
+
results = {
'bp_cosine': [],
'perturbation_rho': [],
'nudging': {'0.001': [], '0.003': [], '0.01': []},
+ 'hidden_norms_per_layer': hidden_norms_per_layer,
+ 'bp_grad_norms_per_layer': bp_grad_norms_per_layer,
}
for l in range(L):
@@ -673,56 +687,62 @@ def run_experiment(args):
seed_results = {}
+ methods_to_run = getattr(args, 'methods', ['bp', 'dfa', 'state_bridge', 'credit_bridge'])
+
# ---- BP ----
- print("\n--- BP ---")
- model_bp = ResidualMLP(input_dim, args.d_hidden, num_classes, args.num_blocks).to(device)
- init_bp = {n: p.clone().detach() for n, p in model_bp.named_parameters()}
- bp_log = train_bp(model_bp, train_loader, test_loader, device, args)
- bp_diag = compute_diagnostics(model_bp, 'bp', test_loader, device, args)
- bp_drift = feature_drift(init_bp, {n: p.detach() for n, p in model_bp.named_parameters()})
- seed_results['bp'] = {'log': bp_log, 'diagnostics': bp_diag, 'drift': bp_drift}
- print(f" Final test acc: {bp_log['test_acc'][-1]:.4f}")
+ if 'bp' in methods_to_run:
+ print("\n--- BP ---")
+ model_bp = ResidualMLP(input_dim, args.d_hidden, num_classes, args.num_blocks).to(device)
+ init_bp = {n: p.clone().detach() for n, p in model_bp.named_parameters()}
+ bp_log = train_bp(model_bp, train_loader, test_loader, device, args)
+ bp_diag = compute_diagnostics(model_bp, 'bp', test_loader, device, args)
+ bp_drift = feature_drift(init_bp, {n: p.detach() for n, p in model_bp.named_parameters()})
+ seed_results['bp'] = {'log': bp_log, 'diagnostics': bp_diag, 'drift': bp_drift}
+ print(f" Final test acc: {bp_log['test_acc'][-1]:.4f}")
# ---- DFA ----
- print("\n--- DFA ---")
- torch.manual_seed(seed)
- np.random.seed(seed)
- torch.cuda.manual_seed_all(seed)
- model_dfa = ResidualMLP(input_dim, args.d_hidden, num_classes, args.num_blocks).to(device)
- init_dfa = {n: p.clone().detach() for n, p in model_dfa.named_parameters()}
- dfa_log, dfa_Bs = train_dfa(model_dfa, train_loader, test_loader, device, args)
- dfa_diag = compute_diagnostics(model_dfa, 'dfa', test_loader, device, args, dfa_Bs=dfa_Bs)
- dfa_drift = feature_drift(init_dfa, {n: p.detach() for n, p in model_dfa.named_parameters()})
- seed_results['dfa'] = {'log': dfa_log, 'diagnostics': dfa_diag, 'drift': dfa_drift}
- print(f" Final test acc: {dfa_log['test_acc'][-1]:.4f}")
+ if 'dfa' in methods_to_run:
+ print("\n--- DFA ---")
+ torch.manual_seed(seed)
+ np.random.seed(seed)
+ torch.cuda.manual_seed_all(seed)
+ model_dfa = ResidualMLP(input_dim, args.d_hidden, num_classes, args.num_blocks).to(device)
+ init_dfa = {n: p.clone().detach() for n, p in model_dfa.named_parameters()}
+ dfa_log, dfa_Bs = train_dfa(model_dfa, train_loader, test_loader, device, args)
+ dfa_diag = compute_diagnostics(model_dfa, 'dfa', test_loader, device, args, dfa_Bs=dfa_Bs)
+ dfa_drift = feature_drift(init_dfa, {n: p.detach() for n, p in model_dfa.named_parameters()})
+ seed_results['dfa'] = {'log': dfa_log, 'diagnostics': dfa_diag, 'drift': dfa_drift}
+ print(f" Final test acc: {dfa_log['test_acc'][-1]:.4f}")
# ---- State Bridge ----
- print("\n--- State Bridge ---")
- torch.manual_seed(seed)
- np.random.seed(seed)
- torch.cuda.manual_seed_all(seed)
- model_sb = ResidualMLP(input_dim, args.d_hidden, num_classes, args.num_blocks).to(device)
- init_sb = {n: p.clone().detach() for n, p in model_sb.named_parameters()}
- sb_log, state_pred = train_state_bridge(model_sb, train_loader, test_loader, device, args)
- sb_diag = compute_diagnostics(model_sb, 'state_bridge', test_loader, device, args,
- state_predictor=state_pred)
- sb_drift = feature_drift(init_sb, {n: p.detach() for n, p in model_sb.named_parameters()})
- seed_results['state_bridge'] = {'log': sb_log, 'diagnostics': sb_diag, 'drift': sb_drift}
- print(f" Final test acc: {sb_log['test_acc'][-1]:.4f}")
+ if 'state_bridge' in methods_to_run:
+ print("\n--- State Bridge ---")
+ torch.manual_seed(seed)
+ np.random.seed(seed)
+ torch.cuda.manual_seed_all(seed)
+ model_sb = ResidualMLP(input_dim, args.d_hidden, num_classes, args.num_blocks).to(device)
+ init_sb = {n: p.clone().detach() for n, p in model_sb.named_parameters()}
+ sb_log, state_pred = train_state_bridge(model_sb, train_loader, test_loader, device, args)
+ sb_diag = compute_diagnostics(model_sb, 'state_bridge', test_loader, device, args,
+ state_predictor=state_pred)
+ sb_drift = feature_drift(init_sb, {n: p.detach() for n, p in model_sb.named_parameters()})
+ seed_results['state_bridge'] = {'log': sb_log, 'diagnostics': sb_diag, 'drift': sb_drift}
+ print(f" Final test acc: {sb_log['test_acc'][-1]:.4f}")
# ---- Credit Bridge ----
- print("\n--- Credit Bridge ---")
- torch.manual_seed(seed)
- np.random.seed(seed)
- torch.cuda.manual_seed_all(seed)
- model_cb = ResidualMLP(input_dim, args.d_hidden, num_classes, args.num_blocks).to(device)
- init_cb = {n: p.clone().detach() for n, p in model_cb.named_parameters()}
- cb_log, vnet, vnet_ema = train_credit_bridge(model_cb, train_loader, test_loader, device, args)
- cb_diag = compute_diagnostics(model_cb, 'credit_bridge', test_loader, device, args,
- value_net=vnet)
- cb_drift = feature_drift(init_cb, {n: p.detach() for n, p in model_cb.named_parameters()})
- seed_results['credit_bridge'] = {'log': cb_log, 'diagnostics': cb_diag, 'drift': cb_drift}
- print(f" Final test acc: {cb_log['test_acc'][-1]:.4f}")
+ if 'credit_bridge' in methods_to_run:
+ print("\n--- Credit Bridge ---")
+ torch.manual_seed(seed)
+ np.random.seed(seed)
+ torch.cuda.manual_seed_all(seed)
+ model_cb = ResidualMLP(input_dim, args.d_hidden, num_classes, args.num_blocks).to(device)
+ init_cb = {n: p.clone().detach() for n, p in model_cb.named_parameters()}
+ cb_log, vnet, vnet_ema = train_credit_bridge(model_cb, train_loader, test_loader, device, args)
+ cb_diag = compute_diagnostics(model_cb, 'credit_bridge', test_loader, device, args,
+ value_net=vnet)
+ cb_drift = feature_drift(init_cb, {n: p.detach() for n, p in model_cb.named_parameters()})
+ seed_results['credit_bridge'] = {'log': cb_log, 'diagnostics': cb_diag, 'drift': cb_drift}
+ print(f" Final test acc: {cb_log['test_acc'][-1]:.4f}")
all_results[seed] = seed_results
@@ -767,6 +787,10 @@ def main():
parser.add_argument('--seeds', type=int, nargs='+', default=[42, 123, 456])
parser.add_argument('--gpu', type=int, default=1)
parser.add_argument('--output_dir', type=str, default='results/cifar10')
+ parser.add_argument('--methods', type=str, nargs='+', default=['bp', 'dfa', 'state_bridge', 'credit_bridge'],
+ help='Subset of methods to run.')
+ parser.add_argument('--random_targets', action='store_true',
+ help='Replace each minibatch label with i.i.d. random class targets (Mode 1 data-agnostic test).')
args = parser.parse_args()
run_experiment(args)
diff --git a/paper/main.pdf b/paper/main.pdf
index 0ba5ae7..a62416c 100644
--- a/paper/main.pdf
+++ b/paper/main.pdf
Binary files differ
diff --git a/paper/main.tex b/paper/main.tex
index 8bb6857..49cefc8 100644
--- a/paper/main.tex
+++ b/paper/main.tex
@@ -76,7 +76,7 @@ When we compare each method to a frozen-blocks baseline matched to the same arch
\section{Failure Mode 1: Measurement Degeneracy}
\label{sec:mode1}
-The first failure mode is a scale pathology, not yet an alignment pathology. On the audited 4-block pre-LayerNorm ResMLP ($d{=}256$, CIFAR-10, 100 epochs, 3 seeds), DFA optimizes block-local objectives of the form $\langle f_l(h_l),\, e_T B_l^\top\rangle$ with no explicit scale constraint on $f_l$, so for any direction in which increasing $\|f_l(h_l)\|$ improves alignment with the fixed feedback target $B_l^\top e_T$, the local objective rewards larger output magnitude. In a pre-LN residual stack, larger block outputs directly increase residual-stream scale; terminal LayerNorm then removes task-loss sensitivity to that scale at the output, so the architecture provides no global restraint on the local growth incentive \citep{launay2020direct}. In the same runs, each block's $w_1$ and $w_2$ grows by roughly $200\times$ in relative delta, their norm product reaches about $5\times 10^4$ per block, and the terminal hidden-state norm $\|h_L\|$ rises monotonically from about $9$ at random initialization to about $4\times 10^8$ by epoch 100 (Figure~\ref{fig:temporal_cross_arch}). Most of that growth appears immediately: $\|h_L\|$ already reaches about $10^6$ by epoch 5. As a direct test of whether this growth needs task signal at all, we re-ran DFA on the same backbone with i.i.d.\ random class targets refreshed every minibatch, so the labels carry no information; under random targets the network does not learn (test accuracy stays at chance), yet $\|h_L\|$ still grows from about $9$ to about $1.45\times 10^4$ within three epochs, and $\|g_L\|$ already drops to about $5.6\times 10^{-7}$, so Mode~1 is essentially data-agnostic on this architecture (Appendix~\ref{app:random_targets}). Once the residual stream reaches this regime, the backpropagation reference vector no longer behaves like a healthy target.
+The first failure mode is a scale pathology, not yet an alignment pathology. On the audited 4-block pre-LayerNorm ResMLP ($d{=}256$, CIFAR-10, 100 epochs, 3 seeds), DFA optimizes block-local objectives of the form $\langle f_l(h_l),\, e_T B_l^\top\rangle$ with no explicit scale constraint on $f_l$, so for any direction in which increasing $\|f_l(h_l)\|$ improves alignment with the fixed feedback target $B_l^\top e_T$, the local objective rewards larger output magnitude. In a pre-LN residual stack, larger block outputs directly increase residual-stream scale; terminal LayerNorm then removes task-loss sensitivity to that scale at the output, so the architecture provides no global restraint on the local growth incentive \citep{launay2020direct}. In the same runs, each block's $w_1$ and $w_2$ grows by roughly $200\times$ in relative delta, their norm product reaches about $5\times 10^4$ per block, and the terminal hidden-state norm $\|h_L\|$ rises monotonically from about $9$ at random initialization to about $4\times 10^8$ by epoch 100 (Figure~\ref{fig:temporal_cross_arch}). Most of that growth appears immediately: $\|h_L\|$ already reaches about $10^6$ by epoch 5. As a direct test of whether this growth needs task signal at all, we re-ran DFA, State Bridge, and Credit Bridge on the same backbone with i.i.d.\ random class targets refreshed every minibatch, so the labels carry no information; under random targets all three methods stay at chance accuracy, yet $\|h_L\|$ still grows from about $9$ to about $1.45\times 10^4$ for DFA, $6.2\times 10^3$ for State Bridge, and $2.0\times 10^4$ for Credit Bridge within three epochs, and DFA's $\|g_L\|$ already drops to about $5.6\times 10^{-7}$, so Mode~1 is essentially data-agnostic on this architecture across the three audited fixed-feedback local-credit methods (Appendix~\ref{app:random_targets}). Once the residual stream reaches this regime, the backpropagation reference vector no longer behaves like a healthy target.
The measurement failure occurs at the point where the hidden-layer BP gradient ceases to be a meaningful reference direction. In terminal-LayerNorm architectures, the LayerNorm Jacobian scales as $\partial \mathrm{LN}(h)/\partial h \propto 1/\|h\|$ in expectation, so the same residual-stream inflation is accompanied by collapse of the hidden-layer BP reference norm: on DFA-trained ResMLP, $\|g_L\|$ falls from about $9.8\times 10^{-4}$ at random initialization to about $5\times 10^{-10}$ by epoch 100, a six-order-of-magnitude drop, while the reported cosine remains mathematically defined only because \texttt{F.cosine\_similarity} clamps the denominator at $\varepsilon{=}10^{-8}$ (Table~\ref{tab:main_audit}; Figure~\ref{fig:audit_hero}). At that endpoint the reference norm is about $20\times$ below the clamp, so the quantity being reported is effectively $(a\cdot b)/(\|a\|\max(\|b\|,10^{-8}))$ rather than a comparison to an informative BP direction. At that point, reporting a cosine is no longer evidence about credit quality.
@@ -438,6 +438,26 @@ $3$ & $14{,}510$ & $5.62\times 10^{-7}$ & $0.071$ & $-0.025$ \\
This ablation answers the natural counterargument that DFA's residual-stream growth might be a side-effect of the network adapting to genuine task signal in a particularly bad local minimum: it is not. With no task signal at all, DFA on this architecture still inflates the residual stream by more than three orders of magnitude in the first three epochs and pushes the deepest BP reference gradient to the floor of $10^{-7}$ in the same window. The local DFA objective $\langle f_l(h_l),\, e_T B_l^\top\rangle$ contains no penalty on $\|f_l(h_l)\|$, so any direction in which a larger block output increases inner-product alignment with the fixed feedback target is rewarded; the random-target run isolates exactly this geometric incentive, free of any task-driven feature pressure. The full $100$-epoch trajectory of this random-target run is reported as a confirmatory check rather than a primary claim.
+We then asked whether this data-agnostic growth is specific to DFA or generalizes to other fixed-feedback local-credit methods, by repeating the random-target ablation under State Bridge and Credit Bridge with the same architecture, hyperparameters, and seed. Both methods also exhibit data-agnostic activation growth in the same three-epoch window, with $\|h_L\|$ rising from about $9$ to about $6.2\times 10^3$ (State Bridge) and about $2.0\times 10^4$ (Credit Bridge), while their test accuracies remain at chance ($0.10$ and $0.09$, respectively):
+
+\begin{table}[h]
+\centering
+\small
+\caption{Random-target ablation across the three audited fixed-feedback local-credit methods on the standard residual ResMLP-d256, seed 42, three epochs of training with i.i.d.\ random class targets. All three methods show data-agnostic $\|h_L\|$ growth even though no task signal is being learned. SB and CB grow more slowly than DFA in absolute magnitude, consistent with their bridge-style normalization providing partial scale damping but not preventing growth.}
+\label{tab:random_targets_sbcb_smoke}
+\begin{tabular}{lrrr}
+\toprule
+method & $\|h_L\|$ at ep 3 & $\|g_L\|$ at ep 3 & test acc \\
+\midrule
+DFA & $14{,}510$ & $5.6\times 10^{-7}$ & $0.071$ \\
+State Bridge & $6{,}225$ & $1.0\times 10^{-5}$ & $0.104$ \\
+Credit Bridge & $19{,}974$ & $3.2\times 10^{-6}$ & $0.092$ \\
+\bottomrule
+\end{tabular}
+\end{table}
+
+The cross-method version of the test rules out the explanation that the random-target growth is specific to DFA's particular feedback projection. State Bridge and Credit Bridge use bridge constructions with target normalization and stop-gradients, so any residual-stream growth they exhibit cannot be attributed to a simple absence of normalization. Their $\|g_L\|$ values at three epochs are still well above the $10^{-7}$ floor used by diagnostic~(b), so the gradient collapse part of Mode~1 does not yet appear at this horizon for SB/CB; the activation-growth part of Mode~1 is already present. We treat this as evidence that the local-credit growth incentive is not unique to DFA but is shared by the audited family of fixed-feedback methods.
+
\section{Reproducibility}
\label{app:reproducibility}