Round 34 random-target ablation: Mode 1 fires under random labels too

Codex round 34 picked OPTION A (i.i.d. random class targets per minibatch) over the analytic-only OPTION D as the most discriminating test of 'is (a) intrinsic to DFA update geometry or task-driven?'. Smoke test result is unambiguous: ep 0: ||h_L||=8.9 ||g_L||=9.8e-4 ep 1: ||h_L||=1616 ||g_L||=5.1e-6 ep 2: ||h_L||=9768 ||g_L||=8.5e-7 ep 3: ||h_L||=14510 ||g_L||=5.6e-7 (test acc still at chance ~0.07) Three orders of magnitude growth in ||h_L|| in 3 epochs, three orders of magnitude collapse in ||g_L|| in the same 3 epochs, with NO task signal whatsoever — DFA's local-loss geometry is the proximate driver, not data adaptation. - experiments/snapshot_evolution_residual_explosion.py: add --random_targets and --skip_bp flags - paper/main.tex §3 ¶1: replace 'no explicit scale constraint' framing with codex round 34's 6-line geometric argument and the random-target empirical falsifier - paper/main.tex Appendix J: full smoke-test table + interpretation - v2.3: 14 pages total, main content still 8 pages Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
author: YurenHao0426 <Blackhao0426@gmail.com> 2026-04-08 05:47:47 -0500
committer: YurenHao0426 <Blackhao0426@gmail.com> 2026-04-08 05:47:47 -0500
commit: 52693a9be4349c2820ac79e3e3d9af53813a7412 (patch)
tree: a198bbd855bce25b6a37fb730eb8de1fc3e29765 /experiments
parent: 8dd65b2ec3df32749adabbf62c55101d5b00ae7b (diff)
1 files changed, 20 insertions, 9 deletions
diff --git a/experiments/snapshot_evolution_residual_explosion.py b/experiments/snapshot_evolution_residual_explosion.py
index 86de4a4..1dc09f2 100644
--- a/experiments/snapshot_evolution_residual_explosion.py
+++ b/experiments/snapshot_evolution_residual_explosion.py
@@ -150,7 +150,8 @@ def train_bp(model, train_loader, x_eval, y_eval, device, epochs, lr, wd, log_ev
     return log
 
 
-def train_dfa(model, train_loader, x_eval, y_eval, device, epochs, lr, wd, log_every=1):
+def train_dfa(model, train_loader, x_eval, y_eval, device, epochs, lr, wd, log_every=1,
+              random_targets: bool = False):
     d_hidden = model.d_hidden
     L = model.num_blocks
     C = 10
@@ -172,6 +173,9 @@ def train_dfa(model, train_loader, x_eval, y_eval, device, epochs, lr, wd, log_e
         for x, y in train_loader:
             x = x.view(x.size(0), -1).to(device)
             y = y.to(device)
+            if random_targets:
+                # iid random class targets refreshed every minibatch (codex round 34 sharper variant)
+                y = torch.randint(0, 10, y.shape, device=device)
             batch = x.size(0)
             with torch.no_grad():
                 logits, hiddens = model(x, return_hidden=True)
@@ -222,6 +226,10 @@ def main():
                    help='Replace h = h + f with h = f (non-residual stack of LN-W1-GELU-W2 blocks).')
     p.add_argument('--w2_std', type=float, default=0.01,
                    help='Init std for w2 in each block. Bump to 0.05 for non-residual stack.')
+    p.add_argument('--random_targets', action='store_true',
+                   help='Replace each minibatch label with iid random class targets (codex round 34 OPTION A).')
+    p.add_argument('--skip_bp', action='store_true',
+                   help='Only train DFA, skip BP. Useful for cheap DFA-only ablations.')
     args = p.parse_args()
 
     os.makedirs(args.output_dir, exist_ok=True)
@@ -235,13 +243,15 @@ def main():
 
     L, d, C = args.depth, args.d_hidden, 10
 
-    print("\n=== BP training ===", flush=True)
-    torch.manual_seed(args.seed); np.random.seed(args.seed); torch.cuda.manual_seed_all(args.seed)
-    bp_model = ResidualMLP(3072, d, C, L,
-                           residual_add=not args.no_residual_add,
-                           w2_std=args.w2_std).to(device)
-    bp_log = train_bp(bp_model, train_loader, x_eval, y_eval, device,
-                      args.epochs, args.lr, args.wd, log_every=args.log_every)
+    bp_log = None
+    if not args.skip_bp:
+        print("\n=== BP training ===", flush=True)
+        torch.manual_seed(args.seed); np.random.seed(args.seed); torch.cuda.manual_seed_all(args.seed)
+        bp_model = ResidualMLP(3072, d, C, L,
+                               residual_add=not args.no_residual_add,
+                               w2_std=args.w2_std).to(device)
+        bp_log = train_bp(bp_model, train_loader, x_eval, y_eval, device,
+                          args.epochs, args.lr, args.wd, log_every=args.log_every)
 
     print("\n=== DFA training ===", flush=True)
     torch.manual_seed(args.seed); np.random.seed(args.seed); torch.cuda.manual_seed_all(args.seed)
@@ -249,7 +259,8 @@ def main():
                             residual_add=not args.no_residual_add,
                             w2_std=args.w2_std).to(device)
     dfa_log = train_dfa(dfa_model, train_loader, x_eval, y_eval, device,
-                        args.epochs, args.lr, args.wd, log_every=args.log_every)
+                        args.epochs, args.lr, args.wd, log_every=args.log_every,
+                        random_targets=args.random_targets)
 
     out = {
         'config': vars(args),
author	YurenHao0426 <Blackhao0426@gmail.com>	2026-04-08 05:47:47 -0500
committer	YurenHao0426 <Blackhao0426@gmail.com>	2026-04-08 05:47:47 -0500
commit	52693a9be4349c2820ac79e3e3d9af53813a7412 (patch)
tree	a198bbd855bce25b6a37fb730eb8de1fc3e29765 /experiments
parent	8dd65b2ec3df32749adabbf62c55101d5b00ae7b (diff)