From 1118b7457c261de36ead6103503c00c321c75f9b Mon Sep 17 00:00:00 2001
From: YurenHao0426 <Blackhao0426@gmail.com>
Date: Sun, 14 Jun 2026 20:32:31 -0500
Subject: Depth-utility ladder: trainable-block sweep (BP/FA/DFA) on ResMLP
 CIFAR-10

Appendix experiment triangulating the depth-utility diagnostic (D3) by varying
the number of trainable residual blocks k (last-k trainable, first L-k frozen at
init; embed/LN/head always trained).

- d=256 L=4 and d=512 L=2, 3 seeds, recipe identical to the main audit.
- BP climbs monotonically (+22-23pp); DFA peaks at the frozen baseline (k=0) and
  declines once any deep block is trained; FA shows partial/no net depth utility.
- Cross-checks reproduce existing anchors (BP 0.617, DFA 0.301, FA 0.402, frozen 0.349).
- frozen_init_identity_check quantifies frozen stack as a near-norm-preserving
  random feature map (per-block ||f||/||h||~0.10, stack cos 0.981), explaining the
  above-chance k=0 rung.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 experiments/frozen_init_identity_check.py | 82 +++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 experiments/frozen_init_identity_check.py

(limited to 'experiments/frozen_init_identity_check.py')

diff --git a/experiments/frozen_init_identity_check.py b/experiments/frozen_init_identity_check.py
new file mode 100644
index 0000000..3f58d7d
--- /dev/null
+++ b/experiments/frozen_init_identity_check.py
@@ -0,0 +1,82 @@
+"""
+Frozen-init identity check (supporting measurement for the depth-utility ladder).
+
+Quantifies how close a randomly-initialized, frozen ResidualMLP block stack is to
+the identity map. This grounds the footnote explaining why the k=0 rung of the
+ladder (all blocks frozen at init) already sits well above chance: the trained
+embedding + readout are composed with a fixed, near-norm-preserving random feature
+map, i.e. effectively a trained (near-)linear classifier on pixels.
+
+Reports, at random init, on a CIFAR-10 test batch (mean over seeds):
+  - per-block residual ratio   ||f_l(h_l)|| / ||h_l||         (median over batch)
+  - whole-stack deviation      ||h_L - h_0|| / ||h_0||        (median over batch)
+  - whole-stack direction      cos(h_L, h_0)                  (median over batch)
+
+Usage:
+    CUDA_VISIBLE_DEVICES=2 python experiments/frozen_init_identity_check.py
+"""
+import os, sys, json
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision
+import torchvision.transforms as transforms
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from models.residual_mlp import ResidualMLP
+
+
+def main():
+    d_hidden, L, C, n = 256, 4, 10, 256
+    seeds = [42, 123, 456]
+    tf = transforms.Compose([transforms.ToTensor(),
+                             transforms.Normalize((0.4914, 0.4822, 0.4465),
+                                                  (0.2470, 0.2435, 0.2616))])
+    ds = torchvision.datasets.CIFAR10('./data', train=False, download=True, transform=tf)
+    x = torch.stack([ds[i][0] for i in range(n)]).view(n, -1)
+
+    per_block, rel_dev, cos_dev = [], [], []
+    seed_rows = {}
+    for seed in seeds:
+        torch.manual_seed(seed); np.random.seed(seed)
+        m = ResidualMLP(32 * 32 * 3, d_hidden, C, L).eval()
+        with torch.no_grad():
+            h0 = m.embed(x); h = h0; ratios = []
+            for blk in m.blocks:
+                f = blk(h)
+                ratios.append(float((f.norm(dim=-1) / h.norm(dim=-1)).median()))
+                h = h + f
+            rel = float(((h - h0).norm(dim=-1) / h0.norm(dim=-1)).median())
+            cos = float(F.cosine_similarity(h, h0, dim=-1).median())
+        per_block.append(ratios); rel_dev.append(rel); cos_dev.append(cos)
+        seed_rows[str(seed)] = {'per_block_ratio': ratios, 'rel_dev': rel, 'cos': cos}
+        print(f"seed {seed}: per-block ||f||/||h|| = "
+              f"{['%.4f' % r for r in ratios]}  "
+              f"||h_L-h_0||/||h_0|| = {rel:.3f}  cos(h_L,h_0) = {cos:.4f}", flush=True)
+
+    pb = np.array(per_block)
+    summary = {
+        'config': {'d_hidden': d_hidden, 'L': L, 'num_classes': C, 'batch': n,
+                   'dataset': 'cifar10-test', 'seeds': seeds},
+        'per_seed': seed_rows,
+        'per_block_ratio_mean': pb.mean(0).tolist(),
+        'per_block_ratio_grand_mean': float(pb.mean()),
+        'rel_dev_mean': float(np.mean(rel_dev)),
+        'rel_dev_std': float(np.std(rel_dev, ddof=1)),
+        'cos_mean': float(np.mean(cos_dev)),
+        'cos_std': float(np.std(cos_dev, ddof=1)),
+    }
+    print(f"\nMEAN over {len(seeds)} seeds: "
+          f"per-block ratio ≈ {summary['per_block_ratio_grand_mean']:.3f}, "
+          f"||h_L-h_0||/||h_0|| = {summary['rel_dev_mean']:.3f} ± {summary['rel_dev_std']:.3f}, "
+          f"cos = {summary['cos_mean']:.4f} ± {summary['cos_std']:.4f}", flush=True)
+
+    out = 'results/depth_ladder/frozen_init_identity.json'
+    os.makedirs(os.path.dirname(out), exist_ok=True)
+    with open(out, 'w') as f:
+        json.dump(summary, f, indent=2)
+    print(f"Saved -> {out}", flush=True)
+
+
+if __name__ == '__main__':
+    main()
-- 
cgit v1.2.3