From 1118b7457c261de36ead6103503c00c321c75f9b Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Sun, 14 Jun 2026 20:32:31 -0500 Subject: Depth-utility ladder: trainable-block sweep (BP/FA/DFA) on ResMLP CIFAR-10 Appendix experiment triangulating the depth-utility diagnostic (D3) by varying the number of trainable residual blocks k (last-k trainable, first L-k frozen at init; embed/LN/head always trained). - d=256 L=4 and d=512 L=2, 3 seeds, recipe identical to the main audit. - BP climbs monotonically (+22-23pp); DFA peaks at the frozen baseline (k=0) and declines once any deep block is trained; FA shows partial/no net depth utility. - Cross-checks reproduce existing anchors (BP 0.617, DFA 0.301, FA 0.402, frozen 0.349). - frozen_init_identity_check quantifies frozen stack as a near-norm-preserving random feature map (per-block ||f||/||h||~0.10, stack cos 0.981), explaining the above-chance k=0 rung. Co-Authored-By: Claude Opus 4.8 (1M context) --- experiments/frozen_init_identity_check.py | 82 +++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 experiments/frozen_init_identity_check.py (limited to 'experiments/frozen_init_identity_check.py') diff --git a/experiments/frozen_init_identity_check.py b/experiments/frozen_init_identity_check.py new file mode 100644 index 0000000..3f58d7d --- /dev/null +++ b/experiments/frozen_init_identity_check.py @@ -0,0 +1,82 @@ +""" +Frozen-init identity check (supporting measurement for the depth-utility ladder). + +Quantifies how close a randomly-initialized, frozen ResidualMLP block stack is to +the identity map. This grounds the footnote explaining why the k=0 rung of the +ladder (all blocks frozen at init) already sits well above chance: the trained +embedding + readout are composed with a fixed, near-norm-preserving random feature +map, i.e. effectively a trained (near-)linear classifier on pixels. + +Reports, at random init, on a CIFAR-10 test batch (mean over seeds): + - per-block residual ratio ||f_l(h_l)|| / ||h_l|| (median over batch) + - whole-stack deviation ||h_L - h_0|| / ||h_0|| (median over batch) + - whole-stack direction cos(h_L, h_0) (median over batch) + +Usage: + CUDA_VISIBLE_DEVICES=2 python experiments/frozen_init_identity_check.py +""" +import os, sys, json +import numpy as np +import torch +import torch.nn.functional as F +import torchvision +import torchvision.transforms as transforms + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from models.residual_mlp import ResidualMLP + + +def main(): + d_hidden, L, C, n = 256, 4, 10, 256 + seeds = [42, 123, 456] + tf = transforms.Compose([transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), + (0.2470, 0.2435, 0.2616))]) + ds = torchvision.datasets.CIFAR10('./data', train=False, download=True, transform=tf) + x = torch.stack([ds[i][0] for i in range(n)]).view(n, -1) + + per_block, rel_dev, cos_dev = [], [], [] + seed_rows = {} + for seed in seeds: + torch.manual_seed(seed); np.random.seed(seed) + m = ResidualMLP(32 * 32 * 3, d_hidden, C, L).eval() + with torch.no_grad(): + h0 = m.embed(x); h = h0; ratios = [] + for blk in m.blocks: + f = blk(h) + ratios.append(float((f.norm(dim=-1) / h.norm(dim=-1)).median())) + h = h + f + rel = float(((h - h0).norm(dim=-1) / h0.norm(dim=-1)).median()) + cos = float(F.cosine_similarity(h, h0, dim=-1).median()) + per_block.append(ratios); rel_dev.append(rel); cos_dev.append(cos) + seed_rows[str(seed)] = {'per_block_ratio': ratios, 'rel_dev': rel, 'cos': cos} + print(f"seed {seed}: per-block ||f||/||h|| = " + f"{['%.4f' % r for r in ratios]} " + f"||h_L-h_0||/||h_0|| = {rel:.3f} cos(h_L,h_0) = {cos:.4f}", flush=True) + + pb = np.array(per_block) + summary = { + 'config': {'d_hidden': d_hidden, 'L': L, 'num_classes': C, 'batch': n, + 'dataset': 'cifar10-test', 'seeds': seeds}, + 'per_seed': seed_rows, + 'per_block_ratio_mean': pb.mean(0).tolist(), + 'per_block_ratio_grand_mean': float(pb.mean()), + 'rel_dev_mean': float(np.mean(rel_dev)), + 'rel_dev_std': float(np.std(rel_dev, ddof=1)), + 'cos_mean': float(np.mean(cos_dev)), + 'cos_std': float(np.std(cos_dev, ddof=1)), + } + print(f"\nMEAN over {len(seeds)} seeds: " + f"per-block ratio ≈ {summary['per_block_ratio_grand_mean']:.3f}, " + f"||h_L-h_0||/||h_0|| = {summary['rel_dev_mean']:.3f} ± {summary['rel_dev_std']:.3f}, " + f"cos = {summary['cos_mean']:.4f} ± {summary['cos_std']:.4f}", flush=True) + + out = 'results/depth_ladder/frozen_init_identity.json' + os.makedirs(os.path.dirname(out), exist_ok=True) + with open(out, 'w') as f: + json.dump(summary, f, indent=2) + print(f"Saved -> {out}", flush=True) + + +if __name__ == '__main__': + main() -- cgit v1.2.3