""" Frozen-init identity check (supporting measurement for the depth-utility ladder). Quantifies how close a randomly-initialized, frozen ResidualMLP block stack is to the identity map. This grounds the footnote explaining why the k=0 rung of the ladder (all blocks frozen at init) already sits well above chance: the trained embedding + readout are composed with a fixed, near-norm-preserving random feature map, i.e. effectively a trained (near-)linear classifier on pixels. Reports, at random init, on a CIFAR-10 test batch (mean over seeds): - per-block residual ratio ||f_l(h_l)|| / ||h_l|| (median over batch) - whole-stack deviation ||h_L - h_0|| / ||h_0|| (median over batch) - whole-stack direction cos(h_L, h_0) (median over batch) Usage: CUDA_VISIBLE_DEVICES=2 python experiments/frozen_init_identity_check.py """ import os, sys, json import numpy as np import torch import torch.nn.functional as F import torchvision import torchvision.transforms as transforms sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from models.residual_mlp import ResidualMLP def main(): d_hidden, L, C, n = 256, 4, 10, 256 seeds = [42, 123, 456] tf = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))]) ds = torchvision.datasets.CIFAR10('./data', train=False, download=True, transform=tf) x = torch.stack([ds[i][0] for i in range(n)]).view(n, -1) per_block, rel_dev, cos_dev = [], [], [] seed_rows = {} for seed in seeds: torch.manual_seed(seed); np.random.seed(seed) m = ResidualMLP(32 * 32 * 3, d_hidden, C, L).eval() with torch.no_grad(): h0 = m.embed(x); h = h0; ratios = [] for blk in m.blocks: f = blk(h) ratios.append(float((f.norm(dim=-1) / h.norm(dim=-1)).median())) h = h + f rel = float(((h - h0).norm(dim=-1) / h0.norm(dim=-1)).median()) cos = float(F.cosine_similarity(h, h0, dim=-1).median()) per_block.append(ratios); rel_dev.append(rel); cos_dev.append(cos) seed_rows[str(seed)] = {'per_block_ratio': ratios, 'rel_dev': rel, 'cos': cos} print(f"seed {seed}: per-block ||f||/||h|| = " f"{['%.4f' % r for r in ratios]} " f"||h_L-h_0||/||h_0|| = {rel:.3f} cos(h_L,h_0) = {cos:.4f}", flush=True) pb = np.array(per_block) summary = { 'config': {'d_hidden': d_hidden, 'L': L, 'num_classes': C, 'batch': n, 'dataset': 'cifar10-test', 'seeds': seeds}, 'per_seed': seed_rows, 'per_block_ratio_mean': pb.mean(0).tolist(), 'per_block_ratio_grand_mean': float(pb.mean()), 'rel_dev_mean': float(np.mean(rel_dev)), 'rel_dev_std': float(np.std(rel_dev, ddof=1)), 'cos_mean': float(np.mean(cos_dev)), 'cos_std': float(np.std(cos_dev, ddof=1)), } print(f"\nMEAN over {len(seeds)} seeds: " f"per-block ratio ≈ {summary['per_block_ratio_grand_mean']:.3f}, " f"||h_L-h_0||/||h_0|| = {summary['rel_dev_mean']:.3f} ± {summary['rel_dev_std']:.3f}, " f"cos = {summary['cos_mean']:.4f} ± {summary['cos_std']:.4f}", flush=True) out = 'results/depth_ladder/frozen_init_identity.json' os.makedirs(os.path.dirname(out), exist_ok=True) with open(out, 'w') as f: json.dump(summary, f, indent=2) print(f"Saved -> {out}", flush=True) if __name__ == '__main__': main()