From 1118b7457c261de36ead6103503c00c321c75f9b Mon Sep 17 00:00:00 2001
From: YurenHao0426 <Blackhao0426@gmail.com>
Date: Sun, 14 Jun 2026 20:32:31 -0500
Subject: Depth-utility ladder: trainable-block sweep (BP/FA/DFA) on ResMLP
 CIFAR-10

Appendix experiment triangulating the depth-utility diagnostic (D3) by varying
the number of trainable residual blocks k (last-k trainable, first L-k frozen at
init; embed/LN/head always trained).

- d=256 L=4 and d=512 L=2, 3 seeds, recipe identical to the main audit.
- BP climbs monotonically (+22-23pp); DFA peaks at the frozen baseline (k=0) and
  declines once any deep block is trained; FA shows partial/no net depth utility.
- Cross-checks reproduce existing anchors (BP 0.617, DFA 0.301, FA 0.402, frozen 0.349).
- frozen_init_identity_check quantifies frozen stack as a near-norm-preserving
  random feature map (per-block ||f||/||h||~0.10, stack cos 0.981), explaining the
  above-chance k=0 rung.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 experiments/plot_depth_ladder.py | 63 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 experiments/plot_depth_ladder.py

(limited to 'experiments/plot_depth_ladder.py')

diff --git a/experiments/plot_depth_ladder.py b/experiments/plot_depth_ladder.py
new file mode 100644
index 0000000..a5709bf
--- /dev/null
+++ b/experiments/plot_depth_ladder.py
@@ -0,0 +1,63 @@
+"""
+Plot the depth-utility ladder: test accuracy vs number of trainable blocks k,
+one curve per method (BP / FA / DFA), one panel per architecture.
+
+Usage:
+    python experiments/plot_depth_ladder.py
+"""
+import os, sys, json
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+
+CONFIGS = [
+    ('results/depth_ladder/ladder_d256_L4_cifar10.json', 'ResMLP d=256, L=4', 4),
+    ('results/depth_ladder/ladder_d512_L2_cifar10.json', 'ResMLP d=512, L=2', 2),
+]
+METHODS = [('bp', 'BP', 'tab:green', 'o'),
+           ('fa', 'FA', 'tab:orange', 's'),
+           ('dfa', 'DFA', 'tab:red', '^')]
+
+
+def agg(path, L):
+    d = json.load(open(path))['results']
+    out = {}
+    for m, _, _, _ in METHODS:
+        ks, mu, sd = [], [], []
+        for k in range(L + 1):
+            a = [v['final_acc'] for v in d[m][str(k)].values()]
+            ks.append(k); mu.append(np.mean(a))
+            sd.append(np.std(a, ddof=1) if len(a) > 1 else 0.0)
+        out[m] = (np.array(ks), np.array(mu), np.array(sd))
+    return out
+
+
+def main():
+    fig, axes = plt.subplots(1, len(CONFIGS), figsize=(11, 4.2))
+    if len(CONFIGS) == 1:
+        axes = [axes]
+    for ax, (path, title, L) in zip(axes, CONFIGS):
+        data = agg(path, L)
+        for m, label, color, mk in METHODS:
+            ks, mu, sd = data[m]
+            ax.errorbar(ks, mu, yerr=sd, marker=mk, color=color, label=label,
+                        capsize=3, lw=2, ms=7)
+        # frozen baseline reference (k=0, averaged across methods is ~chance-of-readout)
+        ax.axhline(0.10, ls=':', color='gray', lw=1)
+        ax.text(0.02, 0.105, 'chance', color='gray', fontsize=8, transform=ax.get_yaxis_transform())
+        ax.set_xlabel('trainable blocks $k$ (last $k$ of $L$)')
+        ax.set_ylabel('CIFAR-10 test accuracy')
+        ax.set_title(title)
+        ax.set_xticks(range(L + 1))
+        ax.grid(alpha=0.3)
+        ax.legend(loc='center right')
+    fig.suptitle('Depth-utility ladder: does training deeper blocks raise accuracy?', y=1.02)
+    fig.tight_layout()
+    out = 'results/depth_ladder/depth_ladder.png'
+    fig.savefig(out, dpi=150, bbox_inches='tight')
+    print(f"Saved -> {out}")
+
+
+if __name__ == '__main__':
+    main()
-- 
cgit v1.2.3