From d1c22697a99c894f07db972acb5a1a9229b0276a Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Wed, 8 Apr 2026 20:17:43 -0500 Subject: paper v2.35: add Figure 2 - cross-method cos-vs-accuracy dissociation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User said "you don't need to worry about page count for now", which freed up the page budget for substantive additions. Highest-yield substantive addition: a visual figure for the §4 ¶4 cross-method dissociation that the user previously flagged as the paper's strongest new observation but is currently text-only. New figure: paper/figures/fig_cos_acc_dissociation.pdf - Parallel-coordinates / slope-chart style - 4 columns: deep cos | accuracy | |nudging| | training-loss decrease - 3 lines: SB+pen (blue), CB+pen (red), DFA+pen (gray) - Each metric normalized to [0, 1] with raw values annotated - Shaded "cos: CB top" region on the left vs labeled "accuracy / nudging / training-loss: SB top" on the right - The X-pattern between cos and accuracy makes the dissociation visually immediate: SB rises from middle (cos) to top (functional), CB falls from top (cos) to tied with DFA (functional) Inserted between §4 ¶4 (Mode 2 mechanism) and §5 (intervention). Referenced from the §4 ¶4 functional measurements paragraph as "Figure 2". Why this figure replaces the prose-only argument's burden of proof: the X-pattern visualization is a single glance vs paragraph parsing. Reviewers will see "deep cosine ranks differently from 3 functional metrics" without needing to track the numbers. Important design choice: did NOT include deep ρ in the figure, even though it's in §4 ¶2, because ρ ranks CB > SB > DFA (same as cos), not the SB > CB > DFA pattern of the functional metrics. ρ groups with cos as a "directional alignment" metric, while the functional triad (accuracy, nudging, training-loss) groups around forward-state usefulness. The figure caption notes this distinction implicitly by listing only the three functional metrics. Page impact: total 18 → 19 pages, main content §1-§7 now spans p1-p10 (was p1-p9). Per user's relaxed constraint, page count is no longer the binding constraint. Figure auto-shifts the figure numbering: cos_acc_dissoc is now Figure 2, temporal_cross_arch becomes Figure 3, penalty_rescue → Figure 4, cross_arch_summary → Figure 5. All figure references use \\ref{} so they auto-update. Co-Authored-By: Claude Opus 4.6 (1M context) --- paper/figures/fig_cos_acc_dissociation.pdf | Bin 0 -> 30309 bytes paper/figures/render_fig_cos_acc_dissociation.py | 93 +++++++++++++++++++++++ paper/main.pdf | Bin 504573 -> 530297 bytes paper/main.tex | 9 ++- 4 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 paper/figures/fig_cos_acc_dissociation.pdf create mode 100644 paper/figures/render_fig_cos_acc_dissociation.py diff --git a/paper/figures/fig_cos_acc_dissociation.pdf b/paper/figures/fig_cos_acc_dissociation.pdf new file mode 100644 index 0000000..41db52e Binary files /dev/null and b/paper/figures/fig_cos_acc_dissociation.pdf differ diff --git a/paper/figures/render_fig_cos_acc_dissociation.py b/paper/figures/render_fig_cos_acc_dissociation.py new file mode 100644 index 0000000..fff7f65 --- /dev/null +++ b/paper/figures/render_fig_cos_acc_dissociation.py @@ -0,0 +1,93 @@ +"""Render Figure: cos-vs-accuracy cross-method dissociation. + +Shows the v2.33 finding: under matched penalty rescue (lam=1e-2, 30ep, 3 seeds) +on the audited 4-block d=256 ResMLP, three independent functional metrics +(headline accuracy, single-step nudging, integrated training-loss decrease) +all rank SB ≫ CB ≈ DFA, while deep cosine ranks CB > SB > DFA — the only +ordering that disagrees with the functional ranking. + +Sources (all 3-seed): + results/round38_sb_penalty_30ep_s{42,123,456}/results_cifar10.json + results/round38_cb_penalty_30ep_s{42,123,456}/results_cifar10.json + results/round41_dfa_penalty_30ep{,_s{123,456}}/results_cifar10.json + results/nudging_test_3seed_summary.json + results/training_loss_decrease_3seed.json +""" +import os +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np + +REPO_ROOT = "/home/yurenh2/fa" + +# Three-seed values from the saved JSONs (cross-checked against §4 ¶4 prose) +methods = ["SB+pen", "CB+pen", "DFA+pen"] +colors = {"SB+pen": "#1f77b4", "CB+pen": "#d62728", "DFA+pen": "#7f7f7f"} + +# Each entry: (raw values per method, with std if available) +# §4 ¶4 lists the three functional metrics as accuracy, nudging, training-loss +# trajectory. Deep ρ is intentionally NOT shown here because ρ ranks CB > SB > DFA +# (same as cos), not SB > CB > DFA — ρ groups with cos as a "directional alignment" +# metric, while the functional triad below groups around forward-state usefulness. +metrics = { + "deep cos": [0.322, 0.679, 0.151], + "accuracy": [0.453, 0.360, 0.360], + "|nudging|": [1.929e-3, 4.264e-4, 4.978e-5], + "loss decrease": [0.447, 0.121, 0.095], +} +metric_stds = { + "deep cos": [0.007, 0.008, 0.025], + "accuracy": [0.003, 0.003, 0.001], + "|nudging|": [0.113e-3, 0.024e-3, 0.0044e-3], + "loss decrease": [0.008, 0.003, 0.007], +} + +# Normalize each metric to [0, 1] where 1 = max across the 3 methods. +# This makes the parallel-coordinates lines comparable. +metric_names = list(metrics.keys()) +norm = {} +for m, vals in metrics.items(): + mx = max(vals) + norm[m] = [v / mx for v in vals] + +fig, ax = plt.subplots(figsize=(6.0, 3.4)) + +x = np.arange(len(metric_names)) + +for i, method in enumerate(methods): + y = [norm[m][i] for m in metric_names] + ax.plot(x, y, "o-", color=colors[method], lw=2.2, markersize=9, label=method) + # Annotate each point with the raw value + for xi, yi, m in zip(x, y, metric_names): + raw = metrics[m][i] + if "nudg" in m: + label = f"{raw*1e3:.2f}e-3" + elif "cos" in m: + label = f"+{raw:.3f}" if raw >= 0 else f"{raw:.3f}" + else: + label = f"{raw:.3f}" + # Place label slightly offset based on method ordering at this x + ax.annotate(label, (xi, yi), textcoords="offset points", + xytext=(8, 0), fontsize=7, color=colors[method], + ha="left", va="center") + +ax.set_xticks(x) +ax.set_xticklabels(metric_names, fontsize=9) +ax.set_ylabel("normalized score (max = 1 across the 3 methods)", fontsize=9) +ax.set_ylim(-0.05, 1.18) +ax.set_title("Cross-method functional dissociation (3 seeds, 30 ep, $\\lambda{=}10^{-2}$)\n" + "all 3 functional metrics rank SB $\\gg$ CB $\\approx$ DFA; deep cos is the only one that disagrees", + fontsize=9) +ax.legend(loc="upper right", fontsize=8, framealpha=0.95) +ax.grid(True, axis="y", alpha=0.3) + +# Visual guide: shade the "cos column disagrees" region +ax.axvspan(-0.4, 0.4, color="#fff3e0", alpha=0.5, zorder=0) +ax.text(0, 1.13, "cos: CB top", ha="center", fontsize=7, color="#cc4400", style="italic") +ax.text(2.5, 1.13, "accuracy / nudging / training-loss decrease: SB top", ha="center", fontsize=7, color="#1f5f9f", style="italic") + +plt.tight_layout() +out = os.path.join(REPO_ROOT, "paper/figures/fig_cos_acc_dissociation.pdf") +plt.savefig(out, bbox_inches="tight", dpi=200) +print(f"Saved {out}") diff --git a/paper/main.pdf b/paper/main.pdf index f6dac9a..611c97e 100644 Binary files a/paper/main.pdf and b/paper/main.pdf differ diff --git a/paper/main.tex b/paper/main.tex index 7b125e4..90a06b3 100644 --- a/paper/main.tex +++ b/paper/main.tex @@ -98,7 +98,14 @@ A second metric with different numerical failure modes tells the same story. Cos Per-layer reporting is therefore not cosmetic. In ResMLP under vanilla DFA, the headline aggregate alignment $\Gamma \approx 0.07$--$0.10$ can look mildly positive only because layer $0$ remains strongly aligned while the deep network is not: at the same epoch-1 checkpoints where layers $1$--$4$ are essentially zero, layer $0$ has cosine $+0.42$, $+0.44$, and $+0.42$ across seeds (Table~\ref{tab:mode_validation}; per-seed values in Appendix~\ref{app:layer0_dominance}). The resulting average can therefore be driven by the embedding layer even when the interior blocks are effectively unaligned, so aggregate reporting obscures the very distinction needed to separate ``measurement collapse'' from ``poor credit direction.'' This layer-$0$ dominance is specific to the ResMLP DFA setting; on ViT-Mini DFA, all layers are near zero, which strengthens the broader methodological point that alignment should be reported per layer rather than only in aggregate. With the two modes separated observationally, the remaining question is whether intervention can move them independently. -Mode~2 has method-dependent severity within the audited fixed-feedback family once Mode~1 is alleviated. Applying the same $\lambda{=}10^{-2}$ scale-control penalty to SB, CB, and DFA on the audited 4-block $d{=}256$ ResMLP for $30$ epochs (three seeds) gives, in order, test accuracies $0.453 \pm 0.003$, $0.360 \pm 0.003$, $0.360 \pm 0.001$ and deep mean cosines $+0.322 \pm 0.007$, $+0.679 \pm 0.008$, $+0.151 \pm 0.025$ (deep mean $\rho$ $+0.402$, $+0.464$, $+0.080$ and full $\|h_L\|/\|g_L\|$ in Appendix~\ref{app:sb_penalty}), all in the meaningful-measurement regime. SB+penalty is the first audited non-BP method whose trained deep blocks beat the frozen-blocks baseline ($0.349$), by $+10.4$ pp---comparable to BP+penalty's $+18.3$ pp. Within this rescued regime the three methods reveal a clean cosine-versus-accuracy dissociation, and two independent functional measurements rule out the interpretation that cosine is just noisy. \emph{Nudging:} a single step $\eta{=}0.01$ along each method's per-layer credit $a_l$ at the converged checkpoint changes the deep-block test loss by $-1.93 \pm 0.11 \times 10^{-3}$ (SB+pen), $-4.26 \pm 0.24 \times 10^{-4}$ (CB+pen), and $-4.98 \pm 0.44 \times 10^{-5}$ (DFA+pen) across three seeds (per-seed values in Appendix~\ref{app:sb_penalty}): SB moves the loss $\approx\!4.5\times$ more than CB and $\approx\!39\times$ more than DFA, even though CB has the highest deep cosine with BP. \emph{Training-loss trajectory:} the integrated 30-epoch training loss decrease across three seeds ranks SB ($-0.447 \pm 0.008$) $\gg$ CB ($-0.121 \pm 0.003$) $\approx$ DFA ($-0.095 \pm 0.007$). All three functional metrics (accuracy, nudging, training-loss trajectory) agree on SB $\gg$ CB $\approx$ DFA; the deep-cosine ordering CB $>$ SB $>$ DFA is the only one that disagrees. We therefore frame the Mode~2 reading as a three-part proposition. \emph{Observation}: CB has $4\times$ DFA's deep cosine yet matches DFA's accuracy, while SB attains the best accuracy with intermediate cosine. \emph{Inference}: layerwise cosine is necessary to rule out grossly wrong credit signals (distinguishing the rescued regime from the clamp-dominated vanilla one) but not sufficient to certify usable credit for depth. \emph{Mechanism hypothesis}: usefulness depends on whether the local update induces useful forward-state change across blocks, not merely on the angle to the BP gradient---CB supplies a gradient-direction surrogate that aligns in angle without translating to coordinated forward-state improvement, while SB supplies a state-level teaching signal that preserves aspects of useful credit which cosine does not measure. The same hypothesis casts Mode~1 as a downstream symptom of Mode~2 rather than a parallel failure: when $a_l$ cannot drive useful per-block forward-state change, the only easy way to increase $\langle f_l, a_l\rangle$ is to inflate $\|f_l\|$ along the cheap random direction set by $a_l$, producing Mode~1(a) growth and (via terminal LN) Mode~1(b) collapse; the per-block penalty then breaks the chain by capping $\|f_l\|$ without fixing credit quality, consistent with the observed asymmetry that it alleviates Mode~1 fully but only partially fixes Mode~2. We state this as a hypothesis because we have measured the angle-to-accuracy gap and two functional proxies but not the full per-block forward-state-change content; the reporting rule that follows is robust to either interpretation. This cross-method dissociation strengthens the methodological point that alignment must be reported jointly with measurement validity and a depth-utilization baseline rather than as a single headline number. +Mode~2 has method-dependent severity within the audited fixed-feedback family once Mode~1 is alleviated. Applying the same $\lambda{=}10^{-2}$ scale-control penalty to SB, CB, and DFA on the audited 4-block $d{=}256$ ResMLP for $30$ epochs (three seeds) gives, in order, test accuracies $0.453 \pm 0.003$, $0.360 \pm 0.003$, $0.360 \pm 0.001$ and deep mean cosines $+0.322 \pm 0.007$, $+0.679 \pm 0.008$, $+0.151 \pm 0.025$ (deep mean $\rho$ $+0.402$, $+0.464$, $+0.080$ and full $\|h_L\|/\|g_L\|$ in Appendix~\ref{app:sb_penalty}), all in the meaningful-measurement regime. SB+penalty is the first audited non-BP method whose trained deep blocks beat the frozen-blocks baseline ($0.349$), by $+10.4$ pp---comparable to BP+penalty's $+18.3$ pp. Within this rescued regime the three methods reveal a clean cosine-versus-accuracy dissociation, and two independent functional measurements rule out the interpretation that cosine is just noisy. \emph{Nudging:} a single step $\eta{=}0.01$ along each method's per-layer credit $a_l$ at the converged checkpoint changes the deep-block test loss by $-1.93 \pm 0.11 \times 10^{-3}$ (SB+pen), $-4.26 \pm 0.24 \times 10^{-4}$ (CB+pen), and $-4.98 \pm 0.44 \times 10^{-5}$ (DFA+pen) across three seeds (per-seed values in Appendix~\ref{app:sb_penalty}): SB moves the loss $\approx\!4.5\times$ more than CB and $\approx\!39\times$ more than DFA, even though CB has the highest deep cosine with BP. \emph{Training-loss trajectory:} the integrated 30-epoch training loss decrease across three seeds ranks SB ($-0.447 \pm 0.008$) $\gg$ CB ($-0.121 \pm 0.003$) $\approx$ DFA ($-0.095 \pm 0.007$). All three functional metrics (accuracy, nudging, training-loss trajectory) agree on SB $\gg$ CB $\approx$ DFA; the deep-cosine ordering CB $>$ SB $>$ DFA is the only one that disagrees (Figure~\ref{fig:cos_acc_dissoc}). We therefore frame the Mode~2 reading as a three-part proposition. \emph{Observation}: CB has $4\times$ DFA's deep cosine yet matches DFA's accuracy, while SB attains the best accuracy with intermediate cosine. \emph{Inference}: layerwise cosine is necessary to rule out grossly wrong credit signals (distinguishing the rescued regime from the clamp-dominated vanilla one) but not sufficient to certify usable credit for depth. \emph{Mechanism hypothesis}: usefulness depends on whether the local update induces useful forward-state change across blocks, not merely on the angle to the BP gradient---CB supplies a gradient-direction surrogate that aligns in angle without translating to coordinated forward-state improvement, while SB supplies a state-level teaching signal that preserves aspects of useful credit which cosine does not measure. The same hypothesis casts Mode~1 as a downstream symptom of Mode~2 rather than a parallel failure: when $a_l$ cannot drive useful per-block forward-state change, the only easy way to increase $\langle f_l, a_l\rangle$ is to inflate $\|f_l\|$ along the cheap random direction set by $a_l$, producing Mode~1(a) growth and (via terminal LN) Mode~1(b) collapse; the per-block penalty then breaks the chain by capping $\|f_l\|$ without fixing credit quality, consistent with the observed asymmetry that it alleviates Mode~1 fully but only partially fixes Mode~2. We state this as a hypothesis because we have measured the angle-to-accuracy gap and two functional proxies but not the full per-block forward-state-change content; the reporting rule that follows is robust to either interpretation. This cross-method dissociation strengthens the methodological point that alignment must be reported jointly with measurement validity and a depth-utilization baseline rather than as a single headline number. + +\begin{figure}[t] +\centering +\includegraphics[width=0.85\linewidth]{figures/fig_cos_acc_dissociation.pdf} +\caption{Cross-method functional dissociation under matched penalty rescue ($\lambda{=}10^{-2}$, $30$ epochs, $3$ seeds, 4-block $d{=}256$ pre-LayerNorm ResMLP). Each line tracks one method across four metrics, normalized so that the maximum across methods equals $1.0$ in each column; raw values are annotated. Deep cosine to the BP gradient ranks the three methods $\mathrm{CB}{>}\mathrm{SB}{>}\mathrm{DFA}$, but the three functional metrics (test accuracy, single-step nudging-test loss decrease, and integrated 30-epoch training-loss decrease) all rank them $\mathrm{SB}{\gg}\mathrm{CB}{\approx}\mathrm{DFA}$. The X-pattern between deep cos and accuracy is the cross-method cos-versus-accuracy dissociation: SB rises from middle (cos) to top (functional), CB drops from top (cos) to tied with DFA (functional). Deep cosine is the only one of the four metrics that does not predict accuracy.} +\label{fig:cos_acc_dissoc} +\end{figure} \section{Intervention and Cross-Architecture Evidence} \label{sec:validation} -- cgit v1.2.3