""" Phase 10A.6: Structured vs Semantic Auxiliary Dissection. Core question: What kind of structure in the auxiliary signal actually drives the blend gain observed in Phase 10A? 9 branches from the same DFA checkpoint at t0=5: 1. continue_DFA — pure DFA baseline 2. blend_random_trainable — random Vec, trained online (same as 10A.5) 3. blend_shuffled_trainable — Vec trained but targets shuffled within batch 4. blend_zero_target — Vec trained with ||a_aux||^2 loss (learns zero) 5. blend_fresh_random_target — Vec trained with fresh i.i.d. random targets each step 6. blend_time_only — auxiliary only sees t=l/L (no h_l) 7. blend_constant_input — auxiliary learns a fixed per-block template (no input) 8. blend_prefit60_frozen — Vec prefit for 60 epochs, then FROZEN 9. blend_prefit60_trainable — Vec prefit for 60 epochs, then continue training """ import os import sys import json import argparse import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.utils.data import DataLoader import torchvision import torchvision.transforms as transforms import copy sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from models.residual_mlp import ResidualMLP from models.value_net import SinusoidalTimeEmbed from metrics.credit_metrics import cosine_similarity_batch, perturbation_correlation # --------------------------------------------------------------------------- # Auxiliary network architectures # --------------------------------------------------------------------------- class VectorCreditNet(nn.Module): """Standard Vec: takes (h, t, s) and outputs d_hidden credit vector.""" def __init__(self, d_hidden, s_dim, time_embed_dim=32, hidden_dim=256, num_layers=3): super().__init__() self.ln = nn.LayerNorm(d_hidden) self.time_embed = SinusoidalTimeEmbed(time_embed_dim) input_dim = d_hidden + time_embed_dim + s_dim layers = [] for i in range(num_layers): in_d = input_dim if i == 0 else hidden_dim layers.append(nn.Linear(in_d, hidden_dim)) layers.append(nn.GELU()) layers.append(nn.Linear(hidden_dim, d_hidden)) self.net = nn.Sequential(*layers) def forward(self, h, t, s): return self.net(torch.cat([self.ln(h), self.time_embed(t), s], dim=-1)) class TimeOnlyNet(nn.Module): """Auxiliary that only takes t=l/L as input (no h_l, no s). Shared MLP on sinusoidal time embedding -> d_hidden. """ def __init__(self, d_hidden, time_embed_dim=32, hidden_dim=256, num_layers=3): super().__init__() self.time_embed = SinusoidalTimeEmbed(time_embed_dim) layers = [] for i in range(num_layers): in_d = time_embed_dim if i == 0 else hidden_dim layers.append(nn.Linear(in_d, hidden_dim)) layers.append(nn.GELU()) layers.append(nn.Linear(hidden_dim, d_hidden)) self.net = nn.Sequential(*layers) def forward(self, h, t, s): # h and s are ignored — only t is used return self.net(self.time_embed(t)) class ConstantNet(nn.Module): """Auxiliary with NO input at all: learns a per-block fixed output template. Each block has an nn.Parameter of shape (d_hidden,). forward(h, t, s) just expands the per-block param to match batch size. Block index is passed as an integer via a separate call to set_block(). """ def __init__(self, d_hidden, num_blocks): super().__init__() # One template per block self.templates = nn.ParameterList( [nn.Parameter(torch.zeros(d_hidden)) for _ in range(num_blocks)] ) self._block_idx = 0 def set_block(self, l): self._block_idx = l def forward(self, h, t, s): batch = h.size(0) return self.templates[self._block_idx].unsqueeze(0).expand(batch, -1) # --------------------------------------------------------------------------- # Data # --------------------------------------------------------------------------- def get_cifar10(batch_size=128): transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))]) trainset = torchvision.datasets.CIFAR10( root='./data', train=True, download=True, transform=transform_train) testset = torchvision.datasets.CIFAR10( root='./data', train=False, download=True, transform=transform_test) return (DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True), DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)) # --------------------------------------------------------------------------- # Evaluation helpers # --------------------------------------------------------------------------- def evaluate(model, test_loader, device): model.eval(); c, t = 0, 0 with torch.no_grad(): for x, y in test_loader: x = x.view(x.size(0), -1).to(device); y = y.to(device) c += (model(x).argmax(1) == y).sum().item(); t += x.size(0) return c / t def compute_diagnostics(model, aux_net, Bs, test_loader, device, credit_mode, alpha=0.75, const_net_L=None): """Compute mean Gamma (BP cosine) and mean rho (perturbation correlation).""" model.eval() if aux_net is not None: aux_net.eval() L = model.num_blocks # Get one batch for x, y in test_loader: x = x.view(x.size(0), -1).to(device); y = y.to(device); break batch = x.size(0) # BP pass to get hidden gradients was_frozen = not next(model.parameters()).requires_grad if was_frozen: for p in model.parameters(): p.requires_grad_(True) model.zero_grad() lo, hbp = model(x, return_hidden=True) for l in range(L + 1): hbp[l].retain_grad() F.cross_entropy(lo, y).backward() bp = {l: hbp[l].grad.detach().clone() for l in range(L + 1)} if was_frozen: for p in model.parameters(): p.requires_grad_(False) with torch.no_grad(): lo2, hi = model(x, return_hidden=True) eT = lo2.softmax(-1); eT[torch.arange(batch), y] -= 1; s = eT.detach() gammas, rhos = [], [] for l in range(L): h_l = hi[l].detach() t_l = torch.full((batch,), l / L, device=device) if credit_mode == 'dfa': a_l = (s @ Bs[l].T).detach() elif credit_mode == 'blend' and aux_net is not None: a_dfa = (s @ Bs[l].T).detach() if isinstance(aux_net, ConstantNet): aux_net.set_block(l) a_aux = aux_net(h_l, t_l, s).detach() rd = (a_dfa ** 2).mean(-1, keepdim=True).sqrt() + 1e-6 rv = (a_aux ** 2).mean(-1, keepdim=True).sqrt() + 1e-6 a_l = alpha * a_aux / rv + (1 - alpha) * a_dfa / rd else: a_l = (s @ Bs[l].T).detach() gammas.append(cosine_similarity_batch(a_l, bp[l])) def make_fwd(sl): def f(h): with torch.no_grad(): c = h for i in range(sl, L): c = c + model.blocks[i](c) return F.cross_entropy( model.out_head(model.out_ln(c)), y, reduction='none') return f rhos.append(perturbation_correlation(h_l, a_l, make_fwd(l), epsilon=1e-3, M=16)) return float(np.mean(gammas)), float(np.mean(rhos)) # --------------------------------------------------------------------------- # DFA training + checkpoint # --------------------------------------------------------------------------- def train_dfa_get_checkpoint(model, train_loader, test_loader, device, total_epochs, t0, lr, wd): d = model.d_hidden; L = model.num_blocks Bs = [torch.randn(d, 10, device=device) / np.sqrt(10) for _ in range(L)] block_opts = [optim.AdamW(b.parameters(), lr=lr, weight_decay=wd) for b in model.blocks] embed_opt = optim.AdamW(model.embed.parameters(), lr=lr, weight_decay=wd) head_opt = optim.AdamW( list(model.out_head.parameters()) + list(model.out_ln.parameters()), lr=lr, weight_decay=wd) scheds = ([optim.lr_scheduler.CosineAnnealingLR(o, T_max=total_epochs) for o in block_opts] + [optim.lr_scheduler.CosineAnnealingLR(embed_opt, T_max=total_epochs), optim.lr_scheduler.CosineAnnealingLR(head_opt, T_max=total_epochs)]) ckpt = None for epoch in range(1, total_epochs + 1): model.train(); tl, c, t = 0, 0, 0 for x, y in train_loader: x = x.view(x.size(0), -1).to(device); y = y.to(device); b = x.size(0) with torch.no_grad(): lo, hi = model(x, return_hidden=True); lv = F.cross_entropy(lo, y) eT = lo.softmax(-1); eT[torch.arange(b), y] -= 1 hL = hi[-1].detach() lo2 = F.cross_entropy(model.out_head(model.out_ln(hL)), y) head_opt.zero_grad(); lo2.backward(); head_opt.step() for l in range(L): a = (eT @ Bs[l].T).detach() rm = (a ** 2).mean(-1, keepdim=True).sqrt() + 1e-6 f = model.blocks[l](hi[l].detach()) ll = (f * (a / rm)).sum(-1).mean() block_opts[l].zero_grad(); ll.backward() torch.nn.utils.clip_grad_norm_(model.blocks[l].parameters(), 1.0) block_opts[l].step() a0 = (eT @ Bs[0].T).detach() r0 = (a0 ** 2).mean(-1, keepdim=True).sqrt() + 1e-6 el = (model.embed(x) * (a0 / r0)).sum(-1).mean() embed_opt.zero_grad(); el.backward(); embed_opt.step() tl += lv.item() * b; c += (lo.argmax(1) == y).sum().item(); t += b for s in scheds: s.step() if epoch == t0: acc = evaluate(model, test_loader, device) ckpt = {'model': copy.deepcopy(model.state_dict()), 'Bs': [B.clone() for B in Bs], 'acc': acc} print(f" [DFA] Checkpoint at epoch {t0}: acc={acc:.4f}") if epoch % 10 == 0: print(f" [DFA] Epoch {epoch}: acc={evaluate(model, test_loader, device):.4f}") return Bs, ckpt # --------------------------------------------------------------------------- # Offline Vec prefit # --------------------------------------------------------------------------- def offline_fit_vec(vec_net, model, Bs, train_loader, device, epochs, lr_fb, M, eps_pert=1e-3): """Pre-train VectorCreditNet using perturbation targets (no forward net training).""" L = model.num_blocks vec_opt = optim.Adam(vec_net.parameters(), lr=lr_fb) model.eval() print(f" [prefit] Starting Vec prefit for {epochs} epochs...") for epoch in range(1, epochs + 1): vec_net.train(); total_loss = 0.0; nb = 0 for x, y in train_loader: x = x.view(x.size(0), -1).to(device); y = y.to(device); batch = x.size(0) with torch.no_grad(): lo, hi = model(x, return_hidden=True) eT = lo.softmax(-1); eT[torch.arange(batch), y] -= 1; s = eT.detach() hL = hi[-1].detach() # Terminal anchor t_L = torch.ones(batch, device=device) a_term = vec_net(hL, t_L, s) hL_req = hL.clone().requires_grad_(True) ce = F.cross_entropy(model.out_head(model.out_ln(hL_req)), y, reduction='sum') dL = torch.autograd.grad(ce, hL_req)[0].detach() loss_term = ((a_term - dL) ** 2).sum(-1).mean() # Perturbation loss at a random layer lt = np.random.randint(0, L) h_l = hi[lt].detach() t_l = torch.full((batch,), lt / L, device=device) a_l = vec_net(h_l, t_l, s) lp2 = torch.tensor(0.0, device=device) for _ in range(M): v = torch.randn_like(h_l) v = v / (v.norm(-1, keepdim=True) + 1e-8) with torch.no_grad(): lp = F.cross_entropy( model.forward_from_layer(h_l + eps_pert * v, lt), y, reduction='none') lm = F.cross_entropy( model.forward_from_layer(h_l - eps_pert * v, lt), y, reduction='none') gj = (lp - lm) / (2 * eps_pert) lp2 = lp2 + (((a_l * v).sum(-1) - gj.detach()) ** 2).mean() lp2 /= M vl = loss_term + lp2 vec_opt.zero_grad(); vl.backward() torch.nn.utils.clip_grad_norm_(vec_net.parameters(), 1.0) vec_opt.step() total_loss += vl.item(); nb += 1 if epoch % 10 == 0 or epoch == epochs: print(f" [prefit] Epoch {epoch}/{epochs}: loss={total_loss/nb:.4f}") vec_net.eval() return vec_net # --------------------------------------------------------------------------- # Estimate shuffled-target RMS (for fresh_random sigma matching) # --------------------------------------------------------------------------- def estimate_shuffled_rms(model, Bs, train_loader, device, M=4, eps_pert=1e-3, n_batches=5): """Estimate RMS of shuffled perturbation targets to match for fresh_random branch.""" model.eval(); L = model.num_blocks rms_vals = [] for i, (x, y) in enumerate(train_loader): if i >= n_batches: break x = x.view(x.size(0), -1).to(device); y = y.to(device); batch = x.size(0) with torch.no_grad(): lo, hi = model(x, return_hidden=True) lt = np.random.randint(0, L) h_l = hi[lt].detach() for _ in range(M): v = torch.randn_like(h_l) v = v / (v.norm(-1, keepdim=True) + 1e-8) with torch.no_grad(): lp = F.cross_entropy( model.forward_from_layer(h_l + eps_pert * v, lt), y, reduction='none') lm = F.cross_entropy( model.forward_from_layer(h_l - eps_pert * v, lt), y, reduction='none') gj = (lp - lm) / (2 * eps_pert) # Shuffled gj_sh = gj[torch.randperm(batch, device=device)] rms_vals.append(gj_sh.pow(2).mean().sqrt().item()) return float(np.mean(rms_vals)) if rms_vals else 0.01 # --------------------------------------------------------------------------- # Branch runner # --------------------------------------------------------------------------- def run_branch(model, aux_net, Bs, train_loader, test_loader, device, t0, total_epochs, branch_type, alpha, lr, lr_fb, wd, M, branch_name='', fresh_sigma=0.01): """ Run a training branch from a loaded checkpoint. branch_type options: 'dfa' — pure DFA 'blend_trainable' — blend with Vec trained online (perturbation targets) 'blend_shuffled' — blend with Vec trained on shuffled targets 'blend_zero_target' — blend with Vec trained toward zero (||a_aux||^2) 'blend_fresh_random' — blend with Vec trained on fresh random targets each step 'blend_time_only' — blend with TimeOnlyNet trained online 'blend_constant' — blend with ConstantNet trained online 'blend_frozen' — blend with pre-fit Vec, frozen 'blend_prefit_trainable'— blend with pre-fit Vec, continue training """ d = model.d_hidden; L = model.num_blocks; eps_pert = 1e-3 # Determine which aux nets need training trainable_types = {'blend_trainable', 'blend_shuffled', 'blend_zero_target', 'blend_fresh_random', 'blend_time_only', 'blend_constant', 'blend_prefit_trainable'} aux_trained = (branch_type in trainable_types) and (aux_net is not None) block_opts = [optim.AdamW(b.parameters(), lr=lr, weight_decay=wd) for b in model.blocks] embed_opt = optim.AdamW(model.embed.parameters(), lr=lr, weight_decay=wd) head_opt = optim.AdamW( list(model.out_head.parameters()) + list(model.out_ln.parameters()), lr=lr, weight_decay=wd) aux_opt = optim.Adam(aux_net.parameters(), lr=lr_fb) if aux_trained else None scheds = ([optim.lr_scheduler.CosineAnnealingLR(o, T_max=total_epochs) for o in block_opts] + [optim.lr_scheduler.CosineAnnealingLR(embed_opt, T_max=total_epochs), optim.lr_scheduler.CosineAnnealingLR(head_opt, T_max=total_epochs)]) # Advance schedulers to match checkpoint epoch for _ in range(t0): for s in scheds: s.step() log = {'test_acc': [], 'train_loss': [], 'gamma': [], 'rho': [], 'alpha_eff': []} diag_epochs = set( list(range(t0 + 1, min(t0 + 6, total_epochs + 1))) + [t0 + 8, t0 + 10, t0 + 15, t0 + 20] + list(range(t0 + 10, total_epochs + 1, 10)) + [total_epochs]) for epoch in range(t0 + 1, total_epochs + 1): model.train() if aux_net is not None and aux_trained: aux_net.train() elif aux_net is not None: aux_net.eval() tl, c, t = 0, 0, 0 epoch_aux_norms, epoch_dfa_norms = [], [] for x, y in train_loader: x = x.view(x.size(0), -1).to(device); y = y.to(device); batch = x.size(0) with torch.no_grad(): lo, hi = model(x, return_hidden=True); lv = F.cross_entropy(lo, y) eT = lo.softmax(-1); eT[torch.arange(batch), y] -= 1; s = eT.detach() hL = hi[-1].detach() # ---------------------------------------------------------------- # Train auxiliary network (if applicable) # ---------------------------------------------------------------- if aux_opt is not None: if branch_type in ('blend_trainable', 'blend_prefit_trainable', 'blend_time_only'): # Standard perturbation targets t_L = torch.ones(batch, device=device) a_term = aux_net(hL, t_L, s) hL_req = hL.clone().requires_grad_(True) ce = F.cross_entropy( model.out_head(model.out_ln(hL_req)), y, reduction='sum') dL = torch.autograd.grad(ce, hL_req)[0].detach() loss_term = ((a_term - dL) ** 2).sum(-1).mean() lt = np.random.randint(0, L) h_l = hi[lt].detach() t_l = torch.full((batch,), lt / L, device=device) a_l = aux_net(h_l, t_l, s) lp2 = torch.tensor(0.0, device=device) for _ in range(M): v = torch.randn_like(h_l) v = v / (v.norm(-1, keepdim=True) + 1e-8) with torch.no_grad(): lp = F.cross_entropy( model.forward_from_layer(h_l + eps_pert * v, lt), y, reduction='none') lm = F.cross_entropy( model.forward_from_layer(h_l - eps_pert * v, lt), y, reduction='none') gj = (lp - lm) / (2 * eps_pert) lp2 = lp2 + (((a_l * v).sum(-1) - gj.detach()) ** 2).mean() lp2 /= M vl = loss_term + lp2 elif branch_type == 'blend_shuffled': # Perturbation targets shuffled within batch t_L = torch.ones(batch, device=device) a_term = aux_net(hL, t_L, s) hL_req = hL.clone().requires_grad_(True) ce = F.cross_entropy( model.out_head(model.out_ln(hL_req)), y, reduction='sum') dL = torch.autograd.grad(ce, hL_req)[0].detach() dL_sh = dL[torch.randperm(batch, device=device)] loss_term = ((a_term - dL_sh) ** 2).sum(-1).mean() lt = np.random.randint(0, L) h_l = hi[lt].detach() t_l = torch.full((batch,), lt / L, device=device) a_l = aux_net(h_l, t_l, s) lp2 = torch.tensor(0.0, device=device) for _ in range(M): v = torch.randn_like(h_l) v = v / (v.norm(-1, keepdim=True) + 1e-8) with torch.no_grad(): lp = F.cross_entropy( model.forward_from_layer(h_l + eps_pert * v, lt), y, reduction='none') lm = F.cross_entropy( model.forward_from_layer(h_l - eps_pert * v, lt), y, reduction='none') gj = (lp - lm) / (2 * eps_pert) # Shuffle targets within batch to break semantic correlation gj_sh = gj[torch.randperm(batch, device=device)] lp2 = lp2 + (((a_l * v).sum(-1) - gj_sh.detach()) ** 2).mean() lp2 /= M vl = loss_term + lp2 elif branch_type == 'blend_zero_target': # Minimize ||a_aux||^2 — teaches the network to output zero lt = np.random.randint(0, L) h_l = hi[lt].detach() t_l = torch.full((batch,), lt / L, device=device) if isinstance(aux_net, ConstantNet): aux_net.set_block(lt) a_l = aux_net(h_l, t_l, s) vl = (a_l ** 2).sum(-1).mean() elif branch_type == 'blend_fresh_random': # Fresh i.i.d. random targets sampled every step lt = np.random.randint(0, L) h_l = hi[lt].detach() t_l = torch.full((batch,), lt / L, device=device) a_l = aux_net(h_l, t_l, s) # Sample random target with RMS matched to shuffled scale rand_target = torch.randn(batch, device=device) * fresh_sigma lp2 = torch.tensor(0.0, device=device) for _ in range(M): v = torch.randn_like(h_l) v = v / (v.norm(-1, keepdim=True) + 1e-8) # Fresh random target per direction per step gj_rand = torch.randn(batch, device=device) * fresh_sigma lp2 = lp2 + (((a_l * v).sum(-1) - gj_rand) ** 2).mean() lp2 /= M vl = lp2 elif branch_type == 'blend_constant': # ConstantNet: train per-block templates # Train all blocks this step (cheap since it's just parameters) vl = torch.tensor(0.0, device=device) lt = np.random.randint(0, L) h_l = hi[lt].detach() t_l = torch.full((batch,), lt / L, device=device) aux_net.set_block(lt) a_l = aux_net(h_l, t_l, s) lp2_inner = torch.tensor(0.0, device=device) for _ in range(M): v = torch.randn_like(h_l) v = v / (v.norm(-1, keepdim=True) + 1e-8) with torch.no_grad(): lp = F.cross_entropy( model.forward_from_layer(h_l + eps_pert * v, lt), y, reduction='none') lm = F.cross_entropy( model.forward_from_layer(h_l - eps_pert * v, lt), y, reduction='none') gj = (lp - lm) / (2 * eps_pert) lp2_inner = lp2_inner + ( ((a_l * v).sum(-1) - gj.detach()) ** 2).mean() vl = lp2_inner / M else: vl = None if vl is not None: aux_opt.zero_grad(); vl.backward() torch.nn.utils.clip_grad_norm_(aux_net.parameters(), 1.0) aux_opt.step() # ---------------------------------------------------------------- # Compute credits for each block # ---------------------------------------------------------------- dfa_credits = [(eT @ Bs[l].T).detach() for l in range(L)] credits = [] for l in range(L): a_dfa = dfa_credits[l] rms_d = (a_dfa ** 2).mean(-1, keepdim=True).sqrt() + 1e-6 if branch_type == 'dfa': credits.append(a_dfa / rms_d) else: # All blend branches h_l = hi[l].detach() t_l = torch.full((batch,), l / L, device=device) with torch.no_grad(): if isinstance(aux_net, ConstantNet): aux_net.set_block(l) a_aux = aux_net(h_l, t_l, s).detach() rms_v = (a_aux ** 2).mean(-1, keepdim=True).sqrt() + 1e-6 a_blend = alpha * a_aux / rms_v + (1 - alpha) * a_dfa / rms_d credits.append(a_blend) # Track norms for alpha_eff a_c = credits[-1] if branch_type == 'dfa': epoch_aux_norms.append(0.0) epoch_dfa_norms.append(a_c.norm().item()) else: a_dfa_n = a_dfa / rms_d rms_v2 = (a_aux ** 2).mean(-1, keepdim=True).sqrt() + 1e-6 epoch_aux_norms.append((alpha * a_aux / rms_v2).norm().item()) epoch_dfa_norms.append(((1 - alpha) * a_dfa_n).norm().item()) # ---------------------------------------------------------------- # Update output head (local exact gradient — allowed) # ---------------------------------------------------------------- lo2 = F.cross_entropy(model.out_head(model.out_ln(hL)), y) head_opt.zero_grad(); lo2.backward(); head_opt.step() # ---------------------------------------------------------------- # Update blocks with local surrogate # a_total is already blended; normalize before inner product # ---------------------------------------------------------------- for l in range(L): a = credits[l] rm = (a ** 2).mean(-1, keepdim=True).sqrt() + 1e-6 f = model.blocks[l](hi[l].detach()) ll = (f * (a / rm)).sum(-1).mean() block_opts[l].zero_grad(); ll.backward() torch.nn.utils.clip_grad_norm_(model.blocks[l].parameters(), 1.0) block_opts[l].step() # Update embedding with block-0 credit a0 = credits[0] r0 = (a0 ** 2).mean(-1, keepdim=True).sqrt() + 1e-6 el = (model.embed(x) * (a0 / r0)).sum(-1).mean() embed_opt.zero_grad(); el.backward(); embed_opt.step() tl += lv.item() * batch; c += (lo.argmax(1) == y).sum().item(); t += batch for sch in scheds: sch.step() ta = evaluate(model, test_loader, device) log['test_acc'].append(ta); log['train_loss'].append(tl / t) mean_aux = np.mean(epoch_aux_norms) if epoch_aux_norms else 0.0 mean_dfa = np.mean(epoch_dfa_norms) if epoch_dfa_norms else 1.0 aeff = mean_aux / (mean_aux + mean_dfa + 1e-12) log['alpha_eff'].append((epoch, aeff)) if epoch in diag_epochs: cm = 'blend' if branch_type != 'dfa' else 'dfa' gamma, rho = compute_diagnostics( model, aux_net if branch_type != 'dfa' else None, Bs, test_loader, device, cm, alpha) log['gamma'].append((epoch, gamma)); log['rho'].append((epoch, rho)) if epoch <= t0 + 15 or epoch % 20 == 0 or epoch == total_epochs: print(f" [{branch_name}] Ep {epoch}: acc={ta:.4f}, " f"G={gamma:.4f}, r={rho:.4f}, aeff={aeff:.3f}") elif epoch % 10 == 0 or epoch == total_epochs: print(f" [{branch_name}] Ep {epoch}: acc={ta:.4f}") return log # --------------------------------------------------------------------------- # Main experiment # --------------------------------------------------------------------------- def run_experiment(args): device = torch.device(f'cuda:{args.gpu}' if torch.cuda.is_available() else 'cpu') print(f"Using device: {device}") os.makedirs(args.output_dir, exist_ok=True) torch.manual_seed(args.seed); np.random.seed(args.seed) torch.cuda.manual_seed_all(args.seed) train_loader, test_loader = get_cifar10(args.batch_size) input_dim = 32 * 32 * 3; L = args.num_blocks; d = args.d_hidden # ---------------------------------------------------------------- # Step 1: Train DFA and capture checkpoint at t0 # ---------------------------------------------------------------- print(f"\n{'='*60}\nTraining DFA baseline (checkpoint at t0={args.t0})\n{'='*60}") model_dfa = ResidualMLP(input_dim, d, 10, L).to(device) Bs, ckpt = train_dfa_get_checkpoint( model_dfa, train_loader, test_loader, device, args.epochs, args.t0, args.lr, args.wd) print(f" Checkpoint acc at t0={args.t0}: {ckpt['acc']:.4f}") # ---------------------------------------------------------------- # Step 2: Offline prefit of Vec (for prefit60 branches) # ---------------------------------------------------------------- print(f"\n{'='*60}\nOffline prefit of Vec (60 epochs)\n{'='*60}") torch.manual_seed(args.seed + 7777) vec_prefit = VectorCreditNet(d_hidden=d, s_dim=10).to(device) # Load checkpoint model for prefit (weights at t0, forward net frozen) model_prefit = ResidualMLP(input_dim, d, 10, L).to(device) model_prefit.load_state_dict(ckpt['model']) model_prefit.eval() for p in model_prefit.parameters(): p.requires_grad_(False) vec_prefit = offline_fit_vec( vec_prefit, model_prefit, ckpt['Bs'], train_loader, device, epochs=60, lr_fb=args.lr_fb, M=args.M) del model_prefit # ---------------------------------------------------------------- # Step 3: Estimate sigma for fresh_random (match shuffled-target RMS) # ---------------------------------------------------------------- model_ref = ResidualMLP(input_dim, d, 10, L).to(device) model_ref.load_state_dict(ckpt['model']); model_ref.eval() for p in model_ref.parameters(): p.requires_grad_(False) fresh_sigma = estimate_shuffled_rms(model_ref, ckpt['Bs'], train_loader, device) print(f" fresh_random sigma (matched to shuffled RMS): {fresh_sigma:.5f}") del model_ref # ---------------------------------------------------------------- # Step 4: Define and run all 9 branches # ---------------------------------------------------------------- # Each entry: (branch_name, branch_type, aux_factory_fn) # aux_factory_fn returns a fresh aux_net (or None for dfa) VEC_SEED = args.seed + 7777 def make_vec(): torch.manual_seed(VEC_SEED) return VectorCreditNet(d_hidden=d, s_dim=10).to(device) def make_time_only(): torch.manual_seed(VEC_SEED) return TimeOnlyNet(d_hidden=d).to(device) def make_constant(): torch.manual_seed(VEC_SEED) return ConstantNet(d_hidden=d, num_blocks=L).to(device) def make_prefit_frozen(): net = copy.deepcopy(vec_prefit) for p in net.parameters(): p.requires_grad_(False) net.eval() return net def make_prefit_trainable(): return copy.deepcopy(vec_prefit) branches = [ # (name, branch_type, aux_factory) ('continue_DFA', 'dfa', lambda: None), ('blend_random_trainable', 'blend_trainable', make_vec), ('blend_shuffled_trainable', 'blend_shuffled', make_vec), ('blend_zero_target_trainable', 'blend_zero_target', make_vec), ('blend_fresh_random_target', 'blend_fresh_random', make_vec), ('blend_time_only_trainable', 'blend_time_only', make_time_only), ('blend_constant_input', 'blend_constant', make_constant), ('blend_prefit60_frozen', 'blend_frozen', make_prefit_frozen), ('blend_prefit60_trainable', 'blend_prefit_trainable', make_prefit_trainable), ] all_results = {} for bname, btype, aux_factory in branches: print(f"\n{'='*60}\n{bname}\n{'='*60}") # Fresh model from checkpoint model_b = ResidualMLP(input_dim, d, 10, L).to(device) model_b.load_state_dict(ckpt['model']) aux_net_b = aux_factory() log = run_branch( model_b, aux_net_b, ckpt['Bs'], train_loader, test_loader, device, args.t0, args.epochs, btype, args.alpha, args.lr, args.lr_fb, args.wd, args.M, branch_name=bname, fresh_sigma=fresh_sigma) all_results[bname] = log print(f" {bname} final acc: {log['test_acc'][-1]:.4f}") # ---------------------------------------------------------------- # Step 5: Summary table # ---------------------------------------------------------------- dfa_final = all_results['continue_DFA']['test_acc'][-1] print(f"\n{'='*95}") print("SUMMARY — Phase 10A.6: Structured vs Semantic Auxiliary") print(f"{'='*95}") print(f"{'Branch':<34} {'@20':>6} {'final':>7} {'diff':>7} " f"{'mG_5:15':>9} {'mr_5:15':>9} {'aeff':>7}") print("-" * 83) for bname, log in all_results.items(): accs = log['test_acc'] idx20 = max(0, 20 - args.t0 - 1) acc20 = accs[idx20] if len(accs) > idx20 else accs[-1] final = accs[-1] diff = final - dfa_final gammas_e = [g for e, g in log['gamma'] if args.t0 < e <= args.t0 + 15] rhos_e = [r for e, r in log['rho'] if args.t0 < e <= args.t0 + 15] aeffs_e = [a for e, a in log['alpha_eff'] if args.t0 < e <= args.t0 + 15] mg = float(np.mean(gammas_e)) if gammas_e else float('nan') mr = float(np.mean(rhos_e)) if rhos_e else float('nan') mae = float(np.mean(aeffs_e)) if aeffs_e else float('nan') print(f"{bname:<34} {acc20:>6.4f} {final:>7.4f} {diff:>+7.4f} " f"{mg:>9.4f} {mr:>9.4f} {mae:>7.3f}") # ---------------------------------------------------------------- # Step 6: Save results # ---------------------------------------------------------------- save_data = { 'args': vars(args), 'dfa_ckpt_acc': float(ckpt['acc']), 'fresh_sigma': float(fresh_sigma), } for bname, log in all_results.items(): save_data[bname] = { 'test_acc': log['test_acc'], 'train_loss': log['train_loss'], 'gamma': log['gamma'], 'rho': log['rho'], 'alpha_eff': log['alpha_eff'], } out_path = os.path.join(args.output_dir, f'structured_aux_t{args.t0}_s{args.seed}.json') with open(out_path, 'w') as f: json.dump(save_data, f, indent=2, default=float) print(f"\nSaved to {out_path}") # ---------------------------------------------------------------- # Step 7: Judgment # ---------------------------------------------------------------- print(f"\n{'='*60}\nJUDGMENT\n{'='*60}") r = {bname: log['test_acc'][-1] for bname, log in all_results.items()} dfa = r['continue_DFA'] rt = r.get('blend_random_trainable', float('nan')) sh = r.get('blend_shuffled_trainable', float('nan')) zt = r.get('blend_zero_target_trainable', float('nan')) fr = r.get('blend_fresh_random_target', float('nan')) to = r.get('blend_time_only_trainable', float('nan')) cn = r.get('blend_constant_input', float('nan')) pf = r.get('blend_prefit60_frozen', float('nan')) pt = r.get('blend_prefit60_trainable', float('nan')) print(f" DFA={dfa:.4f} rt={rt:.4f} sh={sh:.4f} zt={zt:.4f} " f"fr={fr:.4f} to={to:.4f} cn={cn:.4f} pf={pf:.4f} pt={pt:.4f}") # Key comparisons thr = 0.003 if abs(rt - sh) < thr and abs(rt - fr) < thr: print(" -> random_trainable ≈ shuffled ≈ fresh_random: " "gain is NOT from semantic content, likely norm/regularization") elif rt > sh + thr and rt > fr + thr: print(" -> random_trainable > shuffled AND fresh_random: " "some semantic structure in Vec helps") if abs(rt - zt) < thr: print(" -> zero_target ≈ random_trainable: " "the implicit regularization is from the blend ratio, not signal direction") elif zt < dfa - thr: print(" -> zero_target HURTS vs DFA: driving aux toward zero is not just neutral") if abs(rt - to) < thr: print(" -> time_only ≈ random_trainable: " "depth schedule alone captures most of the benefit") if abs(rt - cn) < thr: print(" -> constant_input ≈ random_trainable: " "fixed per-block template sufficient (no input processing needed)") if pt > pf + thr: print(" -> prefit60_trainable > prefit60_frozen: " "continued training after prefit improves further") elif abs(pt - pf) < thr: print(" -> prefit60_trainable ≈ prefit60_frozen: " "prefit quality saturates, additional online training neutral") if pt > rt + thr: print(" -> prefit60_trainable > random_trainable: " "warm-start from prefit helps over training from scratch") def main(): parser = argparse.ArgumentParser( description='Phase 10A.6: Structured vs Semantic Auxiliary Dissection') parser.add_argument('--num_blocks', type=int, default=4) parser.add_argument('--d_hidden', type=int, default=256) parser.add_argument('--batch_size', type=int, default=128) parser.add_argument('--epochs', type=int, default=100) parser.add_argument('--t0', type=int, default=5) parser.add_argument('--alpha', type=float, default=0.75) parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--lr_fb', type=float, default=1e-3) parser.add_argument('--wd', type=float, default=0.01) parser.add_argument('--M', type=int, default=4) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--gpu', type=int, default=2) parser.add_argument('--output_dir', type=str, default='results/structured_aux') args = parser.parse_args() run_experiment(args) if __name__ == '__main__': main()