From 1118b7457c261de36ead6103503c00c321c75f9b Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Sun, 14 Jun 2026 20:32:31 -0500 Subject: Depth-utility ladder: trainable-block sweep (BP/FA/DFA) on ResMLP CIFAR-10 Appendix experiment triangulating the depth-utility diagnostic (D3) by varying the number of trainable residual blocks k (last-k trainable, first L-k frozen at init; embed/LN/head always trained). - d=256 L=4 and d=512 L=2, 3 seeds, recipe identical to the main audit. - BP climbs monotonically (+22-23pp); DFA peaks at the frozen baseline (k=0) and declines once any deep block is trained; FA shows partial/no net depth utility. - Cross-checks reproduce existing anchors (BP 0.617, DFA 0.301, FA 0.402, frozen 0.349). - frozen_init_identity_check quantifies frozen stack as a near-norm-preserving random feature map (per-block ||f||/||h||~0.10, stack cos 0.981), explaining the above-chance k=0 rung. Co-Authored-By: Claude Opus 4.8 (1M context) --- experiments/depth_utility_ladder.py | 317 +++ experiments/frozen_init_identity_check.py | 82 + experiments/plot_depth_ladder.py | 63 + logs/depth_ladder.log | 1103 +++++++++++ report_explore/MEMO_depth_utility_ladder.md | 119 ++ results/depth_ladder/depth_ladder.png | Bin 0 -> 89453 bytes results/depth_ladder/frozen_init_identity.json | 57 + results/depth_ladder/ladder_d256_L4_cifar10.json | 2274 ++++++++++++++++++++++ results/depth_ladder/ladder_d512_L2_cifar10.json | 1378 +++++++++++++ run_depth_ladder.sh | 14 + 10 files changed, 5407 insertions(+) create mode 100644 experiments/depth_utility_ladder.py create mode 100644 experiments/frozen_init_identity_check.py create mode 100644 experiments/plot_depth_ladder.py create mode 100644 logs/depth_ladder.log create mode 100644 report_explore/MEMO_depth_utility_ladder.md create mode 100644 results/depth_ladder/depth_ladder.png create mode 100644 results/depth_ladder/frozen_init_identity.json create mode 100644 results/depth_ladder/ladder_d256_L4_cifar10.json create mode 100644 results/depth_ladder/ladder_d512_L2_cifar10.json create mode 100755 run_depth_ladder.sh diff --git a/experiments/depth_utility_ladder.py b/experiments/depth_utility_ladder.py new file mode 100644 index 0000000..c9de9e9 --- /dev/null +++ b/experiments/depth_utility_ladder.py @@ -0,0 +1,317 @@ +""" +Depth-utility ladder (appendix experiment for the FA-evaluation E&D paper). + +Turns the binary frozen-vs-trained block comparison into a CURVE: vary the number +of trainable residual blocks k, training the LAST k blocks (output side) and +freezing the first L-k at random init. Embedding / out_ln / out_head are ALWAYS +trained. Credit still propagates through frozen blocks (forward + FA feedback +matrices unchanged); only their weights stay at init. + +Question. As more blocks are made trainable, does test accuracy rise? + - BP (positive control): should climb monotonically with k. + - FA (Lillicrap vanilla): modest climb where depth is usable, flat where not. + - DFA (direct FA): flat at / below the frozen baseline (deep credit + is non-functional -> the D3 failure at every k). + +Output-side-first is deliberate: the deepest block receives the most direct +credit (FA's last block sees the exact output gradient), so it is the BEST case +for the method. If even these blocks add nothing, depth is unused. + +Recipe is identical to the main CIFAR audit (cifar_resmlp.py): AdamW, lr 1e-3, +wd 0.01, cosine, batch 128, 100 epochs, per-block independent optimizers and +rms-normalized local surrogate losses. + +k=0 reproduces the frozen-blocks baseline; k=L reproduces the full audit. + +Usage: + CUDA_VISIBLE_DEVICES=2 python experiments/depth_utility_ladder.py \ + --d_hidden 256 --num_blocks 4 --dataset cifar10 \ + --methods bp fa dfa --k_values 0 1 2 3 4 --seeds 42 123 456 \ + --epochs 100 --output_dir results/depth_ladder +""" +import os +import sys +import json +import argparse +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.utils.data import DataLoader +import torchvision +import torchvision.transforms as transforms + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from models.residual_mlp import ResidualMLP + + +# --------------------------------------------------------------------------- +# Data / eval +# --------------------------------------------------------------------------- +def get_data(dataset, batch_size=128): + if dataset == 'cifar100': + mean, std = (0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761) + DatasetClass, num_classes, input_dim = torchvision.datasets.CIFAR100, 100, 32 * 32 * 3 + else: + mean, std = (0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616) + DatasetClass, num_classes, input_dim = torchvision.datasets.CIFAR10, 10, 32 * 32 * 3 + tf_train = transforms.Compose([ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(mean, std), + ]) + tf_test = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean, std)]) + tr = DatasetClass('./data', True, download=True, transform=tf_train) + te = DatasetClass('./data', False, download=True, transform=tf_test) + return ( + DataLoader(tr, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True), + DataLoader(te, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True), + input_dim, num_classes, + ) + + +def evaluate(model, loader, dev): + model.eval() + c = n = 0 + with torch.no_grad(): + for x, y in loader: + x = x.view(x.size(0), -1).to(dev); y = y.to(dev) + c += (model(x).argmax(-1) == y).sum().item() + n += x.size(0) + return c / n + + +def freeze_first(model, k): + """Freeze the first L-k blocks (indices 0 .. L-k-1); leave the last k trainable. + Returns the set of trainable block indices.""" + L = model.num_blocks + n_frozen = L - k + trainable = set(range(n_frozen, L)) + for l, block in enumerate(model.blocks): + req = l in trainable + for p in block.parameters(): + p.requires_grad_(req) + return trainable + + +# --------------------------------------------------------------------------- +# Trainers (freeze-aware ports of cifar_resmlp.py) +# --------------------------------------------------------------------------- +def train_bp(model, train_loader, test_loader, dev, args, trainable): + """End-to-end BP; optimizer filters to requires_grad params (frozen blocks excluded). + Gradients still flow THROUGH frozen blocks to reach trainable blocks / embed.""" + opt = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), + lr=args.lr, weight_decay=args.wd) + sch = optim.lr_scheduler.CosineAnnealingLR(opt, T_max=args.epochs) + curve = [] + for ep in range(1, args.epochs + 1): + model.train() + for x, y in train_loader: + x = x.view(x.size(0), -1).to(dev); y = y.to(dev) + loss = F.cross_entropy(model(x), y) + opt.zero_grad(); loss.backward(); opt.step() + sch.step() + if ep % 10 == 0 or ep == 1 or ep == args.epochs: + acc = evaluate(model, test_loader, dev) + curve.append((ep, acc)) + print(f" [BP k] ep {ep}: test={acc:.4f}", flush=True) + return curve + + +def train_dfa(model, train_loader, test_loader, dev, args, trainable): + """DFA: each block reads output error directly via B_l (no sequential propagation). + Only TRAINABLE blocks are updated; embed / out_ln / out_head always trained.""" + d, C, L = model.d_hidden, args.num_classes, model.num_blocks + Bs = [torch.randn(d, C, device=dev) / np.sqrt(C) for _ in range(L)] + + block_opts = {l: optim.AdamW(model.blocks[l].parameters(), lr=args.lr, weight_decay=args.wd) + for l in sorted(trainable)} + embed_opt = optim.AdamW(model.embed.parameters(), lr=args.lr, weight_decay=args.wd) + head_opt = optim.AdamW(list(model.out_head.parameters()) + list(model.out_ln.parameters()), + lr=args.lr, weight_decay=args.wd) + scheds = [optim.lr_scheduler.CosineAnnealingLR(o, T_max=args.epochs) + for o in list(block_opts.values()) + [embed_opt, head_opt]] + + curve = [] + for ep in range(1, args.epochs + 1): + model.train() + for x, y in train_loader: + x = x.view(x.size(0), -1).to(dev); y = y.to(dev) + batch = x.size(0) + with torch.no_grad(): + logits, hiddens = model(x, return_hidden=True) + e_T = logits.softmax(-1); e_T[torch.arange(batch), y] -= 1 + + # head: exact CE, h_L detached + hL = hiddens[-1].detach() + head_opt.zero_grad() + F.cross_entropy(model.out_head(model.out_ln(hL)), y).backward() + head_opt.step() + + # trainable blocks: DFA local surrogate + for l in sorted(trainable): + a = (e_T @ Bs[l].T).detach() + a = a / ((a ** 2).mean(-1, keepdim=True).sqrt() + 1e-6) + f_l = model.blocks[l](hiddens[l].detach()) + local = (f_l * a).sum(-1).mean() + block_opts[l].zero_grad(); local.backward(); block_opts[l].step() + + # embed: DFA credit at h_0 + a0 = (e_T @ Bs[0].T).detach() + a0 = a0 / ((a0 ** 2).mean(-1, keepdim=True).sqrt() + 1e-6) + embed_loss = (model.embed(x) * a0).sum(-1).mean() + embed_opt.zero_grad(); embed_loss.backward(); embed_opt.step() + + for s in scheds: + s.step() + if ep % 10 == 0 or ep == 1 or ep == args.epochs: + acc = evaluate(model, test_loader, dev) + curve.append((ep, acc)) + print(f" [DFA k] ep {ep}: test={acc:.4f}", flush=True) + return curve + + +def train_fa(model, train_loader, test_loader, dev, args, trainable): + """Vanilla FA: credit propagates sequentially backward via fixed d×d B_l. + Frozen blocks STILL propagate credit (a_credit = a_credit @ B_l) so trainable + blocks / embed downstream receive it; only their weight update is skipped.""" + d, C, L = model.d_hidden, args.num_classes, model.num_blocks + Bs = [torch.randn(d, d, device=dev) / np.sqrt(d) for _ in range(L)] + + block_opts = {l: optim.AdamW(model.blocks[l].parameters(), lr=args.lr, weight_decay=args.wd) + for l in sorted(trainable)} + embed_opt = optim.AdamW(model.embed.parameters(), lr=args.lr, weight_decay=args.wd) + head_opt = optim.AdamW(list(model.out_head.parameters()) + list(model.out_ln.parameters()), + lr=args.lr, weight_decay=args.wd) + scheds = [optim.lr_scheduler.CosineAnnealingLR(o, T_max=args.epochs) + for o in list(block_opts.values()) + [embed_opt, head_opt]] + + curve = [] + for ep in range(1, args.epochs + 1): + model.train() + for x, y in train_loader: + x = x.view(x.size(0), -1).to(dev); y = y.to(dev) + batch = x.size(0) + with torch.no_grad(): + logits, hiddens = model(x, return_hidden=True) + + # head: exact CE; a_credit = exact gradient at h_L (FA's starting credit) + hL = hiddens[-1].detach().requires_grad_(True) + head_opt.zero_grad() + F.cross_entropy(model.out_head(model.out_ln(hL)), y).backward() + head_opt.step() + a_credit = hL.grad.detach() + + # blocks backward: update only trainable; ALWAYS propagate credit + for l in range(L - 1, -1, -1): + if l in trainable: + a = a_credit / ((a_credit ** 2).mean(-1, keepdim=True).sqrt() + 1e-6) + f_l = model.blocks[l](hiddens[l].detach()) + local = (f_l * a).sum(-1).mean() + block_opts[l].zero_grad(); local.backward(); block_opts[l].step() + a_credit = (a_credit @ Bs[l]).detach() + + # embed: FA credit at h_0 + a0 = a_credit / ((a_credit ** 2).mean(-1, keepdim=True).sqrt() + 1e-6) + embed_loss = (model.embed(x) * a0).sum(-1).mean() + embed_opt.zero_grad(); embed_loss.backward(); embed_opt.step() + + for s in scheds: + s.step() + if ep % 10 == 0 or ep == 1 or ep == args.epochs: + acc = evaluate(model, test_loader, dev) + curve.append((ep, acc)) + print(f" [FA k] ep {ep}: test={acc:.4f}", flush=True) + return curve + + +TRAINERS = {'bp': train_bp, 'dfa': train_dfa, 'fa': train_fa} + + +# --------------------------------------------------------------------------- +# Driver +# --------------------------------------------------------------------------- +def main(): + p = argparse.ArgumentParser() + p.add_argument('--d_hidden', type=int, default=256) + p.add_argument('--num_blocks', type=int, default=4) + p.add_argument('--dataset', type=str, default='cifar10') + p.add_argument('--methods', type=str, nargs='+', default=['bp', 'fa', 'dfa']) + p.add_argument('--k_values', type=int, nargs='+', default=[0, 1, 2, 3, 4]) + p.add_argument('--seeds', type=int, nargs='+', default=[42, 123, 456]) + p.add_argument('--epochs', type=int, default=100) + p.add_argument('--lr', type=float, default=1e-3) + p.add_argument('--wd', type=float, default=0.01) + p.add_argument('--batch_size', type=int, default=128) + p.add_argument('--gpu', type=int, default=0) + p.add_argument('--output_dir', type=str, default='results/depth_ladder') + args = p.parse_args() + + dev = torch.device(f'cuda:{args.gpu}' if torch.cuda.is_available() else 'cpu') + os.makedirs(args.output_dir, exist_ok=True) + L = args.num_blocks + tag = f"ladder_d{args.d_hidden}_L{L}_{args.dataset}" + out_path = os.path.join(args.output_dir, f"{tag}.json") + print(f"Device={dev} {tag} methods={args.methods} k={args.k_values} seeds={args.seeds} " + f"epochs={args.epochs}", flush=True) + + # incremental results: results[method][k][seed] = {final_acc, curve} + results = {} + if os.path.exists(out_path): + with open(out_path) as f: + results = json.load(f).get('results', {}) + print(f"Resuming; existing keys: " + f"{[(m, list(results[m].keys())) for m in results]}", flush=True) + + def save(): + with open(out_path, 'w') as f: + json.dump({'config': vars(args), 'results': results}, f, indent=2) + + for method in args.methods: + results.setdefault(method, {}) + for k in args.k_values: + if k > L: + continue + results[method].setdefault(str(k), {}) + for seed in args.seeds: + if str(seed) in results[method][str(k)]: + print(f" skip {method} k={k} seed={seed} (done)", flush=True) + continue + print(f"\n=== {method.upper()} k={k} (last {k} of {L} trainable) " + f"seed={seed} ===", flush=True) + torch.manual_seed(seed); np.random.seed(seed); torch.cuda.manual_seed_all(seed) + train_loader, test_loader, input_dim, num_classes = get_data(args.dataset, args.batch_size) + args.num_classes = num_classes + + model = ResidualMLP(input_dim, args.d_hidden, num_classes, L).to(dev) + trainable = freeze_first(model, k) + n_train = sum(pp.numel() for pp in model.parameters() if pp.requires_grad) + print(f" trainable blocks: {sorted(trainable)} " + f"trainable params: {n_train:,}", flush=True) + + curve = TRAINERS[method](model, train_loader, test_loader, dev, args, trainable) + final_acc = evaluate(model, test_loader, dev) + results[method][str(k)][str(seed)] = {'final_acc': final_acc, 'curve': curve} + print(f" FINAL {method} k={k} seed={seed}: {final_acc:.4f}", flush=True) + save() + + # summary table + print(f"\n{'='*60}\nSUMMARY {tag} (mean ± ddof-1 std over seeds)\n{'='*60}", flush=True) + for method in args.methods: + row = [] + for k in args.k_values: + if k > L: + continue + accs = [v['final_acc'] for v in results[method][str(k)].values()] + if accs: + m = float(np.mean(accs)); s = float(np.std(accs, ddof=1)) if len(accs) > 1 else 0.0 + row.append(f"k={k}: {m:.4f}±{s:.4f}") + print(f" {method.upper():4s} " + " ".join(row), flush=True) + save() + print(f"\nSaved -> {out_path}", flush=True) + + +if __name__ == '__main__': + main() diff --git a/experiments/frozen_init_identity_check.py b/experiments/frozen_init_identity_check.py new file mode 100644 index 0000000..3f58d7d --- /dev/null +++ b/experiments/frozen_init_identity_check.py @@ -0,0 +1,82 @@ +""" +Frozen-init identity check (supporting measurement for the depth-utility ladder). + +Quantifies how close a randomly-initialized, frozen ResidualMLP block stack is to +the identity map. This grounds the footnote explaining why the k=0 rung of the +ladder (all blocks frozen at init) already sits well above chance: the trained +embedding + readout are composed with a fixed, near-norm-preserving random feature +map, i.e. effectively a trained (near-)linear classifier on pixels. + +Reports, at random init, on a CIFAR-10 test batch (mean over seeds): + - per-block residual ratio ||f_l(h_l)|| / ||h_l|| (median over batch) + - whole-stack deviation ||h_L - h_0|| / ||h_0|| (median over batch) + - whole-stack direction cos(h_L, h_0) (median over batch) + +Usage: + CUDA_VISIBLE_DEVICES=2 python experiments/frozen_init_identity_check.py +""" +import os, sys, json +import numpy as np +import torch +import torch.nn.functional as F +import torchvision +import torchvision.transforms as transforms + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from models.residual_mlp import ResidualMLP + + +def main(): + d_hidden, L, C, n = 256, 4, 10, 256 + seeds = [42, 123, 456] + tf = transforms.Compose([transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), + (0.2470, 0.2435, 0.2616))]) + ds = torchvision.datasets.CIFAR10('./data', train=False, download=True, transform=tf) + x = torch.stack([ds[i][0] for i in range(n)]).view(n, -1) + + per_block, rel_dev, cos_dev = [], [], [] + seed_rows = {} + for seed in seeds: + torch.manual_seed(seed); np.random.seed(seed) + m = ResidualMLP(32 * 32 * 3, d_hidden, C, L).eval() + with torch.no_grad(): + h0 = m.embed(x); h = h0; ratios = [] + for blk in m.blocks: + f = blk(h) + ratios.append(float((f.norm(dim=-1) / h.norm(dim=-1)).median())) + h = h + f + rel = float(((h - h0).norm(dim=-1) / h0.norm(dim=-1)).median()) + cos = float(F.cosine_similarity(h, h0, dim=-1).median()) + per_block.append(ratios); rel_dev.append(rel); cos_dev.append(cos) + seed_rows[str(seed)] = {'per_block_ratio': ratios, 'rel_dev': rel, 'cos': cos} + print(f"seed {seed}: per-block ||f||/||h|| = " + f"{['%.4f' % r for r in ratios]} " + f"||h_L-h_0||/||h_0|| = {rel:.3f} cos(h_L,h_0) = {cos:.4f}", flush=True) + + pb = np.array(per_block) + summary = { + 'config': {'d_hidden': d_hidden, 'L': L, 'num_classes': C, 'batch': n, + 'dataset': 'cifar10-test', 'seeds': seeds}, + 'per_seed': seed_rows, + 'per_block_ratio_mean': pb.mean(0).tolist(), + 'per_block_ratio_grand_mean': float(pb.mean()), + 'rel_dev_mean': float(np.mean(rel_dev)), + 'rel_dev_std': float(np.std(rel_dev, ddof=1)), + 'cos_mean': float(np.mean(cos_dev)), + 'cos_std': float(np.std(cos_dev, ddof=1)), + } + print(f"\nMEAN over {len(seeds)} seeds: " + f"per-block ratio ≈ {summary['per_block_ratio_grand_mean']:.3f}, " + f"||h_L-h_0||/||h_0|| = {summary['rel_dev_mean']:.3f} ± {summary['rel_dev_std']:.3f}, " + f"cos = {summary['cos_mean']:.4f} ± {summary['cos_std']:.4f}", flush=True) + + out = 'results/depth_ladder/frozen_init_identity.json' + os.makedirs(os.path.dirname(out), exist_ok=True) + with open(out, 'w') as f: + json.dump(summary, f, indent=2) + print(f"Saved -> {out}", flush=True) + + +if __name__ == '__main__': + main() diff --git a/experiments/plot_depth_ladder.py b/experiments/plot_depth_ladder.py new file mode 100644 index 0000000..a5709bf --- /dev/null +++ b/experiments/plot_depth_ladder.py @@ -0,0 +1,63 @@ +""" +Plot the depth-utility ladder: test accuracy vs number of trainable blocks k, +one curve per method (BP / FA / DFA), one panel per architecture. + +Usage: + python experiments/plot_depth_ladder.py +""" +import os, sys, json +import numpy as np +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt + +CONFIGS = [ + ('results/depth_ladder/ladder_d256_L4_cifar10.json', 'ResMLP d=256, L=4', 4), + ('results/depth_ladder/ladder_d512_L2_cifar10.json', 'ResMLP d=512, L=2', 2), +] +METHODS = [('bp', 'BP', 'tab:green', 'o'), + ('fa', 'FA', 'tab:orange', 's'), + ('dfa', 'DFA', 'tab:red', '^')] + + +def agg(path, L): + d = json.load(open(path))['results'] + out = {} + for m, _, _, _ in METHODS: + ks, mu, sd = [], [], [] + for k in range(L + 1): + a = [v['final_acc'] for v in d[m][str(k)].values()] + ks.append(k); mu.append(np.mean(a)) + sd.append(np.std(a, ddof=1) if len(a) > 1 else 0.0) + out[m] = (np.array(ks), np.array(mu), np.array(sd)) + return out + + +def main(): + fig, axes = plt.subplots(1, len(CONFIGS), figsize=(11, 4.2)) + if len(CONFIGS) == 1: + axes = [axes] + for ax, (path, title, L) in zip(axes, CONFIGS): + data = agg(path, L) + for m, label, color, mk in METHODS: + ks, mu, sd = data[m] + ax.errorbar(ks, mu, yerr=sd, marker=mk, color=color, label=label, + capsize=3, lw=2, ms=7) + # frozen baseline reference (k=0, averaged across methods is ~chance-of-readout) + ax.axhline(0.10, ls=':', color='gray', lw=1) + ax.text(0.02, 0.105, 'chance', color='gray', fontsize=8, transform=ax.get_yaxis_transform()) + ax.set_xlabel('trainable blocks $k$ (last $k$ of $L$)') + ax.set_ylabel('CIFAR-10 test accuracy') + ax.set_title(title) + ax.set_xticks(range(L + 1)) + ax.grid(alpha=0.3) + ax.legend(loc='center right') + fig.suptitle('Depth-utility ladder: does training deeper blocks raise accuracy?', y=1.02) + fig.tight_layout() + out = 'results/depth_ladder/depth_ladder.png' + fig.savefig(out, dpi=150, bbox_inches='tight') + print(f"Saved -> {out}") + + +if __name__ == '__main__': + main() diff --git a/logs/depth_ladder.log b/logs/depth_ladder.log new file mode 100644 index 0000000..20af1ab --- /dev/null +++ b/logs/depth_ladder.log @@ -0,0 +1,1103 @@ +[Sun Jun 14 11:29:47 AM CDT 2026] START primary d=256 L=4 ladder +Device=cuda:0 ladder_d256_L4_cifar10 methods=['bp', 'fa', 'dfa'] k=[0, 1, 2, 3, 4] seeds=[42, 123, 456] epochs=100 + +=== BP k=0 (last 0 of 4 trainable) seed=42 === + trainable blocks: [] trainable params: 789,770 + [BP k] ep 1: test=0.3543 + [BP k] ep 10: test=0.3673 + [BP k] ep 20: test=0.3483 + [BP k] ep 30: test=0.3498 + [BP k] ep 40: test=0.3608 + [BP k] ep 50: test=0.3627 + [BP k] ep 60: test=0.3697 + [BP k] ep 70: test=0.3803 + [BP k] ep 80: test=0.3821 + [BP k] ep 90: test=0.3870 + [BP k] ep 100: test=0.3882 + FINAL bp k=0 seed=42: 0.3882 + +=== BP k=0 (last 0 of 4 trainable) seed=123 === + trainable blocks: [] trainable params: 789,770 + [BP k] ep 1: test=0.3535 + [BP k] ep 10: test=0.3654 + [BP k] ep 20: test=0.3612 + [BP k] ep 30: test=0.3586 + [BP k] ep 40: test=0.3633 + [BP k] ep 50: test=0.3608 + [BP k] ep 60: test=0.3772 + [BP k] ep 70: test=0.3791 + [BP k] ep 80: test=0.3897 + [BP k] ep 90: test=0.3884 + [BP k] ep 100: test=0.3899 + FINAL bp k=0 seed=123: 0.3899 + +=== BP k=0 (last 0 of 4 trainable) seed=456 === + trainable blocks: [] trainable params: 789,770 + [BP k] ep 1: test=0.3551 + [BP k] ep 10: test=0.3680 + [BP k] ep 20: test=0.3509 + [BP k] ep 30: test=0.3655 + [BP k] ep 40: test=0.3573 + [BP k] ep 50: test=0.3543 + [BP k] ep 60: test=0.3716 + [BP k] ep 70: test=0.3824 + [BP k] ep 80: test=0.3852 + [BP k] ep 90: test=0.3891 + [BP k] ep 100: test=0.3878 + FINAL bp k=0 seed=456: 0.3878 + +=== BP k=1 (last 1 of 4 trainable) seed=42 === + trainable blocks: [3] trainable params: 921,866 + [BP k] ep 1: test=0.3736 + [BP k] ep 10: test=0.4890 + [BP k] ep 20: test=0.5089 + [BP k] ep 30: test=0.5260 + [BP k] ep 40: test=0.5365 + [BP k] ep 50: test=0.5486 + [BP k] ep 60: test=0.5524 + [BP k] ep 70: test=0.5638 + [BP k] ep 80: test=0.5666 + [BP k] ep 90: test=0.5678 + [BP k] ep 100: test=0.5683 + FINAL bp k=1 seed=42: 0.5683 + +=== BP k=1 (last 1 of 4 trainable) seed=123 === + trainable blocks: [3] trainable params: 921,866 + [BP k] ep 1: test=0.3878 + [BP k] ep 10: test=0.4797 + [BP k] ep 20: test=0.5096 + [BP k] ep 30: test=0.5209 + [BP k] ep 40: test=0.5280 + [BP k] ep 50: test=0.5486 + [BP k] ep 60: test=0.5530 + [BP k] ep 70: test=0.5564 + [BP k] ep 80: test=0.5609 + [BP k] ep 90: test=0.5611 + [BP k] ep 100: test=0.5623 + FINAL bp k=1 seed=123: 0.5623 + +=== BP k=1 (last 1 of 4 trainable) seed=456 === + trainable blocks: [3] trainable params: 921,866 + [BP k] ep 1: test=0.3772 + [BP k] ep 10: test=0.4853 + [BP k] ep 20: test=0.5098 + [BP k] ep 30: test=0.5238 + [BP k] ep 40: test=0.5387 + [BP k] ep 50: test=0.5488 + [BP k] ep 60: test=0.5547 + [BP k] ep 70: test=0.5588 + [BP k] ep 80: test=0.5636 + [BP k] ep 90: test=0.5637 + [BP k] ep 100: test=0.5643 + FINAL bp k=1 seed=456: 0.5643 + +=== BP k=2 (last 2 of 4 trainable) seed=42 === + trainable blocks: [2, 3] trainable params: 1,053,962 + [BP k] ep 1: test=0.3874 + [BP k] ep 10: test=0.5157 + [BP k] ep 20: test=0.5361 + [BP k] ep 30: test=0.5600 + [BP k] ep 40: test=0.5753 + [BP k] ep 50: test=0.5802 + [BP k] ep 60: test=0.5843 + [BP k] ep 70: test=0.5965 + [BP k] ep 80: test=0.5970 + [BP k] ep 90: test=0.5979 + [BP k] ep 100: test=0.5994 + FINAL bp k=2 seed=42: 0.5994 + +=== BP k=2 (last 2 of 4 trainable) seed=123 === + trainable blocks: [2, 3] trainable params: 1,053,962 + [BP k] ep 1: test=0.3925 + [BP k] ep 10: test=0.5148 + [BP k] ep 20: test=0.5376 + [BP k] ep 30: test=0.5638 + [BP k] ep 40: test=0.5693 + [BP k] ep 50: test=0.5784 + [BP k] ep 60: test=0.5927 + [BP k] ep 70: test=0.5911 + [BP k] ep 80: test=0.5973 + [BP k] ep 90: test=0.5986 + [BP k] ep 100: test=0.6000 + FINAL bp k=2 seed=123: 0.6000 + +=== BP k=2 (last 2 of 4 trainable) seed=456 === + trainable blocks: [2, 3] trainable params: 1,053,962 + [BP k] ep 1: test=0.3868 + [BP k] ep 10: test=0.5103 + [BP k] ep 20: test=0.5420 + [BP k] ep 30: test=0.5610 + [BP k] ep 40: test=0.5699 + [BP k] ep 50: test=0.5789 + [BP k] ep 60: test=0.5809 + [BP k] ep 70: test=0.5844 + [BP k] ep 80: test=0.5919 + [BP k] ep 90: test=0.5919 + [BP k] ep 100: test=0.5939 + FINAL bp k=2 seed=456: 0.5939 + +=== BP k=3 (last 3 of 4 trainable) seed=42 === + trainable blocks: [1, 2, 3] trainable params: 1,186,058 + [BP k] ep 1: test=0.3904 + [BP k] ep 10: test=0.5218 + [BP k] ep 20: test=0.5469 + [BP k] ep 30: test=0.5749 + [BP k] ep 40: test=0.5935 + [BP k] ep 50: test=0.5950 + [BP k] ep 60: test=0.5983 + [BP k] ep 70: test=0.6015 + [BP k] ep 80: test=0.6070 + [BP k] ep 90: test=0.6057 + [BP k] ep 100: test=0.6079 + FINAL bp k=3 seed=42: 0.6079 + +=== BP k=3 (last 3 of 4 trainable) seed=123 === + trainable blocks: [1, 2, 3] trainable params: 1,186,058 + [BP k] ep 1: test=0.3965 + [BP k] ep 10: test=0.5240 + [BP k] ep 20: test=0.5517 + [BP k] ep 30: test=0.5747 + [BP k] ep 40: test=0.5774 + [BP k] ep 50: test=0.5927 + [BP k] ep 60: test=0.6035 + [BP k] ep 70: test=0.6030 + [BP k] ep 80: test=0.6057 + [BP k] ep 90: test=0.6073 + [BP k] ep 100: test=0.6069 + FINAL bp k=3 seed=123: 0.6069 + +=== BP k=3 (last 3 of 4 trainable) seed=456 === + trainable blocks: [1, 2, 3] trainable params: 1,186,058 + [BP k] ep 1: test=0.3947 + [BP k] ep 10: test=0.5148 + [BP k] ep 20: test=0.5536 + [BP k] ep 30: test=0.5723 + [BP k] ep 40: test=0.5873 + [BP k] ep 50: test=0.5861 + [BP k] ep 60: test=0.5991 + [BP k] ep 70: test=0.5989 + [BP k] ep 80: test=0.6062 + [BP k] ep 90: test=0.6093 + [BP k] ep 100: test=0.6080 + FINAL bp k=3 seed=456: 0.6080 + +=== BP k=4 (last 4 of 4 trainable) seed=42 === + trainable blocks: [0, 1, 2, 3] trainable params: 1,318,154 + [BP k] ep 1: test=0.3936 + [BP k] ep 10: test=0.5235 + [BP k] ep 20: test=0.5606 + [BP k] ep 30: test=0.5794 + [BP k] ep 40: test=0.5992 + [BP k] ep 50: test=0.6044 + [BP k] ep 60: test=0.5979 + [BP k] ep 70: test=0.6115 + [BP k] ep 80: test=0.6153 + [BP k] ep 90: test=0.6177 + [BP k] ep 100: test=0.6173 + FINAL bp k=4 seed=42: 0.6173 + +=== BP k=4 (last 4 of 4 trainable) seed=123 === + trainable blocks: [0, 1, 2, 3] trainable params: 1,318,154 + [BP k] ep 1: test=0.3981 + [BP k] ep 10: test=0.5257 + [BP k] ep 20: test=0.5580 + [BP k] ep 30: test=0.5779 + [BP k] ep 40: test=0.5896 + [BP k] ep 50: test=0.6023 + [BP k] ep 60: test=0.6053 + [BP k] ep 70: test=0.6081 + [BP k] ep 80: test=0.6185 + [BP k] ep 90: test=0.6174 + [BP k] ep 100: test=0.6182 + FINAL bp k=4 seed=123: 0.6182 + +=== BP k=4 (last 4 of 4 trainable) seed=456 === + trainable blocks: [0, 1, 2, 3] trainable params: 1,318,154 + [BP k] ep 1: test=0.3967 + [BP k] ep 10: test=0.5255 + [BP k] ep 20: test=0.5632 + [BP k] ep 30: test=0.5747 + [BP k] ep 40: test=0.5948 + [BP k] ep 50: test=0.5954 + [BP k] ep 60: test=0.6092 + [BP k] ep 70: test=0.6140 + [BP k] ep 80: test=0.6125 + [BP k] ep 90: test=0.6145 + [BP k] ep 100: test=0.6145 + FINAL bp k=4 seed=456: 0.6145 + +=== FA k=0 (last 0 of 4 trainable) seed=42 === + trainable blocks: [] trainable params: 789,770 + [FA k] ep 1: test=0.3112 + [FA k] ep 10: test=0.3389 + [FA k] ep 20: test=0.3325 + [FA k] ep 30: test=0.3495 + [FA k] ep 40: test=0.3467 + [FA k] ep 50: test=0.3465 + [FA k] ep 60: test=0.3573 + [FA k] ep 70: test=0.3542 + [FA k] ep 80: test=0.3567 + [FA k] ep 90: test=0.3554 + [FA k] ep 100: test=0.3555 + FINAL fa k=0 seed=42: 0.3555 + +=== FA k=0 (last 0 of 4 trainable) seed=123 === + trainable blocks: [] trainable params: 789,770 + [FA k] ep 1: test=0.3257 + [FA k] ep 10: test=0.3409 + [FA k] ep 20: test=0.3514 + [FA k] ep 30: test=0.3357 + [FA k] ep 40: test=0.3299 + [FA k] ep 50: test=0.3495 + [FA k] ep 60: test=0.3468 + [FA k] ep 70: test=0.3548 + [FA k] ep 80: test=0.3509 + [FA k] ep 90: test=0.3536 + [FA k] ep 100: test=0.3520 + FINAL fa k=0 seed=123: 0.3520 + +=== FA k=0 (last 0 of 4 trainable) seed=456 === + trainable blocks: [] trainable params: 789,770 + [FA k] ep 1: test=0.3172 + [FA k] ep 10: test=0.3374 + [FA k] ep 20: test=0.3452 + [FA k] ep 30: test=0.3431 + [FA k] ep 40: test=0.3468 + [FA k] ep 50: test=0.3563 + [FA k] ep 60: test=0.3523 + [FA k] ep 70: test=0.3578 + [FA k] ep 80: test=0.3568 + [FA k] ep 90: test=0.3576 + [FA k] ep 100: test=0.3578 + FINAL fa k=0 seed=456: 0.3578 + +=== FA k=1 (last 1 of 4 trainable) seed=42 === + trainable blocks: [3] trainable params: 921,866 + [FA k] ep 1: test=0.2886 + [FA k] ep 10: test=0.3301 + [FA k] ep 20: test=0.3604 + [FA k] ep 30: test=0.3595 + [FA k] ep 40: test=0.3678 + [FA k] ep 50: test=0.3779 + [FA k] ep 60: test=0.3727 + [FA k] ep 70: test=0.3810 + [FA k] ep 80: test=0.3810 + [FA k] ep 90: test=0.3821 + [FA k] ep 100: test=0.3819 + FINAL fa k=1 seed=42: 0.3819 + +=== FA k=1 (last 1 of 4 trainable) seed=123 === + trainable blocks: [3] trainable params: 921,866 + [FA k] ep 1: test=0.3105 + [FA k] ep 10: test=0.3472 + [FA k] ep 20: test=0.3444 + [FA k] ep 30: test=0.3604 + [FA k] ep 40: test=0.3615 + [FA k] ep 50: test=0.3568 + [FA k] ep 60: test=0.3708 + [FA k] ep 70: test=0.3723 + [FA k] ep 80: test=0.3749 + [FA k] ep 90: test=0.3736 + [FA k] ep 100: test=0.3742 + FINAL fa k=1 seed=123: 0.3742 + +=== FA k=1 (last 1 of 4 trainable) seed=456 === + trainable blocks: [3] trainable params: 921,866 + [FA k] ep 1: test=0.2975 + [FA k] ep 10: test=0.3481 + [FA k] ep 20: test=0.3454 + [FA k] ep 30: test=0.3683 + [FA k] ep 40: test=0.3618 + [FA k] ep 50: test=0.3675 + [FA k] ep 60: test=0.3826 + [FA k] ep 70: test=0.3867 + [FA k] ep 80: test=0.3863 + [FA k] ep 90: test=0.3899 + [FA k] ep 100: test=0.3898 + FINAL fa k=1 seed=456: 0.3898 + +=== FA k=2 (last 2 of 4 trainable) seed=42 === + trainable blocks: [2, 3] trainable params: 1,053,962 + [FA k] ep 1: test=0.2657 + [FA k] ep 10: test=0.3431 + [FA k] ep 20: test=0.3494 + [FA k] ep 30: test=0.3436 + [FA k] ep 40: test=0.3574 + [FA k] ep 50: test=0.3388 + [FA k] ep 60: test=0.3426 + [FA k] ep 70: test=0.3341 + [FA k] ep 80: test=0.3303 + [FA k] ep 90: test=0.3310 + [FA k] ep 100: test=0.3305 + FINAL fa k=2 seed=42: 0.3305 + +=== FA k=2 (last 2 of 4 trainable) seed=123 === + trainable blocks: [2, 3] trainable params: 1,053,962 + [FA k] ep 1: test=0.2982 + [FA k] ep 10: test=0.3524 + [FA k] ep 20: test=0.3694 + [FA k] ep 30: test=0.3691 + [FA k] ep 40: test=0.3703 + [FA k] ep 50: test=0.3605 + [FA k] ep 60: test=0.3546 + [FA k] ep 70: test=0.3547 + [FA k] ep 80: test=0.3651 + [FA k] ep 90: test=0.3565 + [FA k] ep 100: test=0.3607 + FINAL fa k=2 seed=123: 0.3607 + +=== FA k=2 (last 2 of 4 trainable) seed=456 === + trainable blocks: [2, 3] trainable params: 1,053,962 + [FA k] ep 1: test=0.2753 + [FA k] ep 10: test=0.3386 + [FA k] ep 20: test=0.3495 + [FA k] ep 30: test=0.3458 + [FA k] ep 40: test=0.3374 + [FA k] ep 50: test=0.3333 + [FA k] ep 60: test=0.3523 + [FA k] ep 70: test=0.3538 + [FA k] ep 80: test=0.3519 + [FA k] ep 90: test=0.3555 + [FA k] ep 100: test=0.3548 + FINAL fa k=2 seed=456: 0.3548 + +=== FA k=3 (last 3 of 4 trainable) seed=42 === + trainable blocks: [1, 2, 3] trainable params: 1,186,058 + [FA k] ep 1: test=0.2770 + [FA k] ep 10: test=0.3554 + [FA k] ep 20: test=0.3681 + [FA k] ep 30: test=0.3841 + [FA k] ep 40: test=0.3829 + [FA k] ep 50: test=0.3847 + [FA k] ep 60: test=0.3885 + [FA k] ep 70: test=0.3956 + [FA k] ep 80: test=0.3947 + [FA k] ep 90: test=0.3916 + [FA k] ep 100: test=0.3930 + FINAL fa k=3 seed=42: 0.3930 + +=== FA k=3 (last 3 of 4 trainable) seed=123 === + trainable blocks: [1, 2, 3] trainable params: 1,186,058 + [FA k] ep 1: test=0.2905 + [FA k] ep 10: test=0.3495 + [FA k] ep 20: test=0.3804 + [FA k] ep 30: test=0.3820 + [FA k] ep 40: test=0.3885 + [FA k] ep 50: test=0.3950 + [FA k] ep 60: test=0.3971 + [FA k] ep 70: test=0.4049 + [FA k] ep 80: test=0.4047 + [FA k] ep 90: test=0.4075 + [FA k] ep 100: test=0.4074 + FINAL fa k=3 seed=123: 0.4074 + +=== FA k=3 (last 3 of 4 trainable) seed=456 === + trainable blocks: [1, 2, 3] trainable params: 1,186,058 + [FA k] ep 1: test=0.2708 + [FA k] ep 10: test=0.3511 + [FA k] ep 20: test=0.3662 + [FA k] ep 30: test=0.3755 + [FA k] ep 40: test=0.3818 + [FA k] ep 50: test=0.3828 + [FA k] ep 60: test=0.3966 + [FA k] ep 70: test=0.3939 + [FA k] ep 80: test=0.3928 + [FA k] ep 90: test=0.3933 + [FA k] ep 100: test=0.3946 + FINAL fa k=3 seed=456: 0.3946 + +=== FA k=4 (last 4 of 4 trainable) seed=42 === + trainable blocks: [0, 1, 2, 3] trainable params: 1,318,154 + [FA k] ep 1: test=0.2789 + [FA k] ep 10: test=0.3498 + [FA k] ep 20: test=0.3601 + [FA k] ep 30: test=0.3710 + [FA k] ep 40: test=0.3834 + [FA k] ep 50: test=0.3923 + [FA k] ep 60: test=0.3912 + [FA k] ep 70: test=0.3945 + [FA k] ep 80: test=0.3957 + [FA k] ep 90: test=0.3944 + [FA k] ep 100: test=0.3959 + FINAL fa k=4 seed=42: 0.3959 + +=== FA k=4 (last 4 of 4 trainable) seed=123 === + trainable blocks: [0, 1, 2, 3] trainable params: 1,318,154 + [FA k] ep 1: test=0.2905 + [FA k] ep 10: test=0.3596 + [FA k] ep 20: test=0.3803 + [FA k] ep 30: test=0.3792 + [FA k] ep 40: test=0.3955 + [FA k] ep 50: test=0.3980 + [FA k] ep 60: test=0.4071 + [FA k] ep 70: test=0.4034 + [FA k] ep 80: test=0.4076 + [FA k] ep 90: test=0.4115 + [FA k] ep 100: test=0.4122 + FINAL fa k=4 seed=123: 0.4122 + +=== FA k=4 (last 4 of 4 trainable) seed=456 === + trainable blocks: [0, 1, 2, 3] trainable params: 1,318,154 + [FA k] ep 1: test=0.2713 + [FA k] ep 10: test=0.3544 + [FA k] ep 20: test=0.3702 + [FA k] ep 30: test=0.3799 + [FA k] ep 40: test=0.3845 + [FA k] ep 50: test=0.3923 + [FA k] ep 60: test=0.3992 + [FA k] ep 70: test=0.3974 + [FA k] ep 80: test=0.3990 + [FA k] ep 90: test=0.4000 + [FA k] ep 100: test=0.3987 + FINAL fa k=4 seed=456: 0.3987 + +=== DFA k=0 (last 0 of 4 trainable) seed=42 === + trainable blocks: [] trainable params: 789,770 + [DFA k] ep 1: test=0.3185 + [DFA k] ep 10: test=0.3370 + [DFA k] ep 20: test=0.3458 + [DFA k] ep 30: test=0.3425 + [DFA k] ep 40: test=0.3419 + [DFA k] ep 50: test=0.3425 + [DFA k] ep 60: test=0.3420 + [DFA k] ep 70: test=0.3466 + [DFA k] ep 80: test=0.3458 + [DFA k] ep 90: test=0.3470 + [DFA k] ep 100: test=0.3454 + FINAL dfa k=0 seed=42: 0.3454 + +=== DFA k=0 (last 0 of 4 trainable) seed=123 === + trainable blocks: [] trainable params: 789,770 + [DFA k] ep 1: test=0.3219 + [DFA k] ep 10: test=0.3339 + [DFA k] ep 20: test=0.3453 + [DFA k] ep 30: test=0.3352 + [DFA k] ep 40: test=0.3322 + [DFA k] ep 50: test=0.3291 + [DFA k] ep 60: test=0.3428 + [DFA k] ep 70: test=0.3447 + [DFA k] ep 80: test=0.3465 + [DFA k] ep 90: test=0.3464 + [DFA k] ep 100: test=0.3498 + FINAL dfa k=0 seed=123: 0.3498 + +=== DFA k=0 (last 0 of 4 trainable) seed=456 === + trainable blocks: [] trainable params: 789,770 + [DFA k] ep 1: test=0.3241 + [DFA k] ep 10: test=0.3486 + [DFA k] ep 20: test=0.3396 + [DFA k] ep 30: test=0.3396 + [DFA k] ep 40: test=0.3387 + [DFA k] ep 50: test=0.3456 + [DFA k] ep 60: test=0.3508 + [DFA k] ep 70: test=0.3527 + [DFA k] ep 80: test=0.3498 + [DFA k] ep 90: test=0.3508 + [DFA k] ep 100: test=0.3516 + FINAL dfa k=0 seed=456: 0.3516 + +=== DFA k=1 (last 1 of 4 trainable) seed=42 === + trainable blocks: [3] trainable params: 921,866 + [DFA k] ep 1: test=0.2563 + [DFA k] ep 10: test=0.2580 + [DFA k] ep 20: test=0.2445 + [DFA k] ep 30: test=0.2197 + [DFA k] ep 40: test=0.2229 + [DFA k] ep 50: test=0.1952 + [DFA k] ep 60: test=0.2306 + [DFA k] ep 70: test=0.2290 + [DFA k] ep 80: test=0.2211 + [DFA k] ep 90: test=0.2215 + [DFA k] ep 100: test=0.2267 + FINAL dfa k=1 seed=42: 0.2267 + +=== DFA k=1 (last 1 of 4 trainable) seed=123 === + trainable blocks: [3] trainable params: 921,866 + [DFA k] ep 1: test=0.2549 + [DFA k] ep 10: test=0.2505 + [DFA k] ep 20: test=0.2453 + [DFA k] ep 30: test=0.2358 + [DFA k] ep 40: test=0.2499 + [DFA k] ep 50: test=0.2506 + [DFA k] ep 60: test=0.2467 + [DFA k] ep 70: test=0.2513 + [DFA k] ep 80: test=0.2597 + [DFA k] ep 90: test=0.2586 + [DFA k] ep 100: test=0.2563 + FINAL dfa k=1 seed=123: 0.2563 + +=== DFA k=1 (last 1 of 4 trainable) seed=456 === + trainable blocks: [3] trainable params: 921,866 + [DFA k] ep 1: test=0.2112 + [DFA k] ep 10: test=0.2227 + [DFA k] ep 20: test=0.2397 + [DFA k] ep 30: test=0.2326 + [DFA k] ep 40: test=0.2285 + [DFA k] ep 50: test=0.2176 + [DFA k] ep 60: test=0.2431 + [DFA k] ep 70: test=0.2476 + [DFA k] ep 80: test=0.2493 + [DFA k] ep 90: test=0.2477 + [DFA k] ep 100: test=0.2476 + FINAL dfa k=1 seed=456: 0.2476 + +=== DFA k=2 (last 2 of 4 trainable) seed=42 === + trainable blocks: [2, 3] trainable params: 1,053,962 + [DFA k] ep 1: test=0.2792 + [DFA k] ep 10: test=0.2893 + [DFA k] ep 20: test=0.2978 + [DFA k] ep 30: test=0.2960 + [DFA k] ep 40: test=0.3010 + [DFA k] ep 50: test=0.3014 + [DFA k] ep 60: test=0.3005 + [DFA k] ep 70: test=0.3036 + [DFA k] ep 80: test=0.2997 + [DFA k] ep 90: test=0.3005 + [DFA k] ep 100: test=0.3005 + FINAL dfa k=2 seed=42: 0.3005 + +=== DFA k=2 (last 2 of 4 trainable) seed=123 === + trainable blocks: [2, 3] trainable params: 1,053,962 + [DFA k] ep 1: test=0.2671 + [DFA k] ep 10: test=0.2947 + [DFA k] ep 20: test=0.2841 + [DFA k] ep 30: test=0.2801 + [DFA k] ep 40: test=0.2819 + [DFA k] ep 50: test=0.2772 + [DFA k] ep 60: test=0.2834 + [DFA k] ep 70: test=0.2876 + [DFA k] ep 80: test=0.2757 + [DFA k] ep 90: test=0.2806 + [DFA k] ep 100: test=0.2819 + FINAL dfa k=2 seed=123: 0.2819 + +=== DFA k=2 (last 2 of 4 trainable) seed=456 === + trainable blocks: [2, 3] trainable params: 1,053,962 + [DFA k] ep 1: test=0.2604 + [DFA k] ep 10: test=0.2821 + [DFA k] ep 20: test=0.2784 + [DFA k] ep 30: test=0.2826 + [DFA k] ep 40: test=0.2805 + [DFA k] ep 50: test=0.2675 + [DFA k] ep 60: test=0.2735 + [DFA k] ep 70: test=0.2765 + [DFA k] ep 80: test=0.2735 + [DFA k] ep 90: test=0.2759 + [DFA k] ep 100: test=0.2751 + FINAL dfa k=2 seed=456: 0.2751 + +=== DFA k=3 (last 3 of 4 trainable) seed=42 === + trainable blocks: [1, 2, 3] trainable params: 1,186,058 + [DFA k] ep 1: test=0.2821 + [DFA k] ep 10: test=0.2882 + [DFA k] ep 20: test=0.2921 + [DFA k] ep 30: test=0.3064 + [DFA k] ep 40: test=0.3009 + [DFA k] ep 50: test=0.3044 + [DFA k] ep 60: test=0.3041 + [DFA k] ep 70: test=0.3075 + [DFA k] ep 80: test=0.3064 + [DFA k] ep 90: test=0.3021 + [DFA k] ep 100: test=0.3047 + FINAL dfa k=3 seed=42: 0.3047 + +=== DFA k=3 (last 3 of 4 trainable) seed=123 === + trainable blocks: [1, 2, 3] trainable params: 1,186,058 + [DFA k] ep 1: test=0.2630 + [DFA k] ep 10: test=0.2910 + [DFA k] ep 20: test=0.2845 + [DFA k] ep 30: test=0.2821 + [DFA k] ep 40: test=0.2900 + [DFA k] ep 50: test=0.2811 + [DFA k] ep 60: test=0.2860 + [DFA k] ep 70: test=0.2910 + [DFA k] ep 80: test=0.2879 + [DFA k] ep 90: test=0.2910 + [DFA k] ep 100: test=0.2906 + FINAL dfa k=3 seed=123: 0.2906 + +=== DFA k=3 (last 3 of 4 trainable) seed=456 === + trainable blocks: [1, 2, 3] trainable params: 1,186,058 + [DFA k] ep 1: test=0.2544 + [DFA k] ep 10: test=0.2841 + [DFA k] ep 20: test=0.2892 + [DFA k] ep 30: test=0.2998 + [DFA k] ep 40: test=0.2891 + [DFA k] ep 50: test=0.2844 + [DFA k] ep 60: test=0.2938 + [DFA k] ep 70: test=0.2928 + [DFA k] ep 80: test=0.2901 + [DFA k] ep 90: test=0.2932 + [DFA k] ep 100: test=0.2919 + FINAL dfa k=3 seed=456: 0.2919 + +=== DFA k=4 (last 4 of 4 trainable) seed=42 === + trainable blocks: [0, 1, 2, 3] trainable params: 1,318,154 + [DFA k] ep 1: test=0.2899 + [DFA k] ep 10: test=0.2873 + [DFA k] ep 20: test=0.3016 + [DFA k] ep 30: test=0.3053 + [DFA k] ep 40: test=0.3120 + [DFA k] ep 50: test=0.3045 + [DFA k] ep 60: test=0.3071 + [DFA k] ep 70: test=0.3102 + [DFA k] ep 80: test=0.3080 + [DFA k] ep 90: test=0.3066 + [DFA k] ep 100: test=0.3068 + FINAL dfa k=4 seed=42: 0.3068 + +=== DFA k=4 (last 4 of 4 trainable) seed=123 === + trainable blocks: [0, 1, 2, 3] trainable params: 1,318,154 + [DFA k] ep 1: test=0.2683 + [DFA k] ep 10: test=0.2926 + [DFA k] ep 20: test=0.2861 + [DFA k] ep 30: test=0.2875 + [DFA k] ep 40: test=0.2978 + [DFA k] ep 50: test=0.2910 + [DFA k] ep 60: test=0.2972 + [DFA k] ep 70: test=0.3011 + [DFA k] ep 80: test=0.2974 + [DFA k] ep 90: test=0.3015 + [DFA k] ep 100: test=0.3023 + FINAL dfa k=4 seed=123: 0.3023 + +=== DFA k=4 (last 4 of 4 trainable) seed=456 === + trainable blocks: [0, 1, 2, 3] trainable params: 1,318,154 + [DFA k] ep 1: test=0.2591 + [DFA k] ep 10: test=0.2883 + [DFA k] ep 20: test=0.2948 + [DFA k] ep 30: test=0.2995 + [DFA k] ep 40: test=0.2921 + [DFA k] ep 50: test=0.2956 + [DFA k] ep 60: test=0.2960 + [DFA k] ep 70: test=0.2943 + [DFA k] ep 80: test=0.2910 + [DFA k] ep 90: test=0.2955 + [DFA k] ep 100: test=0.2949 + FINAL dfa k=4 seed=456: 0.2949 + +============================================================ +SUMMARY ladder_d256_L4_cifar10 (mean ± ddof-1 std over seeds) +============================================================ + BP k=0: 0.3886±0.0011 k=1: 0.5650±0.0031 k=2: 0.5978±0.0034 k=3: 0.6076±0.0006 k=4: 0.6167±0.0019 + FA k=0: 0.3551±0.0029 k=1: 0.3820±0.0078 k=2: 0.3487±0.0160 k=3: 0.3983±0.0079 k=4: 0.4023±0.0087 + DFA k=0: 0.3489±0.0032 k=1: 0.2435±0.0152 k=2: 0.2858±0.0131 k=3: 0.2957±0.0078 k=4: 0.3013±0.0060 + +Saved -> results/depth_ladder/ladder_d256_L4_cifar10.json +[Sun Jun 14 03:26:20 PM CDT 2026] START secondary d=512 L=2 FA-failure ladder +Device=cuda:0 ladder_d512_L2_cifar10 methods=['bp', 'fa', 'dfa'] k=[0, 1, 2] seeds=[42, 123, 456] epochs=100 + +=== BP k=0 (last 0 of 2 trainable) seed=42 === + trainable blocks: [] trainable params: 1,579,530 + [BP k] ep 1: test=0.3462 + [BP k] ep 10: test=0.3633 + [BP k] ep 20: test=0.3635 + [BP k] ep 30: test=0.3543 + [BP k] ep 40: test=0.3673 + [BP k] ep 50: test=0.3633 + [BP k] ep 60: test=0.3695 + [BP k] ep 70: test=0.3753 + [BP k] ep 80: test=0.3858 + [BP k] ep 90: test=0.3887 + [BP k] ep 100: test=0.3891 + FINAL bp k=0 seed=42: 0.3891 + +=== BP k=0 (last 0 of 2 trainable) seed=123 === + trainable blocks: [] trainable params: 1,579,530 + [BP k] ep 1: test=0.3497 + [BP k] ep 10: test=0.3704 + [BP k] ep 20: test=0.3698 + [BP k] ep 30: test=0.3540 + [BP k] ep 40: test=0.3505 + [BP k] ep 50: test=0.3634 + [BP k] ep 60: test=0.3675 + [BP k] ep 70: test=0.3739 + [BP k] ep 80: test=0.3823 + [BP k] ep 90: test=0.3845 + [BP k] ep 100: test=0.3846 + FINAL bp k=0 seed=123: 0.3846 + +=== BP k=0 (last 0 of 2 trainable) seed=456 === + trainable blocks: [] trainable params: 1,579,530 + [BP k] ep 1: test=0.3409 + [BP k] ep 10: test=0.3578 + [BP k] ep 20: test=0.3767 + [BP k] ep 30: test=0.3607 + [BP k] ep 40: test=0.3551 + [BP k] ep 50: test=0.3632 + [BP k] ep 60: test=0.3722 + [BP k] ep 70: test=0.3704 + [BP k] ep 80: test=0.3784 + [BP k] ep 90: test=0.3834 + [BP k] ep 100: test=0.3838 + FINAL bp k=0 seed=456: 0.3838 + +=== BP k=1 (last 1 of 2 trainable) seed=42 === + trainable blocks: [1] trainable params: 2,105,866 + [BP k] ep 1: test=0.3667 + [BP k] ep 10: test=0.4836 + [BP k] ep 20: test=0.5197 + [BP k] ep 30: test=0.5367 + [BP k] ep 40: test=0.5444 + [BP k] ep 50: test=0.5629 + [BP k] ep 60: test=0.5691 + [BP k] ep 70: test=0.5779 + [BP k] ep 80: test=0.5808 + [BP k] ep 90: test=0.5849 + [BP k] ep 100: test=0.5856 + FINAL bp k=1 seed=42: 0.5856 + +=== BP k=1 (last 1 of 2 trainable) seed=123 === + trainable blocks: [1] trainable params: 2,105,866 + [BP k] ep 1: test=0.3632 + [BP k] ep 10: test=0.4865 + [BP k] ep 20: test=0.5175 + [BP k] ep 30: test=0.5360 + [BP k] ep 40: test=0.5466 + [BP k] ep 50: test=0.5606 + [BP k] ep 60: test=0.5716 + [BP k] ep 70: test=0.5749 + [BP k] ep 80: test=0.5806 + [BP k] ep 90: test=0.5817 + [BP k] ep 100: test=0.5819 + FINAL bp k=1 seed=123: 0.5819 + +=== BP k=1 (last 1 of 2 trainable) seed=456 === + trainable blocks: [1] trainable params: 2,105,866 + [BP k] ep 1: test=0.3696 + [BP k] ep 10: test=0.4737 + [BP k] ep 20: test=0.5199 + [BP k] ep 30: test=0.5317 + [BP k] ep 40: test=0.5498 + [BP k] ep 50: test=0.5610 + [BP k] ep 60: test=0.5675 + [BP k] ep 70: test=0.5767 + [BP k] ep 80: test=0.5785 + [BP k] ep 90: test=0.5802 + [BP k] ep 100: test=0.5809 + FINAL bp k=1 seed=456: 0.5809 + +=== BP k=2 (last 2 of 2 trainable) seed=42 === + trainable blocks: [0, 1] trainable params: 2,632,202 + [BP k] ep 1: test=0.3790 + [BP k] ep 10: test=0.5174 + [BP k] ep 20: test=0.5471 + [BP k] ep 30: test=0.5712 + [BP k] ep 40: test=0.5906 + [BP k] ep 50: test=0.5969 + [BP k] ep 60: test=0.5977 + [BP k] ep 70: test=0.5992 + [BP k] ep 80: test=0.6072 + [BP k] ep 90: test=0.6037 + [BP k] ep 100: test=0.6039 + FINAL bp k=2 seed=42: 0.6039 + +=== BP k=2 (last 2 of 2 trainable) seed=123 === + trainable blocks: [0, 1] trainable params: 2,632,202 + [BP k] ep 1: test=0.3732 + [BP k] ep 10: test=0.5161 + [BP k] ep 20: test=0.5554 + [BP k] ep 30: test=0.5756 + [BP k] ep 40: test=0.5811 + [BP k] ep 50: test=0.5928 + [BP k] ep 60: test=0.5965 + [BP k] ep 70: test=0.6016 + [BP k] ep 80: test=0.6027 + [BP k] ep 90: test=0.6007 + [BP k] ep 100: test=0.6020 + FINAL bp k=2 seed=123: 0.6020 + +=== BP k=2 (last 2 of 2 trainable) seed=456 === + trainable blocks: [0, 1] trainable params: 2,632,202 + [BP k] ep 1: test=0.3768 + [BP k] ep 10: test=0.5097 + [BP k] ep 20: test=0.5499 + [BP k] ep 30: test=0.5773 + [BP k] ep 40: test=0.5858 + [BP k] ep 50: test=0.5845 + [BP k] ep 60: test=0.5934 + [BP k] ep 70: test=0.5985 + [BP k] ep 80: test=0.6011 + [BP k] ep 90: test=0.6020 + [BP k] ep 100: test=0.6045 + FINAL bp k=2 seed=456: 0.6045 + +=== FA k=0 (last 0 of 2 trainable) seed=42 === + trainable blocks: [] trainable params: 1,579,530 + [FA k] ep 1: test=0.3288 + [FA k] ep 10: test=0.3359 + [FA k] ep 20: test=0.3336 + [FA k] ep 30: test=0.3328 + [FA k] ep 40: test=0.3418 + [FA k] ep 50: test=0.3504 + [FA k] ep 60: test=0.3564 + [FA k] ep 70: test=0.3567 + [FA k] ep 80: test=0.3543 + [FA k] ep 90: test=0.3574 + [FA k] ep 100: test=0.3585 + FINAL fa k=0 seed=42: 0.3585 + +=== FA k=0 (last 0 of 2 trainable) seed=123 === + trainable blocks: [] trainable params: 1,579,530 + [FA k] ep 1: test=0.3125 + [FA k] ep 10: test=0.3374 + [FA k] ep 20: test=0.3364 + [FA k] ep 30: test=0.3453 + [FA k] ep 40: test=0.3437 + [FA k] ep 50: test=0.3522 + [FA k] ep 60: test=0.3587 + [FA k] ep 70: test=0.3550 + [FA k] ep 80: test=0.3551 + [FA k] ep 90: test=0.3558 + [FA k] ep 100: test=0.3584 + FINAL fa k=0 seed=123: 0.3584 + +=== FA k=0 (last 0 of 2 trainable) seed=456 === + trainable blocks: [] trainable params: 1,579,530 + [FA k] ep 1: test=0.3180 + [FA k] ep 10: test=0.3311 + [FA k] ep 20: test=0.3344 + [FA k] ep 30: test=0.3533 + [FA k] ep 40: test=0.3476 + [FA k] ep 50: test=0.3523 + [FA k] ep 60: test=0.3455 + [FA k] ep 70: test=0.3569 + [FA k] ep 80: test=0.3562 + [FA k] ep 90: test=0.3583 + [FA k] ep 100: test=0.3590 + FINAL fa k=0 seed=456: 0.3590 + +=== FA k=1 (last 1 of 2 trainable) seed=42 === + trainable blocks: [1] trainable params: 2,105,866 + [FA k] ep 1: test=0.3235 + [FA k] ep 10: test=0.3730 + [FA k] ep 20: test=0.3734 + [FA k] ep 30: test=0.3829 + [FA k] ep 40: test=0.3916 + [FA k] ep 50: test=0.4008 + [FA k] ep 60: test=0.4012 + [FA k] ep 70: test=0.4015 + [FA k] ep 80: test=0.4042 + [FA k] ep 90: test=0.4082 + [FA k] ep 100: test=0.4083 + FINAL fa k=1 seed=42: 0.4083 + +=== FA k=1 (last 1 of 2 trainable) seed=123 === + trainable blocks: [1] trainable params: 2,105,866 + [FA k] ep 1: test=0.2930 + [FA k] ep 10: test=0.3662 + [FA k] ep 20: test=0.3905 + [FA k] ep 30: test=0.4027 + [FA k] ep 40: test=0.3948 + [FA k] ep 50: test=0.4048 + [FA k] ep 60: test=0.4067 + [FA k] ep 70: test=0.4094 + [FA k] ep 80: test=0.4115 + [FA k] ep 90: test=0.4103 + [FA k] ep 100: test=0.4134 + FINAL fa k=1 seed=123: 0.4134 + +=== FA k=1 (last 1 of 2 trainable) seed=456 === + trainable blocks: [1] trainable params: 2,105,866 + [FA k] ep 1: test=0.3098 + [FA k] ep 10: test=0.3561 + [FA k] ep 20: test=0.3860 + [FA k] ep 30: test=0.3957 + [FA k] ep 40: test=0.3907 + [FA k] ep 50: test=0.4032 + [FA k] ep 60: test=0.4017 + [FA k] ep 70: test=0.4125 + [FA k] ep 80: test=0.4123 + [FA k] ep 90: test=0.4164 + [FA k] ep 100: test=0.4155 + FINAL fa k=1 seed=456: 0.4155 + +=== FA k=2 (last 2 of 2 trainable) seed=42 === + trainable blocks: [0, 1] trainable params: 2,632,202 + [FA k] ep 1: test=0.3028 + [FA k] ep 10: test=0.3585 + [FA k] ep 20: test=0.3523 + [FA k] ep 30: test=0.3315 + [FA k] ep 40: test=0.3191 + [FA k] ep 50: test=0.3397 + [FA k] ep 60: test=0.3566 + [FA k] ep 70: test=0.3527 + [FA k] ep 80: test=0.3554 + [FA k] ep 90: test=0.3593 + [FA k] ep 100: test=0.3582 + FINAL fa k=2 seed=42: 0.3582 + +=== FA k=2 (last 2 of 2 trainable) seed=123 === + trainable blocks: [0, 1] trainable params: 2,632,202 + [FA k] ep 1: test=0.2794 + [FA k] ep 10: test=0.3627 + [FA k] ep 20: test=0.3600 + [FA k] ep 30: test=0.3750 + [FA k] ep 40: test=0.3482 + [FA k] ep 50: test=0.3679 + [FA k] ep 60: test=0.3630 + [FA k] ep 70: test=0.3643 + [FA k] ep 80: test=0.3636 + [FA k] ep 90: test=0.3618 + [FA k] ep 100: test=0.3621 + FINAL fa k=2 seed=123: 0.3621 + +=== FA k=2 (last 2 of 2 trainable) seed=456 === + trainable blocks: [0, 1] trainable params: 2,632,202 + [FA k] ep 1: test=0.3005 + [FA k] ep 10: test=0.3573 + [FA k] ep 20: test=0.3624 + [FA k] ep 30: test=0.3706 + [FA k] ep 40: test=0.3529 + [FA k] ep 50: test=0.3648 + [FA k] ep 60: test=0.3581 + [FA k] ep 70: test=0.3645 + [FA k] ep 80: test=0.3652 + [FA k] ep 90: test=0.3632 + [FA k] ep 100: test=0.3642 + FINAL fa k=2 seed=456: 0.3642 + +=== DFA k=0 (last 0 of 2 trainable) seed=42 === + trainable blocks: [] trainable params: 1,579,530 + [DFA k] ep 1: test=0.3196 + [DFA k] ep 10: test=0.3187 + [DFA k] ep 20: test=0.3369 + [DFA k] ep 30: test=0.3221 + [DFA k] ep 40: test=0.3386 + [DFA k] ep 50: test=0.3401 + [DFA k] ep 60: test=0.3473 + [DFA k] ep 70: test=0.3472 + [DFA k] ep 80: test=0.3426 + [DFA k] ep 90: test=0.3445 + [DFA k] ep 100: test=0.3432 + FINAL dfa k=0 seed=42: 0.3432 + +=== DFA k=0 (last 0 of 2 trainable) seed=123 === + trainable blocks: [] trainable params: 1,579,530 + [DFA k] ep 1: test=0.3089 + [DFA k] ep 10: test=0.3180 + [DFA k] ep 20: test=0.3301 + [DFA k] ep 30: test=0.3434 + [DFA k] ep 40: test=0.3386 + [DFA k] ep 50: test=0.3343 + [DFA k] ep 60: test=0.3489 + [DFA k] ep 70: test=0.3458 + [DFA k] ep 80: test=0.3499 + [DFA k] ep 90: test=0.3508 + [DFA k] ep 100: test=0.3508 + FINAL dfa k=0 seed=123: 0.3508 + +=== DFA k=0 (last 0 of 2 trainable) seed=456 === + trainable blocks: [] trainable params: 1,579,530 + [DFA k] ep 1: test=0.3238 + [DFA k] ep 10: test=0.3327 + [DFA k] ep 20: test=0.3395 + [DFA k] ep 30: test=0.3457 + [DFA k] ep 40: test=0.3367 + [DFA k] ep 50: test=0.3496 + [DFA k] ep 60: test=0.3453 + [DFA k] ep 70: test=0.3487 + [DFA k] ep 80: test=0.3491 + [DFA k] ep 90: test=0.3498 + [DFA k] ep 100: test=0.3521 + FINAL dfa k=0 seed=456: 0.3521 + +=== DFA k=1 (last 1 of 2 trainable) seed=42 === + trainable blocks: [1] trainable params: 2,105,866 + [DFA k] ep 1: test=0.2687 + [DFA k] ep 10: test=0.2106 + [DFA k] ep 20: test=0.2293 + [DFA k] ep 30: test=0.2297 + [DFA k] ep 40: test=0.2241 + [DFA k] ep 50: test=0.2318 + [DFA k] ep 60: test=0.2417 + [DFA k] ep 70: test=0.2458 + [DFA k] ep 80: test=0.2463 + [DFA k] ep 90: test=0.2438 + [DFA k] ep 100: test=0.2384 + FINAL dfa k=1 seed=42: 0.2384 + +=== DFA k=1 (last 1 of 2 trainable) seed=123 === + trainable blocks: [1] trainable params: 2,105,866 + [DFA k] ep 1: test=0.1958 + [DFA k] ep 10: test=0.1777 + [DFA k] ep 20: test=0.2220 + [DFA k] ep 30: test=0.1852 + [DFA k] ep 40: test=0.2165 + [DFA k] ep 50: test=0.2095 + [DFA k] ep 60: test=0.1995 + [DFA k] ep 70: test=0.2038 + [DFA k] ep 80: test=0.2068 + [DFA k] ep 90: test=0.2173 + [DFA k] ep 100: test=0.2097 + FINAL dfa k=1 seed=123: 0.2097 + +=== DFA k=1 (last 1 of 2 trainable) seed=456 === + trainable blocks: [1] trainable params: 2,105,866 + [DFA k] ep 1: test=0.2118 + [DFA k] ep 10: test=0.2074 + [DFA k] ep 20: test=0.1777 + [DFA k] ep 30: test=0.2043 + [DFA k] ep 40: test=0.2010 + [DFA k] ep 50: test=0.2087 + [DFA k] ep 60: test=0.2073 + [DFA k] ep 70: test=0.2126 + [DFA k] ep 80: test=0.2202 + [DFA k] ep 90: test=0.2355 + [DFA k] ep 100: test=0.2295 + FINAL dfa k=1 seed=456: 0.2295 + +=== DFA k=2 (last 2 of 2 trainable) seed=42 === + trainable blocks: [0, 1] trainable params: 2,632,202 + [DFA k] ep 1: test=0.2769 + [DFA k] ep 10: test=0.2705 + [DFA k] ep 20: test=0.3000 + [DFA k] ep 30: test=0.2988 + [DFA k] ep 40: test=0.3080 + [DFA k] ep 50: test=0.2941 + [DFA k] ep 60: test=0.3025 + [DFA k] ep 70: test=0.3075 + [DFA k] ep 80: test=0.3070 + [DFA k] ep 90: test=0.3063 + [DFA k] ep 100: test=0.3069 + FINAL dfa k=2 seed=42: 0.3069 + +=== DFA k=2 (last 2 of 2 trainable) seed=123 === + trainable blocks: [0, 1] trainable params: 2,632,202 + [DFA k] ep 1: test=0.2582 + [DFA k] ep 10: test=0.2772 + [DFA k] ep 20: test=0.2904 + [DFA k] ep 30: test=0.3072 + [DFA k] ep 40: test=0.2898 + [DFA k] ep 50: test=0.2938 + [DFA k] ep 60: test=0.2892 + [DFA k] ep 70: test=0.2974 + [DFA k] ep 80: test=0.2970 + [DFA k] ep 90: test=0.3035 + [DFA k] ep 100: test=0.3025 + FINAL dfa k=2 seed=123: 0.3025 + +=== DFA k=2 (last 2 of 2 trainable) seed=456 === + trainable blocks: [0, 1] trainable params: 2,632,202 + [DFA k] ep 1: test=0.2794 + [DFA k] ep 10: test=0.2888 + [DFA k] ep 20: test=0.2884 + [DFA k] ep 30: test=0.2901 + [DFA k] ep 40: test=0.2784 + [DFA k] ep 50: test=0.2817 + [DFA k] ep 60: test=0.2983 + [DFA k] ep 70: test=0.2920 + [DFA k] ep 80: test=0.2904 + [DFA k] ep 90: test=0.2999 + [DFA k] ep 100: test=0.2963 + FINAL dfa k=2 seed=456: 0.2963 + +============================================================ +SUMMARY ladder_d512_L2_cifar10 (mean ± ddof-1 std over seeds) +============================================================ + BP k=0: 0.3858±0.0029 k=1: 0.5828±0.0025 k=2: 0.6035±0.0013 + FA k=0: 0.3586±0.0003 k=1: 0.4124±0.0037 k=2: 0.3615±0.0030 + DFA k=0: 0.3487±0.0048 k=1: 0.2259±0.0147 k=2: 0.3019±0.0053 + +Saved -> results/depth_ladder/ladder_d512_L2_cifar10.json +[Sun Jun 14 05:49:05 PM CDT 2026] ALL DONE diff --git a/report_explore/MEMO_depth_utility_ladder.md b/report_explore/MEMO_depth_utility_ladder.md new file mode 100644 index 0000000..d43a983 --- /dev/null +++ b/report_explore/MEMO_depth_utility_ladder.md @@ -0,0 +1,119 @@ +# MEMO — Depth-utility ladder (appendix experiment) + +**Date:** 2026-06-14 +**Purpose:** Reviewer asked to triangulate the depth-utility diagnostic (D3) more finely +— turn the binary *frozen-vs-fully-trained* block comparison into a **curve**. We vary the +number of trainable residual blocks `k`, training the **last `k`** blocks (output side) and +freezing the first `L−k` at random init; embedding / out_ln / out_head are **always** trained. + +**Question.** As more blocks are made trainable, does test accuracy rise? Under a method that +genuinely trains depth (BP) it should climb; under a method whose deep credit is non-functional +(DFA) it should stay flat at — or below — the frozen baseline. + +**Why output-side-first.** The deepest block receives the most direct credit (FA's last block +sees the exact output gradient), so the last `k` blocks are the **best case** for the method. +If even these don't help, depth is unused. + +--- + +## Setup + +- Arch / task: ResMLP (CIFAR-10). Two configs: **d=256 L=4** (primary audit) and **d=512 L=2** + (FA-failure case — vanilla FA is known to be ≈ frozen here). +- Methods: **BP** (positive control), **FA** (Lillicrap vanilla feedback alignment), **DFA**. +- `k ∈ {0,…,L}`; `k=0` = frozen-blocks baseline, `k=L` = full audit. +- Seeds {42,123,456}; mean ± ddof-1 std. +- Recipe identical to the main audit: AdamW, lr 1e-3, wd 0.01, cosine, batch 128, 100 epochs, + per-block independent optimizers, rms-normalized local surrogate losses. +- The **full** ladder (all `k`, incl. 0 and L) was run in **one** script for internal + consistency — `k=0` / `k=L` reproduce the external anchors (see cross-checks). + +Harness: `experiments/depth_utility_ladder.py`. +Raw results: `results/depth_ladder/ladder_d256_L4_cifar10.json`, `ladder_d512_L2_cifar10.json`. +Figure: `results/depth_ladder/depth_ladder.png` (`experiments/plot_depth_ladder.py`). + +--- + +## Results (CIFAR-10 test acc, mean ± ddof-1 std, n=3) + +**Primary — ResMLP d=256, L=4** + +| k (last-k trainable) | BP | FA | DFA | +|---|---|---|---| +| 0 (frozen) | 0.389 ± 0.001 | 0.355 ± 0.003 | 0.349 ± 0.003 | +| 1 | 0.565 ± 0.003 | 0.382 ± 0.008 | 0.244 ± 0.015 | +| 2 | 0.598 ± 0.003 | 0.349 ± 0.016 | 0.286 ± 0.013 | +| 3 | 0.608 ± 0.001 | 0.398 ± 0.008 | 0.296 ± 0.008 | +| 4 (full) | 0.617 ± 0.002 | 0.402 ± 0.009 | 0.301 ± 0.006 | + +**Secondary — ResMLP d=512, L=2 (FA-failure)** + +| k | BP | FA | DFA | +|---|---|---|---| +| 0 (frozen) | 0.386 ± 0.003 | 0.359 ± 0.000 | 0.349 ± 0.005 | +| 1 | 0.583 ± 0.002 | 0.412 ± 0.004 | 0.226 ± 0.015 | +| 2 (full) | 0.603 ± 0.001 | 0.361 ± 0.003 | 0.302 ± 0.005 | + +--- + +## Interpretation + +- **BP — monotone climb.** d=256: 0.389 → 0.617 (**+23 pp**); d=512: 0.386 → 0.603 (**+22 pp**). + Each block made trainable adds accuracy → depth is genuinely usable, so the D3 precondition + (BP benefits from depth) holds. +- **DFA — flat-to-negative.** The frozen rung `k=0` (≈0.349) is DFA's **maximum** in both configs. + Every trained-block configuration lands **below** it, including the full audit (`k=L`): d=256 + full = 0.301 (−4.8 pp vs frozen), d=512 full = 0.302 (−4.7 pp). Training deep DFA blocks does + not just fail to help — it actively destroys ~5 pp. **The D3 failure now holds at every + granularity**, not just the two extremes. +- **FA — partial / no net depth utility.** d=256 ends at 0.402 (+4.7 pp over frozen) but + non-monotonically; d=512 ends at 0.361 ≈ frozen 0.359 (**no net gain** — the FA-failure case + reproduces). FA is the intermediate: it can use some depth in the easier config and none in the + harder one. The non-monotonic dips (d=256 k=2; d=512 k=2) are consistent with FA's mis-scaled + sequential credit occasionally hurting. + +**One-line takeaway for §6.2:** *A trainable-depth ladder shows BP's accuracy climbs monotonically +with the number of trainable blocks (+22–23 pp) while DFA peaks at the frozen baseline and +declines once any deep block is trained; FA shows partial-to-no depth utility. Depth is usable +(BP), but DFA's deep credit is not.* + +## Cross-checks (internal rerun reproduces external anchors) + +- BP `k=4` = 0.617 ≈ existing full-audit BP 0.615. +- DFA `k=4` = 0.301 ≈ existing full-audit DFA 0.301 / 0.306. +- FA `k=4` = 0.402 ≈ existing FA 0.401. +- Frozen `k=0` ≈ 0.349 across methods ≈ existing frozen-blocks baseline 0.349. + +## Footnote — why `k=0` is already well above chance + +`k=0` is **not** an untrained network: embed / out_ln / out_head are trained; only the blocks are +frozen at random init. At init the residual branches are **small but non-negligible**: +per block `‖f_l(h_l)‖/‖h_l‖ ≈ 0.10`, and the full frozen 4-block stack deviates from the identity +by `‖h_L−h_0‖/‖h_0‖ = 0.196 ± 0.003` with `cos(h_L,h_0) = 0.981 ± 0.001` (3 seeds, CIFAR-10 +batch). The frozen stack is therefore a fixed, **near-norm-preserving random feature map**, not a +strict identity. So `k=0` (≈0.35) is the accuracy of a trained embedding+readout composed with +this fixed map — effectively a trained (near-)linear classifier on pixels, well above the 10% +chance level. Measurement: `experiments/frozen_init_identity_check.py` → +`results/depth_ladder/frozen_init_identity.json`. + +## Reproduce + +```bash +# ladders (GPU2, ~7 h for both, 72 runs, incremental/resumable JSON) +CUDA_VISIBLE_DEVICES=2 python experiments/depth_utility_ladder.py \ + --d_hidden 256 --num_blocks 4 --methods bp fa dfa --k_values 0 1 2 3 4 \ + --seeds 42 123 456 --epochs 100 --gpu 0 --output_dir results/depth_ladder +CUDA_VISIBLE_DEVICES=2 python experiments/depth_utility_ladder.py \ + --d_hidden 512 --num_blocks 2 --methods bp fa dfa --k_values 0 1 2 \ + --seeds 42 123 456 --epochs 100 --gpu 0 --output_dir results/depth_ladder +# figure + identity check +python experiments/plot_depth_ladder.py +CUDA_VISIBLE_DEVICES=2 python experiments/frozen_init_identity_check.py +``` + +## Caveats / open items + +- Parameter-matched shallow baseline (rule out "it's capacity not depth") not yet run — lower + priority; given deep-BP beats frozen by +22–23 pp, the D3 precondition is already safe. +- FA non-monotonicity (k=1 > k=2 in both configs) is noted but not separately investigated; it + does not affect the headline (FA full ≈ or slightly above frozen, ≪ BP). diff --git a/results/depth_ladder/depth_ladder.png b/results/depth_ladder/depth_ladder.png new file mode 100644 index 0000000..5fd1f81 Binary files /dev/null and b/results/depth_ladder/depth_ladder.png differ diff --git a/results/depth_ladder/frozen_init_identity.json b/results/depth_ladder/frozen_init_identity.json new file mode 100644 index 0000000..1c7048b --- /dev/null +++ b/results/depth_ladder/frozen_init_identity.json @@ -0,0 +1,57 @@ +{ + "config": { + "d_hidden": 256, + "L": 4, + "num_classes": 10, + "batch": 256, + "dataset": "cifar10-test", + "seeds": [ + 42, + 123, + 456 + ] + }, + "per_seed": { + "42": { + "per_block_ratio": [ + 0.09595257043838501, + 0.0955488458275795, + 0.09637212753295898, + 0.09818045794963837 + ], + "rel_dev": 0.1959637552499771, + "cos": 0.9811521172523499 + }, + "123": { + "per_block_ratio": [ + 0.09584859013557434, + 0.09690074622631073, + 0.10017187148332596, + 0.09818752110004425 + ], + "rel_dev": 0.19837374985218048, + "cos": 0.9805399179458618 + }, + "456": { + "per_block_ratio": [ + 0.09482444077730179, + 0.09799206256866455, + 0.09791108965873718, + 0.09693857282400131 + ], + "rel_dev": 0.19332122802734375, + "cos": 0.9819751381874084 + } + }, + "per_block_ratio_mean": [ + 0.09554186711708705, + 0.09681388487418492, + 0.09815169622500737, + 0.09776885062456131 + ], + "per_block_ratio_grand_mean": 0.09706907471021016, + "rel_dev_mean": 0.19588624437650046, + "rel_dev_std": 0.0025271525773572136, + "cos_mean": 0.98122239112854, + "cos_std": 0.0007201861555822825 +} \ No newline at end of file diff --git a/results/depth_ladder/ladder_d256_L4_cifar10.json b/results/depth_ladder/ladder_d256_L4_cifar10.json new file mode 100644 index 0000000..cfbd363 --- /dev/null +++ b/results/depth_ladder/ladder_d256_L4_cifar10.json @@ -0,0 +1,2274 @@ +{ + "config": { + "d_hidden": 256, + "num_blocks": 4, + "dataset": "cifar10", + "methods": [ + "bp", + "fa", + "dfa" + ], + "k_values": [ + 0, + 1, + 2, + 3, + 4 + ], + "seeds": [ + 42, + 123, + 456 + ], + "epochs": 100, + "lr": 0.001, + "wd": 0.01, + "batch_size": 128, + "gpu": 0, + "output_dir": "results/depth_ladder", + "num_classes": 10 + }, + "results": { + "bp": { + "0": { + "42": { + "final_acc": 0.3882, + "curve": [ + [ + 1, + 0.3543 + ], + [ + 10, + 0.3673 + ], + [ + 20, + 0.3483 + ], + [ + 30, + 0.3498 + ], + [ + 40, + 0.3608 + ], + [ + 50, + 0.3627 + ], + [ + 60, + 0.3697 + ], + [ + 70, + 0.3803 + ], + [ + 80, + 0.3821 + ], + [ + 90, + 0.387 + ], + [ + 100, + 0.3882 + ] + ] + }, + "123": { + "final_acc": 0.3899, + "curve": [ + [ + 1, + 0.3535 + ], + [ + 10, + 0.3654 + ], + [ + 20, + 0.3612 + ], + [ + 30, + 0.3586 + ], + [ + 40, + 0.3633 + ], + [ + 50, + 0.3608 + ], + [ + 60, + 0.3772 + ], + [ + 70, + 0.3791 + ], + [ + 80, + 0.3897 + ], + [ + 90, + 0.3884 + ], + [ + 100, + 0.3899 + ] + ] + }, + "456": { + "final_acc": 0.3878, + "curve": [ + [ + 1, + 0.3551 + ], + [ + 10, + 0.368 + ], + [ + 20, + 0.3509 + ], + [ + 30, + 0.3655 + ], + [ + 40, + 0.3573 + ], + [ + 50, + 0.3543 + ], + [ + 60, + 0.3716 + ], + [ + 70, + 0.3824 + ], + [ + 80, + 0.3852 + ], + [ + 90, + 0.3891 + ], + [ + 100, + 0.3878 + ] + ] + } + }, + "1": { + "42": { + "final_acc": 0.5683, + "curve": [ + [ + 1, + 0.3736 + ], + [ + 10, + 0.489 + ], + [ + 20, + 0.5089 + ], + [ + 30, + 0.526 + ], + [ + 40, + 0.5365 + ], + [ + 50, + 0.5486 + ], + [ + 60, + 0.5524 + ], + [ + 70, + 0.5638 + ], + [ + 80, + 0.5666 + ], + [ + 90, + 0.5678 + ], + [ + 100, + 0.5683 + ] + ] + }, + "123": { + "final_acc": 0.5623, + "curve": [ + [ + 1, + 0.3878 + ], + [ + 10, + 0.4797 + ], + [ + 20, + 0.5096 + ], + [ + 30, + 0.5209 + ], + [ + 40, + 0.528 + ], + [ + 50, + 0.5486 + ], + [ + 60, + 0.553 + ], + [ + 70, + 0.5564 + ], + [ + 80, + 0.5609 + ], + [ + 90, + 0.5611 + ], + [ + 100, + 0.5623 + ] + ] + }, + "456": { + "final_acc": 0.5643, + "curve": [ + [ + 1, + 0.3772 + ], + [ + 10, + 0.4853 + ], + [ + 20, + 0.5098 + ], + [ + 30, + 0.5238 + ], + [ + 40, + 0.5387 + ], + [ + 50, + 0.5488 + ], + [ + 60, + 0.5547 + ], + [ + 70, + 0.5588 + ], + [ + 80, + 0.5636 + ], + [ + 90, + 0.5637 + ], + [ + 100, + 0.5643 + ] + ] + } + }, + "2": { + "42": { + "final_acc": 0.5994, + "curve": [ + [ + 1, + 0.3874 + ], + [ + 10, + 0.5157 + ], + [ + 20, + 0.5361 + ], + [ + 30, + 0.56 + ], + [ + 40, + 0.5753 + ], + [ + 50, + 0.5802 + ], + [ + 60, + 0.5843 + ], + [ + 70, + 0.5965 + ], + [ + 80, + 0.597 + ], + [ + 90, + 0.5979 + ], + [ + 100, + 0.5994 + ] + ] + }, + "123": { + "final_acc": 0.6, + "curve": [ + [ + 1, + 0.3925 + ], + [ + 10, + 0.5148 + ], + [ + 20, + 0.5376 + ], + [ + 30, + 0.5638 + ], + [ + 40, + 0.5693 + ], + [ + 50, + 0.5784 + ], + [ + 60, + 0.5927 + ], + [ + 70, + 0.5911 + ], + [ + 80, + 0.5973 + ], + [ + 90, + 0.5986 + ], + [ + 100, + 0.6 + ] + ] + }, + "456": { + "final_acc": 0.5939, + "curve": [ + [ + 1, + 0.3868 + ], + [ + 10, + 0.5103 + ], + [ + 20, + 0.542 + ], + [ + 30, + 0.561 + ], + [ + 40, + 0.5699 + ], + [ + 50, + 0.5789 + ], + [ + 60, + 0.5809 + ], + [ + 70, + 0.5844 + ], + [ + 80, + 0.5919 + ], + [ + 90, + 0.5919 + ], + [ + 100, + 0.5939 + ] + ] + } + }, + "3": { + "42": { + "final_acc": 0.6079, + "curve": [ + [ + 1, + 0.3904 + ], + [ + 10, + 0.5218 + ], + [ + 20, + 0.5469 + ], + [ + 30, + 0.5749 + ], + [ + 40, + 0.5935 + ], + [ + 50, + 0.595 + ], + [ + 60, + 0.5983 + ], + [ + 70, + 0.6015 + ], + [ + 80, + 0.607 + ], + [ + 90, + 0.6057 + ], + [ + 100, + 0.6079 + ] + ] + }, + "123": { + "final_acc": 0.6069, + "curve": [ + [ + 1, + 0.3965 + ], + [ + 10, + 0.524 + ], + [ + 20, + 0.5517 + ], + [ + 30, + 0.5747 + ], + [ + 40, + 0.5774 + ], + [ + 50, + 0.5927 + ], + [ + 60, + 0.6035 + ], + [ + 70, + 0.603 + ], + [ + 80, + 0.6057 + ], + [ + 90, + 0.6073 + ], + [ + 100, + 0.6069 + ] + ] + }, + "456": { + "final_acc": 0.608, + "curve": [ + [ + 1, + 0.3947 + ], + [ + 10, + 0.5148 + ], + [ + 20, + 0.5536 + ], + [ + 30, + 0.5723 + ], + [ + 40, + 0.5873 + ], + [ + 50, + 0.5861 + ], + [ + 60, + 0.5991 + ], + [ + 70, + 0.5989 + ], + [ + 80, + 0.6062 + ], + [ + 90, + 0.6093 + ], + [ + 100, + 0.608 + ] + ] + } + }, + "4": { + "42": { + "final_acc": 0.6173, + "curve": [ + [ + 1, + 0.3936 + ], + [ + 10, + 0.5235 + ], + [ + 20, + 0.5606 + ], + [ + 30, + 0.5794 + ], + [ + 40, + 0.5992 + ], + [ + 50, + 0.6044 + ], + [ + 60, + 0.5979 + ], + [ + 70, + 0.6115 + ], + [ + 80, + 0.6153 + ], + [ + 90, + 0.6177 + ], + [ + 100, + 0.6173 + ] + ] + }, + "123": { + "final_acc": 0.6182, + "curve": [ + [ + 1, + 0.3981 + ], + [ + 10, + 0.5257 + ], + [ + 20, + 0.558 + ], + [ + 30, + 0.5779 + ], + [ + 40, + 0.5896 + ], + [ + 50, + 0.6023 + ], + [ + 60, + 0.6053 + ], + [ + 70, + 0.6081 + ], + [ + 80, + 0.6185 + ], + [ + 90, + 0.6174 + ], + [ + 100, + 0.6182 + ] + ] + }, + "456": { + "final_acc": 0.6145, + "curve": [ + [ + 1, + 0.3967 + ], + [ + 10, + 0.5255 + ], + [ + 20, + 0.5632 + ], + [ + 30, + 0.5747 + ], + [ + 40, + 0.5948 + ], + [ + 50, + 0.5954 + ], + [ + 60, + 0.6092 + ], + [ + 70, + 0.614 + ], + [ + 80, + 0.6125 + ], + [ + 90, + 0.6145 + ], + [ + 100, + 0.6145 + ] + ] + } + } + }, + "fa": { + "0": { + "42": { + "final_acc": 0.3555, + "curve": [ + [ + 1, + 0.3112 + ], + [ + 10, + 0.3389 + ], + [ + 20, + 0.3325 + ], + [ + 30, + 0.3495 + ], + [ + 40, + 0.3467 + ], + [ + 50, + 0.3465 + ], + [ + 60, + 0.3573 + ], + [ + 70, + 0.3542 + ], + [ + 80, + 0.3567 + ], + [ + 90, + 0.3554 + ], + [ + 100, + 0.3555 + ] + ] + }, + "123": { + "final_acc": 0.352, + "curve": [ + [ + 1, + 0.3257 + ], + [ + 10, + 0.3409 + ], + [ + 20, + 0.3514 + ], + [ + 30, + 0.3357 + ], + [ + 40, + 0.3299 + ], + [ + 50, + 0.3495 + ], + [ + 60, + 0.3468 + ], + [ + 70, + 0.3548 + ], + [ + 80, + 0.3509 + ], + [ + 90, + 0.3536 + ], + [ + 100, + 0.352 + ] + ] + }, + "456": { + "final_acc": 0.3578, + "curve": [ + [ + 1, + 0.3172 + ], + [ + 10, + 0.3374 + ], + [ + 20, + 0.3452 + ], + [ + 30, + 0.3431 + ], + [ + 40, + 0.3468 + ], + [ + 50, + 0.3563 + ], + [ + 60, + 0.3523 + ], + [ + 70, + 0.3578 + ], + [ + 80, + 0.3568 + ], + [ + 90, + 0.3576 + ], + [ + 100, + 0.3578 + ] + ] + } + }, + "1": { + "42": { + "final_acc": 0.3819, + "curve": [ + [ + 1, + 0.2886 + ], + [ + 10, + 0.3301 + ], + [ + 20, + 0.3604 + ], + [ + 30, + 0.3595 + ], + [ + 40, + 0.3678 + ], + [ + 50, + 0.3779 + ], + [ + 60, + 0.3727 + ], + [ + 70, + 0.381 + ], + [ + 80, + 0.381 + ], + [ + 90, + 0.3821 + ], + [ + 100, + 0.3819 + ] + ] + }, + "123": { + "final_acc": 0.3742, + "curve": [ + [ + 1, + 0.3105 + ], + [ + 10, + 0.3472 + ], + [ + 20, + 0.3444 + ], + [ + 30, + 0.3604 + ], + [ + 40, + 0.3615 + ], + [ + 50, + 0.3568 + ], + [ + 60, + 0.3708 + ], + [ + 70, + 0.3723 + ], + [ + 80, + 0.3749 + ], + [ + 90, + 0.3736 + ], + [ + 100, + 0.3742 + ] + ] + }, + "456": { + "final_acc": 0.3898, + "curve": [ + [ + 1, + 0.2975 + ], + [ + 10, + 0.3481 + ], + [ + 20, + 0.3454 + ], + [ + 30, + 0.3683 + ], + [ + 40, + 0.3618 + ], + [ + 50, + 0.3675 + ], + [ + 60, + 0.3826 + ], + [ + 70, + 0.3867 + ], + [ + 80, + 0.3863 + ], + [ + 90, + 0.3899 + ], + [ + 100, + 0.3898 + ] + ] + } + }, + "2": { + "42": { + "final_acc": 0.3305, + "curve": [ + [ + 1, + 0.2657 + ], + [ + 10, + 0.3431 + ], + [ + 20, + 0.3494 + ], + [ + 30, + 0.3436 + ], + [ + 40, + 0.3574 + ], + [ + 50, + 0.3388 + ], + [ + 60, + 0.3426 + ], + [ + 70, + 0.3341 + ], + [ + 80, + 0.3303 + ], + [ + 90, + 0.331 + ], + [ + 100, + 0.3305 + ] + ] + }, + "123": { + "final_acc": 0.3607, + "curve": [ + [ + 1, + 0.2982 + ], + [ + 10, + 0.3524 + ], + [ + 20, + 0.3694 + ], + [ + 30, + 0.3691 + ], + [ + 40, + 0.3703 + ], + [ + 50, + 0.3605 + ], + [ + 60, + 0.3546 + ], + [ + 70, + 0.3547 + ], + [ + 80, + 0.3651 + ], + [ + 90, + 0.3565 + ], + [ + 100, + 0.3607 + ] + ] + }, + "456": { + "final_acc": 0.3548, + "curve": [ + [ + 1, + 0.2753 + ], + [ + 10, + 0.3386 + ], + [ + 20, + 0.3495 + ], + [ + 30, + 0.3458 + ], + [ + 40, + 0.3374 + ], + [ + 50, + 0.3333 + ], + [ + 60, + 0.3523 + ], + [ + 70, + 0.3538 + ], + [ + 80, + 0.3519 + ], + [ + 90, + 0.3555 + ], + [ + 100, + 0.3548 + ] + ] + } + }, + "3": { + "42": { + "final_acc": 0.393, + "curve": [ + [ + 1, + 0.277 + ], + [ + 10, + 0.3554 + ], + [ + 20, + 0.3681 + ], + [ + 30, + 0.3841 + ], + [ + 40, + 0.3829 + ], + [ + 50, + 0.3847 + ], + [ + 60, + 0.3885 + ], + [ + 70, + 0.3956 + ], + [ + 80, + 0.3947 + ], + [ + 90, + 0.3916 + ], + [ + 100, + 0.393 + ] + ] + }, + "123": { + "final_acc": 0.4074, + "curve": [ + [ + 1, + 0.2905 + ], + [ + 10, + 0.3495 + ], + [ + 20, + 0.3804 + ], + [ + 30, + 0.382 + ], + [ + 40, + 0.3885 + ], + [ + 50, + 0.395 + ], + [ + 60, + 0.3971 + ], + [ + 70, + 0.4049 + ], + [ + 80, + 0.4047 + ], + [ + 90, + 0.4075 + ], + [ + 100, + 0.4074 + ] + ] + }, + "456": { + "final_acc": 0.3946, + "curve": [ + [ + 1, + 0.2708 + ], + [ + 10, + 0.3511 + ], + [ + 20, + 0.3662 + ], + [ + 30, + 0.3755 + ], + [ + 40, + 0.3818 + ], + [ + 50, + 0.3828 + ], + [ + 60, + 0.3966 + ], + [ + 70, + 0.3939 + ], + [ + 80, + 0.3928 + ], + [ + 90, + 0.3933 + ], + [ + 100, + 0.3946 + ] + ] + } + }, + "4": { + "42": { + "final_acc": 0.3959, + "curve": [ + [ + 1, + 0.2789 + ], + [ + 10, + 0.3498 + ], + [ + 20, + 0.3601 + ], + [ + 30, + 0.371 + ], + [ + 40, + 0.3834 + ], + [ + 50, + 0.3923 + ], + [ + 60, + 0.3912 + ], + [ + 70, + 0.3945 + ], + [ + 80, + 0.3957 + ], + [ + 90, + 0.3944 + ], + [ + 100, + 0.3959 + ] + ] + }, + "123": { + "final_acc": 0.4122, + "curve": [ + [ + 1, + 0.2905 + ], + [ + 10, + 0.3596 + ], + [ + 20, + 0.3803 + ], + [ + 30, + 0.3792 + ], + [ + 40, + 0.3955 + ], + [ + 50, + 0.398 + ], + [ + 60, + 0.4071 + ], + [ + 70, + 0.4034 + ], + [ + 80, + 0.4076 + ], + [ + 90, + 0.4115 + ], + [ + 100, + 0.4122 + ] + ] + }, + "456": { + "final_acc": 0.3987, + "curve": [ + [ + 1, + 0.2713 + ], + [ + 10, + 0.3544 + ], + [ + 20, + 0.3702 + ], + [ + 30, + 0.3799 + ], + [ + 40, + 0.3845 + ], + [ + 50, + 0.3923 + ], + [ + 60, + 0.3992 + ], + [ + 70, + 0.3974 + ], + [ + 80, + 0.399 + ], + [ + 90, + 0.4 + ], + [ + 100, + 0.3987 + ] + ] + } + } + }, + "dfa": { + "0": { + "42": { + "final_acc": 0.3454, + "curve": [ + [ + 1, + 0.3185 + ], + [ + 10, + 0.337 + ], + [ + 20, + 0.3458 + ], + [ + 30, + 0.3425 + ], + [ + 40, + 0.3419 + ], + [ + 50, + 0.3425 + ], + [ + 60, + 0.342 + ], + [ + 70, + 0.3466 + ], + [ + 80, + 0.3458 + ], + [ + 90, + 0.347 + ], + [ + 100, + 0.3454 + ] + ] + }, + "123": { + "final_acc": 0.3498, + "curve": [ + [ + 1, + 0.3219 + ], + [ + 10, + 0.3339 + ], + [ + 20, + 0.3453 + ], + [ + 30, + 0.3352 + ], + [ + 40, + 0.3322 + ], + [ + 50, + 0.3291 + ], + [ + 60, + 0.3428 + ], + [ + 70, + 0.3447 + ], + [ + 80, + 0.3465 + ], + [ + 90, + 0.3464 + ], + [ + 100, + 0.3498 + ] + ] + }, + "456": { + "final_acc": 0.3516, + "curve": [ + [ + 1, + 0.3241 + ], + [ + 10, + 0.3486 + ], + [ + 20, + 0.3396 + ], + [ + 30, + 0.3396 + ], + [ + 40, + 0.3387 + ], + [ + 50, + 0.3456 + ], + [ + 60, + 0.3508 + ], + [ + 70, + 0.3527 + ], + [ + 80, + 0.3498 + ], + [ + 90, + 0.3508 + ], + [ + 100, + 0.3516 + ] + ] + } + }, + "1": { + "42": { + "final_acc": 0.2267, + "curve": [ + [ + 1, + 0.2563 + ], + [ + 10, + 0.258 + ], + [ + 20, + 0.2445 + ], + [ + 30, + 0.2197 + ], + [ + 40, + 0.2229 + ], + [ + 50, + 0.1952 + ], + [ + 60, + 0.2306 + ], + [ + 70, + 0.229 + ], + [ + 80, + 0.2211 + ], + [ + 90, + 0.2215 + ], + [ + 100, + 0.2267 + ] + ] + }, + "123": { + "final_acc": 0.2563, + "curve": [ + [ + 1, + 0.2549 + ], + [ + 10, + 0.2505 + ], + [ + 20, + 0.2453 + ], + [ + 30, + 0.2358 + ], + [ + 40, + 0.2499 + ], + [ + 50, + 0.2506 + ], + [ + 60, + 0.2467 + ], + [ + 70, + 0.2513 + ], + [ + 80, + 0.2597 + ], + [ + 90, + 0.2586 + ], + [ + 100, + 0.2563 + ] + ] + }, + "456": { + "final_acc": 0.2476, + "curve": [ + [ + 1, + 0.2112 + ], + [ + 10, + 0.2227 + ], + [ + 20, + 0.2397 + ], + [ + 30, + 0.2326 + ], + [ + 40, + 0.2285 + ], + [ + 50, + 0.2176 + ], + [ + 60, + 0.2431 + ], + [ + 70, + 0.2476 + ], + [ + 80, + 0.2493 + ], + [ + 90, + 0.2477 + ], + [ + 100, + 0.2476 + ] + ] + } + }, + "2": { + "42": { + "final_acc": 0.3005, + "curve": [ + [ + 1, + 0.2792 + ], + [ + 10, + 0.2893 + ], + [ + 20, + 0.2978 + ], + [ + 30, + 0.296 + ], + [ + 40, + 0.301 + ], + [ + 50, + 0.3014 + ], + [ + 60, + 0.3005 + ], + [ + 70, + 0.3036 + ], + [ + 80, + 0.2997 + ], + [ + 90, + 0.3005 + ], + [ + 100, + 0.3005 + ] + ] + }, + "123": { + "final_acc": 0.2819, + "curve": [ + [ + 1, + 0.2671 + ], + [ + 10, + 0.2947 + ], + [ + 20, + 0.2841 + ], + [ + 30, + 0.2801 + ], + [ + 40, + 0.2819 + ], + [ + 50, + 0.2772 + ], + [ + 60, + 0.2834 + ], + [ + 70, + 0.2876 + ], + [ + 80, + 0.2757 + ], + [ + 90, + 0.2806 + ], + [ + 100, + 0.2819 + ] + ] + }, + "456": { + "final_acc": 0.2751, + "curve": [ + [ + 1, + 0.2604 + ], + [ + 10, + 0.2821 + ], + [ + 20, + 0.2784 + ], + [ + 30, + 0.2826 + ], + [ + 40, + 0.2805 + ], + [ + 50, + 0.2675 + ], + [ + 60, + 0.2735 + ], + [ + 70, + 0.2765 + ], + [ + 80, + 0.2735 + ], + [ + 90, + 0.2759 + ], + [ + 100, + 0.2751 + ] + ] + } + }, + "3": { + "42": { + "final_acc": 0.3047, + "curve": [ + [ + 1, + 0.2821 + ], + [ + 10, + 0.2882 + ], + [ + 20, + 0.2921 + ], + [ + 30, + 0.3064 + ], + [ + 40, + 0.3009 + ], + [ + 50, + 0.3044 + ], + [ + 60, + 0.3041 + ], + [ + 70, + 0.3075 + ], + [ + 80, + 0.3064 + ], + [ + 90, + 0.3021 + ], + [ + 100, + 0.3047 + ] + ] + }, + "123": { + "final_acc": 0.2906, + "curve": [ + [ + 1, + 0.263 + ], + [ + 10, + 0.291 + ], + [ + 20, + 0.2845 + ], + [ + 30, + 0.2821 + ], + [ + 40, + 0.29 + ], + [ + 50, + 0.2811 + ], + [ + 60, + 0.286 + ], + [ + 70, + 0.291 + ], + [ + 80, + 0.2879 + ], + [ + 90, + 0.291 + ], + [ + 100, + 0.2906 + ] + ] + }, + "456": { + "final_acc": 0.2919, + "curve": [ + [ + 1, + 0.2544 + ], + [ + 10, + 0.2841 + ], + [ + 20, + 0.2892 + ], + [ + 30, + 0.2998 + ], + [ + 40, + 0.2891 + ], + [ + 50, + 0.2844 + ], + [ + 60, + 0.2938 + ], + [ + 70, + 0.2928 + ], + [ + 80, + 0.2901 + ], + [ + 90, + 0.2932 + ], + [ + 100, + 0.2919 + ] + ] + } + }, + "4": { + "42": { + "final_acc": 0.3068, + "curve": [ + [ + 1, + 0.2899 + ], + [ + 10, + 0.2873 + ], + [ + 20, + 0.3016 + ], + [ + 30, + 0.3053 + ], + [ + 40, + 0.312 + ], + [ + 50, + 0.3045 + ], + [ + 60, + 0.3071 + ], + [ + 70, + 0.3102 + ], + [ + 80, + 0.308 + ], + [ + 90, + 0.3066 + ], + [ + 100, + 0.3068 + ] + ] + }, + "123": { + "final_acc": 0.3023, + "curve": [ + [ + 1, + 0.2683 + ], + [ + 10, + 0.2926 + ], + [ + 20, + 0.2861 + ], + [ + 30, + 0.2875 + ], + [ + 40, + 0.2978 + ], + [ + 50, + 0.291 + ], + [ + 60, + 0.2972 + ], + [ + 70, + 0.3011 + ], + [ + 80, + 0.2974 + ], + [ + 90, + 0.3015 + ], + [ + 100, + 0.3023 + ] + ] + }, + "456": { + "final_acc": 0.2949, + "curve": [ + [ + 1, + 0.2591 + ], + [ + 10, + 0.2883 + ], + [ + 20, + 0.2948 + ], + [ + 30, + 0.2995 + ], + [ + 40, + 0.2921 + ], + [ + 50, + 0.2956 + ], + [ + 60, + 0.296 + ], + [ + 70, + 0.2943 + ], + [ + 80, + 0.291 + ], + [ + 90, + 0.2955 + ], + [ + 100, + 0.2949 + ] + ] + } + } + } + } +} \ No newline at end of file diff --git a/results/depth_ladder/ladder_d512_L2_cifar10.json b/results/depth_ladder/ladder_d512_L2_cifar10.json new file mode 100644 index 0000000..4a3feff --- /dev/null +++ b/results/depth_ladder/ladder_d512_L2_cifar10.json @@ -0,0 +1,1378 @@ +{ + "config": { + "d_hidden": 512, + "num_blocks": 2, + "dataset": "cifar10", + "methods": [ + "bp", + "fa", + "dfa" + ], + "k_values": [ + 0, + 1, + 2 + ], + "seeds": [ + 42, + 123, + 456 + ], + "epochs": 100, + "lr": 0.001, + "wd": 0.01, + "batch_size": 128, + "gpu": 0, + "output_dir": "results/depth_ladder", + "num_classes": 10 + }, + "results": { + "bp": { + "0": { + "42": { + "final_acc": 0.3891, + "curve": [ + [ + 1, + 0.3462 + ], + [ + 10, + 0.3633 + ], + [ + 20, + 0.3635 + ], + [ + 30, + 0.3543 + ], + [ + 40, + 0.3673 + ], + [ + 50, + 0.3633 + ], + [ + 60, + 0.3695 + ], + [ + 70, + 0.3753 + ], + [ + 80, + 0.3858 + ], + [ + 90, + 0.3887 + ], + [ + 100, + 0.3891 + ] + ] + }, + "123": { + "final_acc": 0.3846, + "curve": [ + [ + 1, + 0.3497 + ], + [ + 10, + 0.3704 + ], + [ + 20, + 0.3698 + ], + [ + 30, + 0.354 + ], + [ + 40, + 0.3505 + ], + [ + 50, + 0.3634 + ], + [ + 60, + 0.3675 + ], + [ + 70, + 0.3739 + ], + [ + 80, + 0.3823 + ], + [ + 90, + 0.3845 + ], + [ + 100, + 0.3846 + ] + ] + }, + "456": { + "final_acc": 0.3838, + "curve": [ + [ + 1, + 0.3409 + ], + [ + 10, + 0.3578 + ], + [ + 20, + 0.3767 + ], + [ + 30, + 0.3607 + ], + [ + 40, + 0.3551 + ], + [ + 50, + 0.3632 + ], + [ + 60, + 0.3722 + ], + [ + 70, + 0.3704 + ], + [ + 80, + 0.3784 + ], + [ + 90, + 0.3834 + ], + [ + 100, + 0.3838 + ] + ] + } + }, + "1": { + "42": { + "final_acc": 0.5856, + "curve": [ + [ + 1, + 0.3667 + ], + [ + 10, + 0.4836 + ], + [ + 20, + 0.5197 + ], + [ + 30, + 0.5367 + ], + [ + 40, + 0.5444 + ], + [ + 50, + 0.5629 + ], + [ + 60, + 0.5691 + ], + [ + 70, + 0.5779 + ], + [ + 80, + 0.5808 + ], + [ + 90, + 0.5849 + ], + [ + 100, + 0.5856 + ] + ] + }, + "123": { + "final_acc": 0.5819, + "curve": [ + [ + 1, + 0.3632 + ], + [ + 10, + 0.4865 + ], + [ + 20, + 0.5175 + ], + [ + 30, + 0.536 + ], + [ + 40, + 0.5466 + ], + [ + 50, + 0.5606 + ], + [ + 60, + 0.5716 + ], + [ + 70, + 0.5749 + ], + [ + 80, + 0.5806 + ], + [ + 90, + 0.5817 + ], + [ + 100, + 0.5819 + ] + ] + }, + "456": { + "final_acc": 0.5809, + "curve": [ + [ + 1, + 0.3696 + ], + [ + 10, + 0.4737 + ], + [ + 20, + 0.5199 + ], + [ + 30, + 0.5317 + ], + [ + 40, + 0.5498 + ], + [ + 50, + 0.561 + ], + [ + 60, + 0.5675 + ], + [ + 70, + 0.5767 + ], + [ + 80, + 0.5785 + ], + [ + 90, + 0.5802 + ], + [ + 100, + 0.5809 + ] + ] + } + }, + "2": { + "42": { + "final_acc": 0.6039, + "curve": [ + [ + 1, + 0.379 + ], + [ + 10, + 0.5174 + ], + [ + 20, + 0.5471 + ], + [ + 30, + 0.5712 + ], + [ + 40, + 0.5906 + ], + [ + 50, + 0.5969 + ], + [ + 60, + 0.5977 + ], + [ + 70, + 0.5992 + ], + [ + 80, + 0.6072 + ], + [ + 90, + 0.6037 + ], + [ + 100, + 0.6039 + ] + ] + }, + "123": { + "final_acc": 0.602, + "curve": [ + [ + 1, + 0.3732 + ], + [ + 10, + 0.5161 + ], + [ + 20, + 0.5554 + ], + [ + 30, + 0.5756 + ], + [ + 40, + 0.5811 + ], + [ + 50, + 0.5928 + ], + [ + 60, + 0.5965 + ], + [ + 70, + 0.6016 + ], + [ + 80, + 0.6027 + ], + [ + 90, + 0.6007 + ], + [ + 100, + 0.602 + ] + ] + }, + "456": { + "final_acc": 0.6045, + "curve": [ + [ + 1, + 0.3768 + ], + [ + 10, + 0.5097 + ], + [ + 20, + 0.5499 + ], + [ + 30, + 0.5773 + ], + [ + 40, + 0.5858 + ], + [ + 50, + 0.5845 + ], + [ + 60, + 0.5934 + ], + [ + 70, + 0.5985 + ], + [ + 80, + 0.6011 + ], + [ + 90, + 0.602 + ], + [ + 100, + 0.6045 + ] + ] + } + } + }, + "fa": { + "0": { + "42": { + "final_acc": 0.3585, + "curve": [ + [ + 1, + 0.3288 + ], + [ + 10, + 0.3359 + ], + [ + 20, + 0.3336 + ], + [ + 30, + 0.3328 + ], + [ + 40, + 0.3418 + ], + [ + 50, + 0.3504 + ], + [ + 60, + 0.3564 + ], + [ + 70, + 0.3567 + ], + [ + 80, + 0.3543 + ], + [ + 90, + 0.3574 + ], + [ + 100, + 0.3585 + ] + ] + }, + "123": { + "final_acc": 0.3584, + "curve": [ + [ + 1, + 0.3125 + ], + [ + 10, + 0.3374 + ], + [ + 20, + 0.3364 + ], + [ + 30, + 0.3453 + ], + [ + 40, + 0.3437 + ], + [ + 50, + 0.3522 + ], + [ + 60, + 0.3587 + ], + [ + 70, + 0.355 + ], + [ + 80, + 0.3551 + ], + [ + 90, + 0.3558 + ], + [ + 100, + 0.3584 + ] + ] + }, + "456": { + "final_acc": 0.359, + "curve": [ + [ + 1, + 0.318 + ], + [ + 10, + 0.3311 + ], + [ + 20, + 0.3344 + ], + [ + 30, + 0.3533 + ], + [ + 40, + 0.3476 + ], + [ + 50, + 0.3523 + ], + [ + 60, + 0.3455 + ], + [ + 70, + 0.3569 + ], + [ + 80, + 0.3562 + ], + [ + 90, + 0.3583 + ], + [ + 100, + 0.359 + ] + ] + } + }, + "1": { + "42": { + "final_acc": 0.4083, + "curve": [ + [ + 1, + 0.3235 + ], + [ + 10, + 0.373 + ], + [ + 20, + 0.3734 + ], + [ + 30, + 0.3829 + ], + [ + 40, + 0.3916 + ], + [ + 50, + 0.4008 + ], + [ + 60, + 0.4012 + ], + [ + 70, + 0.4015 + ], + [ + 80, + 0.4042 + ], + [ + 90, + 0.4082 + ], + [ + 100, + 0.4083 + ] + ] + }, + "123": { + "final_acc": 0.4134, + "curve": [ + [ + 1, + 0.293 + ], + [ + 10, + 0.3662 + ], + [ + 20, + 0.3905 + ], + [ + 30, + 0.4027 + ], + [ + 40, + 0.3948 + ], + [ + 50, + 0.4048 + ], + [ + 60, + 0.4067 + ], + [ + 70, + 0.4094 + ], + [ + 80, + 0.4115 + ], + [ + 90, + 0.4103 + ], + [ + 100, + 0.4134 + ] + ] + }, + "456": { + "final_acc": 0.4155, + "curve": [ + [ + 1, + 0.3098 + ], + [ + 10, + 0.3561 + ], + [ + 20, + 0.386 + ], + [ + 30, + 0.3957 + ], + [ + 40, + 0.3907 + ], + [ + 50, + 0.4032 + ], + [ + 60, + 0.4017 + ], + [ + 70, + 0.4125 + ], + [ + 80, + 0.4123 + ], + [ + 90, + 0.4164 + ], + [ + 100, + 0.4155 + ] + ] + } + }, + "2": { + "42": { + "final_acc": 0.3582, + "curve": [ + [ + 1, + 0.3028 + ], + [ + 10, + 0.3585 + ], + [ + 20, + 0.3523 + ], + [ + 30, + 0.3315 + ], + [ + 40, + 0.3191 + ], + [ + 50, + 0.3397 + ], + [ + 60, + 0.3566 + ], + [ + 70, + 0.3527 + ], + [ + 80, + 0.3554 + ], + [ + 90, + 0.3593 + ], + [ + 100, + 0.3582 + ] + ] + }, + "123": { + "final_acc": 0.3621, + "curve": [ + [ + 1, + 0.2794 + ], + [ + 10, + 0.3627 + ], + [ + 20, + 0.36 + ], + [ + 30, + 0.375 + ], + [ + 40, + 0.3482 + ], + [ + 50, + 0.3679 + ], + [ + 60, + 0.363 + ], + [ + 70, + 0.3643 + ], + [ + 80, + 0.3636 + ], + [ + 90, + 0.3618 + ], + [ + 100, + 0.3621 + ] + ] + }, + "456": { + "final_acc": 0.3642, + "curve": [ + [ + 1, + 0.3005 + ], + [ + 10, + 0.3573 + ], + [ + 20, + 0.3624 + ], + [ + 30, + 0.3706 + ], + [ + 40, + 0.3529 + ], + [ + 50, + 0.3648 + ], + [ + 60, + 0.3581 + ], + [ + 70, + 0.3645 + ], + [ + 80, + 0.3652 + ], + [ + 90, + 0.3632 + ], + [ + 100, + 0.3642 + ] + ] + } + } + }, + "dfa": { + "0": { + "42": { + "final_acc": 0.3432, + "curve": [ + [ + 1, + 0.3196 + ], + [ + 10, + 0.3187 + ], + [ + 20, + 0.3369 + ], + [ + 30, + 0.3221 + ], + [ + 40, + 0.3386 + ], + [ + 50, + 0.3401 + ], + [ + 60, + 0.3473 + ], + [ + 70, + 0.3472 + ], + [ + 80, + 0.3426 + ], + [ + 90, + 0.3445 + ], + [ + 100, + 0.3432 + ] + ] + }, + "123": { + "final_acc": 0.3508, + "curve": [ + [ + 1, + 0.3089 + ], + [ + 10, + 0.318 + ], + [ + 20, + 0.3301 + ], + [ + 30, + 0.3434 + ], + [ + 40, + 0.3386 + ], + [ + 50, + 0.3343 + ], + [ + 60, + 0.3489 + ], + [ + 70, + 0.3458 + ], + [ + 80, + 0.3499 + ], + [ + 90, + 0.3508 + ], + [ + 100, + 0.3508 + ] + ] + }, + "456": { + "final_acc": 0.3521, + "curve": [ + [ + 1, + 0.3238 + ], + [ + 10, + 0.3327 + ], + [ + 20, + 0.3395 + ], + [ + 30, + 0.3457 + ], + [ + 40, + 0.3367 + ], + [ + 50, + 0.3496 + ], + [ + 60, + 0.3453 + ], + [ + 70, + 0.3487 + ], + [ + 80, + 0.3491 + ], + [ + 90, + 0.3498 + ], + [ + 100, + 0.3521 + ] + ] + } + }, + "1": { + "42": { + "final_acc": 0.2384, + "curve": [ + [ + 1, + 0.2687 + ], + [ + 10, + 0.2106 + ], + [ + 20, + 0.2293 + ], + [ + 30, + 0.2297 + ], + [ + 40, + 0.2241 + ], + [ + 50, + 0.2318 + ], + [ + 60, + 0.2417 + ], + [ + 70, + 0.2458 + ], + [ + 80, + 0.2463 + ], + [ + 90, + 0.2438 + ], + [ + 100, + 0.2384 + ] + ] + }, + "123": { + "final_acc": 0.2097, + "curve": [ + [ + 1, + 0.1958 + ], + [ + 10, + 0.1777 + ], + [ + 20, + 0.222 + ], + [ + 30, + 0.1852 + ], + [ + 40, + 0.2165 + ], + [ + 50, + 0.2095 + ], + [ + 60, + 0.1995 + ], + [ + 70, + 0.2038 + ], + [ + 80, + 0.2068 + ], + [ + 90, + 0.2173 + ], + [ + 100, + 0.2097 + ] + ] + }, + "456": { + "final_acc": 0.2295, + "curve": [ + [ + 1, + 0.2118 + ], + [ + 10, + 0.2074 + ], + [ + 20, + 0.1777 + ], + [ + 30, + 0.2043 + ], + [ + 40, + 0.201 + ], + [ + 50, + 0.2087 + ], + [ + 60, + 0.2073 + ], + [ + 70, + 0.2126 + ], + [ + 80, + 0.2202 + ], + [ + 90, + 0.2355 + ], + [ + 100, + 0.2295 + ] + ] + } + }, + "2": { + "42": { + "final_acc": 0.3069, + "curve": [ + [ + 1, + 0.2769 + ], + [ + 10, + 0.2705 + ], + [ + 20, + 0.3 + ], + [ + 30, + 0.2988 + ], + [ + 40, + 0.308 + ], + [ + 50, + 0.2941 + ], + [ + 60, + 0.3025 + ], + [ + 70, + 0.3075 + ], + [ + 80, + 0.307 + ], + [ + 90, + 0.3063 + ], + [ + 100, + 0.3069 + ] + ] + }, + "123": { + "final_acc": 0.3025, + "curve": [ + [ + 1, + 0.2582 + ], + [ + 10, + 0.2772 + ], + [ + 20, + 0.2904 + ], + [ + 30, + 0.3072 + ], + [ + 40, + 0.2898 + ], + [ + 50, + 0.2938 + ], + [ + 60, + 0.2892 + ], + [ + 70, + 0.2974 + ], + [ + 80, + 0.297 + ], + [ + 90, + 0.3035 + ], + [ + 100, + 0.3025 + ] + ] + }, + "456": { + "final_acc": 0.2963, + "curve": [ + [ + 1, + 0.2794 + ], + [ + 10, + 0.2888 + ], + [ + 20, + 0.2884 + ], + [ + 30, + 0.2901 + ], + [ + 40, + 0.2784 + ], + [ + 50, + 0.2817 + ], + [ + 60, + 0.2983 + ], + [ + 70, + 0.292 + ], + [ + 80, + 0.2904 + ], + [ + 90, + 0.2999 + ], + [ + 100, + 0.2963 + ] + ] + } + } + } + } +} \ No newline at end of file diff --git a/run_depth_ladder.sh b/run_depth_ladder.sh new file mode 100755 index 0000000..e450b47 --- /dev/null +++ b/run_depth_ladder.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -e +cd /home/yurenh2/fa +echo "[$(date)] START primary d=256 L=4 ladder" +CUDA_VISIBLE_DEVICES=2 python3 experiments/depth_utility_ladder.py \ + --d_hidden 256 --num_blocks 4 --dataset cifar10 \ + --methods bp fa dfa --k_values 0 1 2 3 4 --seeds 42 123 456 \ + --epochs 100 --gpu 0 --output_dir results/depth_ladder +echo "[$(date)] START secondary d=512 L=2 FA-failure ladder" +CUDA_VISIBLE_DEVICES=2 python3 experiments/depth_utility_ladder.py \ + --d_hidden 512 --num_blocks 2 --dataset cifar10 \ + --methods bp fa dfa --k_values 0 1 2 --seeds 42 123 456 \ + --epochs 100 --gpu 0 --output_dir results/depth_ladder +echo "[$(date)] ALL DONE" -- cgit v1.2.3