diff options
Diffstat (limited to 'results/resmlp_frozen_blocks_s456.log')
| -rw-r--r-- | results/resmlp_frozen_blocks_s456.log | 73 |
1 files changed, 73 insertions, 0 deletions
diff --git a/results/resmlp_frozen_blocks_s456.log b/results/resmlp_frozen_blocks_s456.log new file mode 100644 index 0000000..5bd1b66 --- /dev/null +++ b/results/resmlp_frozen_blocks_s456.log @@ -0,0 +1,73 @@ +Device: cuda:0, seed=456, epochs=100 + +=== BP shallow (ResMLP num_blocks=0), seed=456 === + n_params: 789770 (789770 trainable) + [BP-shallow] ep 1: test_acc=0.3545 + [BP-shallow] ep 10: test_acc=0.3636 + [BP-shallow] ep 20: test_acc=0.3572 + [BP-shallow] ep 30: test_acc=0.3514 + [BP-shallow] ep 40: test_acc=0.3629 + [BP-shallow] ep 50: test_acc=0.3623 + [BP-shallow] ep 60: test_acc=0.3711 + [BP-shallow] ep 70: test_acc=0.3766 + [BP-shallow] ep 80: test_acc=0.3875 + [BP-shallow] ep 90: test_acc=0.3875 + [BP-shallow] ep 100: test_acc=0.3876 +FINAL BP-shallow: 0.3876 + +=== BP frozen-blocks (ResMLP num_blocks=4, blocks frozen), seed=456 === + n_params: 1318154 (789770 trainable) + [BP-frozen] ep 1: test_acc=0.3593 + [BP-frozen] ep 10: test_acc=0.3696 + [BP-frozen] ep 20: test_acc=0.3515 + [BP-frozen] ep 30: test_acc=0.3541 + [BP-frozen] ep 40: test_acc=0.3574 + [BP-frozen] ep 50: test_acc=0.3567 + [BP-frozen] ep 60: test_acc=0.3724 + [BP-frozen] ep 70: test_acc=0.3777 + [BP-frozen] ep 80: test_acc=0.3861 + [BP-frozen] ep 90: test_acc=0.3894 + [BP-frozen] ep 100: test_acc=0.3881 +FINAL BP-frozen-blocks: 0.3881 + +=== DFA shallow (ResMLP num_blocks=0), seed=456 === + n_params: 789770 (789770 trainable) + [DFA-shallow] ep 1: test_acc=0.3246 + [DFA-shallow] ep 10: test_acc=0.3453 + [DFA-shallow] ep 20: test_acc=0.3426 + [DFA-shallow] ep 30: test_acc=0.3498 + [DFA-shallow] ep 40: test_acc=0.3431 + [DFA-shallow] ep 50: test_acc=0.3549 + [DFA-shallow] ep 60: test_acc=0.3494 + [DFA-shallow] ep 70: test_acc=0.3534 + [DFA-shallow] ep 80: test_acc=0.3494 + [DFA-shallow] ep 90: test_acc=0.3507 + [DFA-shallow] ep 100: test_acc=0.3519 +FINAL DFA-shallow: 0.3519 + +=== DFA frozen-blocks (ResMLP num_blocks=4, blocks frozen), seed=456 === + n_params: 1318154 (789770 trainable) + [DFA-frozen] ep 1: test_acc=0.3283 + [DFA-frozen] ep 10: test_acc=0.3427 + [DFA-frozen] ep 20: test_acc=0.3425 + [DFA-frozen] ep 30: test_acc=0.3481 + [DFA-frozen] ep 40: test_acc=0.3329 + [DFA-frozen] ep 50: test_acc=0.3425 + [DFA-frozen] ep 60: test_acc=0.3519 + [DFA-frozen] ep 70: test_acc=0.3556 + [DFA-frozen] ep 80: test_acc=0.3507 + [DFA-frozen] ep 90: test_acc=0.3508 + [DFA-frozen] ep 100: test_acc=0.3510 +FINAL DFA-frozen-blocks: 0.3510 + +=== ResMLP frozen/shallow baseline summary, seed=456 === + BP-shallow: 0.3876 + BP-frozen: 0.3881 + DFA-shallow: 0.3519 + DFA-frozen: 0.3510 + +Compare to trainable 4-block ResMLP (3-seed mean): BP=0.609, DFA=0.308 + +Interpretation: + If DFA-frozen ≈ DFA-trainable (0.308): blocks are passengers, walk-back parallels ViT + If DFA-frozen << DFA-trainable: ResMLP DFA actually trains the blocks (interesting contrast with ViT) |
