diff options
Diffstat (limited to 'protocol/examples')
| -rw-r--r-- | protocol/examples/threshold_d_sensitivity.py | 94 |
1 files changed, 94 insertions, 0 deletions
diff --git a/protocol/examples/threshold_d_sensitivity.py b/protocol/examples/threshold_d_sensitivity.py new file mode 100644 index 0000000..d3f2c58 --- /dev/null +++ b/protocol/examples/threshold_d_sensitivity.py @@ -0,0 +1,94 @@ +""" +Sensitivity of diagnostic (d) frozen-blocks margin threshold. + +Codex round 18 specifically called out the +1.4 pp margin on penalized +DFA as fragile under the choice of threshold. This script sweeps the +margin threshold from 0.5 pp to 5 pp and reports the verdict on each +condition (vanilla DFA, penalized DFA at 3 lambda values, BP). + +Run: + python -m protocol.examples.threshold_d_sensitivity +""" +import os +import sys +import json + +import numpy as np + +REPO_ROOT = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +) + + +def main(): + # 3-seed mean accuracies on 4-block d=256 ResMLP CIFAR-10 + conditions = [ + ("BP-trainable", 0.609, 0.004), + ("DFA-shallow", 0.349, 0.002), + ("DFA-vanilla", 0.308, 0.014), + ("DFA-pen lam=1e-3", 0.372, None), # 1 seed + ("DFA-pen lam=1e-2", 0.363, 0.0007), + ("DFA-frozen-rand", 0.349, 0.002), + ] + shallow_acc = 0.349 + + print("=" * 80) + print("Diagnostic (d) frozen-baseline margin threshold sensitivity") + print("=" * 80) + print(f" Reference baseline (DFA-frozen-random): {shallow_acc:.4f}") + print() + print(f" {'condition':<22}{'acc':>10}{'std':>10}{'margin (pp)':>14}") + print(" " + "-" * 56) + for name, acc, std in conditions: + margin_pp = (acc - shallow_acc) * 100 + std_str = f"±{std:.4f}" if std is not None else "(n=1)" + print(f" {name:<22}{acc:>10.4f}{std_str:>10}{margin_pp:>14.2f}") + print() + + # Sweep + thresholds = [0.5, 1.0, 1.5, 2.0, 3.0, 5.0, 10.0] + print("Walk-back verdict at each threshold:") + print(f" {'condition':<22}", end="") + for t in thresholds: + print(f"{'>'+str(t)+'pp':>10}", end="") + print() + print(" " + "-" * (22 + 10 * len(thresholds))) + for name, acc, std in conditions: + margin_pp = (acc - shallow_acc) * 100 + print(f" {name:<22}", end="") + for t in thresholds: + verdict = "FIRE" if margin_pp < t else "ok" + print(f"{verdict:>10}", end="") + print() + + print() + print("=" * 80) + print("INTERPRETATION") + print("=" * 80) + print() + print(" - DFA-vanilla margin = -4 pp: FIRES at ALL reasonable thresholds") + print(" (it's actively below the shallow baseline, not just close to it)") + print() + print(" - DFA-pen lam=1e-2 margin = +1.4 pp: knife-edge") + print(" fires at threshold ≥ 1.5 pp") + print(" passes at threshold ≤ 1.0 pp") + print() + print(" The default 2.0 pp gives a walk-back, but a reviewer setting") + print(" 1.0 pp would say the penalized DFA passes (d). The conclusion") + print(" is sensitive to this choice.") + print() + print(" - DFA-pen lam=1e-3 margin = +2.3 pp: passes (d) at 2.0 pp threshold") + print(" (slightly stronger penalty, slightly better acc — would NOT walk back)") + print() + print(" Round 18 lesson: the +1.4 pp finding is real but the binary verdict") + print(" depends on a knife-edge threshold choice. The honest paper claim") + print(" should be: 'after the penalty correction, the depth contribution is") + print(" at most 1.4 pp above the random-blocks baseline — much smaller than") + print(" BP's +26 pp gap over shallow', not 'the deep blocks are passive'.") + print() + print(" Compare to (a) 63x and (b) 24338x separation gaps from") + print(" threshold_sensitivity.py — those diagnostics are robust; (d) is not.") + + +if __name__ == "__main__": + main() |
