From 66e0d8b9fd4d0f7a2231d689c055e26fdf1cf04a Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Sat, 13 Jun 2026 12:35:36 -0500 Subject: rrm workspace: TRM/HRM/SRM code, Maze dataset, dynamical-analysis pipeline Curated export for clone-and-run Maze training (2x A6000) + diagnostics. trm/hrm pretrain.py carry trajectory-augmentation code (backward-compatible). Heavy artifacts (checkpoints/wandb/npz) gitignored; see PROVENANCE.md. Co-Authored-By: Claude Fable 5 --- .../analysis_2x2/offline_followups/followups.md | 88 ++++++++++++++++++++++ .../analysis_2x2/offline_followups/phase1_e1.md | 27 +++++++ 2 files changed, 115 insertions(+) create mode 100644 research/flossing/analysis_2x2/offline_followups/followups.md create mode 100644 research/flossing/analysis_2x2/offline_followups/phase1_e1.md (limited to 'research/flossing/analysis_2x2/offline_followups') diff --git a/research/flossing/analysis_2x2/offline_followups/followups.md b/research/flossing/analysis_2x2/offline_followups/followups.md new file mode 100644 index 0000000..be5f519 --- /dev/null +++ b/research/flossing/analysis_2x2/offline_followups/followups.md @@ -0,0 +1,88 @@ +# Offline follow-ups (no GPU) — 2026-06-11 + +Strict in-band thresholds: HRM pct45 of pooled log10 late-drift; TRM pct60 (band edge; B=0 regardless). +All numbers observational; within-dataset comparisons only. + +## HRM @26040 (n=8192), strict tau(log10)=-0.0129 +| cell | n | lam1 med | lam8 med | token_acc med | halted_at med | q_halt_final med | givens med | +|---|---|---|---|---|---|---|---| +| A | 3665 | -0.8670 | -0.9787 | 1.000 | 4 | +7.47 | 26 | +| B | 21 | -0.8421 | -0.9495 | 0.617 | 6 | +7.47 | 25 | +| C | 633 | -0.7796 | -0.8815 | 1.000 | 10 | +7.47 | 25 | +| D | 3873 | -0.5991 | -0.7140 | 0.630 | 0 | -9.62 | 25 | + +### hrm26040_n8192_strict: end-of-window drift slope (log10 steps13-16 vs 9-12; <0 = still descending) +- A: n=3665, slope median -0.0023, IQR [-0.0073, +0.0012], frac still descending (<-0.01): 0.20 +- B: n=21, slope median -0.0063, IQR [-0.4005, -0.0009], frac still descending (<-0.01): 0.48 +- C: n=633, slope median -0.0006, IQR [-1.4084, +0.0088], frac still descending (<-0.01): 0.44 +- D: n=3873, slope median -0.0031, IQR [-0.0556, +0.0459], frac still descending (<-0.01): 0.46 + +### HRM unsettled stratum: AUC(-lam1 -> correct) per log-drift decile +| decile | drift range (log10) | n | n_correct | AUC | +|---|---|---|---|---| +| 1 | [-0.01, 0.66] | 451 | 422 | 0.966 | +| 2 | [0.66, 1.42] | 450 | 50 | 0.972 | +| 3 | [1.42, 1.52] | 451 | 4 | 0.988 | +| 4 | [1.52, 1.56] | 450 | 6 | 0.964 | +| 5 | [1.56, 1.60] | 451 | 1 | 0.984 | +| 6 | [1.60, 1.62] | 450 | 7 | 0.949 | +| 7 | [1.62, 1.65] | 451 | 5 | 0.837 | +| 8 | [1.65, 1.68] | 450 | 5 | 0.851 | +| 9 | [1.68, 1.71] | 451 | 16 | 0.804 | +| 10 | [1.71, 1.93] | 451 | 117 | 0.685 | +- weighted mean within-decile AUC = 0.879 (vs unconditioned within-unsettled AUC 0.933) +- AUC(-end_slope -> correct | unsettled) = 0.605 (C still-descending fraction vs D, see slope table above) + +## HRM strict-band settled-but-wrong examples (n=21) +| idx | givens | token_acc | lam1 | drift_final | halted_at | q_halt_final | +|---|---|---|---|---|---|---| +| 342267 | 17 | 0.407 | -0.867 | 0.976 | 5 | +7.41 | +| 212705 | 17 | 0.469 | -0.838 | 0.964 | 8 | +7.44 | +| 329832 | 17 | 0.481 | -0.703 | 0.970 | 8 | +7.41 | +| 20075 | 27 | 0.519 | -0.812 | 0.966 | 5 | +7.50 | +| 198242 | 25 | 0.568 | -0.843 | 0.980 | 7 | +7.47 | +| 223591 | 24 | 0.580 | -0.939 | 0.951 | 4 | +7.47 | +| 238704 | 27 | 0.593 | -0.931 | 0.953 | 5 | +7.47 | +| 364431 | 25 | 0.593 | -0.806 | 0.956 | 6 | +7.44 | +| 274637 | 26 | 0.593 | -0.859 | 0.979 | 6 | +7.47 | +| 182424 | 24 | 0.605 | -0.985 | 0.949 | 6 | +7.47 | +| 351919 | 25 | 0.617 | -0.742 | 0.965 | 5 | +7.47 | +| 123022 | 27 | 0.617 | -0.826 | 0.951 | 7 | +7.50 | +| 150426 | 25 | 0.630 | -0.767 | 0.963 | 9 | +7.47 | +| 175427 | 26 | 0.630 | -0.843 | 0.946 | 8 | +7.50 | +| 422185 | 26 | 0.642 | -0.841 | 0.946 | 7 | +7.47 | +| 344032 | 24 | 0.654 | -0.903 | 0.965 | 4 | +7.53 | +| 30703 | 25 | 0.691 | -0.732 | 0.972 | 6 | +7.53 | +| 386549 | 23 | 0.691 | -0.842 | 0.966 | 4 | +7.47 | +| 3370 | 26 | 0.716 | -0.861 | 0.955 | 6 | +7.47 | +| 243909 | 24 | 0.753 | -0.786 | 0.969 | 8 | +7.50 | +| 258307 | 25 | 0.877 | -0.918 | 0.952 | 5 | +7.47 | + +## HRM difficulty control (#givens, input tokens != 1) +- givens: min 17, median 25, max 36 +- Spearman(lam1, givens): overall -0.350; correct-only -0.155; wrong-only -0.180 +- Spearman(correct, givens) = +0.276 + +| givens bin | n | acc | AUC(-lam1 -> correct) | +|---|---|---|---| +| [17, 24] | 1152 | 0.321 | 0.976 | +| [24, 25] | 1795 | 0.373 | 0.980 | +| [25, 26] | 1764 | 0.503 | 0.987 | +| [26, 36] | 3481 | 0.681 | 0.983 | +- weighted mean within-bin AUC = 0.982 (overall 0.984) + +## TRM official @58590 (n=512), strict tau(log10)=1.0240 +| cell | n | lam1 med | token_acc med | q_halt_final med | givens med | +|---|---|---|---|---|---| +| A | 307 | +0.0105 | 1.000 | +7.78 | 26 | +| B | 0 | | | | | +| C | 141 | +0.0174 | 1.000 | +7.81 | 25 | +| D | 64 | +0.1034 | 0.630 | -11.12 | 25 | + +### trm_official58590_n512_strict: end-of-window drift slope (log10 steps13-16 vs 9-12; <0 = still descending) +- A: n=307, slope median -0.1471, IQR [-0.2267, -0.0641], frac still descending (<-0.01): 0.90 +- B: n=0 +- C: n=141, slope median -0.0080, IQR [-0.2603, +0.0808], frac still descending (<-0.01): 0.49 +- D: n=64, slope median -0.0125, IQR [-0.0525, +0.0276], frac still descending (<-0.01): 0.53 +- Spearman(lam1, givens): overall -0.240; wrong-only -0.238 +- Spearman(correct, givens) = +0.148 \ No newline at end of file diff --git a/research/flossing/analysis_2x2/offline_followups/phase1_e1.md b/research/flossing/analysis_2x2/offline_followups/phase1_e1.md new file mode 100644 index 0000000..b3e3f40 --- /dev/null +++ b/research/flossing/analysis_2x2/offline_followups/phase1_e1.md @@ -0,0 +1,27 @@ +# E1 offline batch — bootstrap CIs, settling robustness, TRM multi4 pair + +## Bootstrap / exact CIs (TRM official @58590, n=2048) +- settled-wrong fraction: observed 0/254; exact 95% upper bound 0.0117 (1.17% of failures) +- AUC(-lam1->correct) = 0.9935, bootstrap 95% CI (0.9908244697676584, 0.9957330475628791) +- lam1(wrong) median 95% CI (0.10100110620260239, 0.10556983947753906) +- lam1(correct) median 95% CI (0.011215815320611, 0.011744528077542782) + +## Bootstrap CIs (HRM @26040, n=8192, strict band) +- strict settled-wrong fraction of failures: observed 0.0054, bootstrap 95% CI (0.0032613427182413084, 0.007798538095694945) +- AUC(-lam1->correct) = 0.9841, bootstrap 95% CI (0.9815470536412456, 0.9865145187475995) + +## Settling-criterion robustness (B-cell counts under alternative drift definitions) +- TRM official n=2048 | zH: B=0/A=1724 (tau=1.36) | zL: B=0/A=1728 (tau=1.42) | combined: B=0/A=1727 (tau=1.54) +- HRM n=8192 | zH: B=63/A=4103 (tau=0.77) | zL: B=59/A=4083 (tau=1.01) | combined: B=60/A=4087 (tau=1.07) + +## TRM official-pipeline multi4 vs baseline (matched objective, n=512 each) +- baseline @58590: acc=0.875; A/B/C/D=434/0/14/64; fD=0.125; lam1(D)=+0.1034; lam1(A)=+0.0111 +- multi4 @35805 (best): acc=0.900; A/B/C/D=452/0/9/51; fD=0.100; lam1(D)=+0.1019; lam1(A)=+0.0039 +- multi4 @65100 (final): acc=0.824; A/B/C/D=408/1/14/89; fD=0.174; lam1(D)=+0.0946; lam1(A)=+0.0133 + +## hrm_multi4 provenance (E6a) +- diag_hrm_multi4_step_{20832,23436,26040}_512.npz step grid matches HRM pretrain numbering; + multi4_eval_compare/logs should contain the eval invocations — checked manually below. +- ACTION: if the hrm_multi4 run is pretrain-pipeline (ACT-streaming + perturbation), then the + May-28 multi4 vs righteous baseline comparison IS matched-pipeline and Sec 3.4's caveat is + narrower than written; step9 E-vs-F pair (queued) covers the fixed-unroll objective regardless. \ No newline at end of file -- cgit v1.2.3