From 152821462023690df5d2bf90812e1cb5b1ca7274 Mon Sep 17 00:00:00 2001
From: YurenHao0426 <Blackhao0426@gmail.com>
Date: Sat, 23 May 2026 04:56:47 -0500
Subject: Add SRM training pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- config/arch/srm_v1.yaml: arch config for pretrain.py integration
- scripts/train_srm.py: standalone from-scratch trainer based on step4
  (HRM training infra adapted for SRM joint operator)

The arch.yaml exposes κ, η, α, n_iters, n_aol_layers as Hydra params.

train_srm.py adds joint Lyapunov diagnostic via JVP on srm_block to verify
λ_1 ≤ log((1-α)+α·κ) per micro-step. Smoke tested with hidden=128, n_iters=4
on Sudoku 1k: empirical Lip 0.28 << bound 0.90.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 config/arch/srm_v1.yaml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 config/arch/srm_v1.yaml

(limited to 'config')

diff --git a/config/arch/srm_v1.yaml b/config/arch/srm_v1.yaml
new file mode 100644
index 0000000..1b5ece5
--- /dev/null
+++ b/config/arch/srm_v1.yaml
@@ -0,0 +1,21 @@
+name: srm.srm_aol_v1@StableRecursionModel_ACTV1
+loss:
+  name: losses@ACTLossHead
+  loss_type: stablemax_cross_entropy
+
+halt_exploration_prob: 0.1
+halt_max_steps: 16
+
+# SRM-specific
+n_iters: 12           # joint micro-steps per ACT step (≈ HRM's H_cycles·L_cycles+H_cycles = 6 with deeper schedule)
+n_aol_layers: 2       # depth of AOL ψ block (channel + token mix per layer)
+kappa: 0.9            # contraction factor: per-step Lip_P ≤ (1-α)+α·κ = κ
+eta: 1.0              # weighting of L block in P-norm (1.0 = symmetric)
+alpha: 1.0            # damping (1.0 = full step)
+
+hidden_size: 512
+puzzle_emb_ndim: ${.hidden_size}
+
+# Unused (kept so pretrain.py's __pydantic_extra__ doesn't break)
+# pretrain.py's create_model() passes some fields HRM expects; Pydantic 'ignore'
+# (default) drops them silently.
-- 
cgit v1.2.3