From bcec9560cf5c9b113e9381a52d1a941daa8865f2 Mon Sep 17 00:00:00 2001 From: Yuren Hao Date: Fri, 3 Jul 2026 07:57:22 -0500 Subject: omega/norm-family refuted as stability signal; fingerprint story retracted; eigreg v2 = true map-eigenvalue (spec_penalty) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - eig_control: fix plain-PI bug (shifted PI for lambda_max of indefinite Sym); add lead_rho + spec_penalty (soft one-sided cap on |lam|(I+eps*J_F), 2-D Rayleigh-Ritz, matvec-only) — aep 'spectral' ported. eig_penalty demoted to diagnostic. - eig_recheck.py (Lanczos audit): omega=+5..+13 on ALL operators incl the stablest (s2000 +12.8 while true alpha=-0.02); gap omega-alpha~10; old 'warm -10.14 vs scratch +1.11' numbers were PI-mixture artifacts. RETRACTED. - eig_v2_smoke/depth: v2 mechanics validated vs ARPACK; z_T1 readings >1 are unconverged-state contamination (150: 1.009 -> 400/800: 0.997-0.999, mu=-0.02..-0.006 matching eig_probe); fixed-point top = BAND of slow modes. - lt_ep_train: --eigreg now spec_penalty (--eig_margin 0.995 = rho target); --fingerprint reports rho/Re_mu instead of num_abscissa. - ONBOARDING §4-7 + FINDINGS 2026-07-03: retraction + verdict (fundamental quantity = finite-horizon path LE / resreg axis; de-cliff via floss-ept; spec_penalty = measure-mode scalpel for a detaching Hopf pair). Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_014FAPDWQ49M5Ye3NpTndTpn --- ep_run/diag_cos.py | 16 ++++--- ep_run/eig_control.py | 121 +++++++++++++++++++++++++++++++++++++++++++----- ep_run/eig_recheck.log | 34 ++++++++++++++ ep_run/eig_recheck.py | 109 +++++++++++++++++++++++++++++++++++++++++++ ep_run/eig_v2_depth.log | 6 +++ ep_run/eig_v2_depth.py | 39 ++++++++++++++++ ep_run/eig_v2_smoke.log | 13 ++++++ ep_run/eig_v2_smoke.py | 56 ++++++++++++++++++++++ ep_run/lt_ep_train.py | 13 +++--- 9 files changed, 382 insertions(+), 25 deletions(-) create mode 100644 ep_run/eig_recheck.log create mode 100644 ep_run/eig_recheck.py create mode 100644 ep_run/eig_v2_depth.log create mode 100644 ep_run/eig_v2_depth.py create mode 100644 ep_run/eig_v2_smoke.log create mode 100644 ep_run/eig_v2_smoke.py (limited to 'ep_run') diff --git a/ep_run/diag_cos.py b/ep_run/diag_cos.py index 37e7257..fb8a68f 100644 --- a/ep_run/diag_cos.py +++ b/ep_run/diag_cos.py @@ -9,7 +9,7 @@ can be laid side by side, and fingerprints any checkpoint (res, cos, numerical a see WHAT distinguishes s2000 from other 2000-step checkpoints (conditioning? alignment? abscissa?).""" import math, torch from lt_ep_train import ep_step, bptt_step, relax, evaluate, get_batch -from eig_control import num_abscissa +from eig_control import lead_rho def _cos(ge, gb, params): # cosine over the shared block params (where EP != BPTT) @@ -32,14 +32,16 @@ def cos_ep_bptt(blk, idx, y, T1, T2, eps, beta, holo=0, hr=0.02, t1max=0, res_es def fingerprint(blk, T1, T2, eps, beta, holo=0, hr=0.02, t1max=0, res_est=1e-4, t2sel=0, nb=4, B=6): - """Median (res, cos-to-BPTT, numerical abscissa) over nb small batches + val CE — the operator's 4-D fingerprint. - B kept small: the exact-BPTT reference gradient unrolls T1 steps and is memory-heavy at C512.""" - cache = {}; res_l, cos_l, om_l = [], [], [] + """Median (res, cos-to-BPTT, TRUE leading map-eigenvalue rho & Re mu) over nb small batches + val CE. + B kept small: the exact-BPTT reference gradient unrolls T1 steps and is memory-heavy at C512. + (The numerical-abscissa column was retracted 2026-07-03 — omega is ~10 above the true abscissa on + this non-normal operator and anti-correlates with stability; see eig_control docstring / eig_recheck.)""" + cache = {}; res_l, cos_l, rho_l, mu_l = [], [], [], [] for _ in range(nb): idx, y = get_batch('train', B, blk.T) c, r = cos_ep_bptt(blk, idx, y, T1, T2, eps, beta, holo, hr, t1max, res_est, t2sel) xin = blk.embed(idx).detach(); zs = relax(blk, xin.clone(), xin, T1, eps) - _, om = num_abscissa(blk, zs, cache) - res_l.append(r); cos_l.append(c); om_l.append(om) + _, rho, mu = lead_rho(blk, zs, eps, blk.c, cache, iters=25) # one-shot: more iters than the warm-started training path + res_l.append(r); cos_l.append(c); rho_l.append(rho); mu_l.append(mu) md = lambda a: sorted(a)[len(a) // 2] - return dict(res=md(res_l), cos=md(cos_l), num_abscissa=md(om_l), val=evaluate(blk, T1, eps)) + return dict(res=md(res_l), cos=md(cos_l), rho=md(rho_l), mu_re=md(mu_l), val=evaluate(blk, T1, eps)) diff --git a/ep_run/eig_control.py b/ep_run/eig_control.py index 6e3f598..0e62068 100644 --- a/ep_run/eig_control.py +++ b/ep_run/eig_control.py @@ -13,30 +13,127 @@ Power iteration on the SYMMETRIC PART -> matvec-only (jvp+vjp of nc_force, the s and the AEP correction already call), so it scales and is analog-compatible (no eigendecomposition). One-sided ReLU penalty = a LEADING signal: acts only as the abscissa nears the margin, so unlike jacreg it does not over-contract / hijack when the operator is already safe. + +=== VERDICT (eig_recheck.py, 2026-07-03): the omega premise above is REFUTED for this operator. === +Gold-standard Lanczos on Sym(J_nc) at z*: omega = +5..+13 on EVERY operator — including the stablest +warm source s2000 (omega=+12.8 while its TRUE spectral abscissa, eig_probe FD-JVP, is ~-0.02). The +non-normality gap omega - alpha ~ 10: the numerical abscissa is hopelessly conservative here and even +ANTI-correlates with actual stability (rand-init +5.0 -> s2000 +12.8). An omega-margin penalty would +fire permanently and over-constrain worse than jacreg. KEEP num_abscissa/eig_penalty as DIAGNOSTICS +ONLY. (The pre-fix plain power iteration also returned garbage mixtures on the indefinite Sym — the old +'warm -10.14 vs scratch +1.11' fingerprint story is retracted.) + +The CONTROL is lead_rho/spec_penalty below: the TRUE leading eigenvalue of the forward map +M = I + eps*J_F (a real 2-D Rayleigh-Ritz block captures the complex Hopf pair that symmetric power +iteration cannot see). rho = |lam_lead(M)| is the exact discrete stability criterion (rho > 1 = the +relaxation spirals out — covers both the continuous Hopf and the Euler/Neimark-Sacker route), matches +eig_probe's measurement, and is the aep-dynamics 'spectral' steering ported to C512 — applied SOFTLY +(small coefficient, one-sided), per the aep finding that hard projection over-constrains. """ import torch from torch.autograd.functional import jvp, vjp def num_abscissa(blk, zs, cache, iters=3): - """Power-iterate Sym(J_nc)=(J_nc+J_nc^T)/2 at zs for the leading eigenpair. Returns (v_detached, lambda_float). - lambda = v^T Sym(J_nc) v = v^T J_nc v (Rayleigh quotient at the leading eigenvector) = numerical abscissa.""" + """SHIFTED power iteration for the TOP (algebraic-max) eigenpair of Sym(J_nc)=(J_nc+J_nc^T)/2 at zs. + Returns (v_detached, lambda_max_float) — lambda_max(Sym) = the numerical abscissa. + + Why shifted: plain power iteration converges to the largest-|lambda| eigenpair. Sym(J_nc) is + indefinite and its NEGATIVE end can dominate in magnitude — plain PI then silently returns + lambda_min, which reads 'deeply stable' exactly when lambda_max is over the margin, disarming the + penalty. Phase 1 estimates the dominant magnitude s; phase 2 iterates (Sym + s·I), whose top + eigenvalue lambda_max + s is non-negative and dominant, so the Rayleigh quotient converges to + lambda_max. Both phases warm-start from the cache, so accuracy accrues across training steps.""" z = zs.detach() - v = cache.get('v') - if v is None or v.shape != z.shape or v.dtype != z.dtype or v.device != z.device: - v = torch.randn_like(z) - v = v / (v.norm() + 1e-12) + + def smv(u): # Sym(J_nc) u + return 0.5 * (jvp(blk.nc_force, z, u)[1] + vjp(blk.nc_force, z, u)[1]) + + def unit(key): + u = cache.get(key) + if u is None or u.shape != z.shape or u.dtype != z.dtype or u.device != z.device: + u = torch.randn_like(z) + return u / (u.norm() + 1e-12) + with torch.no_grad(): - for _ in range(iters): - Sv = 0.5 * (jvp(blk.nc_force, z, v)[1] + vjp(blk.nc_force, z, v)[1]) # Sym(J_nc) v - v = Sv / (Sv.norm() + 1e-12) - lam = float((v * jvp(blk.nc_force, z, v)[1]).sum() / (v * v).sum()) # v^T J_nc v - cache['v'] = v + u, s = unit('u'), 0.0 + for _ in range(iters): # phase 1: dominant |lambda| -> the shift + Su = smv(u); s = float(Su.norm()); u = Su / (s + 1e-12) + s = 1.1 * s + 1e-3 # safe overestimate (need s >= |lambda_min|) + v = unit('v') + for _ in range(iters): # phase 2: top eigenpair of Sym + s·I + w = smv(v) + s * v + v = w / (w.norm() + 1e-12) + lam = float((v * jvp(blk.nc_force, z, v)[1]).sum() / (v * v).sum()) # v^T J v = v^T Sym(J) v + cache['u'] = u; cache['v'] = v return v, lam +def lead_rho(blk, zs, eps, c, cache, iters=6): + """TRUE leading eigenvalue of the forward map M = I + eps*J_F, J_F = J_nc - (1+c)I. + Warm-started 2-D subspace (Rayleigh-Ritz) iteration: a real 2-D block captures the complex Hopf + pair that plain power iteration cannot represent. Matvec-only (jvp of nc_force). + Returns (V=[v0,v1] detached orthonormal, rho, mu_re): rho = |lam_lead(M)| (discrete stability: + rho > 1 = relaxation spirals out), mu = (lam-1)/eps the continuous eigenvalue (Hopf: Re mu > 0).""" + z = zs.detach() + k = 1.0 - eps * (1.0 + c) + + def mv(u): # M u = u + eps*(J_nc - (1+c)I) u + return k * u + eps * jvp(blk.nc_force, z, u)[1] + + V = cache.get('V') + if V is None or V[0].shape != z.shape or V[0].dtype != z.dtype or V[0].device != z.device: + V = [torch.randn_like(z), torch.randn_like(z)] + with torch.no_grad(): + for _ in range(iters): + W0, W1 = mv(V[0]), mv(V[1]) + W0 = W0 / (W0.norm() + 1e-12) # Gram-Schmidt: orthonormal 2-D block + W1 = W1 - (W1 * W0).sum() * W0 + W1 = W1 / (W1.norm() + 1e-12) + V = [W0, W1] + M0, M1 = mv(V[0]), mv(V[1]) + h00, h01 = float((V[0] * M0).sum()), float((V[0] * M1).sum()) # H = V^T M V (Rayleigh-Ritz) + h10, h11 = float((V[1] * M0).sum()), float((V[1] * M1).sum()) + tr, det = h00 + h11, h00 * h11 - h01 * h10 + disc = tr * tr / 4.0 - det + if disc >= 0: # two real Ritz values: take the larger |.| + l1, l2 = tr / 2.0 + disc ** 0.5, tr / 2.0 - disc ** 0.5 + lam_re, rho = (l1, abs(l1)) if abs(l1) >= abs(l2) else (l2, abs(l2)) + else: # complex pair a+-bi: |lam|=sqrt(det), Re=tr/2 + rho, lam_re = det ** 0.5, tr / 2.0 + cache['V'] = V + return V, rho, (lam_re - 1.0) / eps + + +def spec_penalty(blk, zs, eps, c, specreg, rho_target, cache, iters=6): + """Grads of the one-sided TRUE-leading-eigenvalue penalty R = specreg * relu(rho - rho_target)^2 + with rho = |lam_lead(I + eps*J_F)| via Rayleigh-Ritz on the converged (detached) 2-D subspace. + This is the aep-dynamics 'spectral' steering at C512, applied softly (one-sided, small specreg). + Returns ({id(p): grad}, rho, mu_re) — empty grads when rho below target.""" + V, rho0, mu_re = lead_rho(blk, zs, eps, c, cache, iters) + if rho0 <= rho_target: # contraction margin held: signal off + return {}, rho0, mu_re + z = zs.detach() + k = 1.0 - eps * (1.0 + c) + with torch.enable_grad(): # H(theta), V fixed (envelope/Rayleigh-Ritz) + M0 = k * V[0] + eps * jvp(blk.nc_force, z, V[0], create_graph=True)[1] + M1 = k * V[1] + eps * jvp(blk.nc_force, z, V[1], create_graph=True)[1] + h00, h01 = (V[0] * M0).sum(), (V[0] * M1).sum() + h10, h11 = (V[1] * M0).sum(), (V[1] * M1).sum() + tr, det = h00 + h11, h00 * h11 - h01 * h10 + disc = tr * tr / 4.0 - det + rho = torch.where(disc >= 0, + torch.maximum((tr / 2 + disc.clamp(min=0).sqrt()).abs(), + (tr / 2 - disc.clamp(min=0).sqrt()).abs()), + det.clamp(min=1e-12).sqrt()) + R = specreg * torch.relu(rho - rho_target) ** 2 + gr = torch.autograd.grad(R, blk.block, allow_unused=True) + return {id(p): g for p, g in zip(blk.block, gr) if g is not None}, rho0, mu_re + + def eig_penalty(blk, zs, eigreg, margin, cache, iters=3): - """Grads of the one-sided leading-abscissa penalty R = eigreg * relu(omega(J_nc) - margin)^2. + """[DIAGNOSTIC-ONLY — omega control refuted, see module docstring; use spec_penalty instead.] + Grads of the one-sided leading-abscissa penalty R = eigreg * relu(omega(J_nc) - margin)^2. Returns ({id(p): grad}, omega) — omega logged as the controller signal. Empty grads when below margin.""" v, lam0 = num_abscissa(blk, zs, cache, iters) if lam0 <= margin: # below the stability margin: leading signal off diff --git a/ep_run/eig_recheck.log b/ep_run/eig_recheck.log new file mode 100644 index 0000000..49cd2a9 --- /dev/null +++ b/ep_run/eig_recheck.log @@ -0,0 +1,34 @@ +/home/yurenh2/miniconda3/lib/python3.13/site-packages/torch/autograd/graph.py:865: UserWarning: Attempting to run cuBLAS, but there was no current CUDA context! Attempting to set the primary context... (Triggered internally at /pytorch/aten/src/ATen/cuda/CublasHandlePool.cpp:330.) + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass + [rand-init b0] lam_max= +5.089 lam_min= -5.024 oldPI= -0.182 newPI= +4.818 res=1.62e+01 + [rand-init b1] lam_max= +4.958 lam_min= -4.928 oldPI= -0.143 newPI= +4.819 res=1.11e+01 +[rand-init] (random init) lam_max=+5.023 lam_min=-4.976 oldPI=-0.163 <-- oldPI tracked lam_MIN (bug live) newPI=+4.819 res=1.36e+01 val=8.4893 + + [s2000-warmsrc b0] lam_max= +12.349 lam_min= -15.432 oldPI= -6.062 newPI= +12.121 res=2.58e+00 + [s2000-warmsrc b1] lam_max= +13.334 lam_min= -15.816 oldPI= -6.800 newPI= +13.303 res=2.72e+00 +[s2000-warmsrc] (step 1900 best 3.1321) lam_max=+12.841 lam_min=-15.624 oldPI=-6.431 <-- oldPI tracked lam_MIN (bug live) newPI=+12.712 res=2.65e+00 val=3.1028 + + [resreg-scratch b0] lam_max= +10.712 lam_min= -9.901 oldPI= +2.080 newPI= +10.408 res=6.64e+00 + [resreg-scratch b1] lam_max= +10.576 lam_min= -9.794 oldPI= +0.714 newPI= +10.566 res=5.84e+00 +[resreg-scratch] (step 9200 best 2.2156) lam_max=+10.644 lam_min=-9.847 oldPI=+1.397 (oldPI ~ lam_max, bug latent) newPI=+10.487 res=6.24e+00 val=2.2380 + + [fast-adaptive b0] lam_max= +7.474 lam_min= -7.285 oldPI= +2.351 newPI= +7.470 res=1.56e+00 + [fast-adaptive b1] lam_max= +8.313 lam_min= -7.637 oldPI= +2.472 newPI= +8.289 res=1.82e+00 +[fast-adaptive] (step 24400 best 2.1023) lam_max=+7.893 lam_min=-7.461 oldPI=+2.412 (oldPI ~ lam_max, bug latent) newPI=+7.879 res=1.69e+00 val=2.1817 + + [warm-fast-live b0] lam_max= +8.360 lam_min= -8.382 oldPI= -0.852 newPI= +8.216 res=3.32e+00 + [warm-fast-live b1] lam_max= +8.580 lam_min= -8.044 oldPI= +0.688 newPI= +8.327 res=2.10e+00 +[warm-fast-live] (step 4900 best 2.3064) lam_max=+8.470 lam_min=-8.213 oldPI=-0.082 <-- oldPI tracked lam_MIN (bug live) newPI=+8.272 res=2.71e+00 val=2.3378 + + [self-restart-live b0] lam_max= +11.042 lam_min= -10.908 oldPI= -0.257 newPI= +10.983 res=6.85e-01 + [self-restart-live b1] lam_max= +10.305 lam_min= -10.332 oldPI= +0.463 newPI= +10.079 res=1.33e+00 +[self-restart-live] (step 8600 best 2.3809) lam_max=+10.673 lam_min=-10.620 oldPI=+0.103 (oldPI ~ lam_max, bug latent) newPI=+10.531 res=1.01e+00 val=2.3913 + +operator lam_max lam_min oldPI newPI res val +rand-init +5.023 -4.976 -0.163 +4.819 1.36e+01 8.4893 # random init +s2000-warmsrc +12.841 -15.624 -6.431 +12.712 2.65e+00 3.1028 # step 1900 best 3.1321 +resreg-scratch +10.644 -9.847 +1.397 +10.487 6.24e+00 2.2380 # step 9200 best 2.2156 +fast-adaptive +7.893 -7.461 +2.412 +7.879 1.69e+00 2.1817 # step 24400 best 2.1023 +warm-fast-live +8.470 -8.213 -0.082 +8.272 2.71e+00 2.3378 # step 4900 best 2.3064 +self-restart-live +10.673 -10.620 +0.103 +10.531 1.01e+00 2.3913 # step 8600 best 2.3809 +EIG_RECHECK_DONE diff --git a/ep_run/eig_recheck.py b/ep_run/eig_recheck.py new file mode 100644 index 0000000..317a40d --- /dev/null +++ b/ep_run/eig_recheck.py @@ -0,0 +1,109 @@ +"""eig_recheck.py — gold-standard recheck of the warm/scratch abscissa story after the num_abscissa +shift fix. Plain power iteration (the pre-fix estimator) converges to the largest-|lambda| end of the +indefinite Sym(J_nc) — so the reported 's2000 = -10.14 vs scratch = +1.11' may have compared lambda_min +against lambda_max. For each operator this measures, at z* on the SAME seeded batches: + lam_max / lam_min — both ENDS of Sym(J_nc), Lanczos (scipy eigsh LA/SA, matvec-only, gold standard) + oldPI — what the pre-fix plain power iteration returned (bug reproduction) + newPI — the fixed shifted-PI estimate (validates the training-time estimator vs Lanczos) + res — T1-residual eps*||F(z*)|| (the lt_ep_train line-455 convention) + val — val CE (identifies the checkpoint on the training curve) +Verdict logic: oldPI ~= lam_min (when |lam_min| > lam_max) ==> the bug was live and the -10 story is +about the WRONG end; the warm/scratch contrast must be re-read off the lam_max column. +""" +import numpy as np, torch, scipy.sparse.linalg as sla +from pathlib import Path +from torch.autograd.functional import jvp, vjp +import lt_ep_train as L +from eig_control import num_abscissa + +T1, EPS, B, NB = 150, 0.1, 6, 2 + +CKPTS = [ # (label, path or None=random init) + ('rand-init', None), + ('s2000-warmsrc', 'runs/redx_traj/s2000.pt'), + ('resreg-scratch', 'runs/ep_resreg_scratch.pt'), + ('fast-adaptive', 'runs/ep_fast_adaptive.pt'), + ('warm-fast-live', 'runs/ep_warm_fast.pt'), + ('self-restart-live', 'runs/ep_self_restart.pt'), +] + + +def load_op(path): + torch.manual_seed(0) + blk = L.EQBlock(512, 16, 256, 256, c=1.0, attn_mode='thick') + blk.qknorm = True # canonical C512 recipe flag (not stored in ckpt) + if path is None: + return blk, 'random init' + for attempt in range(2): # live ckpts: retry once in case of a mid-save read + try: + ck = torch.load(path, map_location=L.dev); break + except Exception: + if attempt: raise + import time; time.sleep(5) + with torch.no_grad(): + for p, w in zip(blk.allp, ck['allp']): + p.copy_(w.to(L.dev)) + return blk, f"step {ck.get('step')} best {ck.get('best', float('nan')):.4f}" + + +def sym_ends(blk, z): # both ends of Sym(J_nc) via Lanczos + sh, n = z.shape, z.numel() + + def mv(x): + v = torch.from_numpy(np.asarray(x, dtype=np.float32)).to(L.dev).view(sh) + with torch.no_grad(): + Sv = 0.5 * (jvp(blk.nc_force, z, v)[1] + vjp(blk.nc_force, z, v)[1]) + return Sv.reshape(-1).double().cpu().numpy() + + A = sla.LinearOperator((n, n), matvec=mv, dtype=np.float64) + kw = dict(k=1, tol=1e-3, return_eigenvectors=False, maxiter=600) + return float(sla.eigsh(A, which='LA', **kw)[0]), float(sla.eigsh(A, which='SA', **kw)[0]) + + +def old_pi(blk, z, iters=3): # the PRE-FIX estimator, reproduced verbatim + torch.manual_seed(1) + v = torch.randn_like(z); v = v / (v.norm() + 1e-12) + with torch.no_grad(): + for _ in range(iters): + Sv = 0.5 * (jvp(blk.nc_force, z, v)[1] + vjp(blk.nc_force, z, v)[1]) + v = Sv / (Sv.norm() + 1e-12) + return float((v * jvp(blk.nc_force, z, v)[1]).sum() / (v * v).sum()) + + +def main(): + rows = [] + for name, path in CKPTS: + if path and not Path(path).exists(): + print(f"[skip] {name}: {path} missing", flush=True); continue + blk, info = load_op(path) + torch.manual_seed(42) # SAME batches for every operator + acc = {k: [] for k in ('la', 'sa', 'op', 'np', 'res')} + for b in range(NB): + idx, _ = L.get_batch('train', B, 256) + xin = blk.embed(idx).detach() + zs = L.relax(blk, xin.clone(), xin, T1, EPS) + res = (L.relax(blk, zs, xin, 1, EPS) - zs).norm().item() + la, sa = sym_ends(blk, zs) + opi = old_pi(blk, zs) + _, npi = num_abscissa(blk, zs, {}, iters=40) + for k, x in zip(('la', 'sa', 'op', 'np', 'res'), (la, sa, opi, npi, res)): + acc[k].append(x) + print(f" [{name} b{b}] lam_max={la:+8.3f} lam_min={sa:+8.3f} oldPI={opi:+8.3f} " + f"newPI={npi:+8.3f} res={res:.2e}", flush=True) + val = L.evaluate(blk, T1, EPS) + m = {k: sum(v) / len(v) for k, v in acc.items()} + rows.append((name, info, m, val)) + wrong_end = abs(m['op'] - m['sa']) < abs(m['op'] - m['la']) + print(f"[{name}] ({info}) lam_max={m['la']:+.3f} lam_min={m['sa']:+.3f} oldPI={m['op']:+.3f}" + f"{' <-- oldPI tracked lam_MIN (bug live)' if wrong_end else ' (oldPI ~ lam_max, bug latent)'}" + f" newPI={m['np']:+.3f} res={m['res']:.2e} val={val:.4f}\n", flush=True) + + print(f"{'operator':<20}{'lam_max':>9}{'lam_min':>9}{'oldPI':>9}{'newPI':>9}{'res':>10}{'val':>8}") + for name, info, m, val in rows: + print(f"{name:<20}{m['la']:>+9.3f}{m['sa']:>+9.3f}{m['op']:>+9.3f}{m['np']:>+9.3f}" + f"{m['res']:>10.2e}{val:>8.4f} # {info}") + print("EIG_RECHECK_DONE", flush=True) + + +if __name__ == '__main__': + main() diff --git a/ep_run/eig_v2_depth.log b/ep_run/eig_v2_depth.log new file mode 100644 index 0000000..3fd80fc --- /dev/null +++ b/ep_run/eig_v2_depth.log @@ -0,0 +1,6 @@ +/home/yurenh2/miniconda3/lib/python3.13/site-packages/torch/autograd/graph.py:865: UserWarning: Attempting to run cuBLAS, but there was no current CUDA context! Attempting to set the primary context... (Triggered internally at /pytorch/aten/src/ATen/cuda/CublasHandlePool.cpp:330.) + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[s2000 @ 150 steps] res=2.584e+00 |l|=1.00880(+1.0088+0.0000j) |l|=1.00615(+1.0061+0.0000j) |l|=1.00509(+1.0051+0.0000j) +[s2000 @ 400 steps] res=4.638e-02 |l|=0.99798(+0.9980+0.0000j) |l|=0.99788(+0.9979+0.0000j) |l|=0.99753(+0.9975+0.0000j) +[s2000 @ 800 steps] res=3.327e-03 |l|=0.99942(+0.9994+0.0000j) |l|=0.99804(+0.9980+0.0000j) |l|=0.99698(+0.9970+0.0000j) +EIG_V2_DEPTH_DONE diff --git a/ep_run/eig_v2_depth.py b/ep_run/eig_v2_depth.py new file mode 100644 index 0000000..ee597f1 --- /dev/null +++ b/ep_run/eig_v2_depth.py @@ -0,0 +1,39 @@ +"""Path-effect check: is the '|lam|~1.008 at s2000' reading an unconverged-state artifact? +Measure ARPACK leading map-eigenvalues at z after 150 vs 400 vs 800 relax steps (same batch). +If |lam| drops below 1 with depth -> the >1 reading is transient-state contamination and the +fixed-point operator is stable (consistent with eig_probe's Re mu = -0.02 at 250 steps).""" +import numpy as np, torch, scipy.sparse.linalg as sla +from torch.autograd.functional import jvp +import lt_ep_train as L + +EPS, B, C = 0.1, 6, 1.0 +torch.manual_seed(0) +blk = L.EQBlock(512, 16, 256, 256, c=C, attn_mode='thick'); blk.qknorm = True +ck = torch.load('runs/redx_traj/s2000.pt', map_location=L.dev) +with torch.no_grad(): + for p, w in zip(blk.allp, ck['allp']): + p.copy_(w.to(L.dev)) +torch.manual_seed(42) +idx, _ = L.get_batch('train', B, 256) +xin = blk.embed(idx).detach() +kk = 1.0 - EPS * (1.0 + C) + +z = xin.clone() +done = 0 +for steps in (150, 400, 800): + z = L.relax(blk, z, xin, steps - done, EPS); done = steps + res = (L.relax(blk, z, xin, 1, EPS) - z).norm().item() + sh, n = z.shape, z.numel() + + def mv(x, z=z): + v = torch.from_numpy(np.asarray(x, dtype=np.float32)).to(L.dev).view(sh) + with torch.no_grad(): + Mv = kk * v + EPS * jvp(blk.nc_force, z, v)[1] + return Mv.reshape(-1).double().cpu().numpy() + + A = sla.LinearOperator((n, n), matvec=mv, dtype=np.float64) + vals = sorted(sla.eigs(A, k=4, which='LM', return_eigenvectors=False, maxiter=2000, tol=1e-4), + key=lambda x: -abs(x)) + print(f"[s2000 @ {steps:4d} steps] res={res:.3e} " + + " ".join(f"|l|={abs(l):.5f}({l.real:+.4f}{l.imag:+.4f}j)" for l in vals[:3]), flush=True) +print("EIG_V2_DEPTH_DONE", flush=True) diff --git a/ep_run/eig_v2_smoke.log b/ep_run/eig_v2_smoke.log new file mode 100644 index 0000000..c5bee1b --- /dev/null +++ b/ep_run/eig_v2_smoke.log @@ -0,0 +1,13 @@ +/home/yurenh2/miniconda3/lib/python3.13/site-packages/torch/autograd/graph.py:865: UserWarning: Attempting to run cuBLAS, but there was no current CUDA context! Attempting to set the primary context... (Triggered internally at /pytorch/aten/src/ATen/cuda/CublasHandlePool.cpp:330.) + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +[s2000] lead_rho: rho=0.97381 Re_mu=-0.2619 +[s2000] ARPACK : |l|=1.00880(+1.0088+0.0000j) |l|=1.00615(+1.0061+0.0000j) |l|=1.00509(+1.0051+0.0000j) +[s2000] agree : d_rho=0.0350 d_Remu=0.3499 +[s2000] penalty : rho=0.97280 fired=False n_grads=0 finite=True + +[rand-init] lead_rho: rho=1.01879 Re_mu=+0.1871 +[rand-init] ARPACK : |l|=1.00767(+1.0077+0.0000j) |l|=1.00585(+1.0051+0.0379j) |l|=1.00585(+1.0051-0.0379j) +[rand-init] agree : d_rho=0.0111 d_Remu=0.1103 +[rand-init] penalty : rho=1.02209 fired=True n_grads=11 finite=True + +EIG_V2_SMOKE_DONE diff --git a/ep_run/eig_v2_smoke.py b/ep_run/eig_v2_smoke.py new file mode 100644 index 0000000..241be7d --- /dev/null +++ b/ep_run/eig_v2_smoke.py @@ -0,0 +1,56 @@ +"""Validate lead_rho/spec_penalty (eigreg v2, TRUE map-eigenvalue) against ARPACK on the same operator. +Checks, per operator (s2000 warm source + rand-init): + 1. lead_rho's (rho, Re mu) vs scipy eigs LM top eigenvalues of M = I + eps*J_F (gold standard) + 2. cross-check vs eig_probe's known s2000 number (Re mu ~ -0.02) + 3. spec_penalty fires (non-empty grads, finite) when rho > target, stays off when below +""" +import numpy as np, torch, scipy.sparse.linalg as sla +from torch.autograd.functional import jvp +import lt_ep_train as L +from eig_control import lead_rho, spec_penalty + +T1, EPS, B, C = 150, 0.1, 6, 1.0 + + +def load_op(path): + torch.manual_seed(0) + blk = L.EQBlock(512, 16, 256, 256, c=C, attn_mode='thick'); blk.qknorm = True + if path: + ck = torch.load(path, map_location=L.dev) + with torch.no_grad(): + for p, w in zip(blk.allp, ck['allp']): + p.copy_(w.to(L.dev)) + return blk + + +def arpack_map(blk, z, k=4): + sh, n = z.shape, z.numel() + kk = 1.0 - EPS * (1.0 + C) + + def mv(x): + v = torch.from_numpy(np.asarray(x, dtype=np.float32)).to(L.dev).view(sh) + with torch.no_grad(): + Mv = kk * v + EPS * jvp(blk.nc_force, z, v)[1] + return Mv.reshape(-1).double().cpu().numpy() + + A = sla.LinearOperator((n, n), matvec=mv, dtype=np.float64) + vals = sla.eigs(A, k=k, which='LM', return_eigenvectors=False, maxiter=2000, tol=1e-4) + return sorted(vals, key=lambda x: -abs(x)) + + +for name, path in [('s2000', 'runs/redx_traj/s2000.pt'), ('rand-init', None)]: + blk = load_op(path) + torch.manual_seed(42) + idx, _ = L.get_batch('train', B, 256) + xin = blk.embed(idx).detach() + zs = L.relax(blk, xin.clone(), xin, T1, EPS) + _, rho, mu = lead_rho(blk, zs, EPS, C, {}, iters=40) + top = arpack_map(blk, zs) + lam0 = top[0] + print(f"[{name}] lead_rho: rho={rho:.5f} Re_mu={mu:+.4f}", flush=True) + print(f"[{name}] ARPACK : " + " ".join(f"|l|={abs(l):.5f}({l.real:+.4f}{l.imag:+.4f}j)" for l in top[:3])) + print(f"[{name}] agree : d_rho={abs(rho - abs(lam0)):.4f} d_Remu={abs(mu - (lam0.real - 1) / EPS):.4f}") + g, r0, m0 = spec_penalty(blk, zs, EPS, C, 0.1, 0.995, {}, iters=40) + fin = all(torch.isfinite(v).all().item() for v in g.values()) if g else True + print(f"[{name}] penalty : rho={r0:.5f} fired={bool(g)} n_grads={len(g)} finite={fin}\n", flush=True) +print("EIG_V2_SMOKE_DONE", flush=True) diff --git a/ep_run/lt_ep_train.py b/ep_run/lt_ep_train.py index 9974bd8..4e7b8b1 100644 --- a/ep_run/lt_ep_train.py +++ b/ep_run/lt_ep_train.py @@ -229,9 +229,10 @@ def ep_step(blk, idx, y, T1, T2, eps, beta, jacreg=0.0, holo=0, hr=0.02, t1max=0 for p, g in zip(blk.block, grr): if g is not None: grads[id(p)] = g * lam if grads.get(id(p)) is None else grads[id(p)] + lam * g - if eigreg > 0: # #2: leading-abscissa control (surgical, one-sided; alt to jacreg) - from eig_control import eig_penalty - ge, _om = eig_penalty(blk, zs, eigreg, eig_margin, blk.__dict__.setdefault('_eigcache', {})) + if eigreg > 0: # #2 v2: TRUE leading map-eigenvalue control (aep 'spectral', soft one-sided) + from eig_control import spec_penalty # (omega/numerical-abscissa version refuted 2026-07-03, eig_recheck) + ge, _rho, _mu = spec_penalty(blk, zs, eps, blk.c, eigreg, eig_margin, + blk.__dict__.setdefault('_eigcache', {})) for pid, g in ge.items(): grads[pid] = g if grads.get(pid) is None else grads[pid] + g return grads, res @@ -375,8 +376,8 @@ def main(): ap.add_argument('--res_gate', type=float, default=0.0) # validity gate: skip task grads above this res ap.add_argument('--wsd', type=float, default=0.0) # WSD: hold peak lr, cosine-decay only the last wsd fraction ap.add_argument('--resreg', type=float, default=0.0) # T1-residual penalty: defend z_T1 (cap ratio vs task grad); run res_gate=0 - ap.add_argument('--eigreg', type=float, default=0.0) # #2: leading-abscissa (numerical-abscissa) control — surgical alt to jacreg - ap.add_argument('--eig_margin', type=float, default=1.0) # penalize omega(J_nc) above this (free-phase Hopf boundary ~ 1+c) + ap.add_argument('--eigreg', type=float, default=0.0) # #2 v2: soft penalty on TRUE |lam|_lead(I+eps*J_F) — aep 'spectral' at C512 + ap.add_argument('--eig_margin', type=float, default=0.995) # rho target: penalize |lam|_lead above this (<1 = contracting relaxation map) ap.add_argument('--diag_cos', type=int, default=0) # #1: every N steps, log cos(EP grad, exact BPTT grad) + res ap.add_argument('--fingerprint', action='store_true') # load --init_ckpt, print (res,cos,abscissa,val) fingerprint, exit ap.add_argument('--opt', choices=['adamw', 'lion', 'lionlars', 'sgdm', 'sgdsai'], default='adamw') @@ -522,7 +523,7 @@ def main(): from diag_cos import fingerprint fp = fingerprint(blk, cfg.T1, cfg.T2, cfg.eps, cfg.beta, cfg.holo, cfg.hr, cfg.t1max, cfg.res_est, cfg.t2sel) print(f"[fingerprint] ckpt={cfg.init_ckpt or 'scratch'} | res={fp['res']:.2e} cos(EP,BPTT)={fp['cos']:.4f} " - f"num_abscissa={fp['num_abscissa']:+.4f} val={fp['val']:.4f}", flush=True) + f"rho={fp['rho']:.5f} Re_mu={fp['mu_re']:+.4f} val={fp['val']:.4f}", flush=True) return for step in range(start_step, cfg.steps + 1): idx, y = get_batch('train', cfg.B, cfg.T) -- cgit v1.2.3