summaryrefslogtreecommitdiff
path: root/ep_run/lt_ep_compare.py
diff options
context:
space:
mode:
authorYuren Hao <yurenh2@illinois.edu>2026-07-03 05:56:50 -0500
committerYuren Hao <yurenh2@illinois.edu>2026-07-03 05:56:50 -0500
commitb83947778e2c776f757a07d4719b7ce961d7ed55 (patch)
treeb9cc01d7adda691d9156d9d04f4fb2f644674e96 /ep_run/lt_ep_compare.py
Initial commit: ept — backprop-free equilibrium transformer (EP)
Code (ep_run/), organized docs (docs/{method,campaign,hardware,outreach,paper}), analysis scripts (scripts/), ONBOARDING.md entry point. Large data/checkpoints git-ignored (share separately). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_014FAPDWQ49M5Ye3NpTndTpn
Diffstat (limited to 'ep_run/lt_ep_compare.py')
-rw-r--r--ep_run/lt_ep_compare.py69
1 files changed, 69 insertions, 0 deletions
diff --git a/ep_run/lt_ep_compare.py b/ep_run/lt_ep_compare.py
new file mode 100644
index 0000000..0159855
--- /dev/null
+++ b/ep_run/lt_ep_compare.py
@@ -0,0 +1,69 @@
+"""option 2: compare local attention-gradient quality vs true BP on the LM's attention.
+Uses the project's OWN LocalCausalSelfAttention: FA (feedback alignment) and fuse_attn_local
+(the hand-derived SoftmaxValueMixLocalFn local backward). Reports attention-param grad cosine
+vs BP, to set against the AEP result (0.993) from the equilibrium reformulation."""
+import math, pickle, numpy as np, torch, torch.nn.functional as F
+from pathlib import Path
+from model_local import LocalCausalSelfAttention, LocalGPTConfig
+
+dev = 'cuda' if torch.cuda.is_available() else 'cpu'
+torch.manual_seed(0)
+DD = Path('/tmp/lt_ep/data/shakespeare_char')
+vocab = pickle.load(open(DD / 'meta.pkl', 'rb'))['vocab_size']
+B, T, C, H = 16, 64, 128, 4
+
+
+def get_batch():
+ data = np.memmap(DD / 'train.bin', dtype=np.uint16, mode='r')
+ ix = torch.randint(len(data) - T - 1, (B,))
+ x = torch.stack([torch.from_numpy(data[i:i + T].astype(np.int64)) for i in ix])
+ y = torch.stack([torch.from_numpy(data[i + 1:i + 1 + T].astype(np.int64)) for i in ix])
+ return x.to(dev), y.to(dev)
+
+
+idx, y = get_batch()
+tok = torch.randn(vocab, C, device=dev) * 0.02
+pos = torch.randn(T, C, device=dev) * 0.02
+EMB = (tok[idx] + pos[None]).detach()
+Whead = torch.randn(C, vocab, device=dev) / math.sqrt(C)
+
+
+def make(method, fuse):
+ cfg = LocalGPTConfig(block_size=T, vocab_size=vocab, n_head=H, n_embd=C,
+ attn_mode='softmax', method=method, fuse_attn_local=fuse, dropout=0.0, bias=False)
+ return LocalCausalSelfAttention(cfg).to(dev)
+
+
+bp = make('bp', False)
+fa = make('fa', False)
+fuse = make('bp', True)
+# identical weights across the three so gradients are comparable
+for m in (fa, fuse):
+ for p in ('q_proj', 'k_proj', 'v_proj', 'o_proj'):
+ getattr(m, p).weight.data.copy_(getattr(bp, p).weight.data)
+
+
+def grads(model):
+ for p in model.parameters():
+ if p.grad is not None:
+ p.grad = None
+ o = model(EMB)
+ F.cross_entropy((o @ Whead).reshape(-1, vocab), y.reshape(-1)).backward()
+ return [getattr(model, p).weight.grad for p in ('q_proj', 'k_proj', 'v_proj', 'o_proj')]
+
+
+def cos(g, gb):
+ cs = [F.cosine_similarity(a.flatten(), b.flatten(), dim=0).item() for a, b in zip(g, gb)]
+ return sum(cs) / len(cs), cs
+
+
+gb = grads(bp)
+gfa = grads(fa)
+gfuse = grads(fuse)
+names = ('WQ', 'WK', 'WV', 'WO')
+mfa, cfa = cos(gfa, gb)
+mfu, cfu = cos(gfuse, gb)
+print("FEEDFORWARD attention (project's own code), grad cosine vs BP on Shakespeare:")
+print(f" FA (feedback align) : mean {mfa:+.3f} " + " ".join(f"{n}={c:+.2f}" for n, c in zip(names, cfa)))
+print(f" fuse_attn_local (SoftmaxValueMixLocalFn): mean {mfu:+.3f} " + " ".join(f"{n}={c:+.2f}" for n, c in zip(names, cfu)))
+print("\n(compare to AEP on equilibrium attention: mean +0.993)")