summaryrefslogtreecommitdiff
path: root/scripts/build_item_space.py
diff options
context:
space:
mode:
authorYurenHao0426 <blackhao0426@gmail.com>2025-12-17 04:29:37 -0600
committerYurenHao0426 <blackhao0426@gmail.com>2025-12-17 04:29:37 -0600
commite43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (patch)
tree6ce8a00d2f8b9ebd83c894a27ea01ac50cfb2ff5 /scripts/build_item_space.py
Initial commit (clean history)HEADmain
Diffstat (limited to 'scripts/build_item_space.py')
-rw-r--r--scripts/build_item_space.py52
1 files changed, 52 insertions, 0 deletions
diff --git a/scripts/build_item_space.py b/scripts/build_item_space.py
new file mode 100644
index 0000000..c98238c
--- /dev/null
+++ b/scripts/build_item_space.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+"""
+Script to build Item Space (PCA Projection) from Memory Embeddings.
+Inputs:
+- data/corpora/memory_embeddings.npy (M x 4096)
+Outputs:
+- data/corpora/item_projection.npz (P, mean, V)
+"""
+
+import sys
+import os
+import numpy as np
+
+# Add src to sys.path
+sys.path.append(os.path.join(os.path.dirname(__file__), "../src"))
+
+from personalization.user_model.features import ItemProjection
+
+def main():
+ emb_path = "data/corpora/memory_embeddings.npy"
+ out_path = "data/corpora/item_projection.npz"
+
+ if not os.path.exists(emb_path):
+ print(f"Error: {emb_path} not found. Run migrate_preferences.py first.")
+ sys.exit(1)
+
+ print(f"Loading embeddings from {emb_path}...")
+ E = np.load(emb_path)
+ print(f"Loaded shape: {E.shape}")
+
+ # Target dimension k=256
+ k = 256
+ print(f"Fitting PCA with k={k}...")
+
+ proj = ItemProjection.from_pca(E, k=k)
+
+ print("Transforming all embeddings to item space...")
+ V = proj.transform_embeddings(E)
+ print(f"Item vectors shape: {V.shape}")
+
+ print(f"Saving projection to {out_path}...")
+ np.savez(
+ out_path,
+ P=proj.P,
+ mean=proj.mean,
+ V=V
+ )
+ print("Done.")
+
+if __name__ == "__main__":
+ main()
+