diff options
| author | YurenHao0426 <blackhao0426@gmail.com> | 2025-12-17 04:29:37 -0600 |
|---|---|---|
| committer | YurenHao0426 <blackhao0426@gmail.com> | 2025-12-17 04:29:37 -0600 |
| commit | e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (patch) | |
| tree | 6ce8a00d2f8b9ebd83c894a27ea01ac50cfb2ff5 /scripts/build_item_space.py | |
Diffstat (limited to 'scripts/build_item_space.py')
| -rw-r--r-- | scripts/build_item_space.py | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/scripts/build_item_space.py b/scripts/build_item_space.py new file mode 100644 index 0000000..c98238c --- /dev/null +++ b/scripts/build_item_space.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +""" +Script to build Item Space (PCA Projection) from Memory Embeddings. +Inputs: +- data/corpora/memory_embeddings.npy (M x 4096) +Outputs: +- data/corpora/item_projection.npz (P, mean, V) +""" + +import sys +import os +import numpy as np + +# Add src to sys.path +sys.path.append(os.path.join(os.path.dirname(__file__), "../src")) + +from personalization.user_model.features import ItemProjection + +def main(): + emb_path = "data/corpora/memory_embeddings.npy" + out_path = "data/corpora/item_projection.npz" + + if not os.path.exists(emb_path): + print(f"Error: {emb_path} not found. Run migrate_preferences.py first.") + sys.exit(1) + + print(f"Loading embeddings from {emb_path}...") + E = np.load(emb_path) + print(f"Loaded shape: {E.shape}") + + # Target dimension k=256 + k = 256 + print(f"Fitting PCA with k={k}...") + + proj = ItemProjection.from_pca(E, k=k) + + print("Transforming all embeddings to item space...") + V = proj.transform_embeddings(E) + print(f"Item vectors shape: {V.shape}") + + print(f"Saving projection to {out_path}...") + np.savez( + out_path, + P=proj.P, + mean=proj.mean, + V=V + ) + print("Done.") + +if __name__ == "__main__": + main() + |
