summaryrefslogtreecommitdiff
path: root/src/personalization/user_model/features.py
blob: a4508b451bc334bd69ac8b61526c51e4817ba670 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import numpy as np
from dataclasses import dataclass
from sklearn.decomposition import PCA

@dataclass
class ItemProjection:
    P: np.ndarray     # [k, d]
    mean: np.ndarray  # [d]

    @classmethod
    def from_pca(cls, embeddings: np.ndarray, k: int) -> "ItemProjection":
        """
        embeddings: [M, d]
        """
        mean = embeddings.mean(axis=0)
        centered = embeddings - mean
        
        # Ensure k is not larger than min(n_samples, n_features)
        n_samples, n_features = embeddings.shape
        actual_k = min(k, n_samples, n_features)
        
        pca = PCA(n_components=actual_k)
        pca.fit(centered)

        # pca.components_: [k, d]
        P = pca.components_  # Each row is a principal component vector
        
        # If we had to reduce k, we might want to pad P or handle it?
        # For now, let's assume we get what we asked for or less if data is small.
        # But for the system we want fixed k. 
        # If actual_k < k, we should pad with zeros to match expected dimension.
        if actual_k < k:
            padding = np.zeros((k - actual_k, n_features), dtype=P.dtype)
            P = np.vstack([P, padding])
            
        return cls(P=P, mean=mean)

    def transform_embeddings(self, E: np.ndarray) -> np.ndarray:
        """
        E: [N, d] -> [N, k]
        """
        return (E - self.mean) @ self.P.T

    def transform_vector(self, e: np.ndarray) -> np.ndarray:
        """
        e: [d] -> [k]
        """
        return self.P @ (e - self.mean)