1 files changed, 103 insertions, 0 deletions
diff --git a/scripts/analyze_memory_coverage.py b/scripts/analyze_memory_coverage.py
new file mode 100644
index 0000000..e7ec498
--- /dev/null
+++ b/scripts/analyze_memory_coverage.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""
+Script to analyze Memory Card coverage statistics.
+"""
+import sys
+import os
+import json
+import numpy as np
+from collections import defaultdict
+
+# Add src to sys.path
+sys.path.append(os.path.join(os.path.dirname(__file__), "../src"))
+
+from personalization.retrieval.preference_store.schemas import MemoryCard
+
+def main():
+    cards_path = "data/personamem/memory_cards.jsonl"
+    
+    if not os.path.exists(cards_path):
+        print(f"Error: {cards_path} not found.")
+        return
+
+    print(f"Loading memory cards from {cards_path}...")
+    
+    cards_by_user = defaultdict(int)
+    total_cards = 0
+    
+    with open(cards_path, "r") as f:
+        for line in f:
+            try:
+                card = json.loads(line)
+                uid = card.get("user_id")
+                if uid:
+                    cards_by_user[uid] += 1
+                    total_cards += 1
+            except:
+                continue
+                
+    # We also need to know the TOTAL number of personas (including those with 0 cards)
+    # We can infer this from the user_vectors file if it exists, or just report on "users with memory"
+    # But better to check contexts file to see denominator
+    
+    ctx_path = "data/raw_datasets/personamem/shared_contexts_32k.jsonl"
+    total_personas = 0
+    if os.path.exists(ctx_path):
+        with open(ctx_path, "r") as f:
+            for line in f:
+                data = json.loads(line)
+                total_personas += len(data) # Each line is {hash: [msgs]}? Wait, check format.
+                # personamem_loader says: line is dict {cid: msgs}
+                # So usually 1 per line? Or many?
+                # Let's count keys.
+    else:
+        print("Warning: Context file not found, can't calculate 0-memory users accurately.")
+        total_personas = len(cards_by_user) # Fallback
+
+    users_with_memory = len(cards_by_user)
+    users_without_memory = total_personas - users_with_memory
+    
+    counts = list(cards_by_user.values())
+    if users_without_memory > 0:
+        counts.extend([0] * users_without_memory)
+        
+    print("\n" + "="*40)
+    print("Memory Coverage Statistics")
+    print("="*40)
+    print(f"Total Personas (Est): {total_personas}")
+    print(f"Total Memory Cards:   {total_cards}")
+    print(f"Users with Memory:    {users_with_memory} ({users_with_memory/total_personas*100:.2f}%)")
+    print(f"Users w/o Memory:     {users_without_memory} ({users_without_memory/total_personas*100:.2f}%)")
+    print("-" * 40)
+    
+    if counts:
+        avg_cards = np.mean(counts)
+        median_cards = np.median(counts)
+        max_cards = np.max(counts)
+        
+        print(f"Avg Cards/User:       {avg_cards:.2f}")
+        print(f"Median Cards/User:    {median_cards:.2f}")
+        print(f"Max Cards/User:       {max_cards}")
+        
+        # Percentiles
+        p25, p75 = np.percentile(counts, [25, 75])
+        print(f"25th Percentile:      {p25:.2f}")
+        print(f"75th Percentile:      {p75:.2f}")
+        
+        print("\nDistribution:")
+        
+        # Adjust for exact 0
+        zero_count = counts.count(0)
+        
+        print(f"  0       : {zero_count}")
+        # Custom bins for >0
+        non_zero_counts = [c for c in counts if c > 0]
+        if non_zero_counts:
+            hist_nz, edges = np.histogram(non_zero_counts, bins=[1, 5, 10, 20, 50, 1000])
+            for i in range(len(hist_nz)):
+                range_str = f"{int(edges[i])}-{int(edges[i+1]-1)}"
+                print(f"  {range_str:<8}: {hist_nz[i]}")
+
+if __name__ == "__main__":
+    main()
+