#!/usr/bin/env python3 """ Script to analyze Memory Card coverage statistics. """ import sys import os import json import numpy as np from collections import defaultdict # Add src to sys.path sys.path.append(os.path.join(os.path.dirname(__file__), "../src")) from personalization.retrieval.preference_store.schemas import MemoryCard def main(): cards_path = "data/personamem/memory_cards.jsonl" if not os.path.exists(cards_path): print(f"Error: {cards_path} not found.") return print(f"Loading memory cards from {cards_path}...") cards_by_user = defaultdict(int) total_cards = 0 with open(cards_path, "r") as f: for line in f: try: card = json.loads(line) uid = card.get("user_id") if uid: cards_by_user[uid] += 1 total_cards += 1 except: continue # We also need to know the TOTAL number of personas (including those with 0 cards) # We can infer this from the user_vectors file if it exists, or just report on "users with memory" # But better to check contexts file to see denominator ctx_path = "data/raw_datasets/personamem/shared_contexts_32k.jsonl" total_personas = 0 if os.path.exists(ctx_path): with open(ctx_path, "r") as f: for line in f: data = json.loads(line) total_personas += len(data) # Each line is {hash: [msgs]}? Wait, check format. # personamem_loader says: line is dict {cid: msgs} # So usually 1 per line? Or many? # Let's count keys. else: print("Warning: Context file not found, can't calculate 0-memory users accurately.") total_personas = len(cards_by_user) # Fallback users_with_memory = len(cards_by_user) users_without_memory = total_personas - users_with_memory counts = list(cards_by_user.values()) if users_without_memory > 0: counts.extend([0] * users_without_memory) print("\n" + "="*40) print("Memory Coverage Statistics") print("="*40) print(f"Total Personas (Est): {total_personas}") print(f"Total Memory Cards: {total_cards}") print(f"Users with Memory: {users_with_memory} ({users_with_memory/total_personas*100:.2f}%)") print(f"Users w/o Memory: {users_without_memory} ({users_without_memory/total_personas*100:.2f}%)") print("-" * 40) if counts: avg_cards = np.mean(counts) median_cards = np.median(counts) max_cards = np.max(counts) print(f"Avg Cards/User: {avg_cards:.2f}") print(f"Median Cards/User: {median_cards:.2f}") print(f"Max Cards/User: {max_cards}") # Percentiles p25, p75 = np.percentile(counts, [25, 75]) print(f"25th Percentile: {p25:.2f}") print(f"75th Percentile: {p75:.2f}") print("\nDistribution:") # Adjust for exact 0 zero_count = counts.count(0) print(f" 0 : {zero_count}") # Custom bins for >0 non_zero_counts = [c for c in counts if c > 0] if non_zero_counts: hist_nz, edges = np.histogram(non_zero_counts, bins=[1, 5, 10, 20, 50, 1000]) for i in range(len(hist_nz)): range_str = f"{int(edges[i])}-{int(edges[i+1]-1)}" print(f" {range_str:<8}: {hist_nz[i]}") if __name__ == "__main__": main()