diff options
| author | YurenHao0426 <blackhao0426@gmail.com> | 2025-12-17 04:29:37 -0600 |
|---|---|---|
| committer | YurenHao0426 <blackhao0426@gmail.com> | 2025-12-17 04:29:37 -0600 |
| commit | e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (patch) | |
| tree | 6ce8a00d2f8b9ebd83c894a27ea01ac50cfb2ff5 /scripts/analyze_memory_coverage.py | |
Diffstat (limited to 'scripts/analyze_memory_coverage.py')
| -rw-r--r-- | scripts/analyze_memory_coverage.py | 103 |
1 files changed, 103 insertions, 0 deletions
diff --git a/scripts/analyze_memory_coverage.py b/scripts/analyze_memory_coverage.py new file mode 100644 index 0000000..e7ec498 --- /dev/null +++ b/scripts/analyze_memory_coverage.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +""" +Script to analyze Memory Card coverage statistics. +""" +import sys +import os +import json +import numpy as np +from collections import defaultdict + +# Add src to sys.path +sys.path.append(os.path.join(os.path.dirname(__file__), "../src")) + +from personalization.retrieval.preference_store.schemas import MemoryCard + +def main(): + cards_path = "data/personamem/memory_cards.jsonl" + + if not os.path.exists(cards_path): + print(f"Error: {cards_path} not found.") + return + + print(f"Loading memory cards from {cards_path}...") + + cards_by_user = defaultdict(int) + total_cards = 0 + + with open(cards_path, "r") as f: + for line in f: + try: + card = json.loads(line) + uid = card.get("user_id") + if uid: + cards_by_user[uid] += 1 + total_cards += 1 + except: + continue + + # We also need to know the TOTAL number of personas (including those with 0 cards) + # We can infer this from the user_vectors file if it exists, or just report on "users with memory" + # But better to check contexts file to see denominator + + ctx_path = "data/raw_datasets/personamem/shared_contexts_32k.jsonl" + total_personas = 0 + if os.path.exists(ctx_path): + with open(ctx_path, "r") as f: + for line in f: + data = json.loads(line) + total_personas += len(data) # Each line is {hash: [msgs]}? Wait, check format. + # personamem_loader says: line is dict {cid: msgs} + # So usually 1 per line? Or many? + # Let's count keys. + else: + print("Warning: Context file not found, can't calculate 0-memory users accurately.") + total_personas = len(cards_by_user) # Fallback + + users_with_memory = len(cards_by_user) + users_without_memory = total_personas - users_with_memory + + counts = list(cards_by_user.values()) + if users_without_memory > 0: + counts.extend([0] * users_without_memory) + + print("\n" + "="*40) + print("Memory Coverage Statistics") + print("="*40) + print(f"Total Personas (Est): {total_personas}") + print(f"Total Memory Cards: {total_cards}") + print(f"Users with Memory: {users_with_memory} ({users_with_memory/total_personas*100:.2f}%)") + print(f"Users w/o Memory: {users_without_memory} ({users_without_memory/total_personas*100:.2f}%)") + print("-" * 40) + + if counts: + avg_cards = np.mean(counts) + median_cards = np.median(counts) + max_cards = np.max(counts) + + print(f"Avg Cards/User: {avg_cards:.2f}") + print(f"Median Cards/User: {median_cards:.2f}") + print(f"Max Cards/User: {max_cards}") + + # Percentiles + p25, p75 = np.percentile(counts, [25, 75]) + print(f"25th Percentile: {p25:.2f}") + print(f"75th Percentile: {p75:.2f}") + + print("\nDistribution:") + + # Adjust for exact 0 + zero_count = counts.count(0) + + print(f" 0 : {zero_count}") + # Custom bins for >0 + non_zero_counts = [c for c in counts if c > 0] + if non_zero_counts: + hist_nz, edges = np.histogram(non_zero_counts, bins=[1, 5, 10, 20, 50, 1000]) + for i in range(len(hist_nz)): + range_str = f"{int(edges[i])}-{int(edges[i+1]-1)}" + print(f" {range_str:<8}: {hist_nz[i]}") + +if __name__ == "__main__": + main() + |
