import json import os from collections import Counter, defaultdict MEMORY_FILE = "data/corpora/memory_cards.jsonl" def analyze_memory(): if not os.path.exists(MEMORY_FILE): print(f"Error: {MEMORY_FILE} not found.") return print(f"Analyzing {MEMORY_FILE}...") total_cards = 0 user_counts = Counter() content_hashes = defaultdict(int) user_content_hashes = defaultdict(int) with open(MEMORY_FILE, "r", encoding="utf-8") as f: for line in f: if not line.strip(): continue try: card = json.loads(line) total_cards += 1 uid = card.get("user_id", "unknown") text = card.get("note_text", "").strip() user_counts[uid] += 1 content_hashes[text] += 1 user_content_hashes[(uid, text)] += 1 except: pass print("\n" + "="*40) print("MEMORY STORE ANALYSIS") print("="*40) print(f"Total Cards: {total_cards}") print(f"Unique Users: {len(user_counts)}") print("-" * 40) print("\nTop 10 Users by Card Count:") for uid, count in user_counts.most_common(10): print(f" {uid}: {count}") print("\nTop 10 Most Frequent Contents (Global):") sorted_content = sorted(content_hashes.items(), key=lambda x: x[1], reverse=True)[:10] for text, count in sorted_content: display_text = (text[:50] + '...') if len(text) > 50 else text print(f" [{count}] {display_text}") print("\nTop 10 Most Frequent (User, Content) Duplicates:") sorted_user_content = sorted(user_content_hashes.items(), key=lambda x: x[1], reverse=True)[:10] for (uid, text), count in sorted_user_content: display_text = (text[:50] + '...') if len(text) > 50 else text print(f" [{count}] User: {uid} | {display_text}") print("="*40) if __name__ == "__main__": analyze_memory()