summaryrefslogtreecommitdiff
path: root/scripts/analyze_memory.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/analyze_memory.py')
-rw-r--r--scripts/analyze_memory.py61
1 files changed, 61 insertions, 0 deletions
diff --git a/scripts/analyze_memory.py b/scripts/analyze_memory.py
new file mode 100644
index 0000000..4c439cf
--- /dev/null
+++ b/scripts/analyze_memory.py
@@ -0,0 +1,61 @@
+import json
+import os
+from collections import Counter, defaultdict
+
+MEMORY_FILE = "data/corpora/memory_cards.jsonl"
+
+def analyze_memory():
+ if not os.path.exists(MEMORY_FILE):
+ print(f"Error: {MEMORY_FILE} not found.")
+ return
+
+ print(f"Analyzing {MEMORY_FILE}...")
+
+ total_cards = 0
+ user_counts = Counter()
+ content_hashes = defaultdict(int)
+ user_content_hashes = defaultdict(int)
+
+ with open(MEMORY_FILE, "r", encoding="utf-8") as f:
+ for line in f:
+ if not line.strip(): continue
+ try:
+ card = json.loads(line)
+ total_cards += 1
+ uid = card.get("user_id", "unknown")
+ text = card.get("note_text", "").strip()
+
+ user_counts[uid] += 1
+ content_hashes[text] += 1
+ user_content_hashes[(uid, text)] += 1
+ except:
+ pass
+
+ print("\n" + "="*40)
+ print("MEMORY STORE ANALYSIS")
+ print("="*40)
+ print(f"Total Cards: {total_cards}")
+ print(f"Unique Users: {len(user_counts)}")
+ print("-" * 40)
+
+ print("\nTop 10 Users by Card Count:")
+ for uid, count in user_counts.most_common(10):
+ print(f" {uid}: {count}")
+
+ print("\nTop 10 Most Frequent Contents (Global):")
+ sorted_content = sorted(content_hashes.items(), key=lambda x: x[1], reverse=True)[:10]
+ for text, count in sorted_content:
+ display_text = (text[:50] + '...') if len(text) > 50 else text
+ print(f" [{count}] {display_text}")
+
+ print("\nTop 10 Most Frequent (User, Content) Duplicates:")
+ sorted_user_content = sorted(user_content_hashes.items(), key=lambda x: x[1], reverse=True)[:10]
+ for (uid, text), count in sorted_user_content:
+ display_text = (text[:50] + '...') if len(text) > 50 else text
+ print(f" [{count}] User: {uid} | {display_text}")
+
+ print("="*40)
+
+if __name__ == "__main__":
+ analyze_memory()
+