diff options
| author | YurenHao0426 <blackhao0426@gmail.com> | 2025-12-17 04:29:37 -0600 |
|---|---|---|
| committer | YurenHao0426 <blackhao0426@gmail.com> | 2025-12-17 04:29:37 -0600 |
| commit | e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (patch) | |
| tree | 6ce8a00d2f8b9ebd83c894a27ea01ac50cfb2ff5 /scripts/analyze_memory.py | |
Diffstat (limited to 'scripts/analyze_memory.py')
| -rw-r--r-- | scripts/analyze_memory.py | 61 |
1 files changed, 61 insertions, 0 deletions
diff --git a/scripts/analyze_memory.py b/scripts/analyze_memory.py new file mode 100644 index 0000000..4c439cf --- /dev/null +++ b/scripts/analyze_memory.py @@ -0,0 +1,61 @@ +import json +import os +from collections import Counter, defaultdict + +MEMORY_FILE = "data/corpora/memory_cards.jsonl" + +def analyze_memory(): + if not os.path.exists(MEMORY_FILE): + print(f"Error: {MEMORY_FILE} not found.") + return + + print(f"Analyzing {MEMORY_FILE}...") + + total_cards = 0 + user_counts = Counter() + content_hashes = defaultdict(int) + user_content_hashes = defaultdict(int) + + with open(MEMORY_FILE, "r", encoding="utf-8") as f: + for line in f: + if not line.strip(): continue + try: + card = json.loads(line) + total_cards += 1 + uid = card.get("user_id", "unknown") + text = card.get("note_text", "").strip() + + user_counts[uid] += 1 + content_hashes[text] += 1 + user_content_hashes[(uid, text)] += 1 + except: + pass + + print("\n" + "="*40) + print("MEMORY STORE ANALYSIS") + print("="*40) + print(f"Total Cards: {total_cards}") + print(f"Unique Users: {len(user_counts)}") + print("-" * 40) + + print("\nTop 10 Users by Card Count:") + for uid, count in user_counts.most_common(10): + print(f" {uid}: {count}") + + print("\nTop 10 Most Frequent Contents (Global):") + sorted_content = sorted(content_hashes.items(), key=lambda x: x[1], reverse=True)[:10] + for text, count in sorted_content: + display_text = (text[:50] + '...') if len(text) > 50 else text + print(f" [{count}] {display_text}") + + print("\nTop 10 Most Frequent (User, Content) Duplicates:") + sorted_user_content = sorted(user_content_hashes.items(), key=lambda x: x[1], reverse=True)[:10] + for (uid, text), count in sorted_user_content: + display_text = (text[:50] + '...') if len(text) > 50 else text + print(f" [{count}] User: {uid} | {display_text}") + + print("="*40) + +if __name__ == "__main__": + analyze_memory() + |
