summaryrefslogtreecommitdiff
path: root/scripts/clean_memory_store.py
diff options
context:
space:
mode:
authorYurenHao0426 <blackhao0426@gmail.com>2025-12-17 04:29:37 -0600
committerYurenHao0426 <blackhao0426@gmail.com>2025-12-17 04:29:37 -0600
commite43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (patch)
tree6ce8a00d2f8b9ebd83c894a27ea01ac50cfb2ff5 /scripts/clean_memory_store.py
Initial commit (clean history)HEADmain
Diffstat (limited to 'scripts/clean_memory_store.py')
-rw-r--r--scripts/clean_memory_store.py52
1 files changed, 52 insertions, 0 deletions
diff --git a/scripts/clean_memory_store.py b/scripts/clean_memory_store.py
new file mode 100644
index 0000000..3b07a95
--- /dev/null
+++ b/scripts/clean_memory_store.py
@@ -0,0 +1,52 @@
+import json
+import os
+import shutil
+
+MEMORY_FILE = "data/corpora/memory_cards.jsonl"
+BACKUP_FILE = "data/corpora/memory_cards.jsonl.bak"
+
+def clean_memory_store():
+ if not os.path.exists(MEMORY_FILE):
+ print(f"Error: {MEMORY_FILE} not found.")
+ return
+
+ # 1. Backup
+ print(f"Backing up to {BACKUP_FILE}...")
+ shutil.copy2(MEMORY_FILE, BACKUP_FILE)
+
+ unique_keys = set()
+ cleaned_records = []
+ total_read = 0
+
+ # 2. Read and Dedup
+ print("Scanning and deduplicating...")
+ with open(MEMORY_FILE, "r", encoding="utf-8") as f:
+ for line in f:
+ if not line.strip(): continue
+ try:
+ card = json.loads(line)
+ total_read += 1
+
+ uid = card.get("user_id")
+ text = card.get("note_text", "").strip()
+
+ # Key: (User, Content)
+ key = (uid, text)
+
+ if key not in unique_keys:
+ unique_keys.add(key)
+ cleaned_records.append(line.strip())
+ except:
+ pass
+
+ # 3. Write Back
+ print(f"Writing {len(cleaned_records)} records back (Removed {total_read - len(cleaned_records)} duplicates)...")
+ with open(MEMORY_FILE, "w", encoding="utf-8") as f:
+ for line in cleaned_records:
+ f.write(line + "\n")
+
+ print("Done!")
+
+if __name__ == "__main__":
+ clean_memory_store()
+