diff options
Diffstat (limited to 'scripts/clean_memory_store.py')
| -rw-r--r-- | scripts/clean_memory_store.py | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/scripts/clean_memory_store.py b/scripts/clean_memory_store.py new file mode 100644 index 0000000..3b07a95 --- /dev/null +++ b/scripts/clean_memory_store.py @@ -0,0 +1,52 @@ +import json +import os +import shutil + +MEMORY_FILE = "data/corpora/memory_cards.jsonl" +BACKUP_FILE = "data/corpora/memory_cards.jsonl.bak" + +def clean_memory_store(): + if not os.path.exists(MEMORY_FILE): + print(f"Error: {MEMORY_FILE} not found.") + return + + # 1. Backup + print(f"Backing up to {BACKUP_FILE}...") + shutil.copy2(MEMORY_FILE, BACKUP_FILE) + + unique_keys = set() + cleaned_records = [] + total_read = 0 + + # 2. Read and Dedup + print("Scanning and deduplicating...") + with open(MEMORY_FILE, "r", encoding="utf-8") as f: + for line in f: + if not line.strip(): continue + try: + card = json.loads(line) + total_read += 1 + + uid = card.get("user_id") + text = card.get("note_text", "").strip() + + # Key: (User, Content) + key = (uid, text) + + if key not in unique_keys: + unique_keys.add(key) + cleaned_records.append(line.strip()) + except: + pass + + # 3. Write Back + print(f"Writing {len(cleaned_records)} records back (Removed {total_read - len(cleaned_records)} duplicates)...") + with open(MEMORY_FILE, "w", encoding="utf-8") as f: + for line in cleaned_records: + f.write(line + "\n") + + print("Done!") + +if __name__ == "__main__": + clean_memory_store() + |
