import json import os import shutil MEMORY_FILE = "data/corpora/memory_cards.jsonl" BACKUP_FILE = "data/corpora/memory_cards.jsonl.bak" def clean_memory_store(): if not os.path.exists(MEMORY_FILE): print(f"Error: {MEMORY_FILE} not found.") return # 1. Backup print(f"Backing up to {BACKUP_FILE}...") shutil.copy2(MEMORY_FILE, BACKUP_FILE) unique_keys = set() cleaned_records = [] total_read = 0 # 2. Read and Dedup print("Scanning and deduplicating...") with open(MEMORY_FILE, "r", encoding="utf-8") as f: for line in f: if not line.strip(): continue try: card = json.loads(line) total_read += 1 uid = card.get("user_id") text = card.get("note_text", "").strip() # Key: (User, Content) key = (uid, text) if key not in unique_keys: unique_keys.add(key) cleaned_records.append(line.strip()) except: pass # 3. Write Back print(f"Writing {len(cleaned_records)} records back (Removed {total_read - len(cleaned_records)} duplicates)...") with open(MEMORY_FILE, "w", encoding="utf-8") as f: for line in cleaned_records: f.write(line + "\n") print("Done!") if __name__ == "__main__": clean_memory_store()