#!/usr/bin/env python3 """ Script to migrate raw queries into MemoryCards by extracting preferences. It reads from data/raw_datasets/pilot_study_1000.jsonl and outputs: - data/corpora/memory_cards.jsonl - data/corpora/memory_embeddings.npy """ import json import os import sys # Add src to sys.path so we can import personalization sys.path.append(os.path.join(os.path.dirname(__file__), "../src")) import uuid import numpy as np import torch from pathlib import Path from tqdm import tqdm from typing import List from personalization.config.settings import load_local_models_config # from personalization.models.preference_extractor.rule_extractor import QwenRuleExtractor from personalization.models.preference_extractor.gpt4o_extractor import GPT4OExtractor from personalization.models.embedding.qwen3_8b import Qwen3Embedding8B from personalization.retrieval.preference_store.schemas import ChatTurn, MemoryCard, PreferenceList def ensure_dir(path: str): Path(path).parent.mkdir(parents=True, exist_ok=True) def main(): # 1. Setup paths input_path = "data/corpora/oasst1_labeled.jsonl" # input_path = "data/raw_datasets/oasst1_queries.jsonl" output_cards_path = "data/corpora/memory_cards.jsonl" output_emb_path = "data/corpora/memory_embeddings.npy" ensure_dir(output_cards_path) print("Loading models configuration...") cfg = load_local_models_config() # 2. Initialize models # print("Initializing Preference Extractor (GPT-4o)...") # extractor = GPT4OExtractor.from_config(cfg) print("Initializing Embedding Model...") embedder = Qwen3Embedding8B.from_config(cfg) # 3. Process data print(f"Reading from {input_path}...") memory_cards: List[MemoryCard] = [] # We will process in small batches to manage memory if needed, # but for 1000 items, we can iterate one by one for extraction # and maybe batch for embedding if we want optimization. # Given the complexity, let's just do sequential for simplicity and safety. with open(input_path, "r", encoding="utf-8") as f: lines = f.readlines() # Synthetic user distribution (round robin for 10 users) users = [f"user_{i}" for i in range(10)] print("Extracting preferences...") # Use tqdm for progress for idx, line in enumerate(tqdm(lines)): # if idx >= 100: # LIMIT to 100 items # break row = json.loads(line) query = row.get("original_query", "").strip() if not query: continue # Use real metadata from dataset user_id = row.get("user_id", f"user_{idx}") session_id = row.get("session_id", f"sess_{idx}") turn_id = row.get("turn_id", 0) # Load pre-extracted preferences has_pref = row.get("has_preference", False) extracted_data = row.get("extracted_json", {}) # Skip if no preference (according to label) if not has_pref: continue try: pref_list = PreferenceList.model_validate(extracted_data) except Exception: # Fallback or skip if validation fails continue # If we have preferences, create a memory card if pref_list.preferences: # Construct a note text: "condition: action" notes = [f"{p.condition}: {p.action}" for p in pref_list.preferences] note_summary = "; ".join(notes) # Create MemoryCard (embedding will be filled later) card = MemoryCard( card_id=str(uuid.uuid4()), user_id=user_id, source_session_id=session_id, source_turn_ids=[turn_id], raw_queries=[query], preference_list=pref_list, note_text=note_summary, embedding_e=[], # To be filled kind="pref" ) memory_cards.append(card) print(f"Found {len(memory_cards)} memory cards. Generating embeddings...") if not memory_cards: print("No preferences found. Exiting.") return # 4. Generate Embeddings # We'll embed the `raw_queries` (joined) or `note_text`? # The design doc says: "Qwen3Embedding8B.encode([turn.text])" # So we embed the original query that generated the memory. texts_to_embed = [card.raw_queries[0] for card in memory_cards] print(f"Embedding {len(texts_to_embed)} memories...") embeddings_list = [] chunk_size = 2000 # Process in chunks to avoid OOM for i in range(0, len(texts_to_embed), chunk_size): print(f" Embedding chunk {i} to {min(i+chunk_size, len(texts_to_embed))}...") chunk = texts_to_embed[i : i + chunk_size] # Batch encode with larger batch_size for A40 chunk_emb = embedder.encode( chunk, batch_size=128, normalize=True, return_tensor=False ) embeddings_list.extend(chunk_emb) # Assign back to cards and prepare matrix emb_matrix = [] for card, emb in zip(memory_cards, embeddings_list): card.embedding_e = emb emb_matrix.append(emb) # 5. Save print(f"Saving {len(memory_cards)} cards to {output_cards_path}...") with open(output_cards_path, "w", encoding="utf-8") as f: for card in memory_cards: f.write(card.model_dump_json() + "\n") print(f"Saving embeddings matrix to {output_emb_path}...") np_emb = np.array(emb_matrix, dtype=np.float32) np.save(output_emb_path, np_emb) print("Done!") if __name__ == "__main__": main()