From 73c194f304f827b55081b15524479f82a1b7d94c Mon Sep 17 00:00:00 2001 From: maszhongming Date: Tue, 16 Sep 2025 15:15:29 -0500 Subject: Initial commit --- kg_rag/vectorDB/__init__.py | 0 kg_rag/vectorDB/create_vectordb.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 kg_rag/vectorDB/__init__.py create mode 100644 kg_rag/vectorDB/create_vectordb.py (limited to 'kg_rag/vectorDB') diff --git a/kg_rag/vectorDB/__init__.py b/kg_rag/vectorDB/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/kg_rag/vectorDB/create_vectordb.py b/kg_rag/vectorDB/create_vectordb.py new file mode 100644 index 0000000..b02c9ff --- /dev/null +++ b/kg_rag/vectorDB/create_vectordb.py @@ -0,0 +1,35 @@ +import pickle +from kg_rag.utility import RecursiveCharacterTextSplitter, Chroma, SentenceTransformerEmbeddings, config_data, time + + +DATA_PATH = config_data["VECTOR_DB_DISEASE_ENTITY_PATH"] +VECTOR_DB_NAME = config_data["VECTOR_DB_PATH"] +CHUNK_SIZE = int(config_data["VECTOR_DB_CHUNK_SIZE"]) +CHUNK_OVERLAP = int(config_data["VECTOR_DB_CHUNK_OVERLAP"]) +BATCH_SIZE = int(config_data["VECTOR_DB_BATCH_SIZE"]) +SENTENCE_EMBEDDING_MODEL = config_data["VECTOR_DB_SENTENCE_EMBEDDING_MODEL"] + + +def load_data(): + with open(DATA_PATH, "rb") as f: + data = pickle.load(f) + metadata_list = list(map(lambda x:{"source": x + " from SPOKE knowledge graph"}, data)) + return data, metadata_list + +def create_vectordb(): + start_time = time.time() + data, metadata_list = load_data() + text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP) + docs = text_splitter.create_documents(data, metadatas=metadata_list) + batches = [docs[i:i + BATCH_SIZE] for i in range(0, len(docs), BATCH_SIZE)] + vectorstore = Chroma(embedding_function=SentenceTransformerEmbeddings(model_name=SENTENCE_EMBEDDING_MODEL), + persist_directory=VECTOR_DB_NAME) + for batch in batches: + vectorstore.add_documents(documents=batch) + end_time = round((time.time() - start_time)/(60), 2) + print("VectorDB is created in {} mins".format(end_time)) + + +if __name__ == "__main__": + create_vectordb() + -- cgit v1.2.3