diff options
Diffstat (limited to 'kg_rag/rag_based_generation')
13 files changed, 800 insertions, 0 deletions
diff --git a/kg_rag/rag_based_generation/GPT/drug_action.py b/kg_rag/rag_based_generation/GPT/drug_action.py new file mode 100644 index 0000000..60c0acf --- /dev/null +++ b/kg_rag/rag_based_generation/GPT/drug_action.py @@ -0,0 +1,52 @@ +from kg_rag.utility import * +import argparse + + + +parser = argparse.ArgumentParser() +parser.add_argument('-g', type=str, default='gpt-35-turbo', help='GPT model selection') +parser.add_argument('-i', type=bool, default=False, help='Flag for interactive mode') +parser.add_argument('-e', type=bool, default=False, help='Flag for showing evidence of association from the graph') +args = parser.parse_args() + +CHAT_MODEL_ID = args.g +INTERACTIVE = args.i +EDGE_EVIDENCE = bool(args.e) + + +SYSTEM_PROMPT = system_prompts["DRUG_ACTION"] +CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"]) +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"]) +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"]) +VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"] +NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"] +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"] +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"] +TEMPERATURE = config_data["LLM_TEMPERATURE"] + + +CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID + +vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL) +embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL) +node_context_df = pd.read_csv(NODE_CONTEXT_PATH) + +def main(): + print(" ") + question = input("Enter your question : ") + if not INTERACTIVE: + print("Retrieving context from SPOKE graph...") + context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, EDGE_EVIDENCE) + print("Here is the KG-RAG based answer:") + print("") + enriched_prompt = "Context: "+ context + "\n" + "Question: " + question + output = get_GPT_response(enriched_prompt, SYSTEM_PROMPT, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=TEMPERATURE) + stream_out(output) + else: + interactive(question, vectorstore, node_context_df, embedding_function_for_context_retrieval, CHAT_MODEL_ID, EDGE_EVIDENCE, SYSTEM_PROMPT) + + + +if __name__ == "__main__": + main() + diff --git a/kg_rag/rag_based_generation/GPT/drug_repurposing_v2.py b/kg_rag/rag_based_generation/GPT/drug_repurposing_v2.py new file mode 100644 index 0000000..d95053b --- /dev/null +++ b/kg_rag/rag_based_generation/GPT/drug_repurposing_v2.py @@ -0,0 +1,68 @@ +from kg_rag.utility import * +import argparse + + + +parser = argparse.ArgumentParser() +parser.add_argument('-g', type=str, default='gpt-35-turbo', help='GPT model selection') +parser.add_argument('-i', type=bool, default=False, help='Flag for interactive mode') +parser.add_argument('-e', type=bool, default=False, help='Flag for showing evidence of association from the graph') +args = parser.parse_args() + + +CHAT_MODEL_ID = args.g +INTERACTIVE = args.i +EDGE_EVIDENCE = bool(args.e) + +SYSTEM_PROMPT = system_prompts["DRUG_REPURPOSING_V2"] +CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"]) +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"]) +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"]) +VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"] +NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"] +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"] +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"] +TEMPERATURE = config_data["LLM_TEMPERATURE"] + + +CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID + +vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL) +embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL) +node_context_df = pd.read_csv(NODE_CONTEXT_PATH) + +print('') +question = input("Question : ") + +question_template = f''' +To the question asked at the end, answer by referring the context. +See example below +Example 1: + Question: + What drugs can be repurposed for disease X? + Context: + Compound Alizapride DOWNREGULATES Gene APOE and Provenance of this association is XX. Gene APOE ASSOCIATES Disease X and Provenance of this association is YY. Gene TTR encodes Protein Transthyretin (ATTR) and Provenance of this association is ZZ. Compound Acetylcysteine treats Disease X and Provenance of this association is PP. + Answer: + Since Alizapride downregulates gene APOE (Provenance XX) and APOE is associated with Disease X (Provenance YY), Alizapride can be repurposed to treat Disease X. p-value for these associations is XXXX and z-score values for these associations is YYYY. +Question: +{question} +''' + +def main(): + if not INTERACTIVE: + print("Retrieving context from SPOKE graph...") + context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, EDGE_EVIDENCE) + print("Here is the KG-RAG based answer:") + print("") + enriched_prompt = "Context: "+ context + "\n" + "Question: " + question + output = get_GPT_response(enriched_prompt, SYSTEM_PROMPT, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=TEMPERATURE) + stream_out(output) + else: + interactive(question, vectorstore, node_context_df, embedding_function_for_context_retrieval, CHAT_MODEL_ID, EDGE_EVIDENCE, SYSTEM_PROMPT) + + + +if __name__ == "__main__": + main() + + diff --git a/kg_rag/rag_based_generation/GPT/run_drug_repurposing.py b/kg_rag/rag_based_generation/GPT/run_drug_repurposing.py new file mode 100644 index 0000000..8a5726d --- /dev/null +++ b/kg_rag/rag_based_generation/GPT/run_drug_repurposing.py @@ -0,0 +1,57 @@ +''' +This script takes the drug repurposing style questions from the csv file and save the result as another csv file. +Before running this script, make sure to configure the filepaths in config.yaml file. +Command line argument should be either 'gpt-4' or 'gpt-35-turbo' +''' + +from kg_rag.utility import * +import sys + + + +CHAT_MODEL_ID = sys.argv[1] + +QUESTION_PATH = config_data["DRUG_REPURPOSING_PATH"] +SYSTEM_PROMPT = system_prompts["DRUG_REPURPOSING"] +CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"]) +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"]) +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"]) +VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"] +NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"] +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"] +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"] +TEMPERATURE = config_data["LLM_TEMPERATURE"] +SAVE_PATH = config_data["SAVE_RESULTS_PATH"] + + +CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID + +save_name = "_".join(CHAT_MODEL_ID.split("-"))+"_drug_repurposing_questions_response.csv" + + +vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL) +embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL) +node_context_df = pd.read_csv(NODE_CONTEXT_PATH) + + +def main(): + start_time = time.time() + question_df = pd.read_csv(QUESTION_PATH) + answer_list = [] + for index, row in question_df.iterrows(): + question = row["text"] + context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY) + enriched_prompt = "Context: " + context + "\n" + "Question: " + question + output = get_GPT_response(enriched_prompt, SYSTEM_PROMPT, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=TEMPERATURE) + answer_list.append((row["disease_in_question"], row["refDisease"], row["compoundGroundTruth"], row["text"], output)) + answer_df = pd.DataFrame(answer_list, columns=["disease_in_question", "refDisease", "compoundGroundTruth", "text", "llm_answer"]) + answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True) + print("Completed in {} min".format((time.time()-start_time)/60)) + + + +if __name__ == "__main__": + main() + + + diff --git a/kg_rag/rag_based_generation/GPT/run_mcq_qa.py b/kg_rag/rag_based_generation/GPT/run_mcq_qa.py new file mode 100644 index 0000000..edf0415 --- /dev/null +++ b/kg_rag/rag_based_generation/GPT/run_mcq_qa.py @@ -0,0 +1,91 @@ +''' +This script takes the MCQ style questions from the csv file and save the result as another csv file. +Before running this script, make sure to configure the filepaths in config.yaml file. +Command line argument should be either 'gpt-4' or 'gpt-35-turbo' +''' + +from kg_rag.utility import * +import sys + + +from tqdm import tqdm +CHAT_MODEL_ID = sys.argv[1] + +QUESTION_PATH = config_data["MCQ_PATH"] +SYSTEM_PROMPT = system_prompts["MCQ_QUESTION"] +CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"]) +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"]) +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"]) +VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"] +NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"] +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"] +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"] +TEMPERATURE = config_data["LLM_TEMPERATURE"] +SAVE_PATH = config_data["SAVE_RESULTS_PATH"] + + +CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID + +save_name = "_".join(CHAT_MODEL_ID.split("-"))+"_kg_rag_based_mcq_{mode}.csv" + + +vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL) +embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL) +node_context_df = pd.read_csv(NODE_CONTEXT_PATH) +edge_evidence = False + + +MODE = "0" +### MODE 0: Original KG_RAG ### +### MODE 1: jsonlize the context from KG search ### +### MODE 2: Add the prior domain knowledge ### +### MODE 3: Combine MODE 1 & 2 ### + +def main(): + start_time = time.time() + question_df = pd.read_csv(QUESTION_PATH) + answer_list = [] + + for index, row in tqdm(question_df.iterrows(), total=306): + try: + question = row["text"] + if MODE == "0": + ### MODE 0: Original KG_RAG ### + context = retrieve_context(row["text"], vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence, model_id=CHAT_MODEL_ID) + enriched_prompt = "Context: "+ context + "\n" + "Question: "+ question + output = get_Gemini_response(enriched_prompt, SYSTEM_PROMPT, temperature=TEMPERATURE) + + if MODE == "1": + ### MODE 1: jsonlize the context from KG search ### + ### Please implement the first strategy here ### + output = '...' + + if MODE == "2": + ### MODE 2: Add the prior domain knowledge ### + ### Please implement the second strategy here ### + output = '...' + + if MODE == "3": + ### MODE 3: Combine MODE 1 & 2 ### + ### Please implement the third strategy here ### + output = '...' + + answer_list.append((row["text"], row["correct_node"], output)) + except Exception as e: + print("Error in processing question: ", row["text"]) + print("Error: ", e) + answer_list.append((row["text"], row["correct_node"], "Error")) + + + answer_df = pd.DataFrame(answer_list, columns=["question", "correct_answer", "llm_answer"]) + output_file = os.path.join(SAVE_PATH, f"{save_name}".format(mode=MODE),) + answer_df.to_csv(output_file, index=False, header=True) + print("Save the model outputs in ", output_file) + print("Completed in {} min".format((time.time()-start_time)/60)) + + + +if __name__ == "__main__": + main() + + diff --git a/kg_rag/rag_based_generation/GPT/run_single_disease_entity_hyperparameter_tuning.py b/kg_rag/rag_based_generation/GPT/run_single_disease_entity_hyperparameter_tuning.py new file mode 100644 index 0000000..aaf8071 --- /dev/null +++ b/kg_rag/rag_based_generation/GPT/run_single_disease_entity_hyperparameter_tuning.py @@ -0,0 +1,61 @@ +''' +This script is used for hyperparameter tuning on one-hop graph traversal questions. +Hyperparameters are 'CONTEXT_VOLUME_LIST' and 'SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL_LIST' + +This will run on one-hop graph traveral questions from the csv file and save the result as another csv file. + +Before running this script, make sure to configure the filepaths in config.yaml file. +Command line argument should be either 'gpt-4' or 'gpt-35-turbo' +''' + +from kg_rag.utility import * +import sys + +CHAT_MODEL_ID = sys.argv[1] + +CONTEXT_VOLUME_LIST = [10, 50, 100, 150, 200] +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL_LIST = ["pritamdeka/S-PubMedBert-MS-MARCO", "sentence-transformers/all-MiniLM-L6-v2"] +SAVE_NAME_LIST = ["pubmedBert_based_one_hop_questions_parameter_tuning_round_{}.csv", "miniLM_based_one_hop_questions_parameter_tuning_round_{}.csv"] + +QUESTION_PATH = config_data["SINGLE_DISEASE_ENTITY_FILE"] +SYSTEM_PROMPT = system_prompts["SINGLE_DISEASE_ENTITY_VALIDATION"] +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"]) +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"]) +VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"] +NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"] +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"] +TEMPERATURE = config_data["LLM_TEMPERATURE"] +SAVE_PATH = config_data["SAVE_RESULTS_PATH"] + +CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID + +vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL) +node_context_df = pd.read_csv(NODE_CONTEXT_PATH) +edge_evidence = False + +def main(): + start_time = time.time() + question_df = pd.read_csv(QUESTION_PATH) + for tranformer_index, sentence_embedding_model_for_context_retrieval in enumerate(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL_LIST): + embedding_function_for_context_retrieval = load_sentence_transformer(sentence_embedding_model_for_context_retrieval) + for context_index, context_volume in enumerate(CONTEXT_VOLUME_LIST): + answer_list = [] + for index, row in question_df.iterrows(): + question = row["text"] + context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, context_volume, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence) + enriched_prompt = "Context: "+ context + "\n" + "Question: " + question + output = get_GPT_response(enriched_prompt, SYSTEM_PROMPT, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=TEMPERATURE) + if not output: + time.sleep(5) + answer_list.append((row["disease_1"], row["Compounds"], row["Diseases"], row["text"], output, context_volume)) + answer_df = pd.DataFrame(answer_list, columns=["disease", "compound_groundTruth", "disease_groundTruth", "text", "llm_answer", "context_volume"]) + save_name = "_".join(CHAT_MODEL_ID.split("-"))+SAVE_NAME_LIST[tranformer_index].format(context_index+1) + answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True) + print("Completed in {} min".format((time.time()-start_time)/60)) + + + +if __name__ == "__main__": + main() + + diff --git a/kg_rag/rag_based_generation/GPT/run_true_false_generation.py b/kg_rag/rag_based_generation/GPT/run_true_false_generation.py new file mode 100644 index 0000000..7b8d0e3 --- /dev/null +++ b/kg_rag/rag_based_generation/GPT/run_true_false_generation.py @@ -0,0 +1,52 @@ +''' +This script takes the True/False style questions from the csv file and save the result as another csv file. +Before running this script, make sure to configure the filepaths in config.yaml file. +Command line argument should be either 'gpt-4' or 'gpt-35-turbo' +''' + +from kg_rag.utility import * +import sys + +CHAT_MODEL_ID = sys.argv[1] + +QUESTION_PATH = config_data["TRUE_FALSE_PATH"] +SYSTEM_PROMPT = system_prompts["TRUE_FALSE_QUESTION"] +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"]) +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"]) +VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"] +NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"] +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"] +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"] +TEMPERATURE = config_data["LLM_TEMPERATURE"] +SAVE_PATH = config_data["SAVE_RESULTS_PATH"] +CONTEXT_VOLUME = 100 + + +CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID + +save_name = "_".join(CHAT_MODEL_ID.split("-"))+"_kg_rag_based_true_false_binary_response.csv" + + +vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL) +embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL) +node_context_df = pd.read_csv(NODE_CONTEXT_PATH) +edge_evidence = False + +def main(): + start_time = time.time() + question_df = pd.read_csv(QUESTION_PATH) + answer_list = [] + for index, row in question_df.iterrows(): + question = row["text"] + context = retrieve_context(row["text"], vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence) + enriched_prompt = "Context: "+ context + "\n" + "Question: "+ question + output = get_GPT_response(enriched_prompt, SYSTEM_PROMPT, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=TEMPERATURE) + answer_list.append((row["text"], row["label"], output)) + answer_df = pd.DataFrame(answer_list, columns=["question", "label", "llm_answer"]) + answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True) + print("Completed in {} min".format((time.time()-start_time)/60)) + + +if __name__ == "__main__": + main() + diff --git a/kg_rag/rag_based_generation/GPT/run_two_disease_entity_hyperparameter_tuning.py b/kg_rag/rag_based_generation/GPT/run_two_disease_entity_hyperparameter_tuning.py new file mode 100644 index 0000000..043f39d --- /dev/null +++ b/kg_rag/rag_based_generation/GPT/run_two_disease_entity_hyperparameter_tuning.py @@ -0,0 +1,57 @@ +''' +This script is used for hyperparameter tuning on two-hop graph traversal questions. +Hyperparameters are 'CONTEXT_VOLUME_LIST' and 'SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL_LIST' + +This will run on two-hop graph traveral questions from the csv file and save the result as another csv file. + +Before running this script, make sure to configure the filepaths in config.yaml file. +Command line argument should be either 'gpt-4' or 'gpt-35-turbo' +''' + +from kg_rag.utility import * +import sys + + +CHAT_MODEL_ID = sys.argv[1] + +CONTEXT_VOLUME_LIST = [10, 50, 100, 150, 200] +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL_LIST = ["pritamdeka/S-PubMedBert-MS-MARCO", "sentence-transformers/all-MiniLM-L6-v2"] +SAVE_NAME_LIST = ["pubmedBert_based_two_hop_questions_parameter_tuning_round_{}.csv", "miniLM_based_two_hop_questions_parameter_tuning_round_{}.csv"] + +QUESTION_PATH = config_data["TWO_DISEASE_ENTITY_FILE"] +SYSTEM_PROMPT = system_prompts["TWO_DISEASE_ENTITY_VALIDATION"] +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"]) +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"]) +VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"] +NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"] +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"] +TEMPERATURE = config_data["LLM_TEMPERATURE"] +SAVE_PATH = config_data["SAVE_RESULTS_PATH"] + +vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL) +embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL) +node_context_df = pd.read_csv(NODE_CONTEXT_PATH) +edge_evidence = False + +def main(): + start_time = time.time() + question_df = pd.read_csv(QUESTION_PATH) + for tranformer_index, sentence_embedding_model_for_context_retrieval in enumerate(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL_LIST): + for context_index, context_volume in enumerate(CONTEXT_VOLUME_LIST): + answer_list = [] + for index, row in question_df.iterrows(): + question = row["text"] + context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, context_volume, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence) + enriched_prompt = "Context: "+ context + "\n" + "Question: " + question + output = get_GPT_response(enriched_prompt, system_prompt, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=temperature) + if not output: + time.sleep(5) + answer_list.append((row["disease_1"], row["disease_2"], row["central_nodes"], row["text"], output, context_volume)) + answer_df = pd.DataFrame(answer_list, columns=["disease_1", "disease_2", "central_nodes_groundTruth", "text", "llm_answer", "context_volume"]) + save_name = "_".join(CHAT_MODEL_ID.split("-"))+SAVE_NAME_LIST[tranformer_index].format(context_index+1) + answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True) + print("Completed in {} min".format((time.time()-start_time)/60)) + + +if __name__ == "__main__": + main() diff --git a/kg_rag/rag_based_generation/GPT/text_generation.py b/kg_rag/rag_based_generation/GPT/text_generation.py new file mode 100644 index 0000000..f2fcee1 --- /dev/null +++ b/kg_rag/rag_based_generation/GPT/text_generation.py @@ -0,0 +1,61 @@ +''' +This script takes a question from the user in an interactive fashion and returns the KG-RAG based response in real time +Before running this script, make sure to configure config.yaml file. +Command line argument should be either 'gpt-4' or 'gpt-35-turbo' +''' + +from kg_rag.utility import * +import argparse + + + +parser = argparse.ArgumentParser() +parser.add_argument('-g', type=str, default='gpt-35-turbo', help='GPT model selection') +parser.add_argument('-i', type=bool, default=False, help='Flag for interactive mode') +parser.add_argument('-e', type=bool, default=False, help='Flag for showing evidence of association from the graph') +args = parser.parse_args() + +CHAT_MODEL_ID = args.g +INTERACTIVE = args.i +EDGE_EVIDENCE = bool(args.e) + + +SYSTEM_PROMPT = system_prompts["KG_RAG_BASED_TEXT_GENERATION"] +CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"]) +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"]) +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"]) +VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"] +NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"] +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"] +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"] +TEMPERATURE = config_data["LLM_TEMPERATURE"] + + +CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID if openai.api_type == "azure" else None + + +vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL) +embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL) +node_context_df = pd.read_csv(NODE_CONTEXT_PATH) + +def main(): + print(" ") + question = input("Enter your question : ") + if not INTERACTIVE: + print("Retrieving context from SPOKE graph...") + context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, EDGE_EVIDENCE) + print("Here is the KG-RAG based answer:") + print("") + enriched_prompt = "Context: "+ context + "\n" + "Question: " + question + output = get_GPT_response(enriched_prompt, SYSTEM_PROMPT, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=TEMPERATURE) + stream_out(output) + else: + interactive(question, vectorstore, node_context_df, embedding_function_for_context_retrieval, CHAT_MODEL_ID, EDGE_EVIDENCE, SYSTEM_PROMPT) + + + +if __name__ == "__main__": + main() + + + diff --git a/kg_rag/rag_based_generation/Llama/run_drug_repurposing.py b/kg_rag/rag_based_generation/Llama/run_drug_repurposing.py new file mode 100644 index 0000000..0b8d2f0 --- /dev/null +++ b/kg_rag/rag_based_generation/Llama/run_drug_repurposing.py @@ -0,0 +1,60 @@ +''' +This script takes the drug repurposing style questions from the csv file and save the result as another csv file. +This script makes use of Llama model. +Before running this script, make sure to configure the filepaths in config.yaml file. +''' + +from langchain import PromptTemplate, LLMChain +from kg_rag.utility import * +import sys + +QUESTION_PATH = config_data["DRUG_REPURPOSING_PATH"] +SYSTEM_PROMPT = system_prompts["DRUG_REPURPOSING"] +CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"]) +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"]) +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"]) +VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"] +NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"] +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"] +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"] +SAVE_PATH = config_data["SAVE_RESULTS_PATH"] +MODEL_NAME = config_data["LLAMA_MODEL_NAME"] +BRANCH_NAME = config_data["LLAMA_MODEL_BRANCH"] +CACHE_DIR = config_data["LLM_CACHE_DIR"] + + +save_name = "_".join(MODEL_NAME.split("/")[-1].split("-"))+"_drug_repurposing_questions_response.csv" + + +INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}" + +vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL) +embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL) +node_context_df = pd.read_csv(NODE_CONTEXT_PATH) + + + +def main(): + start_time = time.time() + llm = llama_model(MODEL_NAME, BRANCH_NAME, CACHE_DIR, max_new_tokens=1024) + template = get_prompt(INSTRUCTION, SYSTEM_PROMPT) + prompt = PromptTemplate(template=template, input_variables=["context", "question"]) + llm_chain = LLMChain(prompt=prompt, llm=llm) + question_df = pd.read_csv(QUESTION_PATH) + answer_list = [] + for index, row in question_df.iterrows(): + question = row["text"] + context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY) + output = llm_chain.run(context=context, question=question) + answer_list.append((row["disease_in_question"], row["refDisease"], row["compoundGroundTruth"], row["text"], output)) + answer_df = pd.DataFrame(answer_list, columns=["disease_in_question", "refDisease", "compoundGroundTruth", "text", "llm_answer"]) + answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True) + print("Completed in {} min".format((time.time()-start_time)/60)) + + + +if __name__ == "__main__": + main() + + + diff --git a/kg_rag/rag_based_generation/Llama/run_mcq_qa.py b/kg_rag/rag_based_generation/Llama/run_mcq_qa.py new file mode 100644 index 0000000..67ae43c --- /dev/null +++ b/kg_rag/rag_based_generation/Llama/run_mcq_qa.py @@ -0,0 +1,61 @@ +''' +This script takes the MCQ style questions from the csv file and save the result as another csv file. +This script makes use of Llama model. +Before running this script, make sure to configure the filepaths in config.yaml file. +''' +from tqdm import tqdm +from langchain import PromptTemplate, LLMChain +from kg_rag.utility import * + + +QUESTION_PATH = config_data["MCQ_PATH"] +SYSTEM_PROMPT = system_prompts["MCQ_QUESTION"] +CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"]) +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"]) +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"]) +VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"] +NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"] +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"] +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"] +SAVE_PATH = config_data["SAVE_RESULTS_PATH"] +MODEL_NAME = config_data["LLAMA_MODEL_NAME"] +BRANCH_NAME = config_data["LLAMA_MODEL_BRANCH"] +CACHE_DIR = config_data["LLM_CACHE_DIR"] + +save_name = "_".join(MODEL_NAME.split("/")[-1].split("-"))+"_kg_rag_based_mcq_from_monarch_and_robokop_response.csv" + + +INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}" + +vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL) +embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL) +node_context_df = pd.read_csv(NODE_CONTEXT_PATH) +edge_evidence = False + + + +def main(): + start_time = time.time() + llm = llama_model(MODEL_NAME, BRANCH_NAME, CACHE_DIR) + template = get_prompt(INSTRUCTION, SYSTEM_PROMPT) + prompt = PromptTemplate(template=template, input_variables=["context", "question"]) + llm_chain = LLMChain(prompt=prompt, llm=llm) + question_df = pd.read_csv(QUESTION_PATH) + answer_list = [] + for index, row in tqdm(question_df.iterrows()): + question = row["text"] + context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence) + output = llm_chain.run(context=context, question=question) + answer_list.append((row["text"], row["correct_node"], output)) + answer_df = pd.DataFrame(answer_list, columns=["question", "correct_answer", "llm_answer"]) + answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True) + print("Completed in {} min".format((time.time()-start_time)/60)) + + + + + +if __name__ == "__main__": + main() + + diff --git a/kg_rag/rag_based_generation/Llama/run_mcq_qa_medgpt.py b/kg_rag/rag_based_generation/Llama/run_mcq_qa_medgpt.py new file mode 100644 index 0000000..813b601 --- /dev/null +++ b/kg_rag/rag_based_generation/Llama/run_mcq_qa_medgpt.py @@ -0,0 +1,61 @@ +''' +This script takes the MCQ style questions from the csv file and save the result as another csv file. +This script makes use of Llama model. +Before running this script, make sure to configure the filepaths in config.yaml file. +''' + +from langchain import PromptTemplate, LLMChain +from kg_rag.utility import * + + +QUESTION_PATH = config_data["MCQ_PATH"] +SYSTEM_PROMPT = system_prompts["MCQ_QUESTION"] +CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"]) +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"]) +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"]) +VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"] +NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"] +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"] +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"] +SAVE_PATH = config_data["SAVE_RESULTS_PATH"] +MODEL_NAME = 'PharMolix/BioMedGPT-LM-7B' +BRANCH_NAME = 'main' +CACHE_DIR = config_data["LLM_CACHE_DIR"] + +save_name = "_".join(MODEL_NAME.split("/")[-1].split("-"))+"_kg_rag_based_mcq_from_monarch_and_robokop_response.csv" + + +INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}" + +vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL) +embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL) +node_context_df = pd.read_csv(NODE_CONTEXT_PATH) +edge_evidence = False + + +def main(): + start_time = time.time() + llm = llama_model(MODEL_NAME, BRANCH_NAME, CACHE_DIR) + template = get_prompt(INSTRUCTION, SYSTEM_PROMPT) + prompt = PromptTemplate(template=template, input_variables=["context", "question"]) + llm_chain = LLMChain(prompt=prompt, llm=llm) + question_df = pd.read_csv(QUESTION_PATH) + question_df = question_df.sample(50, random_state=40) + answer_list = [] + for index, row in question_df.iterrows(): + question = row["text"] + context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence) + output = llm_chain.run(context=context, question=question) + answer_list.append((row["text"], row["correct_node"], output)) + answer_df = pd.DataFrame(answer_list, columns=["question", "correct_answer", "llm_answer"]) + answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True) + print("Completed in {} min".format((time.time()-start_time)/60)) + + + + + +if __name__ == "__main__": + main() + + diff --git a/kg_rag/rag_based_generation/Llama/run_true_false_generation.py b/kg_rag/rag_based_generation/Llama/run_true_false_generation.py new file mode 100644 index 0000000..fa1a37d --- /dev/null +++ b/kg_rag/rag_based_generation/Llama/run_true_false_generation.py @@ -0,0 +1,59 @@ +''' +This script takes the True/False style questions from the csv file and save the result as another csv file. +This script makes use of Llama model. +Before running this script, make sure to configure the filepaths in config.yaml file. +''' + +from langchain import PromptTemplate, LLMChain +from kg_rag.utility import * +import sys + + +QUESTION_PATH = config_data["TRUE_FALSE_PATH"] +SYSTEM_PROMPT = system_prompts["TRUE_FALSE_QUESTION"] +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"]) +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"]) +VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"] +NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"] +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"] +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"] +SAVE_PATH = config_data["SAVE_RESULTS_PATH"] +MODEL_NAME = config_data["LLAMA_MODEL_NAME"] +BRANCH_NAME = config_data["LLAMA_MODEL_BRANCH"] +CACHE_DIR = config_data["LLM_CACHE_DIR"] +CONTEXT_VOLUME = 100 +edge_evidence = False + +save_name = "_".join(MODEL_NAME.split("/")[-1].split("-"))+"_kg_rag_based_true_false_binary_response.csv" + + +INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}" + +vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL) +embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL) +node_context_df = pd.read_csv(NODE_CONTEXT_PATH) + + +def main(): + start_time = time.time() + llm = llama_model(MODEL_NAME, BRANCH_NAME, CACHE_DIR) + template = get_prompt(INSTRUCTION, SYSTEM_PROMPT) + prompt = PromptTemplate(template=template, input_variables=["context", "question"]) + llm_chain = LLMChain(prompt=prompt, llm=llm) + question_df = pd.read_csv(QUESTION_PATH) + answer_list = [] + for index, row in question_df.iterrows(): + question = row["text"] + context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence) + output = llm_chain.run(context=context, question=question) + answer_list.append((row["text"], row["label"], output)) + answer_df = pd.DataFrame(answer_list, columns=["question", "label", "llm_answer"]) + answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True) + print("Completed in {} min".format((time.time()-start_time)/60)) + + + +if __name__ == "__main__": + main() + +
\ No newline at end of file diff --git a/kg_rag/rag_based_generation/Llama/text_generation.py b/kg_rag/rag_based_generation/Llama/text_generation.py new file mode 100644 index 0000000..2824135 --- /dev/null +++ b/kg_rag/rag_based_generation/Llama/text_generation.py @@ -0,0 +1,60 @@ +from langchain import PromptTemplate, LLMChain +from kg_rag.utility import * +import argparse + + + +parser = argparse.ArgumentParser() +parser.add_argument('-i', type=bool, default=False, help='Flag for interactive mode') +parser.add_argument('-m', type=str, default='method-1', help='Method to choose for Llama model') +parser.add_argument('-e', type=bool, default=False, help='Flag for showing evidence of association from the graph') +args = parser.parse_args() + +INTERACTIVE = args.i +METHOD = args.m +EDGE_EVIDENCE = bool(args.e) + + +SYSTEM_PROMPT = system_prompts["KG_RAG_BASED_TEXT_GENERATION"] +CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"]) +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"]) +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"]) +VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"] +NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"] +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"] +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"] +MODEL_NAME = config_data["LLAMA_MODEL_NAME"] +BRANCH_NAME = config_data["LLAMA_MODEL_BRANCH"] +CACHE_DIR = config_data["LLM_CACHE_DIR"] + + +INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}" + +vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL) +embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL) +node_context_df = pd.read_csv(NODE_CONTEXT_PATH) + +def main(): + print(" ") + question = input("Enter your question : ") + if not INTERACTIVE: + template = get_prompt(INSTRUCTION, SYSTEM_PROMPT) + prompt = PromptTemplate(template=template, input_variables=["context", "question"]) + llm = llama_model(MODEL_NAME, BRANCH_NAME, CACHE_DIR, stream=True, method=METHOD) + llm_chain = LLMChain(prompt=prompt, llm=llm) + print("Retrieving context from SPOKE graph...") + context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, EDGE_EVIDENCE) + print("Here is the KG-RAG based answer using Llama:") + print("") + output = llm_chain.run(context=context, question=question) + else: + interactive(question, vectorstore, node_context_df, embedding_function_for_context_retrieval, "llama", EDGE_EVIDENCE, SYSTEM_PROMPT, llama_method=METHOD) + + + + + + + +if __name__ == "__main__": + main() |
