summaryrefslogtreecommitdiff
path: root/kg_rag/rag_based_generation
diff options
context:
space:
mode:
authormaszhongming <mingz5@illinois.edu>2025-09-16 15:15:29 -0500
committermaszhongming <mingz5@illinois.edu>2025-09-16 15:15:29 -0500
commit73c194f304f827b55081b15524479f82a1b7d94c (patch)
tree5e8660e421915420892c5eca18f1ad680f80a861 /kg_rag/rag_based_generation
Initial commit
Diffstat (limited to 'kg_rag/rag_based_generation')
-rw-r--r--kg_rag/rag_based_generation/GPT/drug_action.py52
-rw-r--r--kg_rag/rag_based_generation/GPT/drug_repurposing_v2.py68
-rw-r--r--kg_rag/rag_based_generation/GPT/run_drug_repurposing.py57
-rw-r--r--kg_rag/rag_based_generation/GPT/run_mcq_qa.py91
-rw-r--r--kg_rag/rag_based_generation/GPT/run_single_disease_entity_hyperparameter_tuning.py61
-rw-r--r--kg_rag/rag_based_generation/GPT/run_true_false_generation.py52
-rw-r--r--kg_rag/rag_based_generation/GPT/run_two_disease_entity_hyperparameter_tuning.py57
-rw-r--r--kg_rag/rag_based_generation/GPT/text_generation.py61
-rw-r--r--kg_rag/rag_based_generation/Llama/run_drug_repurposing.py60
-rw-r--r--kg_rag/rag_based_generation/Llama/run_mcq_qa.py61
-rw-r--r--kg_rag/rag_based_generation/Llama/run_mcq_qa_medgpt.py61
-rw-r--r--kg_rag/rag_based_generation/Llama/run_true_false_generation.py59
-rw-r--r--kg_rag/rag_based_generation/Llama/text_generation.py60
13 files changed, 800 insertions, 0 deletions
diff --git a/kg_rag/rag_based_generation/GPT/drug_action.py b/kg_rag/rag_based_generation/GPT/drug_action.py
new file mode 100644
index 0000000..60c0acf
--- /dev/null
+++ b/kg_rag/rag_based_generation/GPT/drug_action.py
@@ -0,0 +1,52 @@
+from kg_rag.utility import *
+import argparse
+
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-g', type=str, default='gpt-35-turbo', help='GPT model selection')
+parser.add_argument('-i', type=bool, default=False, help='Flag for interactive mode')
+parser.add_argument('-e', type=bool, default=False, help='Flag for showing evidence of association from the graph')
+args = parser.parse_args()
+
+CHAT_MODEL_ID = args.g
+INTERACTIVE = args.i
+EDGE_EVIDENCE = bool(args.e)
+
+
+SYSTEM_PROMPT = system_prompts["DRUG_ACTION"]
+CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"])
+QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
+QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
+VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
+NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
+SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
+SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"]
+TEMPERATURE = config_data["LLM_TEMPERATURE"]
+
+
+CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID
+
+vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
+embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
+node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
+
+def main():
+ print(" ")
+ question = input("Enter your question : ")
+ if not INTERACTIVE:
+ print("Retrieving context from SPOKE graph...")
+ context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, EDGE_EVIDENCE)
+ print("Here is the KG-RAG based answer:")
+ print("")
+ enriched_prompt = "Context: "+ context + "\n" + "Question: " + question
+ output = get_GPT_response(enriched_prompt, SYSTEM_PROMPT, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=TEMPERATURE)
+ stream_out(output)
+ else:
+ interactive(question, vectorstore, node_context_df, embedding_function_for_context_retrieval, CHAT_MODEL_ID, EDGE_EVIDENCE, SYSTEM_PROMPT)
+
+
+
+if __name__ == "__main__":
+ main()
+
diff --git a/kg_rag/rag_based_generation/GPT/drug_repurposing_v2.py b/kg_rag/rag_based_generation/GPT/drug_repurposing_v2.py
new file mode 100644
index 0000000..d95053b
--- /dev/null
+++ b/kg_rag/rag_based_generation/GPT/drug_repurposing_v2.py
@@ -0,0 +1,68 @@
+from kg_rag.utility import *
+import argparse
+
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-g', type=str, default='gpt-35-turbo', help='GPT model selection')
+parser.add_argument('-i', type=bool, default=False, help='Flag for interactive mode')
+parser.add_argument('-e', type=bool, default=False, help='Flag for showing evidence of association from the graph')
+args = parser.parse_args()
+
+
+CHAT_MODEL_ID = args.g
+INTERACTIVE = args.i
+EDGE_EVIDENCE = bool(args.e)
+
+SYSTEM_PROMPT = system_prompts["DRUG_REPURPOSING_V2"]
+CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"])
+QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
+QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
+VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
+NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
+SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
+SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"]
+TEMPERATURE = config_data["LLM_TEMPERATURE"]
+
+
+CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID
+
+vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
+embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
+node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
+
+print('')
+question = input("Question : ")
+
+question_template = f'''
+To the question asked at the end, answer by referring the context.
+See example below
+Example 1:
+ Question:
+ What drugs can be repurposed for disease X?
+ Context:
+ Compound Alizapride DOWNREGULATES Gene APOE and Provenance of this association is XX. Gene APOE ASSOCIATES Disease X and Provenance of this association is YY. Gene TTR encodes Protein Transthyretin (ATTR) and Provenance of this association is ZZ. Compound Acetylcysteine treats Disease X and Provenance of this association is PP.
+ Answer:
+ Since Alizapride downregulates gene APOE (Provenance XX) and APOE is associated with Disease X (Provenance YY), Alizapride can be repurposed to treat Disease X. p-value for these associations is XXXX and z-score values for these associations is YYYY.
+Question:
+{question}
+'''
+
+def main():
+ if not INTERACTIVE:
+ print("Retrieving context from SPOKE graph...")
+ context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, EDGE_EVIDENCE)
+ print("Here is the KG-RAG based answer:")
+ print("")
+ enriched_prompt = "Context: "+ context + "\n" + "Question: " + question
+ output = get_GPT_response(enriched_prompt, SYSTEM_PROMPT, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=TEMPERATURE)
+ stream_out(output)
+ else:
+ interactive(question, vectorstore, node_context_df, embedding_function_for_context_retrieval, CHAT_MODEL_ID, EDGE_EVIDENCE, SYSTEM_PROMPT)
+
+
+
+if __name__ == "__main__":
+ main()
+
+
diff --git a/kg_rag/rag_based_generation/GPT/run_drug_repurposing.py b/kg_rag/rag_based_generation/GPT/run_drug_repurposing.py
new file mode 100644
index 0000000..8a5726d
--- /dev/null
+++ b/kg_rag/rag_based_generation/GPT/run_drug_repurposing.py
@@ -0,0 +1,57 @@
+'''
+This script takes the drug repurposing style questions from the csv file and save the result as another csv file.
+Before running this script, make sure to configure the filepaths in config.yaml file.
+Command line argument should be either 'gpt-4' or 'gpt-35-turbo'
+'''
+
+from kg_rag.utility import *
+import sys
+
+
+
+CHAT_MODEL_ID = sys.argv[1]
+
+QUESTION_PATH = config_data["DRUG_REPURPOSING_PATH"]
+SYSTEM_PROMPT = system_prompts["DRUG_REPURPOSING"]
+CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"])
+QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
+QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
+VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
+NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
+SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
+SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"]
+TEMPERATURE = config_data["LLM_TEMPERATURE"]
+SAVE_PATH = config_data["SAVE_RESULTS_PATH"]
+
+
+CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID
+
+save_name = "_".join(CHAT_MODEL_ID.split("-"))+"_drug_repurposing_questions_response.csv"
+
+
+vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
+embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
+node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
+
+
+def main():
+ start_time = time.time()
+ question_df = pd.read_csv(QUESTION_PATH)
+ answer_list = []
+ for index, row in question_df.iterrows():
+ question = row["text"]
+ context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY)
+ enriched_prompt = "Context: " + context + "\n" + "Question: " + question
+ output = get_GPT_response(enriched_prompt, SYSTEM_PROMPT, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=TEMPERATURE)
+ answer_list.append((row["disease_in_question"], row["refDisease"], row["compoundGroundTruth"], row["text"], output))
+ answer_df = pd.DataFrame(answer_list, columns=["disease_in_question", "refDisease", "compoundGroundTruth", "text", "llm_answer"])
+ answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True)
+ print("Completed in {} min".format((time.time()-start_time)/60))
+
+
+
+if __name__ == "__main__":
+ main()
+
+
+
diff --git a/kg_rag/rag_based_generation/GPT/run_mcq_qa.py b/kg_rag/rag_based_generation/GPT/run_mcq_qa.py
new file mode 100644
index 0000000..edf0415
--- /dev/null
+++ b/kg_rag/rag_based_generation/GPT/run_mcq_qa.py
@@ -0,0 +1,91 @@
+'''
+This script takes the MCQ style questions from the csv file and save the result as another csv file.
+Before running this script, make sure to configure the filepaths in config.yaml file.
+Command line argument should be either 'gpt-4' or 'gpt-35-turbo'
+'''
+
+from kg_rag.utility import *
+import sys
+
+
+from tqdm import tqdm
+CHAT_MODEL_ID = sys.argv[1]
+
+QUESTION_PATH = config_data["MCQ_PATH"]
+SYSTEM_PROMPT = system_prompts["MCQ_QUESTION"]
+CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"])
+QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
+QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
+VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
+NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
+SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
+SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"]
+TEMPERATURE = config_data["LLM_TEMPERATURE"]
+SAVE_PATH = config_data["SAVE_RESULTS_PATH"]
+
+
+CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID
+
+save_name = "_".join(CHAT_MODEL_ID.split("-"))+"_kg_rag_based_mcq_{mode}.csv"
+
+
+vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
+embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
+node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
+edge_evidence = False
+
+
+MODE = "0"
+### MODE 0: Original KG_RAG ###
+### MODE 1: jsonlize the context from KG search ###
+### MODE 2: Add the prior domain knowledge ###
+### MODE 3: Combine MODE 1 & 2 ###
+
+def main():
+ start_time = time.time()
+ question_df = pd.read_csv(QUESTION_PATH)
+ answer_list = []
+
+ for index, row in tqdm(question_df.iterrows(), total=306):
+ try:
+ question = row["text"]
+ if MODE == "0":
+ ### MODE 0: Original KG_RAG ###
+ context = retrieve_context(row["text"], vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence, model_id=CHAT_MODEL_ID)
+ enriched_prompt = "Context: "+ context + "\n" + "Question: "+ question
+ output = get_Gemini_response(enriched_prompt, SYSTEM_PROMPT, temperature=TEMPERATURE)
+
+ if MODE == "1":
+ ### MODE 1: jsonlize the context from KG search ###
+ ### Please implement the first strategy here ###
+ output = '...'
+
+ if MODE == "2":
+ ### MODE 2: Add the prior domain knowledge ###
+ ### Please implement the second strategy here ###
+ output = '...'
+
+ if MODE == "3":
+ ### MODE 3: Combine MODE 1 & 2 ###
+ ### Please implement the third strategy here ###
+ output = '...'
+
+ answer_list.append((row["text"], row["correct_node"], output))
+ except Exception as e:
+ print("Error in processing question: ", row["text"])
+ print("Error: ", e)
+ answer_list.append((row["text"], row["correct_node"], "Error"))
+
+
+ answer_df = pd.DataFrame(answer_list, columns=["question", "correct_answer", "llm_answer"])
+ output_file = os.path.join(SAVE_PATH, f"{save_name}".format(mode=MODE),)
+ answer_df.to_csv(output_file, index=False, header=True)
+ print("Save the model outputs in ", output_file)
+ print("Completed in {} min".format((time.time()-start_time)/60))
+
+
+
+if __name__ == "__main__":
+ main()
+
+
diff --git a/kg_rag/rag_based_generation/GPT/run_single_disease_entity_hyperparameter_tuning.py b/kg_rag/rag_based_generation/GPT/run_single_disease_entity_hyperparameter_tuning.py
new file mode 100644
index 0000000..aaf8071
--- /dev/null
+++ b/kg_rag/rag_based_generation/GPT/run_single_disease_entity_hyperparameter_tuning.py
@@ -0,0 +1,61 @@
+'''
+This script is used for hyperparameter tuning on one-hop graph traversal questions.
+Hyperparameters are 'CONTEXT_VOLUME_LIST' and 'SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL_LIST'
+
+This will run on one-hop graph traveral questions from the csv file and save the result as another csv file.
+
+Before running this script, make sure to configure the filepaths in config.yaml file.
+Command line argument should be either 'gpt-4' or 'gpt-35-turbo'
+'''
+
+from kg_rag.utility import *
+import sys
+
+CHAT_MODEL_ID = sys.argv[1]
+
+CONTEXT_VOLUME_LIST = [10, 50, 100, 150, 200]
+SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL_LIST = ["pritamdeka/S-PubMedBert-MS-MARCO", "sentence-transformers/all-MiniLM-L6-v2"]
+SAVE_NAME_LIST = ["pubmedBert_based_one_hop_questions_parameter_tuning_round_{}.csv", "miniLM_based_one_hop_questions_parameter_tuning_round_{}.csv"]
+
+QUESTION_PATH = config_data["SINGLE_DISEASE_ENTITY_FILE"]
+SYSTEM_PROMPT = system_prompts["SINGLE_DISEASE_ENTITY_VALIDATION"]
+QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
+QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
+VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
+NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
+SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
+TEMPERATURE = config_data["LLM_TEMPERATURE"]
+SAVE_PATH = config_data["SAVE_RESULTS_PATH"]
+
+CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID
+
+vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
+node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
+edge_evidence = False
+
+def main():
+ start_time = time.time()
+ question_df = pd.read_csv(QUESTION_PATH)
+ for tranformer_index, sentence_embedding_model_for_context_retrieval in enumerate(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL_LIST):
+ embedding_function_for_context_retrieval = load_sentence_transformer(sentence_embedding_model_for_context_retrieval)
+ for context_index, context_volume in enumerate(CONTEXT_VOLUME_LIST):
+ answer_list = []
+ for index, row in question_df.iterrows():
+ question = row["text"]
+ context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, context_volume, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence)
+ enriched_prompt = "Context: "+ context + "\n" + "Question: " + question
+ output = get_GPT_response(enriched_prompt, SYSTEM_PROMPT, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=TEMPERATURE)
+ if not output:
+ time.sleep(5)
+ answer_list.append((row["disease_1"], row["Compounds"], row["Diseases"], row["text"], output, context_volume))
+ answer_df = pd.DataFrame(answer_list, columns=["disease", "compound_groundTruth", "disease_groundTruth", "text", "llm_answer", "context_volume"])
+ save_name = "_".join(CHAT_MODEL_ID.split("-"))+SAVE_NAME_LIST[tranformer_index].format(context_index+1)
+ answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True)
+ print("Completed in {} min".format((time.time()-start_time)/60))
+
+
+
+if __name__ == "__main__":
+ main()
+
+
diff --git a/kg_rag/rag_based_generation/GPT/run_true_false_generation.py b/kg_rag/rag_based_generation/GPT/run_true_false_generation.py
new file mode 100644
index 0000000..7b8d0e3
--- /dev/null
+++ b/kg_rag/rag_based_generation/GPT/run_true_false_generation.py
@@ -0,0 +1,52 @@
+'''
+This script takes the True/False style questions from the csv file and save the result as another csv file.
+Before running this script, make sure to configure the filepaths in config.yaml file.
+Command line argument should be either 'gpt-4' or 'gpt-35-turbo'
+'''
+
+from kg_rag.utility import *
+import sys
+
+CHAT_MODEL_ID = sys.argv[1]
+
+QUESTION_PATH = config_data["TRUE_FALSE_PATH"]
+SYSTEM_PROMPT = system_prompts["TRUE_FALSE_QUESTION"]
+QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
+QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
+VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
+NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
+SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
+SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"]
+TEMPERATURE = config_data["LLM_TEMPERATURE"]
+SAVE_PATH = config_data["SAVE_RESULTS_PATH"]
+CONTEXT_VOLUME = 100
+
+
+CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID
+
+save_name = "_".join(CHAT_MODEL_ID.split("-"))+"_kg_rag_based_true_false_binary_response.csv"
+
+
+vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
+embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
+node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
+edge_evidence = False
+
+def main():
+ start_time = time.time()
+ question_df = pd.read_csv(QUESTION_PATH)
+ answer_list = []
+ for index, row in question_df.iterrows():
+ question = row["text"]
+ context = retrieve_context(row["text"], vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence)
+ enriched_prompt = "Context: "+ context + "\n" + "Question: "+ question
+ output = get_GPT_response(enriched_prompt, SYSTEM_PROMPT, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=TEMPERATURE)
+ answer_list.append((row["text"], row["label"], output))
+ answer_df = pd.DataFrame(answer_list, columns=["question", "label", "llm_answer"])
+ answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True)
+ print("Completed in {} min".format((time.time()-start_time)/60))
+
+
+if __name__ == "__main__":
+ main()
+
diff --git a/kg_rag/rag_based_generation/GPT/run_two_disease_entity_hyperparameter_tuning.py b/kg_rag/rag_based_generation/GPT/run_two_disease_entity_hyperparameter_tuning.py
new file mode 100644
index 0000000..043f39d
--- /dev/null
+++ b/kg_rag/rag_based_generation/GPT/run_two_disease_entity_hyperparameter_tuning.py
@@ -0,0 +1,57 @@
+'''
+This script is used for hyperparameter tuning on two-hop graph traversal questions.
+Hyperparameters are 'CONTEXT_VOLUME_LIST' and 'SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL_LIST'
+
+This will run on two-hop graph traveral questions from the csv file and save the result as another csv file.
+
+Before running this script, make sure to configure the filepaths in config.yaml file.
+Command line argument should be either 'gpt-4' or 'gpt-35-turbo'
+'''
+
+from kg_rag.utility import *
+import sys
+
+
+CHAT_MODEL_ID = sys.argv[1]
+
+CONTEXT_VOLUME_LIST = [10, 50, 100, 150, 200]
+SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL_LIST = ["pritamdeka/S-PubMedBert-MS-MARCO", "sentence-transformers/all-MiniLM-L6-v2"]
+SAVE_NAME_LIST = ["pubmedBert_based_two_hop_questions_parameter_tuning_round_{}.csv", "miniLM_based_two_hop_questions_parameter_tuning_round_{}.csv"]
+
+QUESTION_PATH = config_data["TWO_DISEASE_ENTITY_FILE"]
+SYSTEM_PROMPT = system_prompts["TWO_DISEASE_ENTITY_VALIDATION"]
+QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
+QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
+VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
+NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
+SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
+TEMPERATURE = config_data["LLM_TEMPERATURE"]
+SAVE_PATH = config_data["SAVE_RESULTS_PATH"]
+
+vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
+embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
+node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
+edge_evidence = False
+
+def main():
+ start_time = time.time()
+ question_df = pd.read_csv(QUESTION_PATH)
+ for tranformer_index, sentence_embedding_model_for_context_retrieval in enumerate(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL_LIST):
+ for context_index, context_volume in enumerate(CONTEXT_VOLUME_LIST):
+ answer_list = []
+ for index, row in question_df.iterrows():
+ question = row["text"]
+ context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, context_volume, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence)
+ enriched_prompt = "Context: "+ context + "\n" + "Question: " + question
+ output = get_GPT_response(enriched_prompt, system_prompt, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=temperature)
+ if not output:
+ time.sleep(5)
+ answer_list.append((row["disease_1"], row["disease_2"], row["central_nodes"], row["text"], output, context_volume))
+ answer_df = pd.DataFrame(answer_list, columns=["disease_1", "disease_2", "central_nodes_groundTruth", "text", "llm_answer", "context_volume"])
+ save_name = "_".join(CHAT_MODEL_ID.split("-"))+SAVE_NAME_LIST[tranformer_index].format(context_index+1)
+ answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True)
+ print("Completed in {} min".format((time.time()-start_time)/60))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/kg_rag/rag_based_generation/GPT/text_generation.py b/kg_rag/rag_based_generation/GPT/text_generation.py
new file mode 100644
index 0000000..f2fcee1
--- /dev/null
+++ b/kg_rag/rag_based_generation/GPT/text_generation.py
@@ -0,0 +1,61 @@
+'''
+This script takes a question from the user in an interactive fashion and returns the KG-RAG based response in real time
+Before running this script, make sure to configure config.yaml file.
+Command line argument should be either 'gpt-4' or 'gpt-35-turbo'
+'''
+
+from kg_rag.utility import *
+import argparse
+
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-g', type=str, default='gpt-35-turbo', help='GPT model selection')
+parser.add_argument('-i', type=bool, default=False, help='Flag for interactive mode')
+parser.add_argument('-e', type=bool, default=False, help='Flag for showing evidence of association from the graph')
+args = parser.parse_args()
+
+CHAT_MODEL_ID = args.g
+INTERACTIVE = args.i
+EDGE_EVIDENCE = bool(args.e)
+
+
+SYSTEM_PROMPT = system_prompts["KG_RAG_BASED_TEXT_GENERATION"]
+CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"])
+QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
+QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
+VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
+NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
+SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
+SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"]
+TEMPERATURE = config_data["LLM_TEMPERATURE"]
+
+
+CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID if openai.api_type == "azure" else None
+
+
+vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
+embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
+node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
+
+def main():
+ print(" ")
+ question = input("Enter your question : ")
+ if not INTERACTIVE:
+ print("Retrieving context from SPOKE graph...")
+ context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, EDGE_EVIDENCE)
+ print("Here is the KG-RAG based answer:")
+ print("")
+ enriched_prompt = "Context: "+ context + "\n" + "Question: " + question
+ output = get_GPT_response(enriched_prompt, SYSTEM_PROMPT, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=TEMPERATURE)
+ stream_out(output)
+ else:
+ interactive(question, vectorstore, node_context_df, embedding_function_for_context_retrieval, CHAT_MODEL_ID, EDGE_EVIDENCE, SYSTEM_PROMPT)
+
+
+
+if __name__ == "__main__":
+ main()
+
+
+
diff --git a/kg_rag/rag_based_generation/Llama/run_drug_repurposing.py b/kg_rag/rag_based_generation/Llama/run_drug_repurposing.py
new file mode 100644
index 0000000..0b8d2f0
--- /dev/null
+++ b/kg_rag/rag_based_generation/Llama/run_drug_repurposing.py
@@ -0,0 +1,60 @@
+'''
+This script takes the drug repurposing style questions from the csv file and save the result as another csv file.
+This script makes use of Llama model.
+Before running this script, make sure to configure the filepaths in config.yaml file.
+'''
+
+from langchain import PromptTemplate, LLMChain
+from kg_rag.utility import *
+import sys
+
+QUESTION_PATH = config_data["DRUG_REPURPOSING_PATH"]
+SYSTEM_PROMPT = system_prompts["DRUG_REPURPOSING"]
+CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"])
+QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
+QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
+VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
+NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
+SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
+SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"]
+SAVE_PATH = config_data["SAVE_RESULTS_PATH"]
+MODEL_NAME = config_data["LLAMA_MODEL_NAME"]
+BRANCH_NAME = config_data["LLAMA_MODEL_BRANCH"]
+CACHE_DIR = config_data["LLM_CACHE_DIR"]
+
+
+save_name = "_".join(MODEL_NAME.split("/")[-1].split("-"))+"_drug_repurposing_questions_response.csv"
+
+
+INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}"
+
+vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
+embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
+node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
+
+
+
+def main():
+ start_time = time.time()
+ llm = llama_model(MODEL_NAME, BRANCH_NAME, CACHE_DIR, max_new_tokens=1024)
+ template = get_prompt(INSTRUCTION, SYSTEM_PROMPT)
+ prompt = PromptTemplate(template=template, input_variables=["context", "question"])
+ llm_chain = LLMChain(prompt=prompt, llm=llm)
+ question_df = pd.read_csv(QUESTION_PATH)
+ answer_list = []
+ for index, row in question_df.iterrows():
+ question = row["text"]
+ context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY)
+ output = llm_chain.run(context=context, question=question)
+ answer_list.append((row["disease_in_question"], row["refDisease"], row["compoundGroundTruth"], row["text"], output))
+ answer_df = pd.DataFrame(answer_list, columns=["disease_in_question", "refDisease", "compoundGroundTruth", "text", "llm_answer"])
+ answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True)
+ print("Completed in {} min".format((time.time()-start_time)/60))
+
+
+
+if __name__ == "__main__":
+ main()
+
+
+
diff --git a/kg_rag/rag_based_generation/Llama/run_mcq_qa.py b/kg_rag/rag_based_generation/Llama/run_mcq_qa.py
new file mode 100644
index 0000000..67ae43c
--- /dev/null
+++ b/kg_rag/rag_based_generation/Llama/run_mcq_qa.py
@@ -0,0 +1,61 @@
+'''
+This script takes the MCQ style questions from the csv file and save the result as another csv file.
+This script makes use of Llama model.
+Before running this script, make sure to configure the filepaths in config.yaml file.
+'''
+from tqdm import tqdm
+from langchain import PromptTemplate, LLMChain
+from kg_rag.utility import *
+
+
+QUESTION_PATH = config_data["MCQ_PATH"]
+SYSTEM_PROMPT = system_prompts["MCQ_QUESTION"]
+CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"])
+QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
+QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
+VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
+NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
+SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
+SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"]
+SAVE_PATH = config_data["SAVE_RESULTS_PATH"]
+MODEL_NAME = config_data["LLAMA_MODEL_NAME"]
+BRANCH_NAME = config_data["LLAMA_MODEL_BRANCH"]
+CACHE_DIR = config_data["LLM_CACHE_DIR"]
+
+save_name = "_".join(MODEL_NAME.split("/")[-1].split("-"))+"_kg_rag_based_mcq_from_monarch_and_robokop_response.csv"
+
+
+INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}"
+
+vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
+embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
+node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
+edge_evidence = False
+
+
+
+def main():
+ start_time = time.time()
+ llm = llama_model(MODEL_NAME, BRANCH_NAME, CACHE_DIR)
+ template = get_prompt(INSTRUCTION, SYSTEM_PROMPT)
+ prompt = PromptTemplate(template=template, input_variables=["context", "question"])
+ llm_chain = LLMChain(prompt=prompt, llm=llm)
+ question_df = pd.read_csv(QUESTION_PATH)
+ answer_list = []
+ for index, row in tqdm(question_df.iterrows()):
+ question = row["text"]
+ context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence)
+ output = llm_chain.run(context=context, question=question)
+ answer_list.append((row["text"], row["correct_node"], output))
+ answer_df = pd.DataFrame(answer_list, columns=["question", "correct_answer", "llm_answer"])
+ answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True)
+ print("Completed in {} min".format((time.time()-start_time)/60))
+
+
+
+
+
+if __name__ == "__main__":
+ main()
+
+
diff --git a/kg_rag/rag_based_generation/Llama/run_mcq_qa_medgpt.py b/kg_rag/rag_based_generation/Llama/run_mcq_qa_medgpt.py
new file mode 100644
index 0000000..813b601
--- /dev/null
+++ b/kg_rag/rag_based_generation/Llama/run_mcq_qa_medgpt.py
@@ -0,0 +1,61 @@
+'''
+This script takes the MCQ style questions from the csv file and save the result as another csv file.
+This script makes use of Llama model.
+Before running this script, make sure to configure the filepaths in config.yaml file.
+'''
+
+from langchain import PromptTemplate, LLMChain
+from kg_rag.utility import *
+
+
+QUESTION_PATH = config_data["MCQ_PATH"]
+SYSTEM_PROMPT = system_prompts["MCQ_QUESTION"]
+CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"])
+QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
+QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
+VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
+NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
+SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
+SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"]
+SAVE_PATH = config_data["SAVE_RESULTS_PATH"]
+MODEL_NAME = 'PharMolix/BioMedGPT-LM-7B'
+BRANCH_NAME = 'main'
+CACHE_DIR = config_data["LLM_CACHE_DIR"]
+
+save_name = "_".join(MODEL_NAME.split("/")[-1].split("-"))+"_kg_rag_based_mcq_from_monarch_and_robokop_response.csv"
+
+
+INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}"
+
+vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
+embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
+node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
+edge_evidence = False
+
+
+def main():
+ start_time = time.time()
+ llm = llama_model(MODEL_NAME, BRANCH_NAME, CACHE_DIR)
+ template = get_prompt(INSTRUCTION, SYSTEM_PROMPT)
+ prompt = PromptTemplate(template=template, input_variables=["context", "question"])
+ llm_chain = LLMChain(prompt=prompt, llm=llm)
+ question_df = pd.read_csv(QUESTION_PATH)
+ question_df = question_df.sample(50, random_state=40)
+ answer_list = []
+ for index, row in question_df.iterrows():
+ question = row["text"]
+ context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence)
+ output = llm_chain.run(context=context, question=question)
+ answer_list.append((row["text"], row["correct_node"], output))
+ answer_df = pd.DataFrame(answer_list, columns=["question", "correct_answer", "llm_answer"])
+ answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True)
+ print("Completed in {} min".format((time.time()-start_time)/60))
+
+
+
+
+
+if __name__ == "__main__":
+ main()
+
+
diff --git a/kg_rag/rag_based_generation/Llama/run_true_false_generation.py b/kg_rag/rag_based_generation/Llama/run_true_false_generation.py
new file mode 100644
index 0000000..fa1a37d
--- /dev/null
+++ b/kg_rag/rag_based_generation/Llama/run_true_false_generation.py
@@ -0,0 +1,59 @@
+'''
+This script takes the True/False style questions from the csv file and save the result as another csv file.
+This script makes use of Llama model.
+Before running this script, make sure to configure the filepaths in config.yaml file.
+'''
+
+from langchain import PromptTemplate, LLMChain
+from kg_rag.utility import *
+import sys
+
+
+QUESTION_PATH = config_data["TRUE_FALSE_PATH"]
+SYSTEM_PROMPT = system_prompts["TRUE_FALSE_QUESTION"]
+QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
+QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
+VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
+NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
+SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
+SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"]
+SAVE_PATH = config_data["SAVE_RESULTS_PATH"]
+MODEL_NAME = config_data["LLAMA_MODEL_NAME"]
+BRANCH_NAME = config_data["LLAMA_MODEL_BRANCH"]
+CACHE_DIR = config_data["LLM_CACHE_DIR"]
+CONTEXT_VOLUME = 100
+edge_evidence = False
+
+save_name = "_".join(MODEL_NAME.split("/")[-1].split("-"))+"_kg_rag_based_true_false_binary_response.csv"
+
+
+INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}"
+
+vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
+embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
+node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
+
+
+def main():
+ start_time = time.time()
+ llm = llama_model(MODEL_NAME, BRANCH_NAME, CACHE_DIR)
+ template = get_prompt(INSTRUCTION, SYSTEM_PROMPT)
+ prompt = PromptTemplate(template=template, input_variables=["context", "question"])
+ llm_chain = LLMChain(prompt=prompt, llm=llm)
+ question_df = pd.read_csv(QUESTION_PATH)
+ answer_list = []
+ for index, row in question_df.iterrows():
+ question = row["text"]
+ context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence)
+ output = llm_chain.run(context=context, question=question)
+ answer_list.append((row["text"], row["label"], output))
+ answer_df = pd.DataFrame(answer_list, columns=["question", "label", "llm_answer"])
+ answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True)
+ print("Completed in {} min".format((time.time()-start_time)/60))
+
+
+
+if __name__ == "__main__":
+ main()
+
+ \ No newline at end of file
diff --git a/kg_rag/rag_based_generation/Llama/text_generation.py b/kg_rag/rag_based_generation/Llama/text_generation.py
new file mode 100644
index 0000000..2824135
--- /dev/null
+++ b/kg_rag/rag_based_generation/Llama/text_generation.py
@@ -0,0 +1,60 @@
+from langchain import PromptTemplate, LLMChain
+from kg_rag.utility import *
+import argparse
+
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-i', type=bool, default=False, help='Flag for interactive mode')
+parser.add_argument('-m', type=str, default='method-1', help='Method to choose for Llama model')
+parser.add_argument('-e', type=bool, default=False, help='Flag for showing evidence of association from the graph')
+args = parser.parse_args()
+
+INTERACTIVE = args.i
+METHOD = args.m
+EDGE_EVIDENCE = bool(args.e)
+
+
+SYSTEM_PROMPT = system_prompts["KG_RAG_BASED_TEXT_GENERATION"]
+CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"])
+QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
+QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
+VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
+NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
+SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
+SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"]
+MODEL_NAME = config_data["LLAMA_MODEL_NAME"]
+BRANCH_NAME = config_data["LLAMA_MODEL_BRANCH"]
+CACHE_DIR = config_data["LLM_CACHE_DIR"]
+
+
+INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}"
+
+vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
+embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
+node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
+
+def main():
+ print(" ")
+ question = input("Enter your question : ")
+ if not INTERACTIVE:
+ template = get_prompt(INSTRUCTION, SYSTEM_PROMPT)
+ prompt = PromptTemplate(template=template, input_variables=["context", "question"])
+ llm = llama_model(MODEL_NAME, BRANCH_NAME, CACHE_DIR, stream=True, method=METHOD)
+ llm_chain = LLMChain(prompt=prompt, llm=llm)
+ print("Retrieving context from SPOKE graph...")
+ context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, EDGE_EVIDENCE)
+ print("Here is the KG-RAG based answer using Llama:")
+ print("")
+ output = llm_chain.run(context=context, question=question)
+ else:
+ interactive(question, vectorstore, node_context_df, embedding_function_for_context_retrieval, "llama", EDGE_EVIDENCE, SYSTEM_PROMPT, llama_method=METHOD)
+
+
+
+
+
+
+
+if __name__ == "__main__":
+ main()