Initial commit

author: maszhongming <mingz5@illinois.edu> 2025-09-16 15:15:29 -0500
committer: maszhongming <mingz5@illinois.edu> 2025-09-16 15:15:29 -0500
commit: 73c194f304f827b55081b15524479f82a1b7d94c (patch)
tree: 5e8660e421915420892c5eca18f1ad680f80a861 /kg_rag/rag_based_generation/Llama/run_drug_repurposing.py
1 files changed, 60 insertions, 0 deletions
diff --git a/kg_rag/rag_based_generation/Llama/run_drug_repurposing.py b/kg_rag/rag_based_generation/Llama/run_drug_repurposing.py
new file mode 100644
index 0000000..0b8d2f0
--- /dev/null
+++ b/kg_rag/rag_based_generation/Llama/run_drug_repurposing.py
@@ -0,0 +1,60 @@
+'''
+This script takes the drug repurposing style questions from the csv file and save the result as another csv file. 
+This script makes use of Llama model.
+Before running this script, make sure to configure the filepaths in config.yaml file.
+'''
+
+from langchain import PromptTemplate, LLMChain
+from kg_rag.utility import *
+import sys
+
+QUESTION_PATH = config_data["DRUG_REPURPOSING_PATH"]
+SYSTEM_PROMPT = system_prompts["DRUG_REPURPOSING"]
+CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"])
+QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
+QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
+VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
+NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
+SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
+SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"]
+SAVE_PATH = config_data["SAVE_RESULTS_PATH"]
+MODEL_NAME = config_data["LLAMA_MODEL_NAME"]
+BRANCH_NAME = config_data["LLAMA_MODEL_BRANCH"]
+CACHE_DIR = config_data["LLM_CACHE_DIR"]
+
+
+save_name = "_".join(MODEL_NAME.split("/")[-1].split("-"))+"_drug_repurposing_questions_response.csv"
+
+
+INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}"
+
+vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
+embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
+node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
+
+
+
+def main():
+    start_time = time.time()
+    llm = llama_model(MODEL_NAME, BRANCH_NAME, CACHE_DIR, max_new_tokens=1024)               
+    template = get_prompt(INSTRUCTION, SYSTEM_PROMPT)
+    prompt = PromptTemplate(template=template, input_variables=["context", "question"])
+    llm_chain = LLMChain(prompt=prompt, llm=llm)    
+    question_df = pd.read_csv(QUESTION_PATH)
+    answer_list = []
+    for index, row in question_df.iterrows():
+        question = row["text"]
+        context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY)
+        output = llm_chain.run(context=context, question=question)
+        answer_list.append((row["disease_in_question"], row["refDisease"], row["compoundGroundTruth"], row["text"], output))
+    answer_df = pd.DataFrame(answer_list, columns=["disease_in_question", "refDisease", "compoundGroundTruth", "text", "llm_answer"])
+    answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True)
+    print("Completed in {} min".format((time.time()-start_time)/60))
+
+
+
+if __name__ == "__main__":
+    main()
+
+        
+
author	maszhongming <mingz5@illinois.edu>	2025-09-16 15:15:29 -0500
committer	maszhongming <mingz5@illinois.edu>	2025-09-16 15:15:29 -0500
commit	73c194f304f827b55081b15524479f82a1b7d94c (patch)
tree	5e8660e421915420892c5eca18f1ad680f80a861 /kg_rag/rag_based_generation/Llama/run_drug_repurposing.py