summaryrefslogtreecommitdiff
path: root/kg_rag/rag_based_generation/GPT/run_mcq_qa.py
diff options
context:
space:
mode:
Diffstat (limited to 'kg_rag/rag_based_generation/GPT/run_mcq_qa.py')
-rw-r--r--kg_rag/rag_based_generation/GPT/run_mcq_qa.py91
1 files changed, 91 insertions, 0 deletions
diff --git a/kg_rag/rag_based_generation/GPT/run_mcq_qa.py b/kg_rag/rag_based_generation/GPT/run_mcq_qa.py
new file mode 100644
index 0000000..edf0415
--- /dev/null
+++ b/kg_rag/rag_based_generation/GPT/run_mcq_qa.py
@@ -0,0 +1,91 @@
+'''
+This script takes the MCQ style questions from the csv file and save the result as another csv file.
+Before running this script, make sure to configure the filepaths in config.yaml file.
+Command line argument should be either 'gpt-4' or 'gpt-35-turbo'
+'''
+
+from kg_rag.utility import *
+import sys
+
+
+from tqdm import tqdm
+CHAT_MODEL_ID = sys.argv[1]
+
+QUESTION_PATH = config_data["MCQ_PATH"]
+SYSTEM_PROMPT = system_prompts["MCQ_QUESTION"]
+CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"])
+QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
+QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
+VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
+NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
+SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
+SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"]
+TEMPERATURE = config_data["LLM_TEMPERATURE"]
+SAVE_PATH = config_data["SAVE_RESULTS_PATH"]
+
+
+CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID
+
+save_name = "_".join(CHAT_MODEL_ID.split("-"))+"_kg_rag_based_mcq_{mode}.csv"
+
+
+vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
+embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
+node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
+edge_evidence = False
+
+
+MODE = "0"
+### MODE 0: Original KG_RAG ###
+### MODE 1: jsonlize the context from KG search ###
+### MODE 2: Add the prior domain knowledge ###
+### MODE 3: Combine MODE 1 & 2 ###
+
+def main():
+ start_time = time.time()
+ question_df = pd.read_csv(QUESTION_PATH)
+ answer_list = []
+
+ for index, row in tqdm(question_df.iterrows(), total=306):
+ try:
+ question = row["text"]
+ if MODE == "0":
+ ### MODE 0: Original KG_RAG ###
+ context = retrieve_context(row["text"], vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence, model_id=CHAT_MODEL_ID)
+ enriched_prompt = "Context: "+ context + "\n" + "Question: "+ question
+ output = get_Gemini_response(enriched_prompt, SYSTEM_PROMPT, temperature=TEMPERATURE)
+
+ if MODE == "1":
+ ### MODE 1: jsonlize the context from KG search ###
+ ### Please implement the first strategy here ###
+ output = '...'
+
+ if MODE == "2":
+ ### MODE 2: Add the prior domain knowledge ###
+ ### Please implement the second strategy here ###
+ output = '...'
+
+ if MODE == "3":
+ ### MODE 3: Combine MODE 1 & 2 ###
+ ### Please implement the third strategy here ###
+ output = '...'
+
+ answer_list.append((row["text"], row["correct_node"], output))
+ except Exception as e:
+ print("Error in processing question: ", row["text"])
+ print("Error: ", e)
+ answer_list.append((row["text"], row["correct_node"], "Error"))
+
+
+ answer_df = pd.DataFrame(answer_list, columns=["question", "correct_answer", "llm_answer"])
+ output_file = os.path.join(SAVE_PATH, f"{save_name}".format(mode=MODE),)
+ answer_df.to_csv(output_file, index=False, header=True)
+ print("Save the model outputs in ", output_file)
+ print("Completed in {} min".format((time.time()-start_time)/60))
+
+
+
+if __name__ == "__main__":
+ main()
+
+