From 73c194f304f827b55081b15524479f82a1b7d94c Mon Sep 17 00:00:00 2001 From: maszhongming Date: Tue, 16 Sep 2025 15:15:29 -0500 Subject: Initial commit --- kg_rag/rag_based_generation/GPT/run_mcq_qa.py | 91 +++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 kg_rag/rag_based_generation/GPT/run_mcq_qa.py (limited to 'kg_rag/rag_based_generation/GPT/run_mcq_qa.py') diff --git a/kg_rag/rag_based_generation/GPT/run_mcq_qa.py b/kg_rag/rag_based_generation/GPT/run_mcq_qa.py new file mode 100644 index 0000000..edf0415 --- /dev/null +++ b/kg_rag/rag_based_generation/GPT/run_mcq_qa.py @@ -0,0 +1,91 @@ +''' +This script takes the MCQ style questions from the csv file and save the result as another csv file. +Before running this script, make sure to configure the filepaths in config.yaml file. +Command line argument should be either 'gpt-4' or 'gpt-35-turbo' +''' + +from kg_rag.utility import * +import sys + + +from tqdm import tqdm +CHAT_MODEL_ID = sys.argv[1] + +QUESTION_PATH = config_data["MCQ_PATH"] +SYSTEM_PROMPT = system_prompts["MCQ_QUESTION"] +CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"]) +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"]) +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"]) +VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"] +NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"] +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"] +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"] +TEMPERATURE = config_data["LLM_TEMPERATURE"] +SAVE_PATH = config_data["SAVE_RESULTS_PATH"] + + +CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID + +save_name = "_".join(CHAT_MODEL_ID.split("-"))+"_kg_rag_based_mcq_{mode}.csv" + + +vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL) +embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL) +node_context_df = pd.read_csv(NODE_CONTEXT_PATH) +edge_evidence = False + + +MODE = "0" +### MODE 0: Original KG_RAG ### +### MODE 1: jsonlize the context from KG search ### +### MODE 2: Add the prior domain knowledge ### +### MODE 3: Combine MODE 1 & 2 ### + +def main(): + start_time = time.time() + question_df = pd.read_csv(QUESTION_PATH) + answer_list = [] + + for index, row in tqdm(question_df.iterrows(), total=306): + try: + question = row["text"] + if MODE == "0": + ### MODE 0: Original KG_RAG ### + context = retrieve_context(row["text"], vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence, model_id=CHAT_MODEL_ID) + enriched_prompt = "Context: "+ context + "\n" + "Question: "+ question + output = get_Gemini_response(enriched_prompt, SYSTEM_PROMPT, temperature=TEMPERATURE) + + if MODE == "1": + ### MODE 1: jsonlize the context from KG search ### + ### Please implement the first strategy here ### + output = '...' + + if MODE == "2": + ### MODE 2: Add the prior domain knowledge ### + ### Please implement the second strategy here ### + output = '...' + + if MODE == "3": + ### MODE 3: Combine MODE 1 & 2 ### + ### Please implement the third strategy here ### + output = '...' + + answer_list.append((row["text"], row["correct_node"], output)) + except Exception as e: + print("Error in processing question: ", row["text"]) + print("Error: ", e) + answer_list.append((row["text"], row["correct_node"], "Error")) + + + answer_df = pd.DataFrame(answer_list, columns=["question", "correct_answer", "llm_answer"]) + output_file = os.path.join(SAVE_PATH, f"{save_name}".format(mode=MODE),) + answer_df.to_csv(output_file, index=False, header=True) + print("Save the model outputs in ", output_file) + print("Completed in {} min".format((time.time()-start_time)/60)) + + + +if __name__ == "__main__": + main() + + -- cgit v1.2.3