diff options
Diffstat (limited to 'kg_rag/rag_based_generation/Llama/run_mcq_qa.py')
| -rw-r--r-- | kg_rag/rag_based_generation/Llama/run_mcq_qa.py | 61 |
1 files changed, 61 insertions, 0 deletions
diff --git a/kg_rag/rag_based_generation/Llama/run_mcq_qa.py b/kg_rag/rag_based_generation/Llama/run_mcq_qa.py new file mode 100644 index 0000000..67ae43c --- /dev/null +++ b/kg_rag/rag_based_generation/Llama/run_mcq_qa.py @@ -0,0 +1,61 @@ +''' +This script takes the MCQ style questions from the csv file and save the result as another csv file. +This script makes use of Llama model. +Before running this script, make sure to configure the filepaths in config.yaml file. +''' +from tqdm import tqdm +from langchain import PromptTemplate, LLMChain +from kg_rag.utility import * + + +QUESTION_PATH = config_data["MCQ_PATH"] +SYSTEM_PROMPT = system_prompts["MCQ_QUESTION"] +CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"]) +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"]) +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"]) +VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"] +NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"] +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"] +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"] +SAVE_PATH = config_data["SAVE_RESULTS_PATH"] +MODEL_NAME = config_data["LLAMA_MODEL_NAME"] +BRANCH_NAME = config_data["LLAMA_MODEL_BRANCH"] +CACHE_DIR = config_data["LLM_CACHE_DIR"] + +save_name = "_".join(MODEL_NAME.split("/")[-1].split("-"))+"_kg_rag_based_mcq_from_monarch_and_robokop_response.csv" + + +INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}" + +vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL) +embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL) +node_context_df = pd.read_csv(NODE_CONTEXT_PATH) +edge_evidence = False + + + +def main(): + start_time = time.time() + llm = llama_model(MODEL_NAME, BRANCH_NAME, CACHE_DIR) + template = get_prompt(INSTRUCTION, SYSTEM_PROMPT) + prompt = PromptTemplate(template=template, input_variables=["context", "question"]) + llm_chain = LLMChain(prompt=prompt, llm=llm) + question_df = pd.read_csv(QUESTION_PATH) + answer_list = [] + for index, row in tqdm(question_df.iterrows()): + question = row["text"] + context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence) + output = llm_chain.run(context=context, question=question) + answer_list.append((row["text"], row["correct_node"], output)) + answer_df = pd.DataFrame(answer_list, columns=["question", "correct_answer", "llm_answer"]) + answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True) + print("Completed in {} min".format((time.time()-start_time)/60)) + + + + + +if __name__ == "__main__": + main() + + |
