1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
'''
This script takes the MCQ style questions from the csv file and save the result as another csv file.
Before running this script, make sure to configure the filepaths in config.yaml file.
Command line argument should be either 'gpt-4' or 'gpt-35-turbo'
'''
from kg_rag.utility import *
import sys
from tqdm import tqdm
CHAT_MODEL_ID = sys.argv[1]
QUESTION_PATH = config_data["MCQ_PATH"]
SYSTEM_PROMPT = system_prompts["MCQ_QUESTION"]
CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"])
QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"]
TEMPERATURE = config_data["LLM_TEMPERATURE"]
SAVE_PATH = config_data["SAVE_RESULTS_PATH"]
CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID
save_name = "_".join(CHAT_MODEL_ID.split("-"))+"_kg_rag_based_mcq_{mode}.csv"
vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
edge_evidence = False
MODE = "0"
### MODE 0: Original KG_RAG ###
### MODE 1: jsonlize the context from KG search ###
### MODE 2: Add the prior domain knowledge ###
### MODE 3: Combine MODE 1 & 2 ###
def main():
start_time = time.time()
question_df = pd.read_csv(QUESTION_PATH)
answer_list = []
for index, row in tqdm(question_df.iterrows(), total=306):
try:
question = row["text"]
if MODE == "0":
### MODE 0: Original KG_RAG ###
context = retrieve_context(row["text"], vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence, model_id=CHAT_MODEL_ID)
enriched_prompt = "Context: "+ context + "\n" + "Question: "+ question
output = get_Gemini_response(enriched_prompt, SYSTEM_PROMPT, temperature=TEMPERATURE)
if MODE == "1":
### MODE 1: jsonlize the context from KG search ###
### Please implement the first strategy here ###
output = '...'
if MODE == "2":
### MODE 2: Add the prior domain knowledge ###
### Please implement the second strategy here ###
output = '...'
if MODE == "3":
### MODE 3: Combine MODE 1 & 2 ###
### Please implement the third strategy here ###
output = '...'
answer_list.append((row["text"], row["correct_node"], output))
except Exception as e:
print("Error in processing question: ", row["text"])
print("Error: ", e)
answer_list.append((row["text"], row["correct_node"], "Error"))
answer_df = pd.DataFrame(answer_list, columns=["question", "correct_answer", "llm_answer"])
output_file = os.path.join(SAVE_PATH, f"{save_name}".format(mode=MODE),)
answer_df.to_csv(output_file, index=False, header=True)
print("Save the model outputs in ", output_file)
print("Completed in {} min".format((time.time()-start_time)/60))
if __name__ == "__main__":
main()
|