From 73c194f304f827b55081b15524479f82a1b7d94c Mon Sep 17 00:00:00 2001 From: maszhongming Date: Tue, 16 Sep 2025 15:15:29 -0500 Subject: Initial commit --- config.yaml | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 config.yaml (limited to 'config.yaml') diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..2925d96 --- /dev/null +++ b/config.yaml @@ -0,0 +1,55 @@ +--- + +# KG-RAG hyperparameters +CONTEXT_VOLUME : 150 +QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD : 75 +QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY : 0.5 +SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL : 'sentence-transformers/all-MiniLM-L6-v2' +SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL : 'pritamdeka/S-PubMedBert-MS-MARCO' + +# VectorDB hyperparameters +VECTOR_DB_DISEASE_ENTITY_PATH : 'data/disease_with_relation_to_genes.pickle' +VECTOR_DB_PATH : 'data/vectorDB/disease_nodes_db' +VECTOR_DB_CHUNK_SIZE : 650 +VECTOR_DB_CHUNK_OVERLAP : 200 +VECTOR_DB_BATCH_SIZE : 200 +VECTOR_DB_SENTENCE_EMBEDDING_MODEL : 'sentence-transformers/all-MiniLM-L6-v2' + +# Path for context file from SPOKE KG +NODE_CONTEXT_PATH : 'data/context_of_disease_which_has_relation_to_genes.csv' + +# Just note that, this assumes your GPT config file is in the $HOME path, if not, change it accordingly +# Also, GPT '.env' file should contain values for API_KEY, and optionally API_VERSION and RESOURCE_ENDPOINT. We are not including those parameters in this yaml file +GPT_CONFIG_FILE : 'gpt_config.env' +# Can be 'azure' or 'open_ai'. +GPT_API_TYPE : 'open_ai' + +# Llama model name (Refer Hugging face to get the correct name for the model version you would like to use, also make sure you have the right permission to use the model) +LLAMA_MODEL_NAME : 'meta-llama/Llama-2-13b-chat-hf' +LLAMA_MODEL_BRANCH : 'main' + +# Path for caching LLM model files (When the model gets downloaded from hugging face, it will be saved in this path) +LLM_CACHE_DIR : 'llm_data/llm_models/huggingface' +LLM_TEMPERATURE : 0 + +# Path to save results +SAVE_RESULTS_PATH : 'data/my_results' + +# File paths for test questions +MCQ_PATH : 'data/benchmark_data/mcq_questions.csv' +TRUE_FALSE_PATH : 'data/benchmark_data/true_false_questions.csv' +SINGLE_DISEASE_ENTITY_FILE : 'data/hyperparam_tuning_data/single_disease_entity_prompts.csv' +TWO_DISEASE_ENTITY_FILE : 'data/hyperparam_tuning_data/two_disease_entity_prompts.csv' + +# SPOKE-API params +BASE_URI : 'https://spoke.rbvi.ucsf.edu' +cutoff_Compound_max_phase : 3 +cutoff_Protein_source : ['SwissProt'] +cutoff_DaG_diseases_sources : ['knowledge', 'experiments'] +cutoff_DaG_textmining : 3 +cutoff_CtD_phase : 3 +cutoff_PiP_confidence : 0.7 +cutoff_ACTeG_level : ['Low', 'Medium', 'High'] +depth : 1 +cutoff_DpL_average_prevalence : 0.001 + -- cgit v1.2.3