From 2fb71a4d2cccfe69c58e49940b75f77b7b84a2c7 Mon Sep 17 00:00:00 2001 From: LuyaoZhuang Date: Sun, 26 Oct 2025 05:11:12 -0400 Subject: commit --- readme.md | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 13 ++++++ run.py | 62 +++++++++++++++++++++++++++++ 3 files changed, 193 insertions(+) create mode 100644 readme.md create mode 100644 requirements.txt create mode 100644 run.py diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..3b9b7a1 --- /dev/null +++ b/readme.md @@ -0,0 +1,118 @@ +# 🌟 **LinearRAG** +> *LinearRAG: Linear Graph Retrieval Augmented Generation on Large-scale Corpora* +> A lightweight GraphRAG framework that eliminates LLM token cost during graph construction – making GraphRAG faster and more efficient than ever. + +

+ + arXiv:2506.08938 + + + HuggingFace + + + GitHub + +

+ +--- + +## 🚀 **Highlights** +- ✅ **Fact-Level Faithfulness**: Eliminates unstable relation extraction, using only lightweight entity recognition to graph. +- 🔥 **Zero Token Consumption**: Complete graph construction and retrieval without any LLM calls. +- 📊 **Strong Results**: Outperforms previous RAG methods on widely-used benchmarks. + +

+ Framework Overview +

+ +--- + +## 🛠️ **Usage** + +### 1️⃣ Install Dependencies + +**Step 1: Install Python packages** + +```bash +pip install -r requirements.txt +``` + +**Step 2: Download Spacy language model** + +```bash +python -m spacy download en_core_web_trf +``` + +> **Note:** For the `medical` dataset, you need to install the scientific/biomedical Spacy model: +```bash +pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_scibert-0.5.3.tar.gz +``` + +**Step 3: Set up your OpenAI API key** + +```bash +export OPENAI_API_KEY="your-api-key-here" +export OPENAI_BASE_URL="your-base-url-here" +``` + +**Step 4: Download Datasets** + +Download the datasets from HuggingFace and place them in the `dataset/` folder: + +```bash +git clone https://huggingface.co/datasets/Zly0523/linear-rag +cp -r linear-rag/dataset/* dataset/ +``` + +**Step 5: Prepare Embedding Model** + +Make sure the embedding model is available at: + +``` +model/all-mpnet-base-v2/ +``` + +### 2️⃣ Quick Start Example + +```bash +SPACY_MODEL="en_core_web_trf" +EMBEDDING_MODEL="model/bge-large-en-v1.5" +DATASET_NAME="2wikimultihop" +LLM_MODEL="gpt-4o-mini" +MAX_WORKERS=16 + +python run.py \ + --spacy_model ${SPACY_MODEL} \ + --embedding_model ${EMBEDDING_MODEL} \ + --dataset_name ${DATASET_NAME} \ + --llm_model ${LLM_MODEL} \ + --max_workers ${MAX_WORKERS} +``` + +## 🎯 **Performance** + +
+framework + +**Main results of end-to-end performance** +
+
+framework + +**Efficiency and performance comparison.** +
+ + +## 📖 Citation + +If you find this work helpful, please consider citing us: +```bibtex +@article{zhuang2025linearrag, + title={LinearRAG: Linear Graph Retrieval Augmented Generation on Large-scale Corpora}, + author={Zhuang, Luyao and Chen, Shengyuan and Xiao, Yilin and Zhou, Huachi and Zhang, Yujing and Chen, Hao and Zhang, Qinggang and Huang, Xiao}, + journal={arXiv preprint arXiv:2510.10114}, + year={2025} +} +``` +## 📬 Contact +✉️ Email: zhuangluyao523@gmail.com \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..da4c82d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +httpx[socks]==0.28.1 +numpy==1.21.0 +openai==1.54.5 +pandas==1.3.0 +python-igraph==0.11.8 +scikit-learn==1.3.2 +sentence-transformers==2.2.2 +spacy==3.6.1 +tqdm==4.67.1 +transformers==4.30.2 +huggingface-hub==0.16.4 +pyarrow==12.0.1 + diff --git a/run.py b/run.py new file mode 100644 index 0000000..bed9f1a --- /dev/null +++ b/run.py @@ -0,0 +1,62 @@ +import argparse +import json +from transformers import AutoTokenizer, AutoModel +from sentence_transformers import SentenceTransformer +from src.config import LinearRAGConfig +from src.LinearRAG import LinearRAG +import os +import warnings +from src.evaluate import Evaluator +from src.utils import LLM_Model +from src.utils import setup_logging +os.environ["CUDA_VISIBLE_DEVICES"] = "5" +os.environ["TOKENIZERS_PARALLELISM"] = "false" +warnings.filterwarnings('ignore') + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("--spacy_model", type=str, default="en_core_sci_scibert", help="The spacy model to use") + parser.add_argument("--embedding_model", type=str, default="model/all-mpnet-base-v2", help="The path of embedding model to use") + parser.add_argument("--dataset_name", type=str, default="medical", help="The dataset to use") + parser.add_argument("--llm_model", type=str, default="gpt-4o-mini", help="The LLM model to use") + parser.add_argument("--max_workers", type=int, default=16, help="The max number of workers to use") + return parser.parse_args() + + +def load_dataset(dataset_name,tokenizer): + questions_path = f"dataset/{dataset_name}/questions.json" + with open(questions_path, "r", encoding="utf-8") as f: + questions = json.load(f) + chunks_path = f"dataset/{dataset_name}/chunks.json" + with open(chunks_path, "r", encoding="utf-8") as f: + chunks = json.load(f) + passages = [f'{idx}:{chunk}' for idx, chunk in enumerate(chunks)] + return questions, passages + +def load_embedding_model(embedding_model): + embedding_model = SentenceTransformer(embedding_model,device="cuda") + return embedding_model + +def main(): + args = parse_arguments() + embedding_model = load_embedding_model(args.embedding_model) + questions,passages = load_dataset(args.dataset_name) + setup_logging(f"results/{args.dataset_name}/log.txt") + llm_model = LLM_Model(args.llm_model) + config = LinearRAGConfig( + dataset_name=args.dataset_name, + embedding_model=embedding_model, + spacy_model=args.spacy_model, + max_workers=args.max_workers, + llm_model=llm_model + ) + rag_model = LinearRAG(global_config=config) + rag_model.index(passages) + questions = rag_model.qa(questions) + os.makedirs(f"results/{args.dataset_name}", exist_ok=True) + with open(f"results/{args.dataset_name}/predictions.json", "w", encoding="utf-8") as f: + json.dump(questions, f, ensure_ascii=False, indent=4) + evaluator = Evaluator(llm_model=llm_model, predictions_path=f"results/{args.dataset_name}/predictions.json") + evaluator.evaluate(max_workers=args.max_workers) +if __name__ == "__main__": + main() \ No newline at end of file -- cgit v1.2.3