summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--readme.md118
-rw-r--r--requirements.txt13
-rw-r--r--run.py62
3 files changed, 193 insertions, 0 deletions
diff --git a/readme.md b/readme.md
new file mode 100644
index 0000000..3b9b7a1
--- /dev/null
+++ b/readme.md
@@ -0,0 +1,118 @@
+# 🌟 **LinearRAG**
+> *LinearRAG: Linear Graph Retrieval Augmented Generation on Large-scale Corpora*
+> A lightweight GraphRAG framework that eliminates LLM token cost during graph construction – making GraphRAG faster and more efficient than ever.
+
+<p align="center">
+ <a href="https://arxiv.org/abs/2510.10114" target="_blank">
+ <img src="https://img.shields.io/badge/Paper-Arxiv-red?logo=arxiv&style=flat-square" alt="arXiv:2506.08938">
+ </a>
+ <a href="https://huggingface.co/datasets/Zly0523/linear-rag/tree/main" target="_blank">
+ <img src="https://img.shields.io/badge/HuggingFace-Model-yellow?logo=huggingface&style=flat-square" alt="HuggingFace">
+ </a>
+ <a href="https://github.com/LuyaoZhuang/linear-rag" target="_blank">
+ <img src="https://img.shields.io/badge/GitHub-Project-181717?logo=github&style=flat-square" alt="GitHub">
+ </a>
+</p>
+
+---
+
+## 🚀 **Highlights**
+- ✅ **Fact-Level Faithfulness**: Eliminates unstable relation extraction, using only lightweight entity recognition to graph.
+- 🔥 **Zero Token Consumption**: Complete graph construction and retrieval without any LLM calls.
+- 📊 **Strong Results**: Outperforms previous RAG methods on widely-used benchmarks.
+
+<p align="center">
+ <img src="figure/main_figure.png" width="80%" alt="Framework Overview">
+</p>
+
+---
+
+## 🛠️ **Usage**
+
+### 1️⃣ Install Dependencies
+
+**Step 1: Install Python packages**
+
+```bash
+pip install -r requirements.txt
+```
+
+**Step 2: Download Spacy language model**
+
+```bash
+python -m spacy download en_core_web_trf
+```
+
+> **Note:** For the `medical` dataset, you need to install the scientific/biomedical Spacy model:
+```bash
+pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_scibert-0.5.3.tar.gz
+```
+
+**Step 3: Set up your OpenAI API key**
+
+```bash
+export OPENAI_API_KEY="your-api-key-here"
+export OPENAI_BASE_URL="your-base-url-here"
+```
+
+**Step 4: Download Datasets**
+
+Download the datasets from HuggingFace and place them in the `dataset/` folder:
+
+```bash
+git clone https://huggingface.co/datasets/Zly0523/linear-rag
+cp -r linear-rag/dataset/* dataset/
+```
+
+**Step 5: Prepare Embedding Model**
+
+Make sure the embedding model is available at:
+
+```
+model/all-mpnet-base-v2/
+```
+
+### 2️⃣ Quick Start Example
+
+```bash
+SPACY_MODEL="en_core_web_trf"
+EMBEDDING_MODEL="model/bge-large-en-v1.5"
+DATASET_NAME="2wikimultihop"
+LLM_MODEL="gpt-4o-mini"
+MAX_WORKERS=16
+
+python run.py \
+ --spacy_model ${SPACY_MODEL} \
+ --embedding_model ${EMBEDDING_MODEL} \
+ --dataset_name ${DATASET_NAME} \
+ --llm_model ${LLM_MODEL} \
+ --max_workers ${MAX_WORKERS}
+```
+
+## 🎯 **Performance**
+
+<div align="center">
+<img src="figure/generation_results.png" alt="framework" width="1000">
+
+**Main results of end-to-end performance**
+</div>
+<div align="center">
+<img src="figure/efficiency_result.png" alt="framework" width="1000">
+
+**Efficiency and performance comparison.**
+</div>
+
+
+## 📖 Citation
+
+If you find this work helpful, please consider citing us:
+```bibtex
+@article{zhuang2025linearrag,
+ title={LinearRAG: Linear Graph Retrieval Augmented Generation on Large-scale Corpora},
+ author={Zhuang, Luyao and Chen, Shengyuan and Xiao, Yilin and Zhou, Huachi and Zhang, Yujing and Chen, Hao and Zhang, Qinggang and Huang, Xiao},
+ journal={arXiv preprint arXiv:2510.10114},
+ year={2025}
+}
+```
+## 📬 Contact
+✉️ Email: zhuangluyao523@gmail.com \ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..da4c82d
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,13 @@
+httpx[socks]==0.28.1
+numpy==1.21.0
+openai==1.54.5
+pandas==1.3.0
+python-igraph==0.11.8
+scikit-learn==1.3.2
+sentence-transformers==2.2.2
+spacy==3.6.1
+tqdm==4.67.1
+transformers==4.30.2
+huggingface-hub==0.16.4
+pyarrow==12.0.1
+
diff --git a/run.py b/run.py
new file mode 100644
index 0000000..bed9f1a
--- /dev/null
+++ b/run.py
@@ -0,0 +1,62 @@
+import argparse
+import json
+from transformers import AutoTokenizer, AutoModel
+from sentence_transformers import SentenceTransformer
+from src.config import LinearRAGConfig
+from src.LinearRAG import LinearRAG
+import os
+import warnings
+from src.evaluate import Evaluator
+from src.utils import LLM_Model
+from src.utils import setup_logging
+os.environ["CUDA_VISIBLE_DEVICES"] = "5"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+warnings.filterwarnings('ignore')
+
+def parse_arguments():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--spacy_model", type=str, default="en_core_sci_scibert", help="The spacy model to use")
+ parser.add_argument("--embedding_model", type=str, default="model/all-mpnet-base-v2", help="The path of embedding model to use")
+ parser.add_argument("--dataset_name", type=str, default="medical", help="The dataset to use")
+ parser.add_argument("--llm_model", type=str, default="gpt-4o-mini", help="The LLM model to use")
+ parser.add_argument("--max_workers", type=int, default=16, help="The max number of workers to use")
+ return parser.parse_args()
+
+
+def load_dataset(dataset_name,tokenizer):
+ questions_path = f"dataset/{dataset_name}/questions.json"
+ with open(questions_path, "r", encoding="utf-8") as f:
+ questions = json.load(f)
+ chunks_path = f"dataset/{dataset_name}/chunks.json"
+ with open(chunks_path, "r", encoding="utf-8") as f:
+ chunks = json.load(f)
+ passages = [f'{idx}:{chunk}' for idx, chunk in enumerate(chunks)]
+ return questions, passages
+
+def load_embedding_model(embedding_model):
+ embedding_model = SentenceTransformer(embedding_model,device="cuda")
+ return embedding_model
+
+def main():
+ args = parse_arguments()
+ embedding_model = load_embedding_model(args.embedding_model)
+ questions,passages = load_dataset(args.dataset_name)
+ setup_logging(f"results/{args.dataset_name}/log.txt")
+ llm_model = LLM_Model(args.llm_model)
+ config = LinearRAGConfig(
+ dataset_name=args.dataset_name,
+ embedding_model=embedding_model,
+ spacy_model=args.spacy_model,
+ max_workers=args.max_workers,
+ llm_model=llm_model
+ )
+ rag_model = LinearRAG(global_config=config)
+ rag_model.index(passages)
+ questions = rag_model.qa(questions)
+ os.makedirs(f"results/{args.dataset_name}", exist_ok=True)
+ with open(f"results/{args.dataset_name}/predictions.json", "w", encoding="utf-8") as f:
+ json.dump(questions, f, ensure_ascii=False, indent=4)
+ evaluator = Evaluator(llm_model=llm_model, predictions_path=f"results/{args.dataset_name}/predictions.json")
+ evaluator.evaluate(max_workers=args.max_workers)
+if __name__ == "__main__":
+ main() \ No newline at end of file