summaryrefslogtreecommitdiff
path: root/scripts/prepare_corpus.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/prepare_corpus.py')
-rw-r--r--scripts/prepare_corpus.py127
1 files changed, 127 insertions, 0 deletions
diff --git a/scripts/prepare_corpus.py b/scripts/prepare_corpus.py
new file mode 100644
index 0000000..93fc0ce
--- /dev/null
+++ b/scripts/prepare_corpus.py
@@ -0,0 +1,127 @@
+"""Convert linear-rag chunks.json to JSONL corpus for build_memory_bank.py.
+
+The linear-rag dataset stores chunks as a list of strings with format "idx:text...".
+This script strips the index prefix and outputs one {"text": "..."} per line.
+
+Usage:
+ python scripts/prepare_corpus.py --dataset hotpotqa
+ python scripts/prepare_corpus.py --dataset hotpotqa --dataset musique --dataset 2wikimultihop
+"""
+
+import argparse
+import json
+import logging
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+DATASETS = ["hotpotqa", "musique", "2wikimultihop", "medical"]
+
+
+def convert_chunks(dataset: str, data_root: Path, output_dir: Path) -> Path:
+ """Convert a single dataset's chunks.json to corpus JSONL.
+
+ Args:
+ dataset: dataset name (e.g., "hotpotqa")
+ data_root: path to linear-rag clone
+ output_dir: directory to write output JSONL
+
+ Returns:
+ Path to the output JSONL file.
+ """
+ chunks_path = data_root / dataset / "chunks.json"
+ if not chunks_path.exists():
+ raise FileNotFoundError(f"Not found: {chunks_path}")
+
+ with open(chunks_path) as f:
+ chunks = json.load(f)
+
+ output_dir.mkdir(parents=True, exist_ok=True)
+ output_path = output_dir / f"{dataset}_corpus.jsonl"
+
+ count = 0
+ with open(output_path, "w") as out:
+ for chunk in chunks:
+ # Strip the "idx:" prefix
+ text = chunk.split(":", 1)[1] if ":" in chunk else chunk
+ text = text.strip()
+ if text:
+ out.write(json.dumps({"text": text}) + "\n")
+ count += 1
+
+ logger.info("%s: %d chunks -> %s", dataset, count, output_path)
+ return output_path
+
+
+def convert_questions(dataset: str, data_root: Path, output_dir: Path) -> Path:
+ """Convert questions.json to a standardized JSONL format.
+
+ Args:
+ dataset: dataset name
+ data_root: path to linear-rag clone
+ output_dir: directory to write output JSONL
+
+ Returns:
+ Path to the output JSONL file.
+ """
+ questions_path = data_root / dataset / "questions.json"
+ if not questions_path.exists():
+ raise FileNotFoundError(f"Not found: {questions_path}")
+
+ with open(questions_path) as f:
+ questions = json.load(f)
+
+ output_dir.mkdir(parents=True, exist_ok=True)
+ output_path = output_dir / f"{dataset}_questions.jsonl"
+
+ count = 0
+ with open(output_path, "w") as out:
+ for q in questions:
+ record = {
+ "id": q.get("id", ""),
+ "question": q["question"],
+ "answer": q["answer"],
+ "question_type": q.get("question_type", ""),
+ }
+ out.write(json.dumps(record) + "\n")
+ count += 1
+
+ logger.info("%s: %d questions -> %s", dataset, count, output_path)
+ return output_path
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description="Prepare linear-rag data for HAG")
+ parser.add_argument(
+ "--dataset",
+ type=str,
+ action="append",
+ default=None,
+ help=f"Dataset(s) to process. Choices: {DATASETS}. Can specify multiple times.",
+ )
+ parser.add_argument(
+ "--data-root",
+ type=str,
+ default="data/linear-rag",
+ help="Path to linear-rag clone",
+ )
+ parser.add_argument(
+ "--output-dir",
+ type=str,
+ default="data/processed",
+ help="Output directory for processed files",
+ )
+ args = parser.parse_args()
+
+ datasets = args.dataset if args.dataset else DATASETS
+ data_root = Path(args.data_root)
+ output_dir = Path(args.output_dir)
+
+ for ds in datasets:
+ convert_chunks(ds, data_root, output_dir)
+ convert_questions(ds, data_root, output_dir)
+
+
+if __name__ == "__main__":
+ main()