1 files changed, 127 insertions, 0 deletions
diff --git a/scripts/prepare_corpus.py b/scripts/prepare_corpus.py
new file mode 100644
index 0000000..93fc0ce
--- /dev/null
+++ b/scripts/prepare_corpus.py
@@ -0,0 +1,127 @@
+"""Convert linear-rag chunks.json to JSONL corpus for build_memory_bank.py.
+
+The linear-rag dataset stores chunks as a list of strings with format "idx:text...".
+This script strips the index prefix and outputs one {"text": "..."} per line.
+
+Usage:
+    python scripts/prepare_corpus.py --dataset hotpotqa
+    python scripts/prepare_corpus.py --dataset hotpotqa --dataset musique --dataset 2wikimultihop
+"""
+
+import argparse
+import json
+import logging
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+DATASETS = ["hotpotqa", "musique", "2wikimultihop", "medical"]
+
+
+def convert_chunks(dataset: str, data_root: Path, output_dir: Path) -> Path:
+    """Convert a single dataset's chunks.json to corpus JSONL.
+
+    Args:
+        dataset: dataset name (e.g., "hotpotqa")
+        data_root: path to linear-rag clone
+        output_dir: directory to write output JSONL
+
+    Returns:
+        Path to the output JSONL file.
+    """
+    chunks_path = data_root / dataset / "chunks.json"
+    if not chunks_path.exists():
+        raise FileNotFoundError(f"Not found: {chunks_path}")
+
+    with open(chunks_path) as f:
+        chunks = json.load(f)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = output_dir / f"{dataset}_corpus.jsonl"
+
+    count = 0
+    with open(output_path, "w") as out:
+        for chunk in chunks:
+            # Strip the "idx:" prefix
+            text = chunk.split(":", 1)[1] if ":" in chunk else chunk
+            text = text.strip()
+            if text:
+                out.write(json.dumps({"text": text}) + "\n")
+                count += 1
+
+    logger.info("%s: %d chunks -> %s", dataset, count, output_path)
+    return output_path
+
+
+def convert_questions(dataset: str, data_root: Path, output_dir: Path) -> Path:
+    """Convert questions.json to a standardized JSONL format.
+
+    Args:
+        dataset: dataset name
+        data_root: path to linear-rag clone
+        output_dir: directory to write output JSONL
+
+    Returns:
+        Path to the output JSONL file.
+    """
+    questions_path = data_root / dataset / "questions.json"
+    if not questions_path.exists():
+        raise FileNotFoundError(f"Not found: {questions_path}")
+
+    with open(questions_path) as f:
+        questions = json.load(f)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = output_dir / f"{dataset}_questions.jsonl"
+
+    count = 0
+    with open(output_path, "w") as out:
+        for q in questions:
+            record = {
+                "id": q.get("id", ""),
+                "question": q["question"],
+                "answer": q["answer"],
+                "question_type": q.get("question_type", ""),
+            }
+            out.write(json.dumps(record) + "\n")
+            count += 1
+
+    logger.info("%s: %d questions -> %s", dataset, count, output_path)
+    return output_path
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Prepare linear-rag data for HAG")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        action="append",
+        default=None,
+        help=f"Dataset(s) to process. Choices: {DATASETS}. Can specify multiple times.",
+    )
+    parser.add_argument(
+        "--data-root",
+        type=str,
+        default="data/linear-rag",
+        help="Path to linear-rag clone",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="data/processed",
+        help="Output directory for processed files",
+    )
+    args = parser.parse_args()
+
+    datasets = args.dataset if args.dataset else DATASETS
+    data_root = Path(args.data_root)
+    output_dir = Path(args.output_dir)
+
+    for ds in datasets:
+        convert_chunks(ds, data_root, output_dir)
+        convert_questions(ds, data_root, output_dir)
+
+
+if __name__ == "__main__":
+    main()