"""Convert linear-rag chunks.json to JSONL corpus for build_memory_bank.py. The linear-rag dataset stores chunks as a list of strings with format "idx:text...". This script strips the index prefix and outputs one {"text": "..."} per line. Usage: python scripts/prepare_corpus.py --dataset hotpotqa python scripts/prepare_corpus.py --dataset hotpotqa --dataset musique --dataset 2wikimultihop """ import argparse import json import logging from pathlib import Path logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) DATASETS = ["hotpotqa", "musique", "2wikimultihop", "medical"] def convert_chunks(dataset: str, data_root: Path, output_dir: Path) -> Path: """Convert a single dataset's chunks.json to corpus JSONL. Args: dataset: dataset name (e.g., "hotpotqa") data_root: path to linear-rag clone output_dir: directory to write output JSONL Returns: Path to the output JSONL file. """ chunks_path = data_root / dataset / "chunks.json" if not chunks_path.exists(): raise FileNotFoundError(f"Not found: {chunks_path}") with open(chunks_path) as f: chunks = json.load(f) output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / f"{dataset}_corpus.jsonl" count = 0 with open(output_path, "w") as out: for chunk in chunks: # Strip the "idx:" prefix text = chunk.split(":", 1)[1] if ":" in chunk else chunk text = text.strip() if text: out.write(json.dumps({"text": text}) + "\n") count += 1 logger.info("%s: %d chunks -> %s", dataset, count, output_path) return output_path def convert_questions(dataset: str, data_root: Path, output_dir: Path) -> Path: """Convert questions.json to a standardized JSONL format. Args: dataset: dataset name data_root: path to linear-rag clone output_dir: directory to write output JSONL Returns: Path to the output JSONL file. """ questions_path = data_root / dataset / "questions.json" if not questions_path.exists(): raise FileNotFoundError(f"Not found: {questions_path}") with open(questions_path) as f: questions = json.load(f) output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / f"{dataset}_questions.jsonl" count = 0 with open(output_path, "w") as out: for q in questions: record = { "id": q.get("id", ""), "question": q["question"], "answer": q["answer"], "question_type": q.get("question_type", ""), } out.write(json.dumps(record) + "\n") count += 1 logger.info("%s: %d questions -> %s", dataset, count, output_path) return output_path def main() -> None: parser = argparse.ArgumentParser(description="Prepare linear-rag data for HAG") parser.add_argument( "--dataset", type=str, action="append", default=None, help=f"Dataset(s) to process. Choices: {DATASETS}. Can specify multiple times.", ) parser.add_argument( "--data-root", type=str, default="data/linear-rag", help="Path to linear-rag clone", ) parser.add_argument( "--output-dir", type=str, default="data/processed", help="Output directory for processed files", ) args = parser.parse_args() datasets = args.dataset if args.dataset else DATASETS data_root = Path(args.data_root) output_dir = Path(args.output_dir) for ds in datasets: convert_chunks(ds, data_root, output_dir) convert_questions(ds, data_root, output_dir) if __name__ == "__main__": main()