diff options
| author | YurenHao0426 <Blackhao0426@gmail.com> | 2026-02-16 14:44:42 -0600 |
|---|---|---|
| committer | YurenHao0426 <Blackhao0426@gmail.com> | 2026-02-16 14:44:42 -0600 |
| commit | 09d50e47860da0035e178a442dc936028808a0b3 (patch) | |
| tree | 9d651b0c7d289a9a0405953f2da989a3c431f147 /scripts/prepare_corpus.py | |
| parent | c90b48e3f8da9dd0f8d2ae82ddf977436bb0cfc3 (diff) | |
- Add centering support to MemoryBank (center_query, apply_centering, mean
persistence in save/load) to remove centroid attractor in Hopfield dynamics
- Add center flag to MemoryBankConfig, device field to PipelineConfig
- Grid search scripts: initial (β≤8), residual, high-β, and centered grids
with dedup-based LLM caching (89-91% call savings)
- Energy landscape visualization: 2D contour, 1D profile, UMAP, PCA heatmap
comparing centered vs uncentered dynamics
- Experiment log (note.md) documenting 4 rounds of results and root cause
analysis of centroid attractor problem
- Key finding: β_critical ≈ 37.6 for centered memory; best configs beat
FAISS baseline by +3-4% F1
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Diffstat (limited to 'scripts/prepare_corpus.py')
| -rw-r--r-- | scripts/prepare_corpus.py | 127 |
1 files changed, 127 insertions, 0 deletions
diff --git a/scripts/prepare_corpus.py b/scripts/prepare_corpus.py new file mode 100644 index 0000000..93fc0ce --- /dev/null +++ b/scripts/prepare_corpus.py @@ -0,0 +1,127 @@ +"""Convert linear-rag chunks.json to JSONL corpus for build_memory_bank.py. + +The linear-rag dataset stores chunks as a list of strings with format "idx:text...". +This script strips the index prefix and outputs one {"text": "..."} per line. + +Usage: + python scripts/prepare_corpus.py --dataset hotpotqa + python scripts/prepare_corpus.py --dataset hotpotqa --dataset musique --dataset 2wikimultihop +""" + +import argparse +import json +import logging +from pathlib import Path + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +DATASETS = ["hotpotqa", "musique", "2wikimultihop", "medical"] + + +def convert_chunks(dataset: str, data_root: Path, output_dir: Path) -> Path: + """Convert a single dataset's chunks.json to corpus JSONL. + + Args: + dataset: dataset name (e.g., "hotpotqa") + data_root: path to linear-rag clone + output_dir: directory to write output JSONL + + Returns: + Path to the output JSONL file. + """ + chunks_path = data_root / dataset / "chunks.json" + if not chunks_path.exists(): + raise FileNotFoundError(f"Not found: {chunks_path}") + + with open(chunks_path) as f: + chunks = json.load(f) + + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / f"{dataset}_corpus.jsonl" + + count = 0 + with open(output_path, "w") as out: + for chunk in chunks: + # Strip the "idx:" prefix + text = chunk.split(":", 1)[1] if ":" in chunk else chunk + text = text.strip() + if text: + out.write(json.dumps({"text": text}) + "\n") + count += 1 + + logger.info("%s: %d chunks -> %s", dataset, count, output_path) + return output_path + + +def convert_questions(dataset: str, data_root: Path, output_dir: Path) -> Path: + """Convert questions.json to a standardized JSONL format. + + Args: + dataset: dataset name + data_root: path to linear-rag clone + output_dir: directory to write output JSONL + + Returns: + Path to the output JSONL file. + """ + questions_path = data_root / dataset / "questions.json" + if not questions_path.exists(): + raise FileNotFoundError(f"Not found: {questions_path}") + + with open(questions_path) as f: + questions = json.load(f) + + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / f"{dataset}_questions.jsonl" + + count = 0 + with open(output_path, "w") as out: + for q in questions: + record = { + "id": q.get("id", ""), + "question": q["question"], + "answer": q["answer"], + "question_type": q.get("question_type", ""), + } + out.write(json.dumps(record) + "\n") + count += 1 + + logger.info("%s: %d questions -> %s", dataset, count, output_path) + return output_path + + +def main() -> None: + parser = argparse.ArgumentParser(description="Prepare linear-rag data for HAG") + parser.add_argument( + "--dataset", + type=str, + action="append", + default=None, + help=f"Dataset(s) to process. Choices: {DATASETS}. Can specify multiple times.", + ) + parser.add_argument( + "--data-root", + type=str, + default="data/linear-rag", + help="Path to linear-rag clone", + ) + parser.add_argument( + "--output-dir", + type=str, + default="data/processed", + help="Output directory for processed files", + ) + args = parser.parse_args() + + datasets = args.dataset if args.dataset else DATASETS + data_root = Path(args.data_root) + output_dir = Path(args.output_dir) + + for ds in datasets: + convert_chunks(ds, data_root, output_dir) + convert_questions(ds, data_root, output_dir) + + +if __name__ == "__main__": + main() |
