From 09d50e47860da0035e178a442dc936028808a0b3 Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Mon, 16 Feb 2026 14:44:42 -0600 Subject: Add memory centering, grid search experiments, and energy visualizations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add centering support to MemoryBank (center_query, apply_centering, mean persistence in save/load) to remove centroid attractor in Hopfield dynamics - Add center flag to MemoryBankConfig, device field to PipelineConfig - Grid search scripts: initial (β≤8), residual, high-β, and centered grids with dedup-based LLM caching (89-91% call savings) - Energy landscape visualization: 2D contour, 1D profile, UMAP, PCA heatmap comparing centered vs uncentered dynamics - Experiment log (note.md) documenting 4 rounds of results and root cause analysis of centroid attractor problem - Key finding: β_critical ≈ 37.6 for centered memory; best configs beat FAISS baseline by +3-4% F1 Co-Authored-By: Claude Opus 4.6 --- scripts/prepare_corpus.py | 127 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 scripts/prepare_corpus.py (limited to 'scripts/prepare_corpus.py') diff --git a/scripts/prepare_corpus.py b/scripts/prepare_corpus.py new file mode 100644 index 0000000..93fc0ce --- /dev/null +++ b/scripts/prepare_corpus.py @@ -0,0 +1,127 @@ +"""Convert linear-rag chunks.json to JSONL corpus for build_memory_bank.py. + +The linear-rag dataset stores chunks as a list of strings with format "idx:text...". +This script strips the index prefix and outputs one {"text": "..."} per line. + +Usage: + python scripts/prepare_corpus.py --dataset hotpotqa + python scripts/prepare_corpus.py --dataset hotpotqa --dataset musique --dataset 2wikimultihop +""" + +import argparse +import json +import logging +from pathlib import Path + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +DATASETS = ["hotpotqa", "musique", "2wikimultihop", "medical"] + + +def convert_chunks(dataset: str, data_root: Path, output_dir: Path) -> Path: + """Convert a single dataset's chunks.json to corpus JSONL. + + Args: + dataset: dataset name (e.g., "hotpotqa") + data_root: path to linear-rag clone + output_dir: directory to write output JSONL + + Returns: + Path to the output JSONL file. + """ + chunks_path = data_root / dataset / "chunks.json" + if not chunks_path.exists(): + raise FileNotFoundError(f"Not found: {chunks_path}") + + with open(chunks_path) as f: + chunks = json.load(f) + + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / f"{dataset}_corpus.jsonl" + + count = 0 + with open(output_path, "w") as out: + for chunk in chunks: + # Strip the "idx:" prefix + text = chunk.split(":", 1)[1] if ":" in chunk else chunk + text = text.strip() + if text: + out.write(json.dumps({"text": text}) + "\n") + count += 1 + + logger.info("%s: %d chunks -> %s", dataset, count, output_path) + return output_path + + +def convert_questions(dataset: str, data_root: Path, output_dir: Path) -> Path: + """Convert questions.json to a standardized JSONL format. + + Args: + dataset: dataset name + data_root: path to linear-rag clone + output_dir: directory to write output JSONL + + Returns: + Path to the output JSONL file. + """ + questions_path = data_root / dataset / "questions.json" + if not questions_path.exists(): + raise FileNotFoundError(f"Not found: {questions_path}") + + with open(questions_path) as f: + questions = json.load(f) + + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / f"{dataset}_questions.jsonl" + + count = 0 + with open(output_path, "w") as out: + for q in questions: + record = { + "id": q.get("id", ""), + "question": q["question"], + "answer": q["answer"], + "question_type": q.get("question_type", ""), + } + out.write(json.dumps(record) + "\n") + count += 1 + + logger.info("%s: %d questions -> %s", dataset, count, output_path) + return output_path + + +def main() -> None: + parser = argparse.ArgumentParser(description="Prepare linear-rag data for HAG") + parser.add_argument( + "--dataset", + type=str, + action="append", + default=None, + help=f"Dataset(s) to process. Choices: {DATASETS}. Can specify multiple times.", + ) + parser.add_argument( + "--data-root", + type=str, + default="data/linear-rag", + help="Path to linear-rag clone", + ) + parser.add_argument( + "--output-dir", + type=str, + default="data/processed", + help="Output directory for processed files", + ) + args = parser.parse_args() + + datasets = args.dataset if args.dataset else DATASETS + data_root = Path(args.data_root) + output_dir = Path(args.output_dir) + + for ds in datasets: + convert_chunks(ds, data_root, output_dir) + convert_questions(ds, data_root, output_dir) + + +if __name__ == "__main__": + main() -- cgit v1.2.3