From 09d50e47860da0035e178a442dc936028808a0b3 Mon Sep 17 00:00:00 2001
From: YurenHao0426 <Blackhao0426@gmail.com>
Date: Mon, 16 Feb 2026 14:44:42 -0600
Subject: Add memory centering, grid search experiments, and energy
 visualizations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add centering support to MemoryBank (center_query, apply_centering, mean
  persistence in save/load) to remove centroid attractor in Hopfield dynamics
- Add center flag to MemoryBankConfig, device field to PipelineConfig
- Grid search scripts: initial (β≤8), residual, high-β, and centered grids
  with dedup-based LLM caching (89-91% call savings)
- Energy landscape visualization: 2D contour, 1D profile, UMAP, PCA heatmap
  comparing centered vs uncentered dynamics
- Experiment log (note.md) documenting 4 rounds of results and root cause
  analysis of centroid attractor problem
- Key finding: β_critical ≈ 37.6 for centered memory; best configs beat
  FAISS baseline by +3-4% F1

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 scripts/prepare_corpus.py | 127 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)
 create mode 100644 scripts/prepare_corpus.py

(limited to 'scripts/prepare_corpus.py')

diff --git a/scripts/prepare_corpus.py b/scripts/prepare_corpus.py
new file mode 100644
index 0000000..93fc0ce
--- /dev/null
+++ b/scripts/prepare_corpus.py
@@ -0,0 +1,127 @@
+"""Convert linear-rag chunks.json to JSONL corpus for build_memory_bank.py.
+
+The linear-rag dataset stores chunks as a list of strings with format "idx:text...".
+This script strips the index prefix and outputs one {"text": "..."} per line.
+
+Usage:
+    python scripts/prepare_corpus.py --dataset hotpotqa
+    python scripts/prepare_corpus.py --dataset hotpotqa --dataset musique --dataset 2wikimultihop
+"""
+
+import argparse
+import json
+import logging
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+DATASETS = ["hotpotqa", "musique", "2wikimultihop", "medical"]
+
+
+def convert_chunks(dataset: str, data_root: Path, output_dir: Path) -> Path:
+    """Convert a single dataset's chunks.json to corpus JSONL.
+
+    Args:
+        dataset: dataset name (e.g., "hotpotqa")
+        data_root: path to linear-rag clone
+        output_dir: directory to write output JSONL
+
+    Returns:
+        Path to the output JSONL file.
+    """
+    chunks_path = data_root / dataset / "chunks.json"
+    if not chunks_path.exists():
+        raise FileNotFoundError(f"Not found: {chunks_path}")
+
+    with open(chunks_path) as f:
+        chunks = json.load(f)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = output_dir / f"{dataset}_corpus.jsonl"
+
+    count = 0
+    with open(output_path, "w") as out:
+        for chunk in chunks:
+            # Strip the "idx:" prefix
+            text = chunk.split(":", 1)[1] if ":" in chunk else chunk
+            text = text.strip()
+            if text:
+                out.write(json.dumps({"text": text}) + "\n")
+                count += 1
+
+    logger.info("%s: %d chunks -> %s", dataset, count, output_path)
+    return output_path
+
+
+def convert_questions(dataset: str, data_root: Path, output_dir: Path) -> Path:
+    """Convert questions.json to a standardized JSONL format.
+
+    Args:
+        dataset: dataset name
+        data_root: path to linear-rag clone
+        output_dir: directory to write output JSONL
+
+    Returns:
+        Path to the output JSONL file.
+    """
+    questions_path = data_root / dataset / "questions.json"
+    if not questions_path.exists():
+        raise FileNotFoundError(f"Not found: {questions_path}")
+
+    with open(questions_path) as f:
+        questions = json.load(f)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = output_dir / f"{dataset}_questions.jsonl"
+
+    count = 0
+    with open(output_path, "w") as out:
+        for q in questions:
+            record = {
+                "id": q.get("id", ""),
+                "question": q["question"],
+                "answer": q["answer"],
+                "question_type": q.get("question_type", ""),
+            }
+            out.write(json.dumps(record) + "\n")
+            count += 1
+
+    logger.info("%s: %d questions -> %s", dataset, count, output_path)
+    return output_path
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Prepare linear-rag data for HAG")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        action="append",
+        default=None,
+        help=f"Dataset(s) to process. Choices: {DATASETS}. Can specify multiple times.",
+    )
+    parser.add_argument(
+        "--data-root",
+        type=str,
+        default="data/linear-rag",
+        help="Path to linear-rag clone",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="data/processed",
+        help="Output directory for processed files",
+    )
+    args = parser.parse_args()
+
+    datasets = args.dataset if args.dataset else DATASETS
+    data_root = Path(args.data_root)
+    output_dir = Path(args.output_dir)
+
+    for ds in datasets:
+        convert_chunks(ds, data_root, output_dir)
+        convert_questions(ds, data_root, output_dir)
+
+
+if __name__ == "__main__":
+    main()
-- 
cgit v1.2.3