summaryrefslogtreecommitdiff
path: root/scripts/debug_personamem_hash.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/debug_personamem_hash.py')
-rw-r--r--scripts/debug_personamem_hash.py22
1 files changed, 22 insertions, 0 deletions
diff --git a/scripts/debug_personamem_hash.py b/scripts/debug_personamem_hash.py
new file mode 100644
index 0000000..7ef442d
--- /dev/null
+++ b/scripts/debug_personamem_hash.py
@@ -0,0 +1,22 @@
+import hashlib
+import json
+
+def get_line_hash(line_str: str) -> str:
+ """Compute SHA256 hash of the line content to match shared_context_id."""
+ return hashlib.sha256(line_str.strip().encode("utf-8")).hexdigest()
+
+def debug_hash():
+ jsonl_path = "data/raw_datasets/personamem/shared_contexts_32k.jsonl"
+ with open(jsonl_path, "r") as f:
+ first_line = f.readline()
+
+ computed_hash = get_line_hash(first_line)
+ target_hash = "e898d03fec683b1cabf29f57287ff66f8a31842543ecef44b56766844c1c1301"
+
+ print(f"Computed: {computed_hash}")
+ print(f"Target: {target_hash}")
+ print(f"Match: {computed_hash == target_hash}")
+
+if __name__ == "__main__":
+ debug_hash()
+