1 files changed, 56 insertions, 0 deletions
diff --git a/scripts/stats_and_extract.py b/scripts/stats_and_extract.py
new file mode 100644
index 0000000..402ade7
--- /dev/null
+++ b/scripts/stats_and_extract.py
@@ -0,0 +1,56 @@
+import json
+import os
+
+INPUT_FILE = "data/raw_datasets/labeled_full_dataset_batch.jsonl"
+OUTPUT_POS_FILE = "data/raw_datasets/positive_seeds.jsonl"
+
+def extract_and_stats():
+    if not os.path.exists(INPUT_FILE):
+        print(f"Error: {INPUT_FILE} not found.")
+        return
+
+    print(f"Scanning {INPUT_FILE}...")
+    
+    total = 0
+    pos_count = 0
+    neg_count = 0
+    
+    # Optional: Track distribution of preference types/keys if needed
+    
+    with open(INPUT_FILE, "r", encoding="utf-8") as f_in, \
+         open(OUTPUT_POS_FILE, "w", encoding="utf-8") as f_out:
+        
+        for line in f_in:
+            if not line.strip(): continue
+            try:
+                item = json.loads(line)
+                total += 1
+                
+                # Check if positive
+                # Our labeling script ensures 'has_preference' boolean, 
+                # but let's double check the actual list to be safe.
+                prefs = item.get("extracted_json", {}).get("preferences", [])
+                
+                if prefs and len(prefs) > 0:
+                    pos_count += 1
+                    f_out.write(line)
+                else:
+                    neg_count += 1
+            except:
+                pass # Skip malformed lines
+
+    ratio = (pos_count / total * 100) if total > 0 else 0
+    
+    print("\n" + "="*30)
+    print("DATASET STATISTICS")
+    print("="*30)
+    print(f"Total Rows:     {total}")
+    print(f"Positive Rows:  {pos_count} ({ratio:.2f}%)")
+    print(f"Negative Rows:  {neg_count}")
+    print("-" * 30)
+    print(f"Positive seeds saved to: {OUTPUT_POS_FILE}")
+    print("="*30)
+
+if __name__ == "__main__":
+    extract_and_stats()
+