summaryrefslogtreecommitdiff
path: root/scripts/stats_and_extract.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/stats_and_extract.py')
-rw-r--r--scripts/stats_and_extract.py56
1 files changed, 56 insertions, 0 deletions
diff --git a/scripts/stats_and_extract.py b/scripts/stats_and_extract.py
new file mode 100644
index 0000000..402ade7
--- /dev/null
+++ b/scripts/stats_and_extract.py
@@ -0,0 +1,56 @@
+import json
+import os
+
+INPUT_FILE = "data/raw_datasets/labeled_full_dataset_batch.jsonl"
+OUTPUT_POS_FILE = "data/raw_datasets/positive_seeds.jsonl"
+
+def extract_and_stats():
+ if not os.path.exists(INPUT_FILE):
+ print(f"Error: {INPUT_FILE} not found.")
+ return
+
+ print(f"Scanning {INPUT_FILE}...")
+
+ total = 0
+ pos_count = 0
+ neg_count = 0
+
+ # Optional: Track distribution of preference types/keys if needed
+
+ with open(INPUT_FILE, "r", encoding="utf-8") as f_in, \
+ open(OUTPUT_POS_FILE, "w", encoding="utf-8") as f_out:
+
+ for line in f_in:
+ if not line.strip(): continue
+ try:
+ item = json.loads(line)
+ total += 1
+
+ # Check if positive
+ # Our labeling script ensures 'has_preference' boolean,
+ # but let's double check the actual list to be safe.
+ prefs = item.get("extracted_json", {}).get("preferences", [])
+
+ if prefs and len(prefs) > 0:
+ pos_count += 1
+ f_out.write(line)
+ else:
+ neg_count += 1
+ except:
+ pass # Skip malformed lines
+
+ ratio = (pos_count / total * 100) if total > 0 else 0
+
+ print("\n" + "="*30)
+ print("DATASET STATISTICS")
+ print("="*30)
+ print(f"Total Rows: {total}")
+ print(f"Positive Rows: {pos_count} ({ratio:.2f}%)")
+ print(f"Negative Rows: {neg_count}")
+ print("-" * 30)
+ print(f"Positive seeds saved to: {OUTPUT_POS_FILE}")
+ print("="*30)
+
+if __name__ == "__main__":
+ extract_and_stats()
+