From e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Wed, 17 Dec 2025 04:29:37 -0600 Subject: Initial commit (clean history) --- scripts/stats_and_extract.py | 56 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 scripts/stats_and_extract.py (limited to 'scripts/stats_and_extract.py') diff --git a/scripts/stats_and_extract.py b/scripts/stats_and_extract.py new file mode 100644 index 0000000..402ade7 --- /dev/null +++ b/scripts/stats_and_extract.py @@ -0,0 +1,56 @@ +import json +import os + +INPUT_FILE = "data/raw_datasets/labeled_full_dataset_batch.jsonl" +OUTPUT_POS_FILE = "data/raw_datasets/positive_seeds.jsonl" + +def extract_and_stats(): + if not os.path.exists(INPUT_FILE): + print(f"Error: {INPUT_FILE} not found.") + return + + print(f"Scanning {INPUT_FILE}...") + + total = 0 + pos_count = 0 + neg_count = 0 + + # Optional: Track distribution of preference types/keys if needed + + with open(INPUT_FILE, "r", encoding="utf-8") as f_in, \ + open(OUTPUT_POS_FILE, "w", encoding="utf-8") as f_out: + + for line in f_in: + if not line.strip(): continue + try: + item = json.loads(line) + total += 1 + + # Check if positive + # Our labeling script ensures 'has_preference' boolean, + # but let's double check the actual list to be safe. + prefs = item.get("extracted_json", {}).get("preferences", []) + + if prefs and len(prefs) > 0: + pos_count += 1 + f_out.write(line) + else: + neg_count += 1 + except: + pass # Skip malformed lines + + ratio = (pos_count / total * 100) if total > 0 else 0 + + print("\n" + "="*30) + print("DATASET STATISTICS") + print("="*30) + print(f"Total Rows: {total}") + print(f"Positive Rows: {pos_count} ({ratio:.2f}%)") + print(f"Negative Rows: {neg_count}") + print("-" * 30) + print(f"Positive seeds saved to: {OUTPUT_POS_FILE}") + print("="*30) + +if __name__ == "__main__": + extract_and_stats() + -- cgit v1.2.3