import json import os INPUT_FILE = "data/raw_datasets/labeled_full_dataset_batch.jsonl" OUTPUT_POS_FILE = "data/raw_datasets/positive_seeds.jsonl" def extract_and_stats(): if not os.path.exists(INPUT_FILE): print(f"Error: {INPUT_FILE} not found.") return print(f"Scanning {INPUT_FILE}...") total = 0 pos_count = 0 neg_count = 0 # Optional: Track distribution of preference types/keys if needed with open(INPUT_FILE, "r", encoding="utf-8") as f_in, \ open(OUTPUT_POS_FILE, "w", encoding="utf-8") as f_out: for line in f_in: if not line.strip(): continue try: item = json.loads(line) total += 1 # Check if positive # Our labeling script ensures 'has_preference' boolean, # but let's double check the actual list to be safe. prefs = item.get("extracted_json", {}).get("preferences", []) if prefs and len(prefs) > 0: pos_count += 1 f_out.write(line) else: neg_count += 1 except: pass # Skip malformed lines ratio = (pos_count / total * 100) if total > 0 else 0 print("\n" + "="*30) print("DATASET STATISTICS") print("="*30) print(f"Total Rows: {total}") print(f"Positive Rows: {pos_count} ({ratio:.2f}%)") print(f"Negative Rows: {neg_count}") print("-" * 30) print(f"Positive seeds saved to: {OUTPUT_POS_FILE}") print("="*30) if __name__ == "__main__": extract_and_stats()