scripts/stats_and_extract.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56

import json
import os

INPUT_FILE = "data/raw_datasets/labeled_full_dataset_batch.jsonl"
OUTPUT_POS_FILE = "data/raw_datasets/positive_seeds.jsonl"

def extract_and_stats():
    if not os.path.exists(INPUT_FILE):
        print(f"Error: {INPUT_FILE} not found.")
        return

    print(f"Scanning {INPUT_FILE}...")
    
    total = 0
    pos_count = 0
    neg_count = 0
    
    # Optional: Track distribution of preference types/keys if needed
    
    with open(INPUT_FILE, "r", encoding="utf-8") as f_in, \
         open(OUTPUT_POS_FILE, "w", encoding="utf-8") as f_out:
        
        for line in f_in:
            if not line.strip(): continue
            try:
                item = json.loads(line)
                total += 1
                
                # Check if positive
                # Our labeling script ensures 'has_preference' boolean, 
                # but let's double check the actual list to be safe.
                prefs = item.get("extracted_json", {}).get("preferences", [])
                
                if prefs and len(prefs) > 0:
                    pos_count += 1
                    f_out.write(line)
                else:
                    neg_count += 1
            except:
                pass # Skip malformed lines

    ratio = (pos_count / total * 100) if total > 0 else 0
    
    print("\n" + "="*30)
    print("DATASET STATISTICS")
    print("="*30)
    print(f"Total Rows:     {total}")
    print(f"Positive Rows:  {pos_count} ({ratio:.2f}%)")
    print(f"Negative Rows:  {neg_count}")
    print("-" * 30)
    print(f"Positive seeds saved to: {OUTPUT_POS_FILE}")
    print("="*30)

if __name__ == "__main__":
    extract_and_stats()