blob: 402ade77c6697e07ee1331a280a1931f9039ff73 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
|
import json
import os
INPUT_FILE = "data/raw_datasets/labeled_full_dataset_batch.jsonl"
OUTPUT_POS_FILE = "data/raw_datasets/positive_seeds.jsonl"
def extract_and_stats():
if not os.path.exists(INPUT_FILE):
print(f"Error: {INPUT_FILE} not found.")
return
print(f"Scanning {INPUT_FILE}...")
total = 0
pos_count = 0
neg_count = 0
# Optional: Track distribution of preference types/keys if needed
with open(INPUT_FILE, "r", encoding="utf-8") as f_in, \
open(OUTPUT_POS_FILE, "w", encoding="utf-8") as f_out:
for line in f_in:
if not line.strip(): continue
try:
item = json.loads(line)
total += 1
# Check if positive
# Our labeling script ensures 'has_preference' boolean,
# but let's double check the actual list to be safe.
prefs = item.get("extracted_json", {}).get("preferences", [])
if prefs and len(prefs) > 0:
pos_count += 1
f_out.write(line)
else:
neg_count += 1
except:
pass # Skip malformed lines
ratio = (pos_count / total * 100) if total > 0 else 0
print("\n" + "="*30)
print("DATASET STATISTICS")
print("="*30)
print(f"Total Rows: {total}")
print(f"Positive Rows: {pos_count} ({ratio:.2f}%)")
print(f"Negative Rows: {neg_count}")
print("-" * 30)
print(f"Positive seeds saved to: {OUTPUT_POS_FILE}")
print("="*30)
if __name__ == "__main__":
extract_and_stats()
|