Initial commit (clean history)HEAD main

author: YurenHao0426 <blackhao0426@gmail.com> 2025-12-17 04:29:37 -0600
committer: YurenHao0426 <blackhao0426@gmail.com> 2025-12-17 04:29:37 -0600
commit: e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (patch)
tree: 6ce8a00d2f8b9ebd83c894a27ea01ac50cfb2ff5 /scripts/assemble_dataset.py
1 files changed, 85 insertions, 0 deletions
diff --git a/scripts/assemble_dataset.py b/scripts/assemble_dataset.py
new file mode 100644
index 0000000..024f91f
--- /dev/null
+++ b/scripts/assemble_dataset.py
@@ -0,0 +1,85 @@
+import json
+import os
+import random
+
+# Source Files
+FILE_ORIGINAL = "data/raw_datasets/labeled_full_dataset_batch.jsonl"
+FILE_SYNTHESIS = "data/raw_datasets/synthesized_positives.jsonl"
+
+# Output Files
+OUTPUT_RAW = "data/finetune/preference_extractor_450k.jsonl"
+
+def assemble_dataset():
+    os.makedirs(os.path.dirname(OUTPUT_RAW), exist_ok=True)
+    
+    print("Assembling final dataset...")
+    
+    records = []
+    
+    # 1. Load Original (Pos + Neg)
+    print(f"Loading {FILE_ORIGINAL}...")
+    if os.path.exists(FILE_ORIGINAL):
+        with open(FILE_ORIGINAL, "r", encoding="utf-8") as f:
+            for line in f:
+                if line.strip():
+                    item = json.loads(line)
+                    # Standardize format: {"input": ..., "output": ...}
+                    # Input is user query. Output is extracted JSON string.
+                    
+                    query = item.get("original_query", "")
+                    output_json = item.get("extracted_json", {"preferences": []})
+                    
+                    # Ensure output is a string of JSON, minimal whitespace to save tokens
+                    output_str = json.dumps(output_json, ensure_ascii=False)
+                    
+                    records.append({
+                        "input": query,
+                        "output": output_str,
+                        "source": item.get("source", "original")
+                    })
+    else:
+        print(f"Warning: {FILE_ORIGINAL} missing!")
+
+    print(f"Loaded {len(records)} from original.")
+    
+    # 2. Load Synthesis (Pos)
+    print(f"Loading {FILE_SYNTHESIS}...")
+    syn_count = 0
+    if os.path.exists(FILE_SYNTHESIS):
+        with open(FILE_SYNTHESIS, "r", encoding="utf-8") as f:
+            for line in f:
+                if line.strip():
+                    item = json.loads(line)
+                    
+                    query = item.get("original_query", "")
+                    output_json = item.get("extracted_json", {"preferences": []})
+                    output_str = json.dumps(output_json, ensure_ascii=False)
+                    
+                    records.append({
+                        "input": query,
+                        "output": output_str,
+                        "source": "synthesis"
+                    })
+                    syn_count += 1
+    else:
+        print(f"Warning: {FILE_SYNTHESIS} missing!")
+        
+    print(f"Loaded {syn_count} from synthesis.")
+    
+    # 3. Shuffle
+    print("Shuffling...")
+    random.shuffle(records)
+    
+    # 4. Save
+    print(f"Saving {len(records)} records to {OUTPUT_RAW}...")
+    with open(OUTPUT_RAW, "w", encoding="utf-8") as f:
+        for r in records:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+            
+    print("Done!")
+    print("\nTo upload to Hugging Face, run:")
+    print("huggingface-cli upload <repo_id> data/finetune/preference_extractor_450k.jsonl --repo-type dataset")
+
+if __name__ == "__main__":
+    assemble_dataset()
+
author	YurenHao0426 <blackhao0426@gmail.com>	2025-12-17 04:29:37 -0600
committer	YurenHao0426 <blackhao0426@gmail.com>	2025-12-17 04:29:37 -0600
commit	e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (patch)
tree	6ce8a00d2f8b9ebd83c894a27ea01ac50cfb2ff5 /scripts/assemble_dataset.py