Initial commit (clean history)HEAD main

author: YurenHao0426 <blackhao0426@gmail.com> 2025-12-17 04:29:37 -0600
committer: YurenHao0426 <blackhao0426@gmail.com> 2025-12-17 04:29:37 -0600
commit: e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (patch)
tree: 6ce8a00d2f8b9ebd83c894a27ea01ac50cfb2ff5 /scripts/convert_to_llama_factory.py
1 files changed, 62 insertions, 0 deletions
diff --git a/scripts/convert_to_llama_factory.py b/scripts/convert_to_llama_factory.py
new file mode 100644
index 0000000..d8b7565
--- /dev/null
+++ b/scripts/convert_to_llama_factory.py
@@ -0,0 +1,62 @@
+import json
+import os
+
+INPUT_FILE = "data/finetune/preference_extractor_450k.jsonl"
+OUTPUT_FILE = "data/finetune/train_llama_factory.json"
+
+# We embed the system prompt as "instruction" so the model learns to respond to this specific instruction.
+# Or, if you plan to put this system prompt in the system slot of the chat template, 
+# you can leave instruction empty or simplified.
+# Given 0.5B model, explicit instruction in the prompt is often helpful.
+SYSTEM_INSTRUCTION = (
+    "Extract user preferences from the query into JSON format based on the PreferenceList schema. "
+    "If no preferences are found, return {\"preferences\": []}."
+)
+
+def convert():
+    if not os.path.exists(INPUT_FILE):
+        print(f"Error: {INPUT_FILE} not found. Run scripts/assemble_dataset.py first.")
+        return
+
+    print(f"Reading {INPUT_FILE}...")
+    dataset = []
+    
+    with open(INPUT_FILE, "r", encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                item = json.loads(line)
+                
+                # Alpaca format
+                record = {
+                    "instruction": SYSTEM_INSTRUCTION,
+                    "input": item["input"],
+                    "output": item["output"]
+                }
+                dataset.append(record)
+    
+    print(f"Converted {len(dataset)} items.")
+    
+    # Save as JSON list (LLaMA-Factory standard)
+    print(f"Saving to {OUTPUT_FILE}...")
+    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+        json.dump(dataset, f, indent=2, ensure_ascii=False)
+        
+    print("Done!")
+    
+    print("\nNext steps for LLaMA-Factory:")
+    print("1. Copy data/finetune/train_llama_factory.json to your LLaMA-Factory data/ folder.")
+    print("2. Add entry to dataset_info.json:")
+    print(json.dumps({
+        "preference_extractor_v1": {
+            "file_name": "train_llama_factory.json",
+            "columns": {
+                "prompt": "instruction",
+                "query": "input",
+                "response": "output"
+            }
+        }
+    }, indent=2))
+
+if __name__ == "__main__":
+    convert()
+
author	YurenHao0426 <blackhao0426@gmail.com>	2025-12-17 04:29:37 -0600
committer	YurenHao0426 <blackhao0426@gmail.com>	2025-12-17 04:29:37 -0600
commit	e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (patch)
tree	6ce8a00d2f8b9ebd83c894a27ea01ac50cfb2ff5 /scripts/convert_to_llama_factory.py