scripts/convert_to_llama_factory.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

import json
import os

INPUT_FILE = "data/finetune/preference_extractor_450k.jsonl"
OUTPUT_FILE = "data/finetune/train_llama_factory.json"

# We embed the system prompt as "instruction" so the model learns to respond to this specific instruction.
# Or, if you plan to put this system prompt in the system slot of the chat template, 
# you can leave instruction empty or simplified.
# Given 0.5B model, explicit instruction in the prompt is often helpful.
SYSTEM_INSTRUCTION = (
    "Extract user preferences from the query into JSON format based on the PreferenceList schema. "
    "If no preferences are found, return {\"preferences\": []}."
)

def convert():
    if not os.path.exists(INPUT_FILE):
        print(f"Error: {INPUT_FILE} not found. Run scripts/assemble_dataset.py first.")
        return

    print(f"Reading {INPUT_FILE}...")
    dataset = []
    
    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                item = json.loads(line)
                
                # Alpaca format
                record = {
                    "instruction": SYSTEM_INSTRUCTION,
                    "input": item["input"],
                    "output": item["output"]
                }
                dataset.append(record)
    
    print(f"Converted {len(dataset)} items.")
    
    # Save as JSON list (LLaMA-Factory standard)
    print(f"Saving to {OUTPUT_FILE}...")
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(dataset, f, indent=2, ensure_ascii=False)
        
    print("Done!")
    
    print("\nNext steps for LLaMA-Factory:")
    print("1. Copy data/finetune/train_llama_factory.json to your LLaMA-Factory data/ folder.")
    print("2. Add entry to dataset_info.json:")
    print(json.dumps({
        "preference_extractor_v1": {
            "file_name": "train_llama_factory.json",
            "columns": {
                "prompt": "instruction",
                "query": "input",
                "response": "output"
            }
        }
    }, indent=2))

if __name__ == "__main__":
    convert()