1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
|
import json
import os
INPUT_FILE = "data/finetune/preference_extractor_450k.jsonl"
OUTPUT_FILE = "data/finetune/train_llama_factory.json"
# We embed the system prompt as "instruction" so the model learns to respond to this specific instruction.
# Or, if you plan to put this system prompt in the system slot of the chat template,
# you can leave instruction empty or simplified.
# Given 0.5B model, explicit instruction in the prompt is often helpful.
SYSTEM_INSTRUCTION = (
"Extract user preferences from the query into JSON format based on the PreferenceList schema. "
"If no preferences are found, return {\"preferences\": []}."
)
def convert():
if not os.path.exists(INPUT_FILE):
print(f"Error: {INPUT_FILE} not found. Run scripts/assemble_dataset.py first.")
return
print(f"Reading {INPUT_FILE}...")
dataset = []
with open(INPUT_FILE, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
item = json.loads(line)
# Alpaca format
record = {
"instruction": SYSTEM_INSTRUCTION,
"input": item["input"],
"output": item["output"]
}
dataset.append(record)
print(f"Converted {len(dataset)} items.")
# Save as JSON list (LLaMA-Factory standard)
print(f"Saving to {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(dataset, f, indent=2, ensure_ascii=False)
print("Done!")
print("\nNext steps for LLaMA-Factory:")
print("1. Copy data/finetune/train_llama_factory.json to your LLaMA-Factory data/ folder.")
print("2. Add entry to dataset_info.json:")
print(json.dumps({
"preference_extractor_v1": {
"file_name": "train_llama_factory.json",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output"
}
}
}, indent=2))
if __name__ == "__main__":
convert()
|