import json import os INPUT_FILE = "data/finetune/preference_extractor_450k.jsonl" OUTPUT_FILE = "data/finetune/train_llama_factory.json" # We embed the system prompt as "instruction" so the model learns to respond to this specific instruction. # Or, if you plan to put this system prompt in the system slot of the chat template, # you can leave instruction empty or simplified. # Given 0.5B model, explicit instruction in the prompt is often helpful. SYSTEM_INSTRUCTION = ( "Extract user preferences from the query into JSON format based on the PreferenceList schema. " "If no preferences are found, return {\"preferences\": []}." ) def convert(): if not os.path.exists(INPUT_FILE): print(f"Error: {INPUT_FILE} not found. Run scripts/assemble_dataset.py first.") return print(f"Reading {INPUT_FILE}...") dataset = [] with open(INPUT_FILE, "r", encoding="utf-8") as f: for line in f: if line.strip(): item = json.loads(line) # Alpaca format record = { "instruction": SYSTEM_INSTRUCTION, "input": item["input"], "output": item["output"] } dataset.append(record) print(f"Converted {len(dataset)} items.") # Save as JSON list (LLaMA-Factory standard) print(f"Saving to {OUTPUT_FILE}...") with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(dataset, f, indent=2, ensure_ascii=False) print("Done!") print("\nNext steps for LLaMA-Factory:") print("1. Copy data/finetune/train_llama_factory.json to your LLaMA-Factory data/ folder.") print("2. Add entry to dataset_info.json:") print(json.dumps({ "preference_extractor_v1": { "file_name": "train_llama_factory.json", "columns": { "prompt": "instruction", "query": "input", "response": "output" } } }, indent=2)) if __name__ == "__main__": convert()