import json import os import time from typing import List from openai import OpenAI # --- Configuration --- INPUT_FILE = "data/raw_datasets/combined_raw_queries.jsonl" BATCH_DIR = "data/raw_datasets/batch_files" MODEL_NAME = "gpt-5.1" # Or "gpt-4o" BATCH_SIZE_LIMIT = 49000 # Safe under 50k limit # --- Load System Prompt --- with open("fine_tuning_prompt_template.txt", "r", encoding="utf-8") as f: SYSTEM_PROMPT = f.read() def prepare_and_submit_batches(): api_key = os.getenv("OPENAI_API_KEY") if not api_key: print("Error: OPENAI_API_KEY not set.") return client = OpenAI(api_key=api_key) os.makedirs(BATCH_DIR, exist_ok=True) print(f"Reading from {INPUT_FILE}...") # Read all lines first all_lines = [] with open(INPUT_FILE, "r", encoding="utf-8") as f: for line in f: if line.strip(): all_lines.append(json.loads(line)) total_items = len(all_lines) print(f"Total items: {total_items}") batch_ids = [] # Split and Process for batch_idx, i in enumerate(range(0, total_items, BATCH_SIZE_LIMIT)): chunk = all_lines[i : i + BATCH_SIZE_LIMIT] chunk_filename = os.path.join(BATCH_DIR, f"batch_input_part_{batch_idx}.jsonl") print(f"\n--- Processing Batch {batch_idx} ({len(chunk)} items) ---") # 1. Create File with open(chunk_filename, "w", encoding="utf-8") as f_out: for item_idx, item in enumerate(chunk): # Global index to track back later if needed global_idx = i + item_idx query = item["query"] # Custom ID: "req_{global_index}" custom_id = f"req_{global_idx}" request_obj = { "custom_id": custom_id, "method": "POST", "url": "/v1/chat/completions", "body": { "model": MODEL_NAME, "messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": query} ], "temperature": 0.0, "response_format": {"type": "json_object"} } } f_out.write(json.dumps(request_obj) + "\n") print(f"File created: {chunk_filename}") # 2. Upload File print("Uploading to OpenAI...") batch_file_obj = client.files.create( file=open(chunk_filename, "rb"), purpose="batch" ) file_id = batch_file_obj.id print(f"Uploaded. File ID: {file_id}") # 3. Submit Batch print("Submitting Batch Job...") batch_job = client.batches.create( input_file_id=file_id, endpoint="/v1/chat/completions", completion_window="24h", metadata={ "description": f"Pers. Extractor Part {batch_idx}", "part_index": str(batch_idx) } ) print(f"Submitted. Batch ID: {batch_job.id}") batch_ids.append(batch_job.id) # Sleep briefly to be nice to API time.sleep(1) # Save all Batch IDs id_file = "data/raw_datasets/submitted_batch_ids.json" with open(id_file, "w") as f: json.dump(batch_ids, f, indent=2) print(f"\nALL DONE! Submitted {len(batch_ids)} batches.") print(f"Batch IDs saved to {id_file}") print("Run scripts/check_batch_status.py (you need to write it) to monitor.") if __name__ == "__main__": prepare_and_submit_batches()