import json import os from openai import OpenAI import time # --- Configuration --- INPUT_SEEDS = "data/raw_datasets/positive_seeds.jsonl" BATCH_DIR = "data/raw_datasets/batch_files_synthesis" MODEL_NAME = "gpt-5.1" # Or gpt-4o BATCH_SIZE_LIMIT = 30000 # 31k total, splitting into 2 files is safe SYNTHESIS_SYSTEM_PROMPT = """You are a data augmentation assistant. Your task is to rewrite a User Query that contains specific preferences into 5 different variations. The goal is to train a model to recognize these preferences in various contexts. Variations required: 1. Formal/Polite: Use sophisticated language and polite markers. 2. Casual/Direct: Use slang, abbreviations, or very direct commands. 3. Implicit/Contextual: Embed the preference naturally within a larger context or story, making it harder to spot. 4. Distractor-Heavy: Mix the preference with irrelevant information or another task. 5. Imperative/Short: Extremely concise, almost robotic. Output strictly a JSON object with a single key "rewrites" containing a list of 5 strings. Example: {"rewrites": ["string1", "string2", "string3", "string4", "string5"]} """ def submit_synthesis_batch(): api_key = os.getenv("OPENAI_API_KEY") if not api_key: print("Error: OPENAI_API_KEY not set.") return client = OpenAI(api_key=api_key) os.makedirs(BATCH_DIR, exist_ok=True) if not os.path.exists(INPUT_SEEDS): print(f"Error: {INPUT_SEEDS} not found.") return print(f"Reading seeds from {INPUT_SEEDS}...") seeds = [] with open(INPUT_SEEDS, "r", encoding="utf-8") as f: for line in f: if line.strip(): seeds.append(json.loads(line)) total_items = len(seeds) print(f"Total seeds: {total_items}") batch_ids = [] # Split and Submit for batch_idx, i in enumerate(range(0, total_items, BATCH_SIZE_LIMIT)): chunk = seeds[i : i + BATCH_SIZE_LIMIT] chunk_filename = os.path.join(BATCH_DIR, f"synthesis_batch_part_{batch_idx}.jsonl") print(f"\n--- Processing Synthesis Batch {batch_idx} ({len(chunk)} items) ---") # 1. Create File with open(chunk_filename, "w", encoding="utf-8") as f_out: for item in chunk: # We need to pass both the query and the extracted preference to help the model # understand WHAT to preserve. original_query = item["original_query"] # extracted_json = item["extracted_json"] # Optional, but maybe helpful? # Actually, showing the extracted preference ensures the rewrite keeps the core intent. # Use original custom_id or create new one? # Let's create new one: "syn_{original_custom_id}" if available, else "syn_{index}" # Wait, positive_seeds might not have custom_id if it came from the recovered batch. # Let's check keys. The recovered file usually has custom_id. base_id = item.get("custom_id", f"seed_{i}") custom_id = f"syn_{base_id}" # Prefix to distinguish user_content = f"Original Query: {original_query}" # Optionally add: f"\nCore Preference: {json.dumps(extracted_json)}" request_obj = { "custom_id": custom_id, "method": "POST", "url": "/v1/chat/completions", "body": { "model": MODEL_NAME, "messages": [ {"role": "system", "content": SYNTHESIS_SYSTEM_PROMPT}, {"role": "user", "content": user_content} ], "temperature": 0.7, # Higher temp for diversity "response_format": {"type": "json_object"} } } f_out.write(json.dumps(request_obj) + "\n") print(f"File created: {chunk_filename}") # 2. Upload print("Uploading to OpenAI...") batch_file_obj = client.files.create( file=open(chunk_filename, "rb"), purpose="batch" ) file_id = batch_file_obj.id print(f"Uploaded. File ID: {file_id}") # 3. Submit print("Submitting Batch Job...") batch_job = client.batches.create( input_file_id=file_id, endpoint="/v1/chat/completions", completion_window="24h", metadata={ "description": f"Pers. Extractor Synthesis Part {batch_idx}", "type": "synthesis" } ) print(f"Submitted. Batch ID: {batch_job.id}") batch_ids.append(batch_job.id) time.sleep(1) id_file = "data/raw_datasets/submitted_synthesis_batch_ids.json" with open(id_file, "w") as f: json.dump(batch_ids, f, indent=2) print(f"\nALL DONE! Submitted {len(batch_ids)} synthesis batches.") print(f"Batch IDs saved to {id_file}") if __name__ == "__main__": submit_synthesis_batch()