From e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Wed, 17 Dec 2025 04:29:37 -0600 Subject: Initial commit (clean history) --- scripts/submit_synthesis_batch.py | 131 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 scripts/submit_synthesis_batch.py (limited to 'scripts/submit_synthesis_batch.py') diff --git a/scripts/submit_synthesis_batch.py b/scripts/submit_synthesis_batch.py new file mode 100644 index 0000000..025782d --- /dev/null +++ b/scripts/submit_synthesis_batch.py @@ -0,0 +1,131 @@ +import json +import os +from openai import OpenAI +import time + +# --- Configuration --- +INPUT_SEEDS = "data/raw_datasets/positive_seeds.jsonl" +BATCH_DIR = "data/raw_datasets/batch_files_synthesis" +MODEL_NAME = "gpt-5.1" # Or gpt-4o +BATCH_SIZE_LIMIT = 30000 # 31k total, splitting into 2 files is safe + +SYNTHESIS_SYSTEM_PROMPT = """You are a data augmentation assistant. +Your task is to rewrite a User Query that contains specific preferences into 5 different variations. +The goal is to train a model to recognize these preferences in various contexts. + +Variations required: +1. Formal/Polite: Use sophisticated language and polite markers. +2. Casual/Direct: Use slang, abbreviations, or very direct commands. +3. Implicit/Contextual: Embed the preference naturally within a larger context or story, making it harder to spot. +4. Distractor-Heavy: Mix the preference with irrelevant information or another task. +5. Imperative/Short: Extremely concise, almost robotic. + +Output strictly a JSON object with a single key "rewrites" containing a list of 5 strings. +Example: {"rewrites": ["string1", "string2", "string3", "string4", "string5"]} +""" + +def submit_synthesis_batch(): + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + print("Error: OPENAI_API_KEY not set.") + return + client = OpenAI(api_key=api_key) + + os.makedirs(BATCH_DIR, exist_ok=True) + + if not os.path.exists(INPUT_SEEDS): + print(f"Error: {INPUT_SEEDS} not found.") + return + + print(f"Reading seeds from {INPUT_SEEDS}...") + + seeds = [] + with open(INPUT_SEEDS, "r", encoding="utf-8") as f: + for line in f: + if line.strip(): + seeds.append(json.loads(line)) + + total_items = len(seeds) + print(f"Total seeds: {total_items}") + + batch_ids = [] + + # Split and Submit + for batch_idx, i in enumerate(range(0, total_items, BATCH_SIZE_LIMIT)): + chunk = seeds[i : i + BATCH_SIZE_LIMIT] + chunk_filename = os.path.join(BATCH_DIR, f"synthesis_batch_part_{batch_idx}.jsonl") + + print(f"\n--- Processing Synthesis Batch {batch_idx} ({len(chunk)} items) ---") + + # 1. Create File + with open(chunk_filename, "w", encoding="utf-8") as f_out: + for item in chunk: + # We need to pass both the query and the extracted preference to help the model + # understand WHAT to preserve. + original_query = item["original_query"] + # extracted_json = item["extracted_json"] # Optional, but maybe helpful? + # Actually, showing the extracted preference ensures the rewrite keeps the core intent. + + # Use original custom_id or create new one? + # Let's create new one: "syn_{original_custom_id}" if available, else "syn_{index}" + # Wait, positive_seeds might not have custom_id if it came from the recovered batch. + # Let's check keys. The recovered file usually has custom_id. + base_id = item.get("custom_id", f"seed_{i}") + custom_id = f"syn_{base_id}" # Prefix to distinguish + + user_content = f"Original Query: {original_query}" + # Optionally add: f"\nCore Preference: {json.dumps(extracted_json)}" + + request_obj = { + "custom_id": custom_id, + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": MODEL_NAME, + "messages": [ + {"role": "system", "content": SYNTHESIS_SYSTEM_PROMPT}, + {"role": "user", "content": user_content} + ], + "temperature": 0.7, # Higher temp for diversity + "response_format": {"type": "json_object"} + } + } + f_out.write(json.dumps(request_obj) + "\n") + + print(f"File created: {chunk_filename}") + + # 2. Upload + print("Uploading to OpenAI...") + batch_file_obj = client.files.create( + file=open(chunk_filename, "rb"), + purpose="batch" + ) + file_id = batch_file_obj.id + print(f"Uploaded. File ID: {file_id}") + + # 3. Submit + print("Submitting Batch Job...") + batch_job = client.batches.create( + input_file_id=file_id, + endpoint="/v1/chat/completions", + completion_window="24h", + metadata={ + "description": f"Pers. Extractor Synthesis Part {batch_idx}", + "type": "synthesis" + } + ) + print(f"Submitted. Batch ID: {batch_job.id}") + batch_ids.append(batch_job.id) + + time.sleep(1) + + id_file = "data/raw_datasets/submitted_synthesis_batch_ids.json" + with open(id_file, "w") as f: + json.dump(batch_ids, f, indent=2) + + print(f"\nALL DONE! Submitted {len(batch_ids)} synthesis batches.") + print(f"Batch IDs saved to {id_file}") + +if __name__ == "__main__": + submit_synthesis_batch() + -- cgit v1.2.3