From e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 Mon Sep 17 00:00:00 2001
From: YurenHao0426 <blackhao0426@gmail.com>
Date: Wed, 17 Dec 2025 04:29:37 -0600
Subject: Initial commit (clean history)

---
 scripts/submit_synthesis_batch.py | 131 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 scripts/submit_synthesis_batch.py

(limited to 'scripts/submit_synthesis_batch.py')

diff --git a/scripts/submit_synthesis_batch.py b/scripts/submit_synthesis_batch.py
new file mode 100644
index 0000000..025782d
--- /dev/null
+++ b/scripts/submit_synthesis_batch.py
@@ -0,0 +1,131 @@
+import json
+import os
+from openai import OpenAI
+import time
+
+# --- Configuration ---
+INPUT_SEEDS = "data/raw_datasets/positive_seeds.jsonl"
+BATCH_DIR = "data/raw_datasets/batch_files_synthesis"
+MODEL_NAME = "gpt-5.1"  # Or gpt-4o
+BATCH_SIZE_LIMIT = 30000 # 31k total, splitting into 2 files is safe
+
+SYNTHESIS_SYSTEM_PROMPT = """You are a data augmentation assistant. 
+Your task is to rewrite a User Query that contains specific preferences into 5 different variations.
+The goal is to train a model to recognize these preferences in various contexts.
+
+Variations required:
+1. Formal/Polite: Use sophisticated language and polite markers.
+2. Casual/Direct: Use slang, abbreviations, or very direct commands.
+3. Implicit/Contextual: Embed the preference naturally within a larger context or story, making it harder to spot.
+4. Distractor-Heavy: Mix the preference with irrelevant information or another task.
+5. Imperative/Short: Extremely concise, almost robotic.
+
+Output strictly a JSON object with a single key "rewrites" containing a list of 5 strings.
+Example: {"rewrites": ["string1", "string2", "string3", "string4", "string5"]}
+"""
+
+def submit_synthesis_batch():
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        print("Error: OPENAI_API_KEY not set.")
+        return
+    client = OpenAI(api_key=api_key)
+
+    os.makedirs(BATCH_DIR, exist_ok=True)
+    
+    if not os.path.exists(INPUT_SEEDS):
+        print(f"Error: {INPUT_SEEDS} not found.")
+        return
+
+    print(f"Reading seeds from {INPUT_SEEDS}...")
+    
+    seeds = []
+    with open(INPUT_SEEDS, "r", encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                seeds.append(json.loads(line))
+    
+    total_items = len(seeds)
+    print(f"Total seeds: {total_items}")
+    
+    batch_ids = []
+    
+    # Split and Submit
+    for batch_idx, i in enumerate(range(0, total_items, BATCH_SIZE_LIMIT)):
+        chunk = seeds[i : i + BATCH_SIZE_LIMIT]
+        chunk_filename = os.path.join(BATCH_DIR, f"synthesis_batch_part_{batch_idx}.jsonl")
+        
+        print(f"\n--- Processing Synthesis Batch {batch_idx} ({len(chunk)} items) ---")
+        
+        # 1. Create File
+        with open(chunk_filename, "w", encoding="utf-8") as f_out:
+            for item in chunk:
+                # We need to pass both the query and the extracted preference to help the model
+                # understand WHAT to preserve.
+                original_query = item["original_query"]
+                # extracted_json = item["extracted_json"] # Optional, but maybe helpful?
+                # Actually, showing the extracted preference ensures the rewrite keeps the core intent.
+                
+                # Use original custom_id or create new one?
+                # Let's create new one: "syn_{original_custom_id}" if available, else "syn_{index}"
+                # Wait, positive_seeds might not have custom_id if it came from the recovered batch.
+                # Let's check keys. The recovered file usually has custom_id.
+                base_id = item.get("custom_id", f"seed_{i}")
+                custom_id = f"syn_{base_id}" # Prefix to distinguish
+                
+                user_content = f"Original Query: {original_query}"
+                # Optionally add: f"\nCore Preference: {json.dumps(extracted_json)}"
+                
+                request_obj = {
+                    "custom_id": custom_id,
+                    "method": "POST",
+                    "url": "/v1/chat/completions",
+                    "body": {
+                        "model": MODEL_NAME,
+                        "messages": [
+                            {"role": "system", "content": SYNTHESIS_SYSTEM_PROMPT},
+                            {"role": "user", "content": user_content}
+                        ],
+                        "temperature": 0.7, # Higher temp for diversity
+                        "response_format": {"type": "json_object"}
+                    }
+                }
+                f_out.write(json.dumps(request_obj) + "\n")
+        
+        print(f"File created: {chunk_filename}")
+        
+        # 2. Upload
+        print("Uploading to OpenAI...")
+        batch_file_obj = client.files.create(
+            file=open(chunk_filename, "rb"),
+            purpose="batch"
+        )
+        file_id = batch_file_obj.id
+        print(f"Uploaded. File ID: {file_id}")
+        
+        # 3. Submit
+        print("Submitting Batch Job...")
+        batch_job = client.batches.create(
+            input_file_id=file_id,
+            endpoint="/v1/chat/completions",
+            completion_window="24h",
+            metadata={
+                "description": f"Pers. Extractor Synthesis Part {batch_idx}",
+                "type": "synthesis"
+            }
+        )
+        print(f"Submitted. Batch ID: {batch_job.id}")
+        batch_ids.append(batch_job.id)
+        
+        time.sleep(1)
+
+    id_file = "data/raw_datasets/submitted_synthesis_batch_ids.json"
+    with open(id_file, "w") as f:
+        json.dump(batch_ids, f, indent=2)
+    
+    print(f"\nALL DONE! Submitted {len(batch_ids)} synthesis batches.")
+    print(f"Batch IDs saved to {id_file}")
+
+if __name__ == "__main__":
+    submit_synthesis_batch()
+
-- 
cgit v1.2.3