From e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Wed, 17 Dec 2025 04:29:37 -0600 Subject: Initial commit (clean history) --- scripts/submit_retry_batch.py | 88 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 scripts/submit_retry_batch.py (limited to 'scripts/submit_retry_batch.py') diff --git a/scripts/submit_retry_batch.py b/scripts/submit_retry_batch.py new file mode 100644 index 0000000..f564c4b --- /dev/null +++ b/scripts/submit_retry_batch.py @@ -0,0 +1,88 @@ +import json +import os +import time +from openai import OpenAI + +# --- Configuration --- +RETRY_INPUT_FILE = "data/raw_datasets/retry_requests.jsonl" +BATCH_DIR = "data/raw_datasets/batch_files_retry" +BATCH_SIZE_LIMIT = 10000 # Smaller chunks as requested + +def submit_retry_batches(): + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + print("Error: OPENAI_API_KEY not set.") + return + client = OpenAI(api_key=api_key) + + os.makedirs(BATCH_DIR, exist_ok=True) + + if not os.path.exists(RETRY_INPUT_FILE): + print(f"Error: {RETRY_INPUT_FILE} not found.") + return + + print(f"Reading retry requests from {RETRY_INPUT_FILE}...") + + all_lines = [] + with open(RETRY_INPUT_FILE, "r", encoding="utf-8") as f: + for line in f: + if line.strip(): + all_lines.append(line.strip()) # Keep as string, no need to parse json + + total_items = len(all_lines) + print(f"Total retry items: {total_items}") + + batch_ids = [] + + # Split and Submit + for batch_idx, i in enumerate(range(0, total_items, BATCH_SIZE_LIMIT)): + chunk = all_lines[i : i + BATCH_SIZE_LIMIT] + chunk_filename = os.path.join(BATCH_DIR, f"retry_batch_part_{batch_idx}.jsonl") + + print(f"\n--- Processing Retry Batch {batch_idx} ({len(chunk)} items) ---") + + # 1. Create File + with open(chunk_filename, "w", encoding="utf-8") as f_out: + for line in chunk: + f_out.write(line + "\n") + + print(f"File created: {chunk_filename}") + + # 2. Upload File + print("Uploading to OpenAI...") + batch_file_obj = client.files.create( + file=open(chunk_filename, "rb"), + purpose="batch" + ) + file_id = batch_file_obj.id + print(f"Uploaded. File ID: {file_id}") + + # 3. Submit Batch + print("Submitting Batch Job...") + batch_job = client.batches.create( + input_file_id=file_id, + endpoint="/v1/chat/completions", + completion_window="24h", + metadata={ + "description": f"Pers. Extractor RETRY Part {batch_idx}", + "retry": "true" + } + ) + print(f"Submitted. Batch ID: {batch_job.id}") + batch_ids.append(batch_job.id) + + time.sleep(1) + + # Save Batch IDs (Append to existing or create new separate file?) + # Let's create a separate file for retries to avoid confusion. + id_file = "data/raw_datasets/submitted_retry_batch_ids.json" + with open(id_file, "w") as f: + json.dump(batch_ids, f, indent=2) + + print(f"\nALL DONE! Submitted {len(batch_ids)} retry batches.") + print(f"Batch IDs saved to {id_file}") + print("Use scripts/check_retry_status.py (need to create/modify) to monitor.") + +if __name__ == "__main__": + submit_retry_batches() + -- cgit v1.2.3