1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
import json
import os
import time
from openai import OpenAI
# --- Configuration ---
RETRY_INPUT_FILE = "data/raw_datasets/retry_requests.jsonl"
BATCH_DIR = "data/raw_datasets/batch_files_retry"
BATCH_SIZE_LIMIT = 10000 # Smaller chunks as requested
def submit_retry_batches():
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
print("Error: OPENAI_API_KEY not set.")
return
client = OpenAI(api_key=api_key)
os.makedirs(BATCH_DIR, exist_ok=True)
if not os.path.exists(RETRY_INPUT_FILE):
print(f"Error: {RETRY_INPUT_FILE} not found.")
return
print(f"Reading retry requests from {RETRY_INPUT_FILE}...")
all_lines = []
with open(RETRY_INPUT_FILE, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
all_lines.append(line.strip()) # Keep as string, no need to parse json
total_items = len(all_lines)
print(f"Total retry items: {total_items}")
batch_ids = []
# Split and Submit
for batch_idx, i in enumerate(range(0, total_items, BATCH_SIZE_LIMIT)):
chunk = all_lines[i : i + BATCH_SIZE_LIMIT]
chunk_filename = os.path.join(BATCH_DIR, f"retry_batch_part_{batch_idx}.jsonl")
print(f"\n--- Processing Retry Batch {batch_idx} ({len(chunk)} items) ---")
# 1. Create File
with open(chunk_filename, "w", encoding="utf-8") as f_out:
for line in chunk:
f_out.write(line + "\n")
print(f"File created: {chunk_filename}")
# 2. Upload File
print("Uploading to OpenAI...")
batch_file_obj = client.files.create(
file=open(chunk_filename, "rb"),
purpose="batch"
)
file_id = batch_file_obj.id
print(f"Uploaded. File ID: {file_id}")
# 3. Submit Batch
print("Submitting Batch Job...")
batch_job = client.batches.create(
input_file_id=file_id,
endpoint="/v1/chat/completions",
completion_window="24h",
metadata={
"description": f"Pers. Extractor RETRY Part {batch_idx}",
"retry": "true"
}
)
print(f"Submitted. Batch ID: {batch_job.id}")
batch_ids.append(batch_job.id)
time.sleep(1)
# Save Batch IDs (Append to existing or create new separate file?)
# Let's create a separate file for retries to avoid confusion.
id_file = "data/raw_datasets/submitted_retry_batch_ids.json"
with open(id_file, "w") as f:
json.dump(batch_ids, f, indent=2)
print(f"\nALL DONE! Submitted {len(batch_ids)} retry batches.")
print(f"Batch IDs saved to {id_file}")
print("Use scripts/check_retry_status.py (need to create/modify) to monitor.")
if __name__ == "__main__":
submit_retry_batches()
|