scripts/submit_retry_batch.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88

import json
import os
import time
from openai import OpenAI

# --- Configuration ---
RETRY_INPUT_FILE = "data/raw_datasets/retry_requests.jsonl"
BATCH_DIR = "data/raw_datasets/batch_files_retry"
BATCH_SIZE_LIMIT = 10000  # Smaller chunks as requested

def submit_retry_batches():
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        print("Error: OPENAI_API_KEY not set.")
        return
    client = OpenAI(api_key=api_key)

    os.makedirs(BATCH_DIR, exist_ok=True)
    
    if not os.path.exists(RETRY_INPUT_FILE):
        print(f"Error: {RETRY_INPUT_FILE} not found.")
        return

    print(f"Reading retry requests from {RETRY_INPUT_FILE}...")
    
    all_lines = []
    with open(RETRY_INPUT_FILE, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                all_lines.append(line.strip()) # Keep as string, no need to parse json
    
    total_items = len(all_lines)
    print(f"Total retry items: {total_items}")
    
    batch_ids = []
    
    # Split and Submit
    for batch_idx, i in enumerate(range(0, total_items, BATCH_SIZE_LIMIT)):
        chunk = all_lines[i : i + BATCH_SIZE_LIMIT]
        chunk_filename = os.path.join(BATCH_DIR, f"retry_batch_part_{batch_idx}.jsonl")
        
        print(f"\n--- Processing Retry Batch {batch_idx} ({len(chunk)} items) ---")
        
        # 1. Create File
        with open(chunk_filename, "w", encoding="utf-8") as f_out:
            for line in chunk:
                f_out.write(line + "\n")
        
        print(f"File created: {chunk_filename}")
        
        # 2. Upload File
        print("Uploading to OpenAI...")
        batch_file_obj = client.files.create(
            file=open(chunk_filename, "rb"),
            purpose="batch"
        )
        file_id = batch_file_obj.id
        print(f"Uploaded. File ID: {file_id}")
        
        # 3. Submit Batch
        print("Submitting Batch Job...")
        batch_job = client.batches.create(
            input_file_id=file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={
                "description": f"Pers. Extractor RETRY Part {batch_idx}",
                "retry": "true"
            }
        )
        print(f"Submitted. Batch ID: {batch_job.id}")
        batch_ids.append(batch_job.id)
        
        time.sleep(1)

    # Save Batch IDs (Append to existing or create new separate file?)
    # Let's create a separate file for retries to avoid confusion.
    id_file = "data/raw_datasets/submitted_retry_batch_ids.json"
    with open(id_file, "w") as f:
        json.dump(batch_ids, f, indent=2)
    
    print(f"\nALL DONE! Submitted {len(batch_ids)} retry batches.")
    print(f"Batch IDs saved to {id_file}")
    print("Use scripts/check_retry_status.py (need to create/modify) to monitor.")

if __name__ == "__main__":
    submit_retry_batches()