scripts/run_putnam_evaluation.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

import json
import os
import glob
import argparse
from typing import List, Dict, Any
from openai import OpenAI

# Configuration
DATA_DIR = "LLaMA-Factory/preprocess/PutnamGAP"
OUTPUT_DIR = "data/putnam_eval"
OUTPUT_FILENAME = "putnam_eval_batch.jsonl"
MODEL_NAME = "gpt-5"  # User requested gpt-5

SYSTEM_PROMPT = """You are an expert mathematician and a judge for math competitions. You are given an original math problem (and its solution) and a "kernel variant" of that problem (and its solution).

Your task is to:
1. Evaluate the correctness of the kernel variant. Is the problem statement mathematically sound and clear? Is the provided solution correct?
2. Evaluate the relationship between the original problem and the kernel variant. Are they mathematically equivalent? Or is the variant a strong abstraction/generalization/simplification of the original? Do they test the same core concepts?

Output your analysis in the following JSON format:
{
  "variant_validity": {
    "is_problem_valid": boolean,
    "is_solution_correct": boolean,
    "comments": "string"
  },
  "relation_to_original": {
    "is_equivalent": boolean,
    "is_strongly_related": boolean,
    "relationship_description": "string"
  }
}"""

def load_dataset(data_dir: str) -> List[Dict[str, Any]]:
    files = glob.glob(os.path.join(data_dir, "*.json"))
    items = []
    print(f"Scanning {len(files)} files in {data_dir}...")
    for fpath in files:
        try:
            with open(fpath, "r", encoding="utf-8") as f:
                data = json.load(f)
                
                # Check for required fields
                if "variants" not in data or "kernel_variant" not in data["variants"]:
                    continue
                    
                orig_q = data.get("question", "")
                orig_s = data.get("solution", "")
                kv = data["variants"]["kernel_variant"]
                kv_q = kv.get("question", "")
                kv_s = kv.get("solution", "")
                
                if not kv_q:
                    continue

                items.append({
                    "id": data.get("index", os.path.basename(fpath)),
                    "original_question": orig_q,
                    "original_solution": orig_s,
                    "kernel_variant_question": kv_q,
                    "kernel_variant_solution": kv_s
                })
        except Exception as e:
            print(f"Error reading {fpath}: {e}")
    return items

def create_batch_file(items: List[Dict[str, Any]], output_path: str):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    count = 0
    with open(output_path, "w", encoding="utf-8") as f:
        for item in items:
            user_content = f"""[Original Problem]
{item['original_question']}

[Original Solution]
{item['original_solution']}

[Kernel Variant Problem]
{item['kernel_variant_question']}

[Kernel Variant Solution]
{item['kernel_variant_solution']}"""

            # Construct request
            request_obj = {
                "custom_id": f"req_{item['id']}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": MODEL_NAME,
                    "messages": [
                        {"role": "system", "content": SYSTEM_PROMPT},
                        {"role": "user", "content": user_content}
                    ],
                    "response_format": {"type": "json_object"}
                }
            }
            f.write(json.dumps(request_obj) + "\n")
            count += 1
            
    print(f"Created batch file at {output_path} with {count} requests.")
    return count

def submit_batch(file_path: str):
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        print("Error: OPENAI_API_KEY not set. Cannot submit.")
        return

    client = OpenAI(api_key=api_key)
    
    print(f"Uploading {file_path} to OpenAI...")
    with open(file_path, "rb") as f:
        batch_file_obj = client.files.create(
            file=f,
            purpose="batch"
        )
    file_id = batch_file_obj.id
    print(f"Uploaded. File ID: {file_id}")
    
    print("Submitting Batch Job...")
    batch_job = client.batches.create(
        input_file_id=file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": "PutnamGAP Evaluation"
        }
    )
    print(f"Submitted. Batch ID: {batch_job.id}")
    
    # Save Batch ID
    id_file = os.path.join(os.path.dirname(file_path), "submitted_batch_ids.json")
    existing_ids = []
    if os.path.exists(id_file):
        try:
            with open(id_file, "r") as f:
                existing_ids = json.load(f)
        except:
            pass
    existing_ids.append(batch_job.id)
    with open(id_file, "w") as f:
        json.dump(existing_ids, f, indent=2)
    print(f"Batch ID saved to {id_file}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Prepare and optionally submit PutnamGAP evaluation batch.")
    parser.add_argument("--submit", action="store_true", help="Submit the batch to OpenAI after generating.")
    args = parser.parse_args()

    items = load_dataset(DATA_DIR)
    print(f"Found {len(items)} items with kernel variants.")
    
    output_path = os.path.join(OUTPUT_DIR, OUTPUT_FILENAME)
    if items:
        create_batch_file(items, output_path)
        if args.submit:
            submit_batch(output_path)
        else:
            print("Use --submit to submit the batch to OpenAI.")
    else:
        print("No items found to process.")