1 files changed, 164 insertions, 0 deletions
diff --git a/scripts/run_putnam_evaluation.py b/scripts/run_putnam_evaluation.py
new file mode 100644
index 0000000..f320eea
--- /dev/null
+++ b/scripts/run_putnam_evaluation.py
@@ -0,0 +1,164 @@
+import json
+import os
+import glob
+import argparse
+from typing import List, Dict, Any
+from openai import OpenAI
+
+# Configuration
+DATA_DIR = "LLaMA-Factory/preprocess/PutnamGAP"
+OUTPUT_DIR = "data/putnam_eval"
+OUTPUT_FILENAME = "putnam_eval_batch.jsonl"
+MODEL_NAME = "gpt-5"  # User requested gpt-5
+
+SYSTEM_PROMPT = """You are an expert mathematician and a judge for math competitions. You are given an original math problem (and its solution) and a "kernel variant" of that problem (and its solution).
+
+Your task is to:
+1. Evaluate the correctness of the kernel variant. Is the problem statement mathematically sound and clear? Is the provided solution correct?
+2. Evaluate the relationship between the original problem and the kernel variant. Are they mathematically equivalent? Or is the variant a strong abstraction/generalization/simplification of the original? Do they test the same core concepts?
+
+Output your analysis in the following JSON format:
+{
+  "variant_validity": {
+    "is_problem_valid": boolean,
+    "is_solution_correct": boolean,
+    "comments": "string"
+  },
+  "relation_to_original": {
+    "is_equivalent": boolean,
+    "is_strongly_related": boolean,
+    "relationship_description": "string"
+  }
+}"""
+
+def load_dataset(data_dir: str) -> List[Dict[str, Any]]:
+    files = glob.glob(os.path.join(data_dir, "*.json"))
+    items = []
+    print(f"Scanning {len(files)} files in {data_dir}...")
+    for fpath in files:
+        try:
+            with open(fpath, "r", encoding="utf-8") as f:
+                data = json.load(f)
+                
+                # Check for required fields
+                if "variants" not in data or "kernel_variant" not in data["variants"]:
+                    continue
+                    
+                orig_q = data.get("question", "")
+                orig_s = data.get("solution", "")
+                kv = data["variants"]["kernel_variant"]
+                kv_q = kv.get("question", "")
+                kv_s = kv.get("solution", "")
+                
+                if not kv_q:
+                    continue
+
+                items.append({
+                    "id": data.get("index", os.path.basename(fpath)),
+                    "original_question": orig_q,
+                    "original_solution": orig_s,
+                    "kernel_variant_question": kv_q,
+                    "kernel_variant_solution": kv_s
+                })
+        except Exception as e:
+            print(f"Error reading {fpath}: {e}")
+    return items
+
+def create_batch_file(items: List[Dict[str, Any]], output_path: str):
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    
+    count = 0
+    with open(output_path, "w", encoding="utf-8") as f:
+        for item in items:
+            user_content = f"""[Original Problem]
+{item['original_question']}
+
+[Original Solution]
+{item['original_solution']}
+
+[Kernel Variant Problem]
+{item['kernel_variant_question']}
+
+[Kernel Variant Solution]
+{item['kernel_variant_solution']}"""
+
+            # Construct request
+            request_obj = {
+                "custom_id": f"req_{item['id']}",
+                "method": "POST",
+                "url": "/v1/chat/completions",
+                "body": {
+                    "model": MODEL_NAME,
+                    "messages": [
+                        {"role": "system", "content": SYSTEM_PROMPT},
+                        {"role": "user", "content": user_content}
+                    ],
+                    "response_format": {"type": "json_object"}
+                }
+            }
+            f.write(json.dumps(request_obj) + "\n")
+            count += 1
+            
+    print(f"Created batch file at {output_path} with {count} requests.")
+    return count
+
+def submit_batch(file_path: str):
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        print("Error: OPENAI_API_KEY not set. Cannot submit.")
+        return
+
+    client = OpenAI(api_key=api_key)
+    
+    print(f"Uploading {file_path} to OpenAI...")
+    with open(file_path, "rb") as f:
+        batch_file_obj = client.files.create(
+            file=f,
+            purpose="batch"
+        )
+    file_id = batch_file_obj.id
+    print(f"Uploaded. File ID: {file_id}")
+    
+    print("Submitting Batch Job...")
+    batch_job = client.batches.create(
+        input_file_id=file_id,
+        endpoint="/v1/chat/completions",
+        completion_window="24h",
+        metadata={
+            "description": "PutnamGAP Evaluation"
+        }
+    )
+    print(f"Submitted. Batch ID: {batch_job.id}")
+    
+    # Save Batch ID
+    id_file = os.path.join(os.path.dirname(file_path), "submitted_batch_ids.json")
+    existing_ids = []
+    if os.path.exists(id_file):
+        try:
+            with open(id_file, "r") as f:
+                existing_ids = json.load(f)
+        except:
+            pass
+    existing_ids.append(batch_job.id)
+    with open(id_file, "w") as f:
+        json.dump(existing_ids, f, indent=2)
+    print(f"Batch ID saved to {id_file}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Prepare and optionally submit PutnamGAP evaluation batch.")
+    parser.add_argument("--submit", action="store_true", help="Submit the batch to OpenAI after generating.")
+    args = parser.parse_args()
+
+    items = load_dataset(DATA_DIR)
+    print(f"Found {len(items)} items with kernel variants.")
+    
+    output_path = os.path.join(OUTPUT_DIR, OUTPUT_FILENAME)
+    if items:
+        create_batch_file(items, output_path)
+        if args.submit:
+            submit_batch(output_path)
+        else:
+            print("Use --submit to submit the batch to OpenAI.")
+    else:
+        print("No items found to process.")
+