summaryrefslogtreecommitdiff
path: root/scripts/run_putnam_evaluation.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/run_putnam_evaluation.py')
-rw-r--r--scripts/run_putnam_evaluation.py164
1 files changed, 164 insertions, 0 deletions
diff --git a/scripts/run_putnam_evaluation.py b/scripts/run_putnam_evaluation.py
new file mode 100644
index 0000000..f320eea
--- /dev/null
+++ b/scripts/run_putnam_evaluation.py
@@ -0,0 +1,164 @@
+import json
+import os
+import glob
+import argparse
+from typing import List, Dict, Any
+from openai import OpenAI
+
+# Configuration
+DATA_DIR = "LLaMA-Factory/preprocess/PutnamGAP"
+OUTPUT_DIR = "data/putnam_eval"
+OUTPUT_FILENAME = "putnam_eval_batch.jsonl"
+MODEL_NAME = "gpt-5" # User requested gpt-5
+
+SYSTEM_PROMPT = """You are an expert mathematician and a judge for math competitions. You are given an original math problem (and its solution) and a "kernel variant" of that problem (and its solution).
+
+Your task is to:
+1. Evaluate the correctness of the kernel variant. Is the problem statement mathematically sound and clear? Is the provided solution correct?
+2. Evaluate the relationship between the original problem and the kernel variant. Are they mathematically equivalent? Or is the variant a strong abstraction/generalization/simplification of the original? Do they test the same core concepts?
+
+Output your analysis in the following JSON format:
+{
+ "variant_validity": {
+ "is_problem_valid": boolean,
+ "is_solution_correct": boolean,
+ "comments": "string"
+ },
+ "relation_to_original": {
+ "is_equivalent": boolean,
+ "is_strongly_related": boolean,
+ "relationship_description": "string"
+ }
+}"""
+
+def load_dataset(data_dir: str) -> List[Dict[str, Any]]:
+ files = glob.glob(os.path.join(data_dir, "*.json"))
+ items = []
+ print(f"Scanning {len(files)} files in {data_dir}...")
+ for fpath in files:
+ try:
+ with open(fpath, "r", encoding="utf-8") as f:
+ data = json.load(f)
+
+ # Check for required fields
+ if "variants" not in data or "kernel_variant" not in data["variants"]:
+ continue
+
+ orig_q = data.get("question", "")
+ orig_s = data.get("solution", "")
+ kv = data["variants"]["kernel_variant"]
+ kv_q = kv.get("question", "")
+ kv_s = kv.get("solution", "")
+
+ if not kv_q:
+ continue
+
+ items.append({
+ "id": data.get("index", os.path.basename(fpath)),
+ "original_question": orig_q,
+ "original_solution": orig_s,
+ "kernel_variant_question": kv_q,
+ "kernel_variant_solution": kv_s
+ })
+ except Exception as e:
+ print(f"Error reading {fpath}: {e}")
+ return items
+
+def create_batch_file(items: List[Dict[str, Any]], output_path: str):
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
+
+ count = 0
+ with open(output_path, "w", encoding="utf-8") as f:
+ for item in items:
+ user_content = f"""[Original Problem]
+{item['original_question']}
+
+[Original Solution]
+{item['original_solution']}
+
+[Kernel Variant Problem]
+{item['kernel_variant_question']}
+
+[Kernel Variant Solution]
+{item['kernel_variant_solution']}"""
+
+ # Construct request
+ request_obj = {
+ "custom_id": f"req_{item['id']}",
+ "method": "POST",
+ "url": "/v1/chat/completions",
+ "body": {
+ "model": MODEL_NAME,
+ "messages": [
+ {"role": "system", "content": SYSTEM_PROMPT},
+ {"role": "user", "content": user_content}
+ ],
+ "response_format": {"type": "json_object"}
+ }
+ }
+ f.write(json.dumps(request_obj) + "\n")
+ count += 1
+
+ print(f"Created batch file at {output_path} with {count} requests.")
+ return count
+
+def submit_batch(file_path: str):
+ api_key = os.getenv("OPENAI_API_KEY")
+ if not api_key:
+ print("Error: OPENAI_API_KEY not set. Cannot submit.")
+ return
+
+ client = OpenAI(api_key=api_key)
+
+ print(f"Uploading {file_path} to OpenAI...")
+ with open(file_path, "rb") as f:
+ batch_file_obj = client.files.create(
+ file=f,
+ purpose="batch"
+ )
+ file_id = batch_file_obj.id
+ print(f"Uploaded. File ID: {file_id}")
+
+ print("Submitting Batch Job...")
+ batch_job = client.batches.create(
+ input_file_id=file_id,
+ endpoint="/v1/chat/completions",
+ completion_window="24h",
+ metadata={
+ "description": "PutnamGAP Evaluation"
+ }
+ )
+ print(f"Submitted. Batch ID: {batch_job.id}")
+
+ # Save Batch ID
+ id_file = os.path.join(os.path.dirname(file_path), "submitted_batch_ids.json")
+ existing_ids = []
+ if os.path.exists(id_file):
+ try:
+ with open(id_file, "r") as f:
+ existing_ids = json.load(f)
+ except:
+ pass
+ existing_ids.append(batch_job.id)
+ with open(id_file, "w") as f:
+ json.dump(existing_ids, f, indent=2)
+ print(f"Batch ID saved to {id_file}")
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Prepare and optionally submit PutnamGAP evaluation batch.")
+ parser.add_argument("--submit", action="store_true", help="Submit the batch to OpenAI after generating.")
+ args = parser.parse_args()
+
+ items = load_dataset(DATA_DIR)
+ print(f"Found {len(items)} items with kernel variants.")
+
+ output_path = os.path.join(OUTPUT_DIR, OUTPUT_FILENAME)
+ if items:
+ create_batch_file(items, output_path)
+ if args.submit:
+ submit_batch(output_path)
+ else:
+ print("Use --submit to submit the batch to OpenAI.")
+ else:
+ print("No items found to process.")
+