diff options
| author | YurenHao0426 <blackhao0426@gmail.com> | 2025-12-17 04:29:37 -0600 |
|---|---|---|
| committer | YurenHao0426 <blackhao0426@gmail.com> | 2025-12-17 04:29:37 -0600 |
| commit | e43b3f8aa36c198b95c1e46bea2eaf3893b13dc3 (patch) | |
| tree | 6ce8a00d2f8b9ebd83c894a27ea01ac50cfb2ff5 /scripts/run_putnam_evaluation.py | |
Diffstat (limited to 'scripts/run_putnam_evaluation.py')
| -rw-r--r-- | scripts/run_putnam_evaluation.py | 164 |
1 files changed, 164 insertions, 0 deletions
diff --git a/scripts/run_putnam_evaluation.py b/scripts/run_putnam_evaluation.py new file mode 100644 index 0000000..f320eea --- /dev/null +++ b/scripts/run_putnam_evaluation.py @@ -0,0 +1,164 @@ +import json +import os +import glob +import argparse +from typing import List, Dict, Any +from openai import OpenAI + +# Configuration +DATA_DIR = "LLaMA-Factory/preprocess/PutnamGAP" +OUTPUT_DIR = "data/putnam_eval" +OUTPUT_FILENAME = "putnam_eval_batch.jsonl" +MODEL_NAME = "gpt-5" # User requested gpt-5 + +SYSTEM_PROMPT = """You are an expert mathematician and a judge for math competitions. You are given an original math problem (and its solution) and a "kernel variant" of that problem (and its solution). + +Your task is to: +1. Evaluate the correctness of the kernel variant. Is the problem statement mathematically sound and clear? Is the provided solution correct? +2. Evaluate the relationship between the original problem and the kernel variant. Are they mathematically equivalent? Or is the variant a strong abstraction/generalization/simplification of the original? Do they test the same core concepts? + +Output your analysis in the following JSON format: +{ + "variant_validity": { + "is_problem_valid": boolean, + "is_solution_correct": boolean, + "comments": "string" + }, + "relation_to_original": { + "is_equivalent": boolean, + "is_strongly_related": boolean, + "relationship_description": "string" + } +}""" + +def load_dataset(data_dir: str) -> List[Dict[str, Any]]: + files = glob.glob(os.path.join(data_dir, "*.json")) + items = [] + print(f"Scanning {len(files)} files in {data_dir}...") + for fpath in files: + try: + with open(fpath, "r", encoding="utf-8") as f: + data = json.load(f) + + # Check for required fields + if "variants" not in data or "kernel_variant" not in data["variants"]: + continue + + orig_q = data.get("question", "") + orig_s = data.get("solution", "") + kv = data["variants"]["kernel_variant"] + kv_q = kv.get("question", "") + kv_s = kv.get("solution", "") + + if not kv_q: + continue + + items.append({ + "id": data.get("index", os.path.basename(fpath)), + "original_question": orig_q, + "original_solution": orig_s, + "kernel_variant_question": kv_q, + "kernel_variant_solution": kv_s + }) + except Exception as e: + print(f"Error reading {fpath}: {e}") + return items + +def create_batch_file(items: List[Dict[str, Any]], output_path: str): + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + count = 0 + with open(output_path, "w", encoding="utf-8") as f: + for item in items: + user_content = f"""[Original Problem] +{item['original_question']} + +[Original Solution] +{item['original_solution']} + +[Kernel Variant Problem] +{item['kernel_variant_question']} + +[Kernel Variant Solution] +{item['kernel_variant_solution']}""" + + # Construct request + request_obj = { + "custom_id": f"req_{item['id']}", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": MODEL_NAME, + "messages": [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_content} + ], + "response_format": {"type": "json_object"} + } + } + f.write(json.dumps(request_obj) + "\n") + count += 1 + + print(f"Created batch file at {output_path} with {count} requests.") + return count + +def submit_batch(file_path: str): + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + print("Error: OPENAI_API_KEY not set. Cannot submit.") + return + + client = OpenAI(api_key=api_key) + + print(f"Uploading {file_path} to OpenAI...") + with open(file_path, "rb") as f: + batch_file_obj = client.files.create( + file=f, + purpose="batch" + ) + file_id = batch_file_obj.id + print(f"Uploaded. File ID: {file_id}") + + print("Submitting Batch Job...") + batch_job = client.batches.create( + input_file_id=file_id, + endpoint="/v1/chat/completions", + completion_window="24h", + metadata={ + "description": "PutnamGAP Evaluation" + } + ) + print(f"Submitted. Batch ID: {batch_job.id}") + + # Save Batch ID + id_file = os.path.join(os.path.dirname(file_path), "submitted_batch_ids.json") + existing_ids = [] + if os.path.exists(id_file): + try: + with open(id_file, "r") as f: + existing_ids = json.load(f) + except: + pass + existing_ids.append(batch_job.id) + with open(id_file, "w") as f: + json.dump(existing_ids, f, indent=2) + print(f"Batch ID saved to {id_file}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Prepare and optionally submit PutnamGAP evaluation batch.") + parser.add_argument("--submit", action="store_true", help="Submit the batch to OpenAI after generating.") + args = parser.parse_args() + + items = load_dataset(DATA_DIR) + print(f"Found {len(items)} items with kernel variants.") + + output_path = os.path.join(OUTPUT_DIR, OUTPUT_FILENAME) + if items: + create_batch_file(items, output_path) + if args.submit: + submit_batch(output_path) + else: + print("Use --submit to submit the batch to OpenAI.") + else: + print("No items found to process.") + |
