From fc6d57ffb8d5ddb5820fcc00b5491a585c259ebc Mon Sep 17 00:00:00 2001 From: Yuren Hao Date: Thu, 4 Sep 2025 22:16:22 -0500 Subject: Initial commit --- code_eval/OpenCodeEval/benchmark/BigCodeBench.py | 113 +++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 code_eval/OpenCodeEval/benchmark/BigCodeBench.py (limited to 'code_eval/OpenCodeEval/benchmark/BigCodeBench.py') diff --git a/code_eval/OpenCodeEval/benchmark/BigCodeBench.py b/code_eval/OpenCodeEval/benchmark/BigCodeBench.py new file mode 100644 index 0000000..abc4faf --- /dev/null +++ b/code_eval/OpenCodeEval/benchmark/BigCodeBench.py @@ -0,0 +1,113 @@ +import os +from typing import Literal + +ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from OpenCodeEval.benchmark.base import Benchmark, PYTHON_STOP, PYTHON_IMPORTS +from OpenCodeEval.utils import refine_text, stream_jsonl +from OpenCodeEval.eval.func_eval import check_correctness +from OpenCodeEval.eval.sanitize import sanitize + +class BigCodeBench(Benchmark): + + name: str = "BigCodeBench" + path: str = None + + fullset_path = os.path.abspath(os.path.join(ROOT, "../data/BigCodeBench.jsonl")) + subset_path = os.path.abspath(os.path.join(ROOT, "../data/BigCodeBench_Hard.jsonl")) + + imports_code = PYTHON_IMPORTS + chat_stop = PYTHON_STOP + base_stop = ['\n"""', "\nassert"] + + def __init__(self, + name: str = "BigCodeBench", + timeout:float = 10.0, + prompt_type: Literal["Completion", "Instruction"] = "Completion" + ): + + super().__init__() + + self.name = name + self.timeout = timeout + self.prompt_type = prompt_type + + if self.name == "BigCodeHard": + self.path = self.subset_path + elif self.name == "BigCodeBench": + self.path = self.fullset_path + + self.tasks = self.get_task() + + def get_task(self): + """ + Get the task data from the jsonl file into a dictionary. + """ + + tasks = {} + + for task_data in stream_jsonl(filename=self.path): + + task_id = int(task_data["task_id"].split("/")[-1]) + + tasks[task_id] = task_data + + return tasks + + def get_prompt(self): + """ + Builds the prompt for the LM to generate from. + """ + + prompts = [] + for task_id, task_data in self.tasks.items(): + + if self.prompt_type == "Completion": + prompt = task_data['complete_prompt'] + elif self.prompt_type == "Instruction": + prompt = task_data['instruct_prompt'] + + prompts.append( + dict( + task_id = task_id, + prompt = refine_text(prompt) + ) + ) + + return prompts + + def postprocess_generation(self, generation): + """ + Postprocess the generations. + """ + + entry_point = self.tasks[generation['task_id']]["entry_point"] + + result = dict( + task_id = generation['task_id'], + completion_id = generation['completion_id'], + solution = sanitize(generation['completion'], entry_point) + ) + + return result + + def process_results(self, solution): + """ + Takes the list of LM generations and evaluates them against the test cases + """ + + task_data = self.tasks[solution['task_id']] + + code = ( + task_data["code_prompt"] + "\n" + + " pass\n" + "\n" + + solution['solution'] + "\n" + ) + + result = check_correctness(solution['task_id'], + solution['completion_id'], + code, + task_data["test"], + self.timeout) + + return result \ No newline at end of file -- cgit v1.2.3