summaryrefslogtreecommitdiff
path: root/code_eval/OpenCodeEval/benchmark/BigCodeBench.py
blob: abc4faf5f9d692fc188ec7aa301b7d0eea6c655c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
from typing import Literal

ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from OpenCodeEval.benchmark.base import Benchmark, PYTHON_STOP, PYTHON_IMPORTS
from OpenCodeEval.utils import refine_text, stream_jsonl
from OpenCodeEval.eval.func_eval import check_correctness
from OpenCodeEval.eval.sanitize import sanitize

class BigCodeBench(Benchmark):

    name: str = "BigCodeBench"
    path: str = None

    fullset_path = os.path.abspath(os.path.join(ROOT, "../data/BigCodeBench.jsonl"))
    subset_path = os.path.abspath(os.path.join(ROOT, "../data/BigCodeBench_Hard.jsonl"))

    imports_code = PYTHON_IMPORTS
    chat_stop = PYTHON_STOP
    base_stop = ['\n"""', "\nassert"]

    def __init__(self,
                 name: str = "BigCodeBench",
                 timeout:float = 10.0,
                 prompt_type: Literal["Completion", "Instruction"] = "Completion"
                 ):
        
        super().__init__()
        
        self.name = name
        self.timeout = timeout
        self.prompt_type = prompt_type

        if self.name == "BigCodeHard":
            self.path = self.subset_path
        elif self.name == "BigCodeBench":
            self.path = self.fullset_path

        self.tasks = self.get_task()

    def get_task(self):
        """
        Get the task data from the jsonl file into a dictionary.
        """

        tasks = {}
        
        for task_data in stream_jsonl(filename=self.path):

            task_id = int(task_data["task_id"].split("/")[-1])
            
            tasks[task_id] = task_data
        
        return tasks
    
    def get_prompt(self):
        """
        Builds the prompt for the LM to generate from.
        """

        prompts = []
        for task_id, task_data in self.tasks.items():

            if self.prompt_type == "Completion":
                prompt = task_data['complete_prompt']
            elif self.prompt_type == "Instruction":
                prompt = task_data['instruct_prompt']

            prompts.append(
                dict(
                    task_id = task_id,
                    prompt = refine_text(prompt)
                )
            )

        return prompts

    def postprocess_generation(self, generation):
        """
        Postprocess the generations.
        """

        entry_point = self.tasks[generation['task_id']]["entry_point"]

        result = dict(
            task_id = generation['task_id'],
            completion_id = generation['completion_id'],
            solution = sanitize(generation['completion'], entry_point)
        )

        return result

    def process_results(self, solution):
        """
        Takes the list of LM generations and evaluates them against the test cases
        """

        task_data = self.tasks[solution['task_id']]

        code = (
            task_data["code_prompt"] + "\n" 
            + "    pass\n" + "\n"
            + solution['solution'] + "\n"
        )
        
        result = check_correctness(solution['task_id'],
                                   solution['completion_id'],
                                   code,
                                   task_data["test"],
                                   self.timeout)
        
        return result