1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
import os
import sys
ROOT = os.path.dirname(os.path.abspath(__file__))
PYTHON_STOP = [ "\nif __name__",
"\ndef main(",
"\nprint("
]
PYTHON_IMPORTS = [ "import math",
"import re",
"import sys",
"import copy",
"import datetime",
"import itertools",
"import collections",
"import heapq",
"import functools",
"import hashlib",
"import numpy",
"import numpy as np",
"import string",
"from typing import *",
"from collections import *"
]
LEETCODE_IMPORTS = [
'from typing import *',
'from functools import *',
'from collections import *',
'from itertools import *',
'from heapq import *',
'from bisect import *',
'from string import *',
'from operator import *',
'from math import *',
'import math',
'import datetime',
"inf = float('inf')",
]
from abc import ABC, abstractmethod
class Benchmark(ABC):
name: str = None
split: str = None
path: str = os.path.abspath(os.path.join(os.path.dirname(__file__), "data/"))
imports = []
chat_stop = []
base_stop = []
def __init__(self):
"""
:param stop_words: list
list of stop words if the generation uses a stopping criteria during generation
:param requires_execution: bool
wheter the task requires code execution during evaluation or not
"""
pass
def fewshot_examples(self):
"""Loads and returns the few-shot examples for the task if they exist."""
pass
@abstractmethod
def get_task(self):
"""Builds the task for the LM to generate from.
"""
pass
@abstractmethod
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from.
:param doc: dict[str: str]
sample from the test dataset
"""
pass
def get_reference(self, doc):
"""Builds the reference solution for the doc.
:param doc: dict[str: str]
sample from the test dataset
"""
pass
@abstractmethod
def postprocess_generation(self, task, generation):
"""Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int
index of doc in the dataset to which the generation belongs
"""
pass
@abstractmethod
def process_results(self, generations, references):
"""Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations as in {"metric_name": result}.
:param generations: list(list(str))
list of lists containing generations
:param references: list(str)
list of str containing refrences
:return: dict[str: float]
"""
pass
def _stop_at_stop_token(decoded_string, stop_tokens):
"""
Produces the prefix of decoded_string that ends at the first occurrence of
a stop_token.
WARNING: the decoded_string *must not* include the prompt, which may have stop tokens
itself.
"""
min_stop_index = len(decoded_string)
for stop_token in stop_tokens:
stop_index = decoded_string.find(stop_token)
if stop_index != -1 and stop_index < min_stop_index:
min_stop_index = stop_index
return decoded_string[:min_stop_index]
|