Initial commit

author: Yuren Hao <yurenh2@timan108.cs.illinois.edu> 2025-09-04 22:16:22 -0500
committer: Yuren Hao <yurenh2@timan108.cs.illinois.edu> 2025-09-04 22:16:22 -0500
commit: fc6d57ffb8d5ddb5820fcc00b5491a585c259ebc (patch)
tree: e9841f93a353e2107225cfc721d1ce57c0e594dc /code_eval/OpenCodeEval
27 files changed, 3257 insertions, 0 deletions
diff --git a/code_eval/OpenCodeEval/__init__.py b/code_eval/OpenCodeEval/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/code_eval/OpenCodeEval/__init__.py
diff --git a/code_eval/OpenCodeEval/args.py b/code_eval/OpenCodeEval/args.py
new file mode 100644
index 0000000..bf2d51f
--- /dev/null
+++ b/code_eval/OpenCodeEval/args.py
@@ -0,0 +1,91 @@
+from loguru import logger
+
+BENCHMARKS = [
+    "HumanEval",
+    "mbpp",
+    "MBPP",
+    "LeetCode",
+    "BigCodeBench",
+    "Bird",
+    "Spider",
+    "understandml"
+]
+
+SPLITS = {
+    "HumanEval": ["base", "plus"],
+    "mbpp": ["full", "sanitized"],
+    "MBPP": ["base", "plus"],
+    "LeetCode": ["contest", "train", "validation", "test"],
+    "BigCodeBench": ["full", "hard"],
+    "Bird": ["train", "dev"],
+    "Spider": ["train", "dev"],
+    "understandml": ["human", "model"]
+}
+
+def check_args(args):
+
+    if args.tokenizer_name is None:
+        args.tokenizer_name = args.model_name
+
+    # When eval the pretrained model, it can not response to the instrcution, so suggest using the Completion prompt
+    if args.model_type == "Base" and args.prompt_type == "Instruction":
+        logger.warning("Prompt type must be Completion for Base Model")
+
+    # check the split is valid for the task
+    if args.split not in SPLITS[args.task]:
+        logger.error(f"split {args.split} is not valid for {args.task}, please use {SPLITS[args.task]}")
+
+    # check the list_k is valid
+    args.list_k = list(map(int, args.list_k.split(',')))
+    if max(args.list_k) > args.num_samples:
+        logger.warning("max of list_k must be less than num_samples")
+        args.list_k = [k for k in args.list_k if k <= args.num_samples]
+
+    # check the temperature is valid
+    if args.num_samples > 1 and args.temperature == 0.0:
+        logger.error("Temperature is not allowed when num_samples > 1")
+
+    #When eval the chat model, it can not reply to the uncompleted code, so suggest using the Instruction prompt
+    if args.model_type == "Chat" and args.prompt_type == "Completion":
+        if args.prompt_prefix == "":
+            logger.warning("Prompt prefix is not set, using default prefix")
+            args.prompt_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:\n```python\n"
+        if args.prompt_suffix == "":
+            logger.warning("Prompt suffix is not set, using default suffix")
+            args.prompt_suffix = "\n```\n"
+
+    return args
+
+def get_args(parser):
+
+    #===================Model Parser===================
+    parser.add_argument("--model_name", default = None, type=str)
+    parser.add_argument("--tokenizer_name", default = None, type=str)
+    parser.add_argument("--trust_remote_code", action="store_true")
+    parser.add_argument("--backend", default="vllm", type=str, choices=["openai", "vllm", "sglang"])
+    parser.add_argument("--task", default="HumanEval", type=str, choices = BENCHMARKS)
+    parser.add_argument("--split", default="base", type = str)
+    parser.add_argument("--prompt_type", default = "Instruction", type=str, choices=["Completion", "Instruction"])
+    parser.add_argument("--model_type", default = "Chat", type = str, choices=["Base", "Chat"])
+    parser.add_argument("--list_k", default = "1,3,5,10", type = str)
+    parser.add_argument("--time_out", default = 3, type = float)
+    
+    #===================Computer Parser===================
+    parser.add_argument("--num_gpus", default = 1, type=int)
+    parser.add_argument("--num_workers", default = 1, type=int)
+
+    parser.add_argument("--save_path", default = 'save', type=str)
+    parser.add_argument("--batch_size", default = 164, type=int)
+    parser.add_argument("--num_samples", default = 1, type=int)
+    parser.add_argument("--max_tokens", default = 2048, type=int)
+    parser.add_argument("--temperature", default = 0.0, type=float)
+    parser.add_argument("--top_p", default = 1.0, type=float)
+    parser.add_argument("--seed", default = 1, type=float)
+
+    #===================Prompt Parser===================
+    parser.add_argument("--prompt_prefix", default = "", type=str)
+    parser.add_argument("--prompt_suffix", default = "", type=str)
+    parser.add_argument("--response_prefix", default = "", type=str)
+    parser.add_argument("--response_suffix", default = "", type=str)
+
+    return parser.parse_args()
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/backend/__init__.py b/code_eval/OpenCodeEval/backend/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/code_eval/OpenCodeEval/backend/__init__.py
diff --git a/code_eval/OpenCodeEval/backend/base.py b/code_eval/OpenCodeEval/backend/base.py
new file mode 100644
index 0000000..42091b9
--- /dev/null
+++ b/code_eval/OpenCodeEval/backend/base.py
@@ -0,0 +1,55 @@
+from typing import Callable
+from abc import ABC, abstractmethod
+
+def make_chat_template(
+        prompt: str,
+        response_prefix: str = "",
+        is_chat: bool = True,
+        tokenizer: Callable = None
+    ) -> str:
+
+    if is_chat:
+        prompt = tokenizer.apply_chat_template(
+            [
+                {"role": "user", "content":  prompt},
+            ],
+            tokenize = False,
+            add_generation_prompt = True
+        ) + response_prefix
+        if tokenizer.bos_token and prompt.startswith(tokenizer.bos_token):
+            prompt = prompt[len(tokenizer.bos_token):]
+        return prompt
+    else:
+        return prompt
+
+class Generator(ABC):
+
+    model_name: str = None
+
+    def __init__(self, model_name: str) -> None:
+        """
+        :param stop_words: list
+            list of stop words if the generation uses a stopping criteria during generation
+        :param requires_execution: bool
+            wheter the task requires code execution during evaluation or not
+        """
+        self.model_name = model_name
+
+    def fewshot_examples(self):
+        """Loads and returns the few-shot examples for the task if they exist."""
+        pass
+
+    @abstractmethod
+    def set_stop(self):
+        """
+        Set the stop tokens for the model
+        """
+        pass
+
+    @abstractmethod
+    def generate(self):
+        """Builds the prompt for the LM to generate from.
+        :param doc: dict[str: str]
+            sample from the test dataset
+        """
+        pass
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/backend/openai.py b/code_eval/OpenCodeEval/backend/openai.py
new file mode 100644
index 0000000..afa45b2
--- /dev/null
+++ b/code_eval/OpenCodeEval/backend/openai.py
@@ -0,0 +1,115 @@
+import os
+
+from typing import List, Dict
+from openai import OpenAI
+from loguru import logger
+from tqdm.contrib.concurrent import thread_map
+
+from OpenCodeEval.backend.base import Generator
+
+class OpenaiGenerator(Generator):
+
+    def __init__(
+            self,
+            model_name: str,
+            model_type: str = "Instruction",
+            batch_size : int = 1,
+            temperature : float = 0.0,
+            top_p : float = 1.0,
+            max_tokens : int = 1024,
+            num_samples : int = 1,
+        ) -> None:
+
+        super().__init__(model_name)
+
+        print("Initializing a decoder model in openai: {} ...".format(model_name))
+        self.model_type = model_type
+        self.batch_size = batch_size
+        self.temperature = temperature
+        self.top_p = top_p
+        self.max_tokens = max_tokens
+        self.num_samples = num_samples
+
+        self.client = OpenAI(
+            base_url = os.getenv("OPENAI_BASE_URL"),
+            api_key = os.getenv("OPENAI_API_KEY")
+        )
+
+    def is_chat(self) -> bool:
+        if self.model_type == "Chat":
+            return True
+        else:
+            return False
+    
+    def set_stop(self, eos: List[str]):
+        self.eos = eos
+
+    def connect_server(self, prompt):
+
+        num_tries = 5
+
+        try:
+
+            for _ in range(num_tries):
+
+                result = self.client.chat.completions.create(
+                    model = self.model_name,
+                    messages=[
+                        {"role": "user", "content": prompt['prompt']}
+                    ], 
+                    n = self.num_samples,
+                    stream = False,
+                    # stop = self.eos,
+                    temperature = self.temperature,
+                    # top_p = self.top_p,
+                    max_tokens = self.max_tokens,
+                    extra_headers = {'apikey':os.getenv("OPENAI_API_KEY")},
+                )
+
+                if all(choice.message.content for choice in result.choices):
+                    break
+                else:
+                    logger.warning("No content in the response, retrying...")
+                
+            results = [
+                dict(
+                    task_id=prompt['task_id'],
+                    completion_id=i,
+                    completion=choice.message.content
+                )
+                for i, choice in enumerate(result.choices)
+            ]
+            
+        except Exception as e:
+            logger.error("Error: {}".format(e))
+            results = [
+                dict(
+                    task_id=prompt['task_id'],
+                    completion_id=i,
+                    completion = ''
+                )
+                for i in range(self.num_samples)
+            ]
+            
+        return results
+
+    def generate(
+        self,
+        prompt_set: List[Dict],
+        response_prefix: str = "",
+        response_suffix: str = ""
+    ) -> List[Dict]:
+        
+        logger.info("Example Prompt:\n{}", prompt_set[0]['prompt'])
+        
+        results = thread_map(
+            self.connect_server, 
+            prompt_set,
+            max_workers = self.batch_size,
+            desc="Generating Completions"
+        )
+        
+
+        generations = [item for sublist in results for item in sublist]
+        
+        return generations
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/backend/sglang.py b/code_eval/OpenCodeEval/backend/sglang.py
new file mode 100644
index 0000000..c759639
--- /dev/null
+++ b/code_eval/OpenCodeEval/backend/sglang.py
@@ -0,0 +1,148 @@
+import os
+
+from loguru import logger
+
+from tqdm import tqdm
+from typing import List, Dict
+
+from OpenCodeEval.backend.base import Generator, make_chat_template
+
+from OpenCodeEval.utils import refine_text
+
+class SglangGenerator(Generator):
+
+    def __init__(
+        self,
+        model_name: str,
+        tokenizer_name: str = None,
+        model_type: str = "Instruction",
+        dtype: str = "bfloat16",
+        batch_size : int = 1,
+        temperature : float = 0.0,
+        top_p : float = 1.0,
+        max_tokens : int = 1024,
+        num_samples: int = 200,
+        num_gpus: int = 1,
+        trust_remote_code: bool = True,
+    ) -> None:
+
+        super().__init__(model_name)
+
+        print("Initializing a decoder model in sglang: {} ...".format(model_name))
+        self.tokenizer_name = tokenizer_name if tokenizer_name is not None else model_name
+        self.model_type = model_type
+        self.batch_size = batch_size
+        self.temperature = temperature
+        self.top_p = top_p
+        self.max_tokens = max_tokens
+        self.num_samples = num_samples
+        self.dtype = dtype
+        self.num_gpus = num_gpus
+        self.trust_remote_code = trust_remote_code
+
+        from sglang import Engine
+
+        self.model = Engine(
+            model_path = self.model_name,
+            tokenizer_path = self.tokenizer_name,
+            dtype = self.dtype,
+            tp_size = self.num_gpus,
+            trust_remote_code = self.trust_remote_code
+        )
+
+        self.tokenizer = self.model.tokenizer_manager.tokenizer
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+        self.sampling_params = dict(
+            n = self.num_samples,
+            temperature = self.temperature,
+            max_new_tokens = self.max_tokens,
+            top_p = self.top_p,
+        )
+    
+    def is_chat(self) -> bool:
+        if self.model_type == "Chat":
+            if self.tokenizer.chat_template is None:
+                logger.error("When the model type is Chat, the chat template must be set for the tokenizer.")
+            else:
+                return True
+        else:
+            return False
+        
+    def set_stop(self, eos: List[str]):
+        self.sampling_params['stop'] = eos
+        
+    def release_memory(self):
+
+        import gc
+        import torch
+
+        from sglang.srt.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
+
+        self.model.shutdown()
+
+        # destroy_model_parallel
+        # destroy_distributed_environment()
+
+        # del self.model
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def generate(
+            self,
+            prompt_set: List[Dict],
+            response_prefix: str = "",
+            response_suffix: str = ""
+        ) -> List[str]:
+
+        if self.is_chat():
+            for prompt in prompt_set:
+                prompt['prompt'] = make_chat_template(
+                    prompt = prompt['prompt'],
+                    response_prefix = response_prefix,
+                    is_chat = self.is_chat(),
+                    tokenizer = self.tokenizer
+                )
+
+        logger.info("Example Prompt:\n{}", prompt_set[0]['prompt'])
+
+        generation_set = []
+
+        for batch_start in tqdm(range(0, len(prompt_set), self.batch_size)):
+
+            batch_prompt = prompt_set[batch_start : batch_start + self.batch_size]
+            batch_outputs = self.model.generate(
+                [prompt['prompt'] for prompt in batch_prompt],
+                self.sampling_params
+            )
+
+            batch_outputs = [
+                [item["text"] for item in batch_outputs[i:i + self.num_samples]]
+                for i in range(0, len(batch_outputs), self.num_samples)
+            ]
+
+            for prompt, output in zip(batch_prompt, batch_outputs):
+
+                # Process completions with prefix/suffix and refine text based on chat mode
+                completions = [
+                    refine_text(
+                        f"{prompt['prompt']}\n\n{response_prefix}{completion}{response_suffix}"
+                        if not self.is_chat()
+                        else f"{response_prefix}{completion}{response_suffix}"
+                    )
+                    for completion in output
+                ]
+
+                for completion_id, completion in enumerate(completions):
+                    generation_set.append({
+                        'task_id': prompt['task_id'],
+                        'completion_id': completion_id,
+                        'completion': completion
+                    })
+
+        if len(generation_set) != len(prompt_set) * self.num_samples:
+            logger.error("Number of generations does not match the expected number.")
+
+        self.release_memory()
+
+        return generation_set
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/backend/vllm.py b/code_eval/OpenCodeEval/backend/vllm.py
new file mode 100644
index 0000000..3082dfe
--- /dev/null
+++ b/code_eval/OpenCodeEval/backend/vllm.py
@@ -0,0 +1,144 @@
+import os
+
+from loguru import logger
+
+from tqdm import tqdm
+from typing import List, Dict
+
+from OpenCodeEval.backend.base import Generator, make_chat_template
+
+from OpenCodeEval.utils import refine_text
+
+class VllmGenerator(Generator):
+
+    def __init__(
+        self,
+        model_name: str,
+        tokenizer_name: str = None,
+        model_type: str = "Instruction",
+        dtype: str = "bfloat16",
+        batch_size : int = 1,
+        temperature : float = 0.0,
+        top_p : float = 1.0,
+        max_tokens : int = 1024,
+        num_samples: int = 200,
+        num_gpus: int = 1,
+        seed: int = None,
+        trust_remote_code: bool = True,
+    ) -> None:
+
+        super().__init__(model_name)
+
+        print("Initializing a decoder model in vllm: {} ...".format(model_name))
+        self.tokenizer_name = tokenizer_name if tokenizer_name is not None else model_name
+        self.model_type = model_type
+        self.batch_size = batch_size
+        self.temperature = temperature
+        self.top_p = top_p
+        self.max_tokens = max_tokens
+        self.num_samples = num_samples
+        self.dtype = dtype
+        self.num_gpus = num_gpus
+        self.seed = seed
+        self.trust_remote_code = trust_remote_code
+
+        from vllm import LLM, SamplingParams
+
+        self.model = LLM(
+            model = self.model_name,
+            tokenizer = self.tokenizer_name,
+            dtype = self.dtype,
+            tensor_parallel_size = self.num_gpus,
+            trust_remote_code = self.trust_remote_code
+        )
+
+        self.tokenizer = self.model.get_tokenizer()
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+        self.sampling_params = SamplingParams(
+            n = self.num_samples,
+            temperature = self.temperature,
+            max_tokens = self.max_tokens,
+            top_p = self.top_p,
+            seed = int(self.seed)
+        )
+    
+    def is_chat(self) -> bool:
+        if self.model_type == "Chat":
+            if self.tokenizer.chat_template is None:
+                logger.error("When the model type is Chat, the chat template must be set for the tokenizer.")
+            else:
+                return True
+        else:
+            return False
+        
+    def set_stop(self, eos: List[str]):
+        self.sampling_params.stop = eos
+        
+    def release_memory(self):
+
+        import gc
+        import torch
+
+        from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
+
+        destroy_model_parallel
+        destroy_distributed_environment()
+        del self.model.llm_engine.model_executor
+        del self.model
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def generate(
+            self,
+            prompt_set: List[Dict],
+            response_prefix: str = "",
+            response_suffix: str = ""
+        ) -> List[str]:
+
+        if self.is_chat():
+            for prompt in prompt_set:
+                prompt['prompt'] = make_chat_template(
+                    prompt = prompt['prompt'],
+                    response_prefix = response_prefix,
+                    is_chat = self.is_chat(),
+                    tokenizer = self.tokenizer
+                )
+
+        logger.info("Example Prompt:\n{}", prompt_set[0]['prompt'])
+
+        generation_set = []
+
+        for batch_start in tqdm(range(0, len(prompt_set), self.batch_size)):
+            batch_prompt = prompt_set[batch_start : batch_start + self.batch_size]
+            batch_outputs = self.model.generate(
+                [prompt['prompt'] for prompt in batch_prompt],
+                self.sampling_params,
+                use_tqdm = False,
+            )
+
+            for prompt, output in zip(batch_prompt, batch_outputs):
+
+                # Process completions with prefix/suffix and refine text based on chat mode
+                completions = [
+                    refine_text(
+                        f"{prompt['prompt']}\n\n{response_prefix}{completion.text}{response_suffix}"
+                        if not self.is_chat()
+                        else f"{response_prefix}{completion.text}{response_suffix}"
+                    )
+                    for completion in output.outputs
+                ]
+
+                for completion_id, completion in enumerate(completions):
+                    generation_set.append({
+                        'task_id': prompt['task_id'],
+                        'completion_id': completion_id,
+                        'completion': completion
+                    })
+
+        if len(generation_set) != len(prompt_set) * self.num_samples:
+            logger.error("Number of generations does not match the expected number.")
+
+        self.release_memory()
+
+        return generation_set
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/BigCodeBench.py b/code_eval/OpenCodeEval/benchmark/BigCodeBench.py
new file mode 100644
index 0000000..abc4faf
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/BigCodeBench.py
@@ -0,0 +1,113 @@
+import os
+from typing import Literal
+
+ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from OpenCodeEval.benchmark.base import Benchmark, PYTHON_STOP, PYTHON_IMPORTS
+from OpenCodeEval.utils import refine_text, stream_jsonl
+from OpenCodeEval.eval.func_eval import check_correctness
+from OpenCodeEval.eval.sanitize import sanitize
+
+class BigCodeBench(Benchmark):
+
+    name: str = "BigCodeBench"
+    path: str = None
+
+    fullset_path = os.path.abspath(os.path.join(ROOT, "../data/BigCodeBench.jsonl"))
+    subset_path = os.path.abspath(os.path.join(ROOT, "../data/BigCodeBench_Hard.jsonl"))
+
+    imports_code = PYTHON_IMPORTS
+    chat_stop = PYTHON_STOP
+    base_stop = ['\n"""', "\nassert"]
+
+    def __init__(self,
+                 name: str = "BigCodeBench",
+                 timeout:float = 10.0,
+                 prompt_type: Literal["Completion", "Instruction"] = "Completion"
+                 ):
+        
+        super().__init__()
+        
+        self.name = name
+        self.timeout = timeout
+        self.prompt_type = prompt_type
+
+        if self.name == "BigCodeHard":
+            self.path = self.subset_path
+        elif self.name == "BigCodeBench":
+            self.path = self.fullset_path
+
+        self.tasks = self.get_task()
+
+    def get_task(self):
+        """
+        Get the task data from the jsonl file into a dictionary.
+        """
+
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename=self.path):
+
+            task_id = int(task_data["task_id"].split("/")[-1])
+            
+            tasks[task_id] = task_data
+        
+        return tasks
+    
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        prompts = []
+        for task_id, task_data in self.tasks.items():
+
+            if self.prompt_type == "Completion":
+                prompt = task_data['complete_prompt']
+            elif self.prompt_type == "Instruction":
+                prompt = task_data['instruct_prompt']
+
+            prompts.append(
+                dict(
+                    task_id = task_id,
+                    prompt = refine_text(prompt)
+                )
+            )
+
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        entry_point = self.tasks[generation['task_id']]["entry_point"]
+
+        result = dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = sanitize(generation['completion'], entry_point)
+        )
+
+        return result
+
+    def process_results(self, solution):
+        """
+        Takes the list of LM generations and evaluates them against the test cases
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        code = (
+            task_data["code_prompt"] + "\n" 
+            + "    pass\n" + "\n"
+            + solution['solution'] + "\n"
+        )
+        
+        result = check_correctness(solution['task_id'],
+                                   solution['completion_id'],
+                                   code,
+                                   task_data["test"],
+                                   self.timeout)
+        
+        return result
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/Bird.py b/code_eval/OpenCodeEval/benchmark/Bird.py
new file mode 100644
index 0000000..b4359fb
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/Bird.py
@@ -0,0 +1,123 @@
+import os
+import sys
+
+from loguru import logger
+from typing import Literal
+
+from OpenCodeEval.benchmark.base import Benchmark
+from OpenCodeEval.utils import refine_text, program_extract, stream_jsonl
+from OpenCodeEval.eval.sql_test import check_correctness
+
+class Bird(Benchmark):
+
+    name: str = "Bird"
+
+    def __init__(
+        self,
+        split: Literal["train", "dev"] = "dev",
+        time_out: float = 30.0,
+        prompt_type: str = "Instruction"
+    ):
+    
+        super().__init__()
+        
+        self.split = split
+        self.time_out = time_out
+        self.prompt_type = prompt_type
+
+        if self.prompt_type == "Completion":
+            logger.error("Completion prompt type not supported for Bird")
+
+        self.database = os.path.join(self.path, f"{self.name}/{self.split}/database")
+        self.path = os.path.join(self.path, f"{self.name}/{self.split}/data.jsonl")
+
+        self.tasks = self.get_task()
+        
+    def get_task(self):
+        """
+        Get the task data from the json file into a dictionary.
+        """
+    
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename = self.path):
+
+            tasks[int(task_data['id'])] = task_data
+        
+        return tasks
+
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        def construct_prompt(data):
+            instruction = data['o_schema']
+            instruction += f"\n\n-- External Knowledge: {data['evidence']}\n\n"
+            instruction += "-- Using valid SQLite and understanding External Knowledge, answer the following questions for the tables provided above.\n\n"
+            instruction += f"Question: {data['question']}\n"
+            return instruction
+
+        prompts = []
+        
+        
+        for task_id, task_data in self.tasks.items():
+            
+            prompt = construct_prompt(task_data)
+            
+            prompts.append(
+                dict(
+                    task_id = task_id,
+                    prompt = refine_text(prompt)
+                )
+            )
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        def one_line_sql(response):
+
+            response = program_extract(response, "sql", "last").strip()
+
+            lines = response.splitlines()
+            lines = [l.strip() for l in lines if l.strip()]
+            sql = " ".join(lines)
+            
+            return sql
+
+        result = dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = one_line_sql(generation['completion'])
+        )
+
+        return result
+
+    def process_results(self, solution):
+        """
+        Takes the list of LM generations and evaluates them against the test cases
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        db_path = self.database + f"/{task_data['db_id']}/{task_data['db_id']}.sqlite"
+
+        result, passed, sql_return = check_correctness(
+            solution['solution'],
+            task_data['sql'],
+            db_path,
+            self.time_out,
+            "set_match"
+        )
+        
+        return dict(
+            task_id = solution['task_id'],
+            completion_id = solution['completion_id'],
+            passed = passed,
+            result = result,
+            solution = solution['solution'],
+            sql_return = sql_return
+        )
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/HumanEval.py b/code_eval/OpenCodeEval/benchmark/HumanEval.py
new file mode 100644
index 0000000..3c3aece
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/HumanEval.py
@@ -0,0 +1,114 @@
+import os
+from typing import Literal
+
+from OpenCodeEval.benchmark.base import Benchmark, PYTHON_STOP, PYTHON_IMPORTS
+from OpenCodeEval.utils import refine_text, stream_jsonl, program_extract
+from OpenCodeEval.eval.func_eval import check_correctness
+from OpenCodeEval.eval.sanitize import sanitize
+
+class HumanEval(Benchmark):
+
+    name: str = "HumanEval"
+
+    imports_code = PYTHON_IMPORTS
+    chat_stop = PYTHON_STOP
+    base_stop = ["\ndef ", "\nclass ", "\nimport ", "\nfrom ", "\nassert "]
+
+    def __init__(
+        self,
+        split: Literal["base", "hard"] = "base",
+        time_out: float = 3.0,
+        prompt_type: str = "Completion"
+    ):
+
+        super().__init__()
+
+        self.split = split
+        self.time_out = time_out
+        self.prompt_type = prompt_type
+
+        self.path = os.path.join(self.path, f"{self.name}/{self.split}.jsonl")
+
+        self.tasks = self.get_task()
+
+    def get_task(self):
+        """
+        Get the task data from the jsonl file into a dictionary.
+        """
+
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename=self.path):
+
+            task_id = int(task_data["task_id"].split("/")[-1])
+            
+            tasks[task_id] = task_data
+        
+        return tasks
+
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        assert self.prompt_type == "Completion", "Prompt type must be Completion for HumanEval"
+
+        prompts = []
+        for task_id, task_data in self.tasks.items():
+            prompts.append(
+                dict(
+                    task_id = task_id,
+                    prompt = refine_text(task_data['prompt'])
+                )
+            )
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        entry_point = self.tasks[generation['task_id']]["entry_point"]
+
+        try:
+            completion = '\n'.join(generation['completion'].splitlines()[-200:])
+
+            if '</think>' in completion:
+                completion = completion.split('</think>')[1]
+            
+            solution = sanitize(completion, entry_point)
+        except Exception:
+            solution = program_extract(generation['completion'], program="python", mode="all")
+
+        result = dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = solution
+        )
+
+        return result
+
+    def process_results(self, solution):
+        """
+        Takes the list of LM generations and evaluates them against the test cases
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        code = (
+            "\n".join(self.imports_code) + "\n"
+            + task_data["prompt"] + "\n"
+            + "    pass\n" + "\n"
+            + solution['solution'] + "\n"
+            + task_data['test'] + "\n"
+            + f"check({task_data['entry_point']})"
+            )
+
+        result = check_correctness(
+            solution['task_id'],
+            solution['completion_id'],
+            code,
+            self.time_out
+        )
+        
+        return result
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/LeetCode.py b/code_eval/OpenCodeEval/benchmark/LeetCode.py
new file mode 100644
index 0000000..97b3489
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/LeetCode.py
@@ -0,0 +1,121 @@
+import os
+from typing import Literal
+from loguru import logger
+
+from OpenCodeEval.benchmark.base import Benchmark, PYTHON_IMPORTS, LEETCODE_IMPORTS, PYTHON_STOP
+from OpenCodeEval.utils import refine_text, stream_jsonl
+from OpenCodeEval.eval.func_eval import check_correctness
+from OpenCodeEval.eval.sanitize import sanitize
+class LeetCode(Benchmark):
+
+    name: str = "LeetCode"
+
+    imports_code = PYTHON_IMPORTS + LEETCODE_IMPORTS
+    chat_stop = PYTHON_STOP
+    base_stop = ["\ndef ", "\nclass ", "\nimport ", "\nfrom ", "\nassert "]
+
+    def __init__(
+        self,
+        split: Literal["contest", "train", "validation", "test"] = "contest",
+        time_out: float = 3.0,
+        prompt_type: Literal["Completion", "Instruction"] = "Instruction"
+    ):
+
+        super().__init__()
+        
+        self.name = name
+        self.split = split
+        self.time_out = time_out
+
+        self.prompt_type = prompt_type
+        if self.split != "contest" and self.prompt_type == "Completion":
+            logger.error(f"Completion prompt type not support {self.split} split")
+
+        self.path = os.path.join(self.path, f"{self.name}/{self.split}.jsonl")
+        self.tasks = self.get_task()
+
+    def get_task(self):
+        """
+        Get the task data from the jsonl file into a dictionary.
+        """
+
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename=self.path):
+
+            if self.split == "contest":
+                task_id = int(task_data["meta"]["questionId"])
+            else:
+                task_id = int(task_data["meta"]["question_id"])
+            tasks[task_id] = task_data
+        
+        return tasks
+        
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        prompts = []
+        for task_id, task_data in self.tasks.items():
+
+            if self.split == "contest":
+                if self.prompt_type == "Completion":
+                    prompt = task_data['prompt']
+                elif self.prompt_type == "Instruction":
+                    prompt = task_data['prompt_sft']
+            else:
+                prompt = task_data['meta']['query']
+
+            prompts.append(
+                dict(
+                    task_id = task_id,
+                    prompt = refine_text(prompt)
+                )
+            )
+
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        return dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = sanitize(
+                text = generation['completion'],
+                entrypoint = "Solution",
+            )
+        )
+    
+    def process_results(self, solution):
+        """
+        Takes the list of LM generations and evaluates them against the test cases
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        if self.split == "contest":
+            code = (
+                "\n".join(self.imports_code) + "\n\n"
+                + solution['solution'] + "\n\n"
+                + task_data['test']
+                )
+        else:
+            code = (
+                "\n".join(self.imports_code) + "\n\n"
+                + task_data['meta']['lang_code'] + "\n"
+                + "        pass\n" + "\n"
+                + solution['solution'] + "\n"
+                + task_data['test'] + "\n"
+                + f"check({task_data['entry_point']})"
+                )
+        
+        result = check_correctness(solution['task_id'],
+                                   solution['completion_id'],
+                                   code,
+                                   self.time_out)
+        
+        return result
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/LiveCodeBench.py b/code_eval/OpenCodeEval/benchmark/LiveCodeBench.py
new file mode 100644
index 0000000..8e0ccd9
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/LiveCodeBench.py
@@ -0,0 +1,76 @@
+import os
+from typing import Literal
+
+from OpenCodeEval.benchmark.base import Benchmark, PYTHON_STOP, PYTHON_IMPORTS
+from OpenCodeEval.utils import refine_text, stream_jsonl, program_extract
+from OpenCodeEval.eval.func_eval import check_correctness
+from OpenCodeEval.eval.sanitize import sanitize
+
+class LiveCodeBench(Benchmark):
+
+    name: str = "LiveCodeBench"
+    path: str = None
+
+    platform_dict = dict(
+        atcoder = 1,
+        codeforces = 2,
+        leetcode = 3,
+    )
+
+    def __init__(
+        self,
+        split: Literal["v1", "v2", "v3", "v4", "v5"] = "v5",
+        time_out: float = 3.0,
+        prompt_type: str = "Instruction"
+    ):
+
+        super().__init__()
+
+        self.path = os.path.join(self.path, self.name)
+
+        self.tasks = self.get_task()
+
+    def get_task_id(self, data):
+        """
+        Get the task id for the task.
+        """
+
+        from datetime import datetime
+
+        date_id = datetime.fromisoformat(data['contest_date'])
+
+        # refromat the date to YYYYMMDD
+        date_id = date_id.strftime("%Y%m%d")
+
+        if data['platform'] == 'atcoder':
+
+            paltform_id = "1"
+            contest, letter = data['question_id'].split('_')
+            contest = ''.join(token for token in contest if token.isdigit())
+            contest = contest.zfill(4)
+                
+            task_id =  paltform_id + contest + str(ord(letter) - ord('a') + 1)
+
+        elif data['platform'] == 'codeforces':
+            paltform_id = "2"
+            contest, letter = data['question_id'].split('_')
+            task_id =  paltform_id + contest + str(ord(letter) - ord('A') + 1)
+
+        elif data['platform'] == 'leetcode':
+            paltform_id = "3"
+            task_id = paltform_id + data['question_id'] + "0"
+        
+        else:
+            logger.error(f"Invalid platform: {data['platform']}")
+
+        return int(task_id)
+
+    def get_task(self):
+        """
+        Get the task data from the jsonl file into a dictionary.
+        """
+        
+        version = int(self.split.split('v')[1])
+
+        for i in range(1, version + 1):
+            
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/MBPP.py b/code_eval/OpenCodeEval/benchmark/MBPP.py
new file mode 100644
index 0000000..0242893
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/MBPP.py
@@ -0,0 +1,126 @@
+import os
+import sys
+
+ROOT = os.path.dirname(os.path.abspath(__file__))
+
+from OpenCodeEval.benchmark.base import Benchmark, PYTHON_STOP, PYTHON_IMPORTS
+from OpenCodeEval.utils import refine_text, stream_jsonl, program_extract
+from OpenCodeEval.eval.func_eval import check_correctness
+from OpenCodeEval.eval.sanitize import sanitize
+
+class MBPP(Benchmark):
+
+    name: str = "MBPP"
+
+    imports_code = PYTHON_IMPORTS
+    chat_stop = PYTHON_STOP
+    base_stop = ['\n"""', "\nassert"]
+    # TODO: add more stop words, e.g. "\nif __name__", "\ndef main(", "\nprint(", '\n```\n']
+
+    def __init__(
+        self,
+        split: str = "base",
+        time_out: float = 3.0,
+        prompt_type: str = "Instruction"
+    ):
+
+        super().__init__()
+        
+        self.split = split
+        self.time_out = time_out
+        self.prompt_type = prompt_type
+
+        self.path = os.path.join(self.path, f"{self.name}/data.jsonl")
+
+        self.tasks = self.get_task()
+
+    def get_task(self):
+        """
+        Get the task data from the jsonl file into a dictionary.
+        """
+
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename=self.path):
+
+            task_id = int(task_data["task_id"])
+            
+            tasks[task_id] = task_data
+        
+        return tasks
+    
+    def format_prompt(self, 
+                     promblem: str,
+                     test: str,
+                     ) -> str:
+        promblem = f"You are an expert Python programmer, and here is your task:\n{promblem}"
+        test = f"Your code should pass the test:\n{test}"
+        prompt = promblem + "\n" + test
+        return prompt
+    
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        assert self.prompt_type == "Instruction", "Prompt type must be Instruction for MBPP"
+
+        prompts = []
+        for task_id, task_data in self.tasks.items():
+            prompts.append(
+                dict(
+                    task_id = task_id,
+                    prompt = refine_text(self.format_prompt(task_data["text"], task_data["test_list"][0]))
+                )
+            )
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        entry_point = self.tasks[generation['task_id']]["entry_point"]
+
+        try:
+            # generation['completion'] = program_extract(generation['completion'], program="python", mode="last")
+            solution = sanitize(generation['completion'], entry_point)
+            # solution = solution.replace("func0", entry_point)
+        except Exception:
+            solution = program_extract(generation['completion'], program="python", mode="all")
+        
+        return dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = solution
+        )
+    
+    
+    def process_results(self, solution):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        if self.split == "base":
+            test_code = "\n".join(task_data['test_imports']) + "\n\n" + "\n".join(task_data['test_list'])
+        elif self.split == "plus":
+            test_code = "\n".join(task_data['test_imports']) + "\n\n" + task_data['test']
+
+        code =  (
+            "\n".join(self.imports_code) + "\n"
+            + solution['solution'] + "\n"
+            + test_code
+        )
+
+        result = check_correctness(solution['task_id'],
+                                   solution['completion_id'],
+                                   code,
+                                   self.time_out)
+        
+        return result
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/Spider.py b/code_eval/OpenCodeEval/benchmark/Spider.py
new file mode 100644
index 0000000..dc18afa
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/Spider.py
@@ -0,0 +1,118 @@
+import os
+import json
+
+from loguru import logger
+from typing import Literal
+
+from OpenCodeEval.benchmark.base import Benchmark
+from OpenCodeEval.utils import refine_text, program_extract, markdown_extract, stream_jsonl
+from OpenCodeEval.eval.sql_test import check_correctness
+
+class Spider(Benchmark):
+
+    name: str = "Spider"
+
+    def __init__(
+        self,
+        split: Literal["train", "dev"] = "dev",
+        time_out: float = 3.0,
+        prompt_type: str = "Instruction"
+    ):
+    
+        super().__init__()
+        
+        self.split = split
+        self.time_out = time_out
+        self.prompt_type = prompt_type
+
+        if self.prompt_type == "Completion":
+            logger.error("Completion prompt type not supported for Spider")
+
+        self.database = os.path.join(self.path, f"{self.name}/{self.split}/database")
+        self.path = os.path.join(self.path, f"{self.name}/{self.split}/data.jsonl")
+
+        self.tasks = self.get_task()
+
+    def get_task(self):
+        """
+        Get the task data from the json file into a dictionary.
+        """
+
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename = self.path):
+
+            tasks[int(task_data['id'])] = task_data
+        
+        return tasks
+
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        prompts = []
+        
+        
+        for task_id, task_data in self.tasks.items():
+            
+            prompt = task_data['instruction']
+            
+            prompts.append(
+                dict(
+                    task_id = task_id,
+                    prompt = refine_text(prompt)
+                )
+            )
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        solution = ' '.join(program_extract(
+            text = generation['completion'],
+            program = 'sql', 
+            mode = 'last').splitlines()
+        )
+
+        if solution == "":
+            solution = ' '.join(markdown_extract(
+                text = generation['completion'],
+                mode = 'last').splitlines()
+            )
+
+        result = dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = solution
+        )
+
+        return result
+
+    def process_results(self, solution):
+        """
+        Takes the list of LM generations and evaluates them against the test cases
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        db_path = os.path.join(self.database, f"{task_data['db_id']}/{task_data['db_id']}.sqlite")
+
+        result, passed,sql_return = check_correctness(
+            solution['solution'],
+            task_data['output'],
+            db_path,
+            self.time_out,
+            "exact_match"
+        )
+        
+        return dict(
+            task_id = solution['task_id'],
+            completion_id = solution['completion_id'],
+            passed = passed,
+            result = result,
+            solution = solution['solution'],
+            sql_return = sql_return
+        )
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/__init__.py b/code_eval/OpenCodeEval/benchmark/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/__init__.py
diff --git a/code_eval/OpenCodeEval/benchmark/base.py b/code_eval/OpenCodeEval/benchmark/base.py
new file mode 100644
index 0000000..4bd9750
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/base.py
@@ -0,0 +1,124 @@
+import os
+import sys
+
+ROOT = os.path.dirname(os.path.abspath(__file__))
+
+PYTHON_STOP = [ "\nif __name__",
+                "\ndef main(",
+                "\nprint("
+                ]
+    
+PYTHON_IMPORTS = [  "import math",
+                    "import re",
+                    "import sys",
+                    "import copy",
+                    "import datetime",
+                    "import itertools",
+                    "import collections",
+                    "import heapq",
+                    "import functools",
+                    "import hashlib",
+                    "import numpy",
+                    "import numpy as np",
+                    "import string",
+                    "from typing import *",
+                    "from collections import *"
+                    ]
+
+LEETCODE_IMPORTS =  [
+    'from typing import *',
+    'from functools import *',
+    'from collections import *',
+    'from itertools import *',
+    'from heapq import *',
+    'from bisect import *',
+    'from string import *',
+    'from operator import *',
+    'from math import *',
+    'import math',
+    'import datetime',
+    "inf = float('inf')",
+]
+
+from abc import ABC, abstractmethod
+
+class Benchmark(ABC):
+
+    name: str = None
+    split: str = None
+    path: str = os.path.abspath(os.path.join(os.path.dirname(__file__), "data/"))
+
+    imports = []
+    chat_stop = []
+    base_stop = []
+
+    def __init__(self):
+        """
+        :param stop_words: list
+            list of stop words if the generation uses a stopping criteria during generation
+        :param requires_execution: bool
+            wheter the task requires code execution during evaluation or not
+        """
+        pass
+
+    def fewshot_examples(self):
+        """Loads and returns the few-shot examples for the task if they exist."""
+        pass
+
+    @abstractmethod
+    def get_task(self):
+        """Builds the task for the LM to generate from.
+        """
+        pass
+
+    @abstractmethod
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from.
+        :param doc: dict[str: str]
+            sample from the test dataset
+        """
+        pass
+
+
+    def get_reference(self, doc):
+        """Builds the reference solution for the doc.
+        :param doc: dict[str: str]
+            sample from the test dataset
+        """
+        pass
+
+    @abstractmethod
+    def postprocess_generation(self, task, generation):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+        """
+        pass
+
+    @abstractmethod
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations as in {"metric_name": result}.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        :return: dict[str: float]
+        """
+        pass
+
+    def _stop_at_stop_token(decoded_string, stop_tokens):
+        """
+        Produces the prefix of decoded_string that ends at the first occurrence of
+        a stop_token.
+        WARNING: the decoded_string *must not* include the prompt, which may have stop tokens
+        itself.
+        """
+        min_stop_index = len(decoded_string)
+        for stop_token in stop_tokens:
+            stop_index = decoded_string.find(stop_token)
+            if stop_index != -1 and stop_index < min_stop_index:
+                min_stop_index = stop_index
+        return decoded_string[:min_stop_index]
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/mbpp.py b/code_eval/OpenCodeEval/benchmark/mbpp.py
new file mode 100644
index 0000000..ee522a2
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/mbpp.py
@@ -0,0 +1,153 @@
+import os
+
+from OpenCodeEval.benchmark.base import Benchmark, PYTHON_STOP, PYTHON_IMPORTS
+from OpenCodeEval.utils import refine_text, stream_jsonl
+from OpenCodeEval.eval.func_eval import check_correctness
+from OpenCodeEval.eval.sanitize import sanitize
+
+from typing import List, Literal
+
+class mbpp(Benchmark):
+
+    name: str = "mbpp"
+
+    imports_code = PYTHON_IMPORTS
+    chat_stop = PYTHON_STOP
+    base_stop = ['\n"""', "\nassert"]
+
+    def __init__(
+        self,
+        split: Literal["full", "sanitized"] = "full",
+        time_out: float = 3.0,
+        prompt_type: str = "Instruction"
+    ):
+
+        super().__init__()
+
+        self.split = split
+        self.time_out = time_out
+        self.prompt_type = prompt_type
+
+        self.path = os.path.join(self.path, f"{self.name}/{self.split}.jsonl")
+        self.tasks = self.get_task()
+
+        self.few_shots_prompt = self.get_few_shots_prompts() if split == "full" else ""
+    
+    def get_task(self):
+        """
+        Get the task data from the jsonl file into a dictionary.
+        """
+
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename = self.path):
+            task_id = int(task_data["task_id"])
+
+            task_data['text'] = refine_text(task_data['text'])
+            
+            tasks[task_id] = task_data
+        
+        return tasks
+
+    def fewshot_examples(self):
+
+        few_shots_start = 1
+        few_shots_end = 4
+
+        few_shots = []
+
+        for task_id, task_data in self.tasks.items():
+            if task_id >= few_shots_start and task_id < few_shots_end:
+                few_shots.append(task_data)
+        
+        return few_shots
+    
+    def format_prompt(self,
+                      promblem: str,
+                      tests: List[str],
+                      code: str = None
+                    ) -> str:
+        promblem = f"You are an expert Python programmer, and here is your task:\n{promblem}"
+        test = "\n".join(tests)
+        test = f"Your code should pass these tests:\n{test}\n"
+        prompt = promblem + test
+        if code:
+            code = refine_text(code)
+            code = f"\n```python\n{code}\n```\n"
+            prompt = prompt + code
+        else:
+            prompt = prompt + "\n```python\n"
+        return prompt
+    
+    def get_few_shots_prompts(self):
+        
+        few_shots_prompts = []
+        for few_shot in self.fewshot_examples():
+            few_shots_prompts.append(self.format_prompt(few_shot["text"], few_shot["test_list"], few_shot["code"]))
+
+        return '\n'.join(few_shots_prompts)
+    
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        assert self.prompt_type == "Instruction", "Prompt type must be Instruction for mbpp"
+
+        if self.split == "full":
+            test_start = 10
+            test_end = 510
+        elif self.split == "sanitized":
+            test_start = 0
+            test_end = 974
+
+        prompts = []
+
+        for task_id, task_data in self.tasks.items():
+            if task_id >= test_start and task_id < test_end:
+                
+                prompt = self.few_shots_prompt + '\n' + self.format_prompt(task_data["text"], task_data["test_list"])
+                prompts.append({
+                    'task_id': task_id,
+                    'prompt': prompt
+                })
+
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        if generation['completion'].startswith(self.few_shots_prompt):
+            generation['completion'] = generation['completion'][len(self.few_shots_prompt):]
+        
+        # if "```python" not in generation['completion']:
+            # generation['completion'] = ""
+
+        return dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = sanitize(generation['completion'])
+        )
+    
+    def process_results(self, solution):
+        """
+        Takes the list of LM generations and evaluates them against the test cases
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        code =  (
+                    "\n".join(self.imports_code) + "\n"
+                    + task_data['test_setup_code'] + "\n"
+                    + solution['solution'] + "\n"
+                    + "\n".join(task_data['test_list'])
+                )
+        
+        result = check_correctness(solution['task_id'],
+                                   solution['completion_id'],
+                                   code,
+                                   self.time_out)
+        
+        return result
diff --git a/code_eval/OpenCodeEval/benchmark/understandml.py b/code_eval/OpenCodeEval/benchmark/understandml.py
new file mode 100644
index 0000000..590db3f
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/understandml.py
@@ -0,0 +1,152 @@
+import os
+
+from OpenCodeEval.benchmark.base import Benchmark, PYTHON_STOP, PYTHON_IMPORTS
+from OpenCodeEval.utils import program_extract, stream_jsonl
+from OpenCodeEval.eval.func_eval import check_correctness
+from OpenCodeEval.eval.sanitize import sanitize
+
+from typing import List, Literal
+
+from typing import *
+from functools import *
+from collections import *
+from itertools import *
+from heapq import *
+from bisect import *
+from string import *
+from operator import *
+from math import *
+
+import numpy as np
+from numpy import *
+
+import datetime
+import copy
+
+inf = float('inf')
+
+if os.environ.get("MAX_LINES"):
+    MAX_LINES = int(os.environ.get("MAX_LINES"))
+else:
+    MAX_LINES = 200
+
+def base_prompt(data):
+    prompt = 'You are an expert Python programmer, and here is your task:\n'
+    prompt = prompt + f'# Task: {data["title"]}\n'
+    prompt = prompt + f'# Description:\n{data["description"]}\n'
+    # prompt = prompt + f'# Examples:\n'
+    # for example_idx, (example, reasoning) in enumerate(zip(data["examples"], data["reasoning"])):
+    #     prompt = prompt + f'## Example {example_idx + 1}:\n'
+    #     prompt = prompt + f'### Input:\n{example["input"]}\n'
+    #     prompt = prompt + f'### Output:\n{example["output"]}\n'
+    #     prompt = prompt + f'### Reasoning:\n{reasoning}\n'
+    input_code = (data["import_code"] + "\n" + data["starter_code"]).strip()
+    prompt = prompt + f'# Your code should start with:\n```python\n{input_code}\n```\n'
+    if data['output_constrains'].strip():
+        prompt = prompt + f'# Output Constraints:\n{data["output_constrains"].strip()}\n'
+
+    return prompt
+
+
+
+class understandml(Benchmark):
+
+    name: str = "understandml"
+
+    imports_code = PYTHON_IMPORTS
+    chat_stop = PYTHON_STOP
+    base_stop = ['\n"""', "\nassert"]
+
+    def __init__(
+        self,
+        split: Literal["human", "model"] = "model",
+        time_out: float = 3.0,
+        prompt_type: str = "Instruction"
+    ):
+
+        super().__init__()
+
+        self.split = split
+        self.time_out = time_out
+        self.prompt_type = prompt_type
+
+        self.path = os.path.join(self.path, f"{self.name}/{self.split}_benchmark.jsonl")
+        self.tasks = self.get_task()
+    
+    def get_task(self):
+        """
+        Get the task data from the jsonl file into a dictionary.
+        """
+
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename = self.path):
+            task_id = int(task_data["id"])
+            
+            tasks[task_id] = task_data
+        
+        return tasks
+    
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        assert self.prompt_type == "Instruction", "Prompt type must be Instruction for mbpp"
+
+        prompts = []
+
+        for task_id, task_data in self.tasks.items():
+                
+            prompt = base_prompt(task_data)
+            prompts.append({
+                'task_id': task_id,
+                'prompt': prompt
+            })
+
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        entry_point = self.tasks[generation['task_id']]["entry_point"]
+
+        try:
+            completion = '\n'.join(generation['completion'].splitlines()[-MAX_LINES:])
+
+            if '</think>' in completion:
+                completion = completion.split('</think>')[1]
+            
+            solution = sanitize(completion, entry_point)
+        except Exception:
+            solution = program_extract(generation['completion'], program="python", mode="all")
+
+        result = dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = solution
+        )
+
+        return result
+    
+    def process_results(self, solution):
+        """
+        Takes the list of LM generations and evaluates them against the test cases
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        code =  (
+                    task_data['import_code'] + "\n"
+                    + solution['solution'] + "\n"
+                    + "\n".join(task_data['test_cases'])
+                )
+        
+        result = check_correctness(solution['task_id'],
+                                   solution['completion_id'],
+                                   code,
+                                   self.time_out)
+        
+        return result
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/data/dataset.sh b/code_eval/OpenCodeEval/data/dataset.sh
new file mode 100755
index 0000000..0e85f8f
--- /dev/null
+++ b/code_eval/OpenCodeEval/data/dataset.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# 定义要下载的文件的 URL 列表
+urls=(
+    "https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.1/BigCodeBench-Hard.jsonl.gz"
+    "https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.1/BigCodeBench.jsonl.gz"
+    "https://github.com/evalplus/humanevalplus_release/releases/download/v0.1.10/HumanEvalPlus.jsonl.gz"
+    "https://github.com/evalplus/mbppplus_release/releases/download/v0.2.0/MbppPlus.jsonl.gz"
+    "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"
+)
+
+# 下载并解压每个文件
+for url in "${urls[@]}"; do
+    # 获取文件名
+    filename=$(basename "$url")
+    
+    # 删除已有的压缩文件和解压后的文件，确保不会重复
+    [ -f "$filename" ] && rm "$filename"
+    [ -f "${filename%.gz}" ] && rm "${filename%.gz}"
+
+    echo "Downloading $url..."
+    wget "$url"
+    
+    echo "Unzipping $filename..."
+    gunzip "$filename"
+done
+
+echo "All files have been downloaded and unzipped."
diff --git a/code_eval/OpenCodeEval/eval/__init__.py b/code_eval/OpenCodeEval/eval/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/code_eval/OpenCodeEval/eval/__init__.py
diff --git a/code_eval/OpenCodeEval/eval/func_eval.py b/code_eval/OpenCodeEval/eval/func_eval.py
new file mode 100644
index 0000000..493c0c6
--- /dev/null
+++ b/code_eval/OpenCodeEval/eval/func_eval.py
@@ -0,0 +1,232 @@
+import io
+import os
+
+import signal
+import tempfile
+import platform
+import contextlib
+import faulthandler
+import multiprocessing
+
+from typing import Optional, Callable, Dict
+
+
+# WARNING
+# This program exists to execute untrusted model-generated code. Although
+# it is highly unlikely that model-generated code will do something overtly
+# malicious in response to this test suite, model-generated code may act
+# destructively due to a lack of model capability or alignment.
+# Users are strongly encouraged to sandbox this evaluation suite so that it 
+# does not perform destructive actions on their host or network. For more 
+# information on how OpenAI sandboxes its code, see the accompanying paper.
+# Once you have read this disclaimer and taken appropriate precautions, 
+# uncomment the 58 line and proceed at your own risk
+
+def check_correctness(task_id: int,
+                      completion_id: int,
+                      solution: str,
+                      time_out: float,
+                      ) -> Dict:
+    """
+    Evaluates the functional correctness of a completion by running the test
+    suite provided in the problem. 
+
+    :param completion_id: an optional completion ID so we can match
+        the results later even if execution finishes asynchronously.
+    """
+
+    def unsafe_execute():
+
+        with create_tempdir():
+
+            # These system calls are needed when cleaning up tempdir.
+            import os
+            import shutil
+            rmtree = shutil.rmtree
+            rmdir = os.rmdir
+            chdir = os.chdir
+
+            # Disable functionalities that can make destructive changes to the test.
+            reliability_guard()
+
+            # Construct the check program and run it.
+            check_program = solution
+
+            try:
+                exec_globals = {}
+                with swallow_io():
+                    with time_limit(time_out):
+                        exec(check_program, exec_globals)
+                result.append("passed")
+            except TimeoutException:
+                result.append("timed out")
+            except BaseException as e:
+                result.append(f"failed: {e}")
+
+            # Needed for cleaning up.
+            shutil.rmtree = rmtree
+            os.rmdir = rmdir
+            os.chdir = chdir
+
+    manager = multiprocessing.Manager()
+    result = manager.list()
+
+    p = multiprocessing.Process(target=unsafe_execute)
+    p.start()
+    p.join(timeout=time_out + 1)
+    if p.is_alive():
+        p.kill()
+
+    if not result:
+        result.append("timed out")
+
+    return dict(
+        task_id = task_id,
+        completion_id = completion_id,
+        passed = result[0] == "passed",
+        result = result[0],
+        solution = solution
+    )
+
+
+@contextlib.contextmanager
+def time_limit(seconds: float):
+    def signal_handler(signum, frame):
+        raise TimeoutException("Timed out!")
+    signal.setitimer(signal.ITIMER_REAL, seconds)
+    signal.signal(signal.SIGALRM, signal_handler)
+    try:
+        yield
+    finally:
+        signal.setitimer(signal.ITIMER_REAL, 0)
+
+
+@contextlib.contextmanager
+def swallow_io():
+    stream = WriteOnlyStringIO()
+    with contextlib.redirect_stdout(stream):
+        with contextlib.redirect_stderr(stream):
+            with redirect_stdin(stream):
+                yield
+
+
+@contextlib.contextmanager
+def create_tempdir():
+    with tempfile.TemporaryDirectory() as dirname:
+        with chdir(dirname):
+            yield dirname
+
+
+class TimeoutException(Exception):
+    pass
+
+
+class WriteOnlyStringIO(io.StringIO):
+    """ StringIO that throws an exception when it's read from """
+
+    def read(self, *args, **kwargs):
+        raise IOError
+
+    def readline(self, *args, **kwargs):
+        raise IOError
+
+    def readlines(self, *args, **kwargs):
+        raise IOError
+
+    def readable(self, *args, **kwargs):
+        """ Returns True if the IO object can be read. """
+        return False
+
+
+class redirect_stdin(contextlib._RedirectStream):  # type: ignore
+    _stream = 'stdin'
+
+
+@contextlib.contextmanager
+def chdir(root):
+    if root == ".":
+        yield
+        return
+    cwd = os.getcwd()
+    os.chdir(root)
+    try:
+        yield
+    except BaseException as exc:
+        raise exc
+    finally:
+        os.chdir(cwd)
+
+
+def reliability_guard(maximum_memory_bytes: Optional[int] = None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the 
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        import resource
+        resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
+        resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
+        if not platform.uname().system == 'Darwin':
+            resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
+
+    faulthandler.disable()
+
+    import builtins
+    builtins.exit = None
+    builtins.quit = None
+
+    import os
+    os.environ['OMP_NUM_THREADS'] = '1'
+
+    os.kill = None
+    os.system = None
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.fchdir = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__['help'] = None
+
+    import sys
+    sys.modules['ipdb'] = None
+    sys.modules['joblib'] = None
+    sys.modules['resource'] = None
+    sys.modules['psutil'] = None
+    sys.modules['tkinter'] = None
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/eval/sanitize.py b/code_eval/OpenCodeEval/eval/sanitize.py
new file mode 100644
index 0000000..a82ea65
--- /dev/null
+++ b/code_eval/OpenCodeEval/eval/sanitize.py
@@ -0,0 +1,196 @@
+"""Post-processing LLM-generated Python code implemented using AST."""
+import ast
+import traceback
+import signal
+
+from typing import Dict, List, Optional, Set, Tuple
+from OpenCodeEval.utils import refine_text
+
+def syntax_check(code, verbose = False):
+    try:
+        ast.parse(code)
+        return True
+    except (SyntaxError, MemoryError):
+        if verbose:
+            traceback.print_exc()
+        return False
+
+def extract_longest_valid_code(text: str) -> str:
+    lines = text.splitlines()
+
+    max_valid_lines = 0
+    max_valid_snippet = ""
+
+    for i in range(len(lines)):
+        for j in range(i, len(lines)):
+            current_snippet = "\n".join(lines[i:j+1])
+            if syntax_check(current_snippet):
+                valid_line_count = sum(1 for line in lines[i:j+1] if line.strip())
+                if valid_line_count > max_valid_lines:
+                    max_valid_lines = valid_line_count
+                    max_valid_snippet = current_snippet
+
+    return max_valid_snippet
+
+def get_deps(nodes: List[Tuple[str, ast.AST]]) -> Dict[str, Set[str]]:
+    name2deps = {}
+    for name, node in nodes:
+        deps = set()
+        stack = [node]
+        while stack:
+            current = stack.pop()
+            for child in ast.iter_child_nodes(current):
+                if isinstance(child, ast.Name):
+                    deps.add(child.id)
+                elif isinstance(child, ast.Attribute):
+                    deps.add(child.attr)
+                else:
+                    stack.append(child)
+        name2deps[name] = deps
+    return name2deps
+
+def get_function_dependency(entrypoint: str, call_graph: Dict[str, Set[str]]) -> Set[str]:
+    visited = set()
+    to_visit = [entrypoint]
+
+    while to_visit:
+        current = to_visit.pop(0)
+        if current not in visited:
+            visited.add(current)
+            to_visit.extend(call_graph.get(current, set()) - visited)
+
+    return visited
+
+def get_definition_name(node: ast.AST) -> Optional[str]:
+    if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
+        return node.name
+    elif isinstance(node, ast.Assign):
+        targets = node.targets
+        if targets and isinstance(targets[0], ast.Name):
+            return targets[0].id
+    return None
+
+def has_return_statement(node: ast.AST) -> bool:
+    return any(isinstance(n, ast.Return) for n in ast.walk(node))
+
+def has_yield_statement(node: ast.AST) -> bool:
+    return any(isinstance(n, ast.Yield) for n in ast.walk(node))
+
+class TimeoutException(Exception): pass
+
+def _timeout_handler(signum, frame):
+    raise TimeoutException()
+"""
+def sanitize(text: str, entrypoint: Optional[str] = None) -> str:
+     # 设置信号处理器
+    signal.signal(signal.SIGALRM, _timeout_handler)
+    signal.alarm(30)  # 30秒后发送SIGALRM
+    
+    text = refine_text(text)
+
+    try:
+        code = extract_longest_valid_code(text)
+
+        tree = ast.parse(code)
+            
+        definitions = {}
+
+        imports = []
+
+        for node in tree.body:
+            if isinstance(node, (ast.Import, ast.ImportFrom)):
+                imports.append(node)
+            elif isinstance(node, ast.ClassDef):
+                name = node.name
+                definitions[name] = ('class', node)
+            elif isinstance(node, ast.FunctionDef):
+                name = node.name
+                if has_return_statement(node) or has_yield_statement(node):
+                    definitions[name] = ('function', node)
+            elif isinstance(node, ast.Assign):
+                name = get_definition_name(node)
+                if name:
+                    definitions[name] = ('variable', node)
+
+        if entrypoint:
+            name2deps = get_deps([(name, node) for name, (_, node) in definitions.items()])
+            reachable = get_function_dependency(entrypoint, name2deps)
+
+        sanitized_output = []
+
+        for node in imports:
+            sanitized_output.append(ast.unparse(node))
+
+        for name, (_, node) in definitions.items():
+            if not entrypoint or name in reachable:
+                sanitized_output.append(ast.unparse(node))
+
+        return "\n".join(sanitized_output)
+
+    except Exception as e:
+        print(f"Error extracting longest valid code: {e}")
+        return ""
+    finally:
+        signal.alarm(0)  # 取消定时器
+"""
+from multiprocessing import Process, Queue, TimeoutError
+
+def sanitize(text: str, entrypoint: Optional[str] = None) -> str:
+    def _sanitize_worker(q, text, entrypoint):
+        try:
+            # 原有处理逻辑保持不变
+            text = refine_text(text)
+            code = extract_longest_valid_code(text)
+            tree = ast.parse(code)
+            definitions = {}
+            imports = []
+            for node in tree.body:
+                if isinstance(node, (ast.Import, ast.ImportFrom)):
+                    imports.append(node)
+                elif isinstance(node, ast.ClassDef):
+                    name = node.name
+                    definitions[name] = ('class', node)
+                elif isinstance(node, ast.FunctionDef):
+                    name = node.name
+                    if has_return_statement(node) or has_yield_statement(node):
+                        definitions[name] = ('function', node)
+                elif isinstance(node, ast.Assign):
+                    name = get_definition_name(node)
+                    if name:
+                        definitions[name] = ('variable', node)
+
+            if entrypoint:
+                name2deps = get_deps([(name, node) for name, (_, node) in definitions.items()])
+                reachable = get_function_dependency(entrypoint, name2deps)
+
+            sanitized_output = []
+            for node in imports:
+                sanitized_output.append(ast.unparse(node))
+            for name, (_, node) in definitions.items():
+                if not entrypoint or name in reachable:
+                    sanitized_output.append(ast.unparse(node))
+            
+            q.put("\n".join(sanitized_output))
+            
+        except Exception as e:
+            print(f"Error extracting longest valid code: {e}")
+            q.put("")
+
+    # 使用多进程实现超时控制
+    q = Queue()
+    p = Process(target=_sanitize_worker, args=(q, text, entrypoint))
+    p.start()
+    
+    try:
+        # 等待3秒获取结果
+        result = q.get(timeout=3)
+        
+        p.terminate()
+        p.join()  # 确保进程退出
+        return result
+    except TimeoutError:
+        print("Function timed out after 3 seconds")
+        p.terminate()  # 强制终止进程
+        p.join()
+        return ""
+    
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/eval/sql_test.py b/code_eval/OpenCodeEval/eval/sql_test.py
new file mode 100644
index 0000000..9b0724e
--- /dev/null
+++ b/code_eval/OpenCodeEval/eval/sql_test.py
@@ -0,0 +1,187 @@
+import re
+import random
+import sqlite3
+
+from loguru import logger
+from itertools import product
+from collections import defaultdict
+from typing import Tuple, List, Set
+from func_timeout import func_timeout, FunctionTimedOut
+
+def permute_tuple(element: Tuple, perm: Tuple) -> Tuple:
+    assert len(element) == len(perm)
+    return tuple([element[i] for i in perm])
+
+
+def unorder_row(row: Tuple) -> Tuple:
+    return tuple(sorted(row, key=lambda x: str(x) + str(type(x))))
+
+
+# unorder each row in the table
+# [result_1 and result_2 has the same bag of unordered row]
+# is a necessary condition of
+# [result_1 and result_2 are equivalent in denotation]
+def quick_rej(result1: List[Tuple], result2: List[Tuple], order_matters: bool) -> bool:
+    s1 = [unorder_row(row) for row in result1]
+    s2 = [unorder_row(row) for row in result2]
+    if order_matters:
+        return s1 == s2
+    else:
+        return set(s1) == set(s2)
+
+# return whether two bag of relations are equivalent
+def multiset_eq(l1: List, l2: List) -> bool:
+    if len(l1) != len(l2):
+        return False
+    d = defaultdict(int)
+    for e in l1:
+        d[e] = d[e] + 1
+    for e in l2:
+        d[e] = d[e] - 1
+        if d[e] < 0:
+            return False
+    return True
+
+def get_constraint_permutation(tab1_sets_by_columns: List[Set], result2: List[Tuple]):
+    num_cols = len(result2[0])
+    perm_constraints = [{i for i in range(num_cols)} for _ in range(num_cols)]
+    if num_cols <= 3:
+        return product(*perm_constraints)
+
+    # we sample 20 rows and constrain the space of permutations
+    for _ in range(20):
+        random_tab2_row = random.choice(result2)
+
+        for tab1_col in range(num_cols):
+            for tab2_col in set(perm_constraints[tab1_col]):
+                if random_tab2_row[tab2_col] not in tab1_sets_by_columns[tab1_col]:
+                    perm_constraints[tab1_col].remove(tab2_col)
+    return product(*perm_constraints)
+
+# check whether two denotations are correct
+def result_eq(result1: List[Tuple], result2: List[Tuple], order_matters: bool) -> bool:
+    if len(result1) == 0 and len(result2) == 0:
+        return True
+
+    # if length is not the same, then they are definitely different bag of rows
+    if len(result1) != len(result2):
+        return False
+
+    num_cols = len(result1[0])
+
+    # if the results do not have the same number of columns, they are different
+    if len(result2[0]) != num_cols:
+        return False
+
+    # unorder each row and compare whether the denotation is the same
+    # this can already find most pair of denotations that are different
+    if not quick_rej(result1, result2, order_matters):
+        return False
+
+    # the rest of the problem is in fact more complicated than one might think
+    # we want to find a permutation of column order and a permutation of row order,
+    # s.t. result_1 is the same as result_2
+    # we return true if we can find such column & row permutations
+    # and false if we cannot
+    tab1_sets_by_columns = [{row[i] for row in result1} for i in range(num_cols)]
+
+    # on a high level, we enumerate all possible column permutations that might make result_1 == result_2
+    # we decrease the size of the column permutation space by the function get_constraint_permutation
+    # if one of the permutation make result_1, result_2 equivalent, then they are equivalent
+    for perm in get_constraint_permutation(tab1_sets_by_columns, result2):
+        if len(perm) != len(set(perm)):
+            continue
+        if num_cols == 1:
+            result2_perm = result2
+        else:
+            result2_perm = [permute_tuple(element, perm) for element in result2]
+        if order_matters:
+            if result1 == result2_perm:
+                return True
+        else:
+            # in fact the first condition must hold if the second condition holds
+            # but the first is way more efficient implementation-wise
+            # and we use it to quickly reject impossible candidates
+            if set(result1) == set(result2_perm) and multiset_eq(result1, result2_perm):
+                return True
+    return False
+
+# postprocess the model predictions to avoid execution errors
+# e.g. removing spaces between ">" and "="
+def postprocess(query: str) -> str:
+    query = query.replace("> =", ">=").replace("< =", "<=").replace("! =", "!=")
+    return query
+
+def find_detail(reason):
+    patterns = [
+        "ambiguous column name",
+        "no such column",
+        "no such table",
+        "no such function",
+        "syntax error",
+    ]
+
+    for p in patterns:
+        if p in reason:
+            return p
+    return "others"
+
+def execute_sql(predicted_sql,ground_truth, db_path, method):
+    conn = sqlite3.connect(db_path)
+    # Connect to the database
+    cursor = conn.cursor()
+    cursor.execute(predicted_sql)
+    predicted_res = cursor.fetchall()
+    cursor.execute(ground_truth)
+    ground_truth_res = cursor.fetchall()
+    result = "failed"
+    passed = False
+
+    if predicted_res is None:
+        sql_return = [[]]
+    else:
+        sql_return = predicted_res
+
+    if method == "set_match":
+
+        if set(predicted_res) == set(ground_truth_res):
+            result = "passed"
+            passed = True
+        return result, passed, sql_return
+
+    elif method == "exact_match":
+
+        order_matters = "orderby" in re.sub(r"\s+", ground_truth.lower(), "")
+        if result_eq(predicted_res, ground_truth_res, order_matters = order_matters):
+            result = "passed"
+            passed = True
+        else:
+            passed = False
+
+        return result, passed, sql_return
+
+    else:
+        logger.error(f"Unknown evaluation method: {method}")
+
+def check_correctness(predicted_sql,ground_truth, db_path, meta_time_out, method):
+    try:
+        result, passed, sql_return = func_timeout(
+            meta_time_out,
+            execute_sql,
+            args = (
+                predicted_sql,
+                ground_truth,
+                db_path,
+                method
+                )
+            )
+    except FunctionTimedOut:
+        result = "timeout"
+        passed = False
+        sql_return = [[]]
+    except Exception as e:
+        result = f"error:{e}"
+        passed = False
+        sql_return = [[]]
+
+    return result, passed, sql_return
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/eval/unit_test.py b/code_eval/OpenCodeEval/eval/unit_test.py
new file mode 100644
index 0000000..5dd1f49
--- /dev/null
+++ b/code_eval/OpenCodeEval/eval/unit_test.py
@@ -0,0 +1,472 @@
+# The MIT License
+#
+# Copyright (c) OpenAI (https://openai.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+import io
+import os
+import sys
+import time
+import types
+import unittest
+import contextlib
+import faulthandler
+import platform
+import signal
+import tempfile
+import subprocess
+import multiprocessing
+import numpy as np
+
+from multiprocessing import Array, Value, Manager
+from typing import Any, Dict, List, Tuple, Union
+
+@contextlib.contextmanager
+def swallow_subprocess_output():
+    """Context manager to swallow stdout and stderr for subprocesses."""
+    original_popen = subprocess.Popen
+    original_run = subprocess.run
+
+    def _popen_patch(*args, **kwargs):
+        if 'capture_output' in kwargs and kwargs['capture_output']:
+            # Avoid setting stdout or stderr if capture_output is True
+            kwargs.pop('stdout', None)
+            kwargs.pop('stderr', None)
+        else:
+            kwargs.setdefault('stdout', subprocess.PIPE)
+            kwargs.setdefault('stderr', subprocess.PIPE)
+        return original_popen(*args, **kwargs)
+
+    def _run_patch(*args, **kwargs):
+        if 'capture_output' in kwargs and kwargs['capture_output']:
+            # Avoid setting stdout or stderr if capture_output is True
+            kwargs.pop('stdout', None)
+            kwargs.pop('stderr', None)
+        else:
+            kwargs.setdefault('stdout', subprocess.PIPE)
+            kwargs.setdefault('stderr', subprocess.PIPE)
+        return original_run(*args, **kwargs)
+
+    subprocess.Popen = _popen_patch
+    subprocess.run = _run_patch
+    try:
+        yield
+    finally:
+        subprocess.Popen = original_popen
+        subprocess.run = original_run
+
+@contextlib.contextmanager
+def swallow_io():
+    stream = WriteOnlyStringIO()
+    with contextlib.redirect_stdout(stream):
+        with contextlib.redirect_stderr(stream):
+            with redirect_stdin(stream):
+                with swallow_subprocess_output():
+                    yield
+
+
+@contextlib.contextmanager
+def time_limit(seconds: float):
+    def signal_handler(signum, frame):
+        raise TimeoutException("Timed out!")
+
+    signal.setitimer(signal.ITIMER_REAL, seconds)
+    signal.signal(signal.SIGALRM, signal_handler)
+    try:
+        yield
+    finally:
+        signal.setitimer(signal.ITIMER_REAL, 0)
+
+
+@contextlib.contextmanager
+def create_tempdir():
+    with tempfile.TemporaryDirectory() as dirname:
+        with chdir(dirname):
+            yield dirname
+
+
+@contextlib.contextmanager
+def chdir(root):
+    if root == ".":
+        yield
+        return
+    cwd = os.getcwd()
+    os.chdir(root)
+    try:
+        yield
+    except BaseException as exc:
+        raise exc
+    finally:
+        os.chdir(cwd)
+
+
+@contextlib.contextmanager
+def safe_environment():
+    # Save original functions
+    original_kill = os.kill
+    original_killpg = os.killpg
+    original_system = os.system
+    original_subprocess_call = subprocess.call
+    original_subprocess_check_output = subprocess.check_output
+    original_subprocess_run = subprocess.run
+    original_subprocess_popen = subprocess.Popen
+    original_os_popen = os.popen
+    original_os_execv = os.execv
+    original_os_execvp = os.execvp
+    original_os_execvpe = os.execvpe
+
+    current_pid = os.getpid()
+    current_pgid = os.getpgid(current_pid)
+    manager = multiprocessing.Manager()
+    child_pids = manager.list()
+
+    def safe_kill(pid, sig):
+        try:
+            pgid = os.getpgid(pid)
+            if pid == current_pid or pid in child_pids:
+                original_kill(pid, sig)
+            else:
+                print(f"Prevented attempt to kill PID {pid} with signal {sig}")
+        except ProcessLookupError:
+            pass
+
+    def safe_killpg(pgid, sig):
+        if pgid == current_pgid or pgid in {os.getpgid(pid) for pid in child_pids}:
+            original_killpg(pgid, sig)
+        else:
+            print(f"Prevented attempt to kill PGID {pgid} with signal {sig}")
+
+    def safe_system(command):
+        print(f"Intercepted system command: {command}")
+        if 'kill' in command or 'killall' in command:
+            return 0  # Simulate successful execution without doing anything
+        return original_system(command)
+
+    def safe_subprocess_call(command, *args, **kwargs):
+        print(f"Intercepted subprocess call: {command}")
+        if 'kill' in command or 'killall' in command:
+            return 0  # Simulate successful execution without doing anything
+        return original_subprocess_call(command, *args, **kwargs)
+
+    def safe_subprocess_check_output(command, *args, **kwargs):
+        print(f"Intercepted command: {command}")
+        if 'ps' in command:
+            return b""  # Simulate no processes found
+        return original_subprocess_check_output(command, *args, **kwargs)
+
+    def safe_subprocess_run(*args, **kwargs):
+        print(f"Intercepted subprocess run command: {args}")
+        if 'kill' in args[0] or 'killall' in args[0]:
+            return subprocess.CompletedProcess(args, 0, b'', b'')  # Simulate successful execution
+        return original_subprocess_run(*args, **kwargs)
+
+    class SafePopen(subprocess.Popen):
+        def __init__(self, *args, **kwargs):
+            print(f"Intercepted Popen command: {args}")
+            kwargs['preexec_fn'] = os.setsid  # Start the process in a new session
+            super().__init__(*args, **kwargs)
+            child_pids.append(self.pid)
+
+        def communicate(self, *args, **kwargs):
+            try:
+                return super().communicate(*args, **kwargs)
+            except subprocess.TimeoutExpired:
+                print("Timeout expired, intercepted and returning None")
+                return None, None
+
+        def kill(self):
+            print(f"Intercepted kill call for PID {self.pid}")
+            safe_kill(self.pid, signal.SIGTERM)
+
+        def terminate(self):
+            print(f"Intercepted terminate call for PID {self.pid}")
+            safe_kill(self.pid, signal.SIGTERM)
+
+    def safe_os_popen(command):
+        print(f"Intercepted os.popen command: {command}")
+        if 'kill' in command or 'killall' in command:
+            return os.popen('echo Intercepted')
+        return original_os_popen(command)
+
+    def safe_exec(*args, **kwargs):
+        print(f"Intercepted exec command: {args}")
+
+    # Override the risky functions with the safe versions
+    os.kill = safe_kill
+    os.killpg = safe_killpg
+    os.system = safe_system
+    subprocess.call = safe_subprocess_call
+    subprocess.check_output = safe_subprocess_check_output
+    subprocess.run = safe_subprocess_run
+    subprocess.Popen = SafePopen
+    os.popen = safe_os_popen
+    os.execv = safe_exec
+    os.execvp = safe_exec
+    os.execvpe = safe_exec
+
+    try:
+        yield
+    finally:
+        for pid in child_pids:
+            try:
+                os.kill(pid, signal.SIGTERM)
+                for _ in range(10):
+                    time.sleep(0.1)
+                    try:
+                        os.kill(pid, 0)
+                    except ProcessLookupError:
+                        break
+                else:
+                    os.kill(pid, signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+            except Exception as e:
+                print(f"Error handling process {pid}: {e}")
+        
+        os.kill = original_kill
+        os.killpg = original_killpg
+        os.system = original_system
+        subprocess.call = original_subprocess_call
+        subprocess.check_output = original_subprocess_check_output
+        subprocess.run = original_subprocess_run
+        subprocess.Popen = original_subprocess_popen
+        os.popen = original_os_popen
+        os.execv = original_os_execv
+        os.execvp = original_os_execvp
+        os.execvpe = original_os_execvpe
+
+
+class TimeoutException(Exception):
+    pass
+
+
+class WriteOnlyStringIO(io.StringIO):
+    """StringIO that throws an exception when it's read from"""
+
+    def read(self, *args, **kwargs):
+        raise IOError
+
+    def readline(self, *args, **kwargs):
+        raise IOError
+
+    def readlines(self, *args, **kwargs):
+        raise IOError
+
+    def readable(self, *args, **kwargs):
+        """Returns True if the IO object can be read."""
+        return False
+
+
+class redirect_stdin(contextlib._RedirectStream):  # type: ignore
+    _stream = "stdin"
+
+
+def reliability_guard(max_as_limit, max_data_limit, max_stack_limit):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+    
+    import os
+    import time
+    from datetime import datetime
+
+    os.environ['TZ'] = 'UTC'
+    time.tzset()
+    
+    os.environ["OMP_NUM_THREADS"] = "1"
+    os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" 
+    os.environ['TF_ENABLE_ONEDNN_OPTS'] = "0"
+    
+    if max_as_limit and max_data_limit and max_stack_limit:
+        import resource
+        
+        max_as_limit = max_as_limit * 1024 * 1024
+        max_data_limit = max_data_limit * 1024 * 1024
+        max_stack_limit = max_stack_limit * 1024 * 1024
+        
+        resource.setrlimit(
+            resource.RLIMIT_AS, (max_as_limit, max_as_limit)
+        )
+        resource.setrlimit(
+            resource.RLIMIT_DATA, (max_data_limit, max_data_limit)
+        )
+        if not platform.uname().system == "Darwin":
+            resource.setrlimit(
+                resource.RLIMIT_STACK, (max_stack_limit, max_stack_limit)
+            )
+
+    faulthandler.disable()
+
+    import builtins
+
+    builtins.exit = None
+    builtins.quit = None
+
+    import matplotlib.pyplot as plt
+    plt.close('all')
+
+
+PASS = "pass"
+FAIL = "fail"
+TIMEOUT = "timeout"
+
+_SUCCESS = 0
+_FAILED = 1
+_TIMEOUT = 2
+_UNKNOWN = 3
+
+_mapping = {_SUCCESS: PASS, _FAILED: FAIL, _TIMEOUT: TIMEOUT, _UNKNOWN: None}
+
+
+def unsafe_execute(
+    code: str,
+    test_code: str,
+    timeout: float,
+    stat,  # Value
+    details,  # Array
+):
+    with safe_environment(), create_tempdir():
+        # These system calls are needed when cleaning up tempdir.
+        import os
+        import shutil
+        import builtins
+        
+        rmtree = shutil.rmtree
+        rmdir = os.rmdir
+        chdir = os.chdir
+        # Disable functionalities that can make destructive changes to the test.
+        reliability_guard(max_as_limit = 30720, max_data_limit = 30720, max_stack_limit = 10)
+        module_name = "__test__"
+        new_module = types.ModuleType(module_name)
+        # Set necessary attributes for the module
+        new_module.__dict__.update({
+            '__builtins__': builtins,
+            '__file__': f"{module_name}.py",
+            '__package__': None,
+            '__doc__': None,
+            'sys': sys,
+            'os': os,
+            'environ': os.environ,
+        })
+
+        try:
+            full_code = code + "\n" + test_code
+
+            with swallow_io():
+                exec(compile(full_code, f"{module_name}.py", 'exec'), new_module.__dict__)
+                sys.modules[module_name] = new_module
+                TestCases = getattr(new_module, 'TestCases')
+                loader = unittest.TestLoader()
+                suite = loader.loadTestsFromTestCase(TestCases)
+                test_result = unittest.TestResult()
+
+                with time_limit(timeout):
+                    suite.run(test_result)
+            
+            issues = test_result.failures + test_result.errors
+            for test, trace in issues:
+                details[test.id().split(".")[-1]] = trace
+            stat.value = _SUCCESS
+        except BaseException as e:
+            details["ALL"] = str(e)
+            stat.value = _FAILED
+        # Needed for cleaning up.
+        shutil.rmtree = rmtree
+        os.rmdir = rmdir
+        os.chdir = chdir
+
+
+import psutil
+
+def terminate_process_tree(pid):
+    try:
+        parent = psutil.Process(pid)
+        children = parent.children(recursive=True)
+        for child in children:
+            try:
+                if child.is_running():
+                    os.kill(child.pid, signal.SIGKILL)
+            except psutil.NoSuchProcess:
+                continue
+        if parent.is_running():
+            os.kill(parent.pid, signal.SIGKILL)
+    except psutil.NoSuchProcess:
+        pass
+
+def check_correctness(
+    task_id: int,
+    solution_id: int,
+    solution: str,
+    test: str,
+    timeout: float,
+) -> Tuple[str, np.ndarray]:
+
+    result = {
+        "task_id": task_id,
+        "solution_id": solution_id
+    }
+    
+    # shared memory objects
+    stat = Value("i", _UNKNOWN)
+    manager = Manager()
+    details = manager.dict()
+
+    p = multiprocessing.Process(
+        target=unsafe_execute,
+        args=(
+            solution,
+            test,
+            timeout,
+            stat,
+            details,
+        ),
+    )
+
+    p.start()
+    p.join(timeout=timeout+1)
+
+    if p.is_alive():
+        terminate_process_tree(p.pid)
+        stat.value = _TIMEOUT
+
+    stat = _mapping[stat.value]
+    details = dict(details)
+    
+    if not stat:
+        stat = TIMEOUT
+    if stat == PASS:
+        if details:
+            stat = FAIL
+
+    result["passed"] = stat == PASS
+    result["result"] = details
+    result["solution"] = solution
+
+    manager.shutdown()
+
+    return result
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/factory.py b/code_eval/OpenCodeEval/factory.py
new file mode 100644
index 0000000..5d3fefc
--- /dev/null
+++ b/code_eval/OpenCodeEval/factory.py
@@ -0,0 +1,89 @@
+from loguru import logger
+
+from OpenCodeEval.benchmark.HumanEval import HumanEval
+from OpenCodeEval.benchmark.mbpp import mbpp
+from OpenCodeEval.benchmark.MBPP import MBPP
+from OpenCodeEval.benchmark.LeetCode import LeetCode
+from OpenCodeEval.benchmark.BigCodeBench import BigCodeBench
+from OpenCodeEval.benchmark.Bird import Bird
+from OpenCodeEval.benchmark.Spider import Spider
+
+from OpenCodeEval.benchmark.understandml import understandml
+
+from OpenCodeEval.backend.vllm import VllmGenerator
+from OpenCodeEval.backend.sglang import SglangGenerator
+from OpenCodeEval.backend.openai import OpenaiGenerator
+
+class BenchmarkFactory:
+
+    @staticmethod
+    def get_task(args):
+        task_map = {
+            "HumanEval": HumanEval,
+            "mbpp": mbpp,
+            "MBPP": MBPP,
+            "LeetCode": LeetCode,
+            "BigCodeBench": BigCodeBench,
+            "Bird": Bird,
+            "Spider": Spider,
+            "understandml": understandml
+        }
+
+        # Check if the task exists in the map
+        if args.task not in task_map:
+            logger.error(f"Unknown Task type: {args.task}")
+
+        # Get the corresponding class
+        task_class = task_map[args.task]
+
+        return task_class(
+            split = args.split,
+            time_out = args.time_out,
+            prompt_type = args.prompt_type
+        )
+
+class BackendFactory:
+
+    @staticmethod
+    def get_backend(args):
+        if args.backend == "vllm":
+            return VllmGenerator(
+                model_name = args.model_name,
+                model_type = args.model_type,
+                tokenizer_name = args.tokenizer_name,
+                num_gpus = args.num_gpus,
+                batch_size = args.batch_size,
+                temperature = args.temperature,
+                top_p = args.top_p,
+                num_samples = args.num_samples,
+                max_tokens = args.max_tokens,
+                seed = args.seed,
+                trust_remote_code = args.trust_remote_code,
+            )
+
+        elif args.backend == "sglang":
+            return SglangGenerator(
+                model_name = args.model_name,
+                model_type = args.model_type,
+                tokenizer_name = args.tokenizer_name,
+                num_gpus = args.num_gpus,
+                batch_size = args.batch_size,
+                temperature = args.temperature,
+                top_p = args.top_p,
+                num_samples = args.num_samples,
+                max_tokens = args.max_tokens,
+                trust_remote_code = args.trust_remote_code,
+            )
+
+        elif args.backend == "openai":
+            return OpenaiGenerator(
+                model_name = args.model_name,
+                model_type = args.model_type,
+                batch_size = args.batch_size,
+                temperature = args.temperature,
+                top_p = args.top_p,
+                num_samples = args.num_samples,
+                max_tokens = args.max_tokens,
+            )
+        else:
+            logger.error(f"Unknown Backend type: {args.backend}")
diff --git a/code_eval/OpenCodeEval/main.py b/code_eval/OpenCodeEval/main.py
new file mode 100644
index 0000000..d78ddec
--- /dev/null
+++ b/code_eval/OpenCodeEval/main.py
@@ -0,0 +1,115 @@
+import os
+import argparse
+
+from OpenCodeEval.args import get_args, check_args
+from OpenCodeEval.utils import refine_text, write_jsonl, stream_jsonl, calculate_pass_at_k
+
+from OpenCodeEval.factory import BenchmarkFactory, BackendFactory
+
+from tqdm.contrib.concurrent import process_map, thread_map
+
+def main():
+
+    parser = argparse.ArgumentParser()
+    args = get_args(parser)
+    args = check_args(args)
+
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok = True)
+
+    task = BenchmarkFactory.get_task(args)
+
+    # get prompts
+    prompts = task.get_prompt()
+
+    for prompt in prompts:
+        prompt['prompt'] = refine_text(args.prompt_prefix + prompt['prompt'] + args.prompt_suffix)
+    prompts = sorted(prompts, key = lambda x: x['task_id'])
+    write_jsonl(os.path.join(save_path, "prompts.jsonl"), prompts)
+
+    if args.split  ==  'plus':
+        base_path = save_path.replace('plus', 'base')
+        if os.path.exists(os.path.join(base_path, "generations.jsonl")):
+            generations = list(stream_jsonl(os.path.join(base_path, "generations.jsonl")))
+            write_jsonl(os.path.join(save_path, "generations.jsonl"), generations)
+
+    # check if generations exits
+    if not os.path.exists(os.path.join(save_path, "generations.jsonl")):
+        
+        prompts = list(stream_jsonl(os.path.join(save_path, "prompts.jsonl")))
+        # get generations
+        decoder = BackendFactory.get_backend(args)
+        if decoder.is_chat():
+            decoder.set_stop(task.chat_stop)
+        else:
+            decoder.set_stop(task.chat_stop + task.base_stop)
+
+        generations = decoder.generate(
+            prompts,
+            args.response_prefix,
+            args.response_suffix
+        )
+
+        generations = sorted([data for data in generations if data['completion']], key = lambda x: (x['task_id'], x['completion_id']))
+        write_jsonl(os.path.join(save_path, "generations.jsonl"), generations)
+
+    else:
+        
+        generated_ids = [data['task_id'] for data in stream_jsonl(os.path.join(save_path, "generations.jsonl"))]
+        prompts = [data for data in stream_jsonl(os.path.join(save_path, "prompts.jsonl")) if data['task_id'] not in generated_ids]
+        if len(prompts) > 0:
+
+            # get generations
+            decoder = BackendFactory.get_backend(args)
+            if decoder.is_chat():
+                decoder.set_stop(task.chat_stop)
+            else:
+                decoder.set_stop(task.chat_stop + task.base_stop)
+
+            continue_generations = decoder.generate(
+                prompts,
+                args.response_prefix,
+                args.response_suffix
+            )
+
+            generations = sorted([data for data in continue_generations if data['completion']] + list(stream_jsonl(os.path.join(save_path, "generations.jsonl"))), key = lambda x: (x['task_id'], x['completion_id']))
+            write_jsonl(os.path.join(save_path, "generations.jsonl"), generations)
+
+
+    # post-process generations
+    # if not os.path.exists(os.path.join(save_path, "solutions.jsonl")):
+    if True:
+
+        generations = list(stream_jsonl(os.path.join(save_path, "generations.jsonl")))
+        solutions = thread_map(
+            task.postprocess_generation,
+            generations,
+            max_workers = args.num_workers,
+            desc = "Post-processing Generations"
+        )
+        solutions = sorted(solutions, key = lambda x: (x['task_id'], x['completion_id']))
+        write_jsonl(os.path.join(save_path, "solutions.jsonl"), solutions)
+
+    # evaluate solutions
+    # if not os.path.exists(os.path.join(save_path, "evaluations.jsonl")):
+    if True:
+        solutions = list(stream_jsonl(os.path.join(save_path, "solutions.jsonl")))
+        evaluations = thread_map(
+            task.process_results,
+            solutions,
+            max_workers = args.num_workers,
+            desc = "Evaluating Solutions"
+        )
+        evaluations = sorted(evaluations, key = lambda x: (x['task_id'], x['completion_id']))
+        write_jsonl(os.path.join(save_path, "evaluations.jsonl"), evaluations)
+
+    # calculate pass@k
+    # if not os.path.exists(os.path.join(save_path, "results.jsonl")):
+    if True:
+        evaluations = list(stream_jsonl(os.path.join(save_path, "evaluations.jsonl")))
+        results = calculate_pass_at_k(evaluations, args.num_samples, args.list_k)
+        write_jsonl(os.path.join(save_path, "results.jsonl"), results)
+
+
+if __name__ == "__main__":
+    main()
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/utils.py b/code_eval/OpenCodeEval/utils.py
new file mode 100644
index 0000000..4c65d49
--- /dev/null
+++ b/code_eval/OpenCodeEval/utils.py
@@ -0,0 +1,165 @@
+import os
+import re
+import gzip
+import json
+import itertools
+import numpy as np
+
+from tqdm import tqdm
+from collections import defaultdict
+from typing import Dict, List, Union, Iterable, Callable, Literal
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+def refine_text(text: str, add_new_line: bool = True) -> str:
+    text =  text.replace("\t", "    ")
+    text = text.replace("\r\n", "\n").replace("\r", "\n")
+    if add_new_line:
+        return text.strip() + "\n"
+    else:
+        return text.strip()
+
+def multi_process_function(function: Callable,
+                           parameters: List,
+                           num_workers: int = 1,
+                           desc: str = "Completing tasks"):
+    
+    if num_workers > len(parameters) or num_workers > os.cpu_count():
+        num_workers = min(os.cpu_count(), len(parameters))
+
+    with ThreadPoolExecutor(num_workers) as executor:
+        futures = []
+        for param in parameters:
+            future = executor.submit(function, param)
+            futures.append(future)
+            
+        results = []
+        for future in tqdm(as_completed(futures), total=len(futures), desc=desc):
+            result = future.result()
+            results.append(result)
+
+    return results
+
+def markdown_extract(text: str, mode: Literal["first", "last", "all"] = "all") -> str:
+    """
+    Extract content enclosed by triple backticks (```) in a Markdown text.
+
+    Args:
+        text (str): The Markdown text to extract from.
+        mode (Literal["first", "last", "all"]):
+            - "first": Extract the first block of content.
+            - "last": Extract the last block of content.
+            - "all": Extract all blocks of content and join them with double newlines.
+
+    Returns:
+        str: Extracted content based on the specified mode.
+    """
+    # Match content inside triple backticks, ignoring the ``` lines
+    pattern = r"```[ \t]*[\r\n]+[ \t]*(.*?)[ \t]*[\r\n]+[ \t]*```"
+    matches = re.findall(pattern, text, re.DOTALL)
+
+    if matches:
+        if mode == "first":
+            return matches[0]
+        elif mode == "last":
+            return matches[-1]
+        elif mode == "all":
+            return "\n\n".join(matches)
+    else:
+            return ""
+
+def program_extract(
+    text: str,
+    program: str = "python",
+    mode: Literal["first", "last", "all"] = "all"
+) -> str:
+
+    program_pattern = rf"```{program}[ \t]*[\r\n]+[ \t]*(.*?)[ \t]*[\r\n]+[ \t]*```"
+    program_re = re.compile(program_pattern, re.DOTALL | re.IGNORECASE)
+
+    matches = program_re.findall(text)
+    if matches:
+        if mode == "first":
+            return matches[0]
+        elif mode == "last":
+            return matches[-1]
+        elif mode == "all":
+            return "\n\n".join(matches)
+    else:
+        return ""
+
+def stream_jsonl(filename: str) -> Iterable[Dict]:
+    """
+    Parses each jsonl line and yields it as a dictionary
+    """
+    if filename.endswith(".gz"):
+        with open(filename, "rb") as gzfp:
+            with gzip.open(gzfp, 'rt') as fp:
+                for line in fp:
+                    if any(not x.isspace() for x in line):
+                        yield json.loads(line)
+    else:
+        with open(filename, "r", encoding="utf-8") as fp:
+            for line in fp:
+                if any(not x.isspace() for x in line):
+                    yield json.loads(line)
+
+
+def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
+    """
+    Writes an iterable of dictionaries to jsonl
+    """
+    if append:
+        mode = 'ab'
+    else:
+        mode = 'wb'
+    filename = os.path.expanduser(filename)
+    if filename.endswith(".gz"):
+        with open(filename, mode) as fp:
+            with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
+                for x in data:
+                    gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
+    else:
+        with open(filename, mode) as fp:
+            for x in data:
+                fp.write((json.dumps(x) + "\n").encode('utf-8'))
+
+def estimate_pass_at_k(
+    num_samples: Union[int, List[int], np.ndarray],
+    num_correct: Union[List[int], np.ndarray],
+    k: int
+) -> np.ndarray:
+    """
+    Estimates pass@k of each problem and returns them in an array.
+    """
+
+    def estimator(n: int, c: int, k: int) -> float:
+        """
+        Calculates 1 - comb(n - c, k) / comb(n, k).
+        """
+        if n - c < k:
+            return 1.0
+        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
+    if isinstance(num_samples, int):
+        num_samples_it = itertools.repeat(num_samples, len(num_correct))
+    else:
+        assert len(num_samples) == len(num_correct)
+        num_samples_it = iter(num_samples)
+
+    return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
+
+def calculate_pass_at_k(
+    evaluations: List[Dict], 
+    num_samples: int, 
+    list_k: List[int]
+    ) -> List[Dict]:
+
+    task_results = defaultdict(int)
+    for evaluation in evaluations:
+        task_results[evaluation['task_id']] += evaluation['passed']
+    task_results = list(task_results.values())
+    benchmark_results = []
+    for k in list_k:
+        pass_rate = float(np.mean(estimate_pass_at_k(num_samples = num_samples, num_correct = task_results, k = k)))
+        benchmark_results.append({f"Pass@{k}": pass_rate})
+    return benchmark_results
+\ No newline at end of file
author	Yuren Hao <yurenh2@timan108.cs.illinois.edu>	2025-09-04 22:16:22 -0500
committer	Yuren Hao <yurenh2@timan108.cs.illinois.edu>	2025-09-04 22:16:22 -0500
commit	fc6d57ffb8d5ddb5820fcc00b5491a585c259ebc (patch)
tree	e9841f93a353e2107225cfc721d1ce57c0e594dc /code_eval/OpenCodeEval