update code and kk eval

author: = <=> 2025-06-04 11:49:37 +0800
committer: = <=> 2025-06-04 11:49:37 +0800
commit: 947d9dfdf16ae37109898111a5caacae7377b96d (patch)
tree: ff4e884020fb7d968a6192106f370b215647f569
parent: 5e163b529a78d528b745b8b57ba794b7b2bba97a (diff)
36 files changed, 3985 insertions, 1 deletions
diff --git a/.gitignore b/.gitignore
index 321e8a5..a6aef85 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,7 @@ archived/*
 *.log
 ./*.sh
 *.ipynb
-log/*
-\ No newline at end of file
+log/*
+*.pyc
+tmp
+code_eval/OpenCodeEval/benchmark/data
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/__init__.py b/code_eval/OpenCodeEval/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/code_eval/OpenCodeEval/__init__.py
diff --git a/code_eval/OpenCodeEval/args.py b/code_eval/OpenCodeEval/args.py
new file mode 100644
index 0000000..bf2d51f
--- /dev/null
+++ b/code_eval/OpenCodeEval/args.py
@@ -0,0 +1,91 @@
+from loguru import logger
+
+BENCHMARKS = [
+    "HumanEval",
+    "mbpp",
+    "MBPP",
+    "LeetCode",
+    "BigCodeBench",
+    "Bird",
+    "Spider",
+    "understandml"
+]
+
+SPLITS = {
+    "HumanEval": ["base", "plus"],
+    "mbpp": ["full", "sanitized"],
+    "MBPP": ["base", "plus"],
+    "LeetCode": ["contest", "train", "validation", "test"],
+    "BigCodeBench": ["full", "hard"],
+    "Bird": ["train", "dev"],
+    "Spider": ["train", "dev"],
+    "understandml": ["human", "model"]
+}
+
+def check_args(args):
+
+    if args.tokenizer_name is None:
+        args.tokenizer_name = args.model_name
+
+    # When eval the pretrained model, it can not response to the instrcution, so suggest using the Completion prompt
+    if args.model_type == "Base" and args.prompt_type == "Instruction":
+        logger.warning("Prompt type must be Completion for Base Model")
+
+    # check the split is valid for the task
+    if args.split not in SPLITS[args.task]:
+        logger.error(f"split {args.split} is not valid for {args.task}, please use {SPLITS[args.task]}")
+
+    # check the list_k is valid
+    args.list_k = list(map(int, args.list_k.split(',')))
+    if max(args.list_k) > args.num_samples:
+        logger.warning("max of list_k must be less than num_samples")
+        args.list_k = [k for k in args.list_k if k <= args.num_samples]
+
+    # check the temperature is valid
+    if args.num_samples > 1 and args.temperature == 0.0:
+        logger.error("Temperature is not allowed when num_samples > 1")
+
+    #When eval the chat model, it can not reply to the uncompleted code, so suggest using the Instruction prompt
+    if args.model_type == "Chat" and args.prompt_type == "Completion":
+        if args.prompt_prefix == "":
+            logger.warning("Prompt prefix is not set, using default prefix")
+            args.prompt_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:\n```python\n"
+        if args.prompt_suffix == "":
+            logger.warning("Prompt suffix is not set, using default suffix")
+            args.prompt_suffix = "\n```\n"
+
+    return args
+
+def get_args(parser):
+
+    #===================Model Parser===================
+    parser.add_argument("--model_name", default = None, type=str)
+    parser.add_argument("--tokenizer_name", default = None, type=str)
+    parser.add_argument("--trust_remote_code", action="store_true")
+    parser.add_argument("--backend", default="vllm", type=str, choices=["openai", "vllm", "sglang"])
+    parser.add_argument("--task", default="HumanEval", type=str, choices = BENCHMARKS)
+    parser.add_argument("--split", default="base", type = str)
+    parser.add_argument("--prompt_type", default = "Instruction", type=str, choices=["Completion", "Instruction"])
+    parser.add_argument("--model_type", default = "Chat", type = str, choices=["Base", "Chat"])
+    parser.add_argument("--list_k", default = "1,3,5,10", type = str)
+    parser.add_argument("--time_out", default = 3, type = float)
+    
+    #===================Computer Parser===================
+    parser.add_argument("--num_gpus", default = 1, type=int)
+    parser.add_argument("--num_workers", default = 1, type=int)
+
+    parser.add_argument("--save_path", default = 'save', type=str)
+    parser.add_argument("--batch_size", default = 164, type=int)
+    parser.add_argument("--num_samples", default = 1, type=int)
+    parser.add_argument("--max_tokens", default = 2048, type=int)
+    parser.add_argument("--temperature", default = 0.0, type=float)
+    parser.add_argument("--top_p", default = 1.0, type=float)
+    parser.add_argument("--seed", default = 1, type=float)
+
+    #===================Prompt Parser===================
+    parser.add_argument("--prompt_prefix", default = "", type=str)
+    parser.add_argument("--prompt_suffix", default = "", type=str)
+    parser.add_argument("--response_prefix", default = "", type=str)
+    parser.add_argument("--response_suffix", default = "", type=str)
+
+    return parser.parse_args()
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/backend/__init__.py b/code_eval/OpenCodeEval/backend/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/code_eval/OpenCodeEval/backend/__init__.py
diff --git a/code_eval/OpenCodeEval/backend/base.py b/code_eval/OpenCodeEval/backend/base.py
new file mode 100644
index 0000000..42091b9
--- /dev/null
+++ b/code_eval/OpenCodeEval/backend/base.py
@@ -0,0 +1,55 @@
+from typing import Callable
+from abc import ABC, abstractmethod
+
+def make_chat_template(
+        prompt: str,
+        response_prefix: str = "",
+        is_chat: bool = True,
+        tokenizer: Callable = None
+    ) -> str:
+
+    if is_chat:
+        prompt = tokenizer.apply_chat_template(
+            [
+                {"role": "user", "content":  prompt},
+            ],
+            tokenize = False,
+            add_generation_prompt = True
+        ) + response_prefix
+        if tokenizer.bos_token and prompt.startswith(tokenizer.bos_token):
+            prompt = prompt[len(tokenizer.bos_token):]
+        return prompt
+    else:
+        return prompt
+
+class Generator(ABC):
+
+    model_name: str = None
+
+    def __init__(self, model_name: str) -> None:
+        """
+        :param stop_words: list
+            list of stop words if the generation uses a stopping criteria during generation
+        :param requires_execution: bool
+            wheter the task requires code execution during evaluation or not
+        """
+        self.model_name = model_name
+
+    def fewshot_examples(self):
+        """Loads and returns the few-shot examples for the task if they exist."""
+        pass
+
+    @abstractmethod
+    def set_stop(self):
+        """
+        Set the stop tokens for the model
+        """
+        pass
+
+    @abstractmethod
+    def generate(self):
+        """Builds the prompt for the LM to generate from.
+        :param doc: dict[str: str]
+            sample from the test dataset
+        """
+        pass
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/backend/openai.py b/code_eval/OpenCodeEval/backend/openai.py
new file mode 100644
index 0000000..afa45b2
--- /dev/null
+++ b/code_eval/OpenCodeEval/backend/openai.py
@@ -0,0 +1,115 @@
+import os
+
+from typing import List, Dict
+from openai import OpenAI
+from loguru import logger
+from tqdm.contrib.concurrent import thread_map
+
+from OpenCodeEval.backend.base import Generator
+
+class OpenaiGenerator(Generator):
+
+    def __init__(
+            self,
+            model_name: str,
+            model_type: str = "Instruction",
+            batch_size : int = 1,
+            temperature : float = 0.0,
+            top_p : float = 1.0,
+            max_tokens : int = 1024,
+            num_samples : int = 1,
+        ) -> None:
+
+        super().__init__(model_name)
+
+        print("Initializing a decoder model in openai: {} ...".format(model_name))
+        self.model_type = model_type
+        self.batch_size = batch_size
+        self.temperature = temperature
+        self.top_p = top_p
+        self.max_tokens = max_tokens
+        self.num_samples = num_samples
+
+        self.client = OpenAI(
+            base_url = os.getenv("OPENAI_BASE_URL"),
+            api_key = os.getenv("OPENAI_API_KEY")
+        )
+
+    def is_chat(self) -> bool:
+        if self.model_type == "Chat":
+            return True
+        else:
+            return False
+    
+    def set_stop(self, eos: List[str]):
+        self.eos = eos
+
+    def connect_server(self, prompt):
+
+        num_tries = 5
+
+        try:
+
+            for _ in range(num_tries):
+
+                result = self.client.chat.completions.create(
+                    model = self.model_name,
+                    messages=[
+                        {"role": "user", "content": prompt['prompt']}
+                    ], 
+                    n = self.num_samples,
+                    stream = False,
+                    # stop = self.eos,
+                    temperature = self.temperature,
+                    # top_p = self.top_p,
+                    max_tokens = self.max_tokens,
+                    extra_headers = {'apikey':os.getenv("OPENAI_API_KEY")},
+                )
+
+                if all(choice.message.content for choice in result.choices):
+                    break
+                else:
+                    logger.warning("No content in the response, retrying...")
+                
+            results = [
+                dict(
+                    task_id=prompt['task_id'],
+                    completion_id=i,
+                    completion=choice.message.content
+                )
+                for i, choice in enumerate(result.choices)
+            ]
+            
+        except Exception as e:
+            logger.error("Error: {}".format(e))
+            results = [
+                dict(
+                    task_id=prompt['task_id'],
+                    completion_id=i,
+                    completion = ''
+                )
+                for i in range(self.num_samples)
+            ]
+            
+        return results
+
+    def generate(
+        self,
+        prompt_set: List[Dict],
+        response_prefix: str = "",
+        response_suffix: str = ""
+    ) -> List[Dict]:
+        
+        logger.info("Example Prompt:\n{}", prompt_set[0]['prompt'])
+        
+        results = thread_map(
+            self.connect_server, 
+            prompt_set,
+            max_workers = self.batch_size,
+            desc="Generating Completions"
+        )
+        
+
+        generations = [item for sublist in results for item in sublist]
+        
+        return generations
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/backend/sglang.py b/code_eval/OpenCodeEval/backend/sglang.py
new file mode 100644
index 0000000..c759639
--- /dev/null
+++ b/code_eval/OpenCodeEval/backend/sglang.py
@@ -0,0 +1,148 @@
+import os
+
+from loguru import logger
+
+from tqdm import tqdm
+from typing import List, Dict
+
+from OpenCodeEval.backend.base import Generator, make_chat_template
+
+from OpenCodeEval.utils import refine_text
+
+class SglangGenerator(Generator):
+
+    def __init__(
+        self,
+        model_name: str,
+        tokenizer_name: str = None,
+        model_type: str = "Instruction",
+        dtype: str = "bfloat16",
+        batch_size : int = 1,
+        temperature : float = 0.0,
+        top_p : float = 1.0,
+        max_tokens : int = 1024,
+        num_samples: int = 200,
+        num_gpus: int = 1,
+        trust_remote_code: bool = True,
+    ) -> None:
+
+        super().__init__(model_name)
+
+        print("Initializing a decoder model in sglang: {} ...".format(model_name))
+        self.tokenizer_name = tokenizer_name if tokenizer_name is not None else model_name
+        self.model_type = model_type
+        self.batch_size = batch_size
+        self.temperature = temperature
+        self.top_p = top_p
+        self.max_tokens = max_tokens
+        self.num_samples = num_samples
+        self.dtype = dtype
+        self.num_gpus = num_gpus
+        self.trust_remote_code = trust_remote_code
+
+        from sglang import Engine
+
+        self.model = Engine(
+            model_path = self.model_name,
+            tokenizer_path = self.tokenizer_name,
+            dtype = self.dtype,
+            tp_size = self.num_gpus,
+            trust_remote_code = self.trust_remote_code
+        )
+
+        self.tokenizer = self.model.tokenizer_manager.tokenizer
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+        self.sampling_params = dict(
+            n = self.num_samples,
+            temperature = self.temperature,
+            max_new_tokens = self.max_tokens,
+            top_p = self.top_p,
+        )
+    
+    def is_chat(self) -> bool:
+        if self.model_type == "Chat":
+            if self.tokenizer.chat_template is None:
+                logger.error("When the model type is Chat, the chat template must be set for the tokenizer.")
+            else:
+                return True
+        else:
+            return False
+        
+    def set_stop(self, eos: List[str]):
+        self.sampling_params['stop'] = eos
+        
+    def release_memory(self):
+
+        import gc
+        import torch
+
+        from sglang.srt.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
+
+        self.model.shutdown()
+
+        # destroy_model_parallel
+        # destroy_distributed_environment()
+
+        # del self.model
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def generate(
+            self,
+            prompt_set: List[Dict],
+            response_prefix: str = "",
+            response_suffix: str = ""
+        ) -> List[str]:
+
+        if self.is_chat():
+            for prompt in prompt_set:
+                prompt['prompt'] = make_chat_template(
+                    prompt = prompt['prompt'],
+                    response_prefix = response_prefix,
+                    is_chat = self.is_chat(),
+                    tokenizer = self.tokenizer
+                )
+
+        logger.info("Example Prompt:\n{}", prompt_set[0]['prompt'])
+
+        generation_set = []
+
+        for batch_start in tqdm(range(0, len(prompt_set), self.batch_size)):
+
+            batch_prompt = prompt_set[batch_start : batch_start + self.batch_size]
+            batch_outputs = self.model.generate(
+                [prompt['prompt'] for prompt in batch_prompt],
+                self.sampling_params
+            )
+
+            batch_outputs = [
+                [item["text"] for item in batch_outputs[i:i + self.num_samples]]
+                for i in range(0, len(batch_outputs), self.num_samples)
+            ]
+
+            for prompt, output in zip(batch_prompt, batch_outputs):
+
+                # Process completions with prefix/suffix and refine text based on chat mode
+                completions = [
+                    refine_text(
+                        f"{prompt['prompt']}\n\n{response_prefix}{completion}{response_suffix}"
+                        if not self.is_chat()
+                        else f"{response_prefix}{completion}{response_suffix}"
+                    )
+                    for completion in output
+                ]
+
+                for completion_id, completion in enumerate(completions):
+                    generation_set.append({
+                        'task_id': prompt['task_id'],
+                        'completion_id': completion_id,
+                        'completion': completion
+                    })
+
+        if len(generation_set) != len(prompt_set) * self.num_samples:
+            logger.error("Number of generations does not match the expected number.")
+
+        self.release_memory()
+
+        return generation_set
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/backend/vllm.py b/code_eval/OpenCodeEval/backend/vllm.py
new file mode 100644
index 0000000..3082dfe
--- /dev/null
+++ b/code_eval/OpenCodeEval/backend/vllm.py
@@ -0,0 +1,144 @@
+import os
+
+from loguru import logger
+
+from tqdm import tqdm
+from typing import List, Dict
+
+from OpenCodeEval.backend.base import Generator, make_chat_template
+
+from OpenCodeEval.utils import refine_text
+
+class VllmGenerator(Generator):
+
+    def __init__(
+        self,
+        model_name: str,
+        tokenizer_name: str = None,
+        model_type: str = "Instruction",
+        dtype: str = "bfloat16",
+        batch_size : int = 1,
+        temperature : float = 0.0,
+        top_p : float = 1.0,
+        max_tokens : int = 1024,
+        num_samples: int = 200,
+        num_gpus: int = 1,
+        seed: int = None,
+        trust_remote_code: bool = True,
+    ) -> None:
+
+        super().__init__(model_name)
+
+        print("Initializing a decoder model in vllm: {} ...".format(model_name))
+        self.tokenizer_name = tokenizer_name if tokenizer_name is not None else model_name
+        self.model_type = model_type
+        self.batch_size = batch_size
+        self.temperature = temperature
+        self.top_p = top_p
+        self.max_tokens = max_tokens
+        self.num_samples = num_samples
+        self.dtype = dtype
+        self.num_gpus = num_gpus
+        self.seed = seed
+        self.trust_remote_code = trust_remote_code
+
+        from vllm import LLM, SamplingParams
+
+        self.model = LLM(
+            model = self.model_name,
+            tokenizer = self.tokenizer_name,
+            dtype = self.dtype,
+            tensor_parallel_size = self.num_gpus,
+            trust_remote_code = self.trust_remote_code
+        )
+
+        self.tokenizer = self.model.get_tokenizer()
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+        self.sampling_params = SamplingParams(
+            n = self.num_samples,
+            temperature = self.temperature,
+            max_tokens = self.max_tokens,
+            top_p = self.top_p,
+            seed = int(self.seed)
+        )
+    
+    def is_chat(self) -> bool:
+        if self.model_type == "Chat":
+            if self.tokenizer.chat_template is None:
+                logger.error("When the model type is Chat, the chat template must be set for the tokenizer.")
+            else:
+                return True
+        else:
+            return False
+        
+    def set_stop(self, eos: List[str]):
+        self.sampling_params.stop = eos
+        
+    def release_memory(self):
+
+        import gc
+        import torch
+
+        from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
+
+        destroy_model_parallel
+        destroy_distributed_environment()
+        del self.model.llm_engine.model_executor
+        del self.model
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def generate(
+            self,
+            prompt_set: List[Dict],
+            response_prefix: str = "",
+            response_suffix: str = ""
+        ) -> List[str]:
+
+        if self.is_chat():
+            for prompt in prompt_set:
+                prompt['prompt'] = make_chat_template(
+                    prompt = prompt['prompt'],
+                    response_prefix = response_prefix,
+                    is_chat = self.is_chat(),
+                    tokenizer = self.tokenizer
+                )
+
+        logger.info("Example Prompt:\n{}", prompt_set[0]['prompt'])
+
+        generation_set = []
+
+        for batch_start in tqdm(range(0, len(prompt_set), self.batch_size)):
+            batch_prompt = prompt_set[batch_start : batch_start + self.batch_size]
+            batch_outputs = self.model.generate(
+                [prompt['prompt'] for prompt in batch_prompt],
+                self.sampling_params,
+                use_tqdm = False,
+            )
+
+            for prompt, output in zip(batch_prompt, batch_outputs):
+
+                # Process completions with prefix/suffix and refine text based on chat mode
+                completions = [
+                    refine_text(
+                        f"{prompt['prompt']}\n\n{response_prefix}{completion.text}{response_suffix}"
+                        if not self.is_chat()
+                        else f"{response_prefix}{completion.text}{response_suffix}"
+                    )
+                    for completion in output.outputs
+                ]
+
+                for completion_id, completion in enumerate(completions):
+                    generation_set.append({
+                        'task_id': prompt['task_id'],
+                        'completion_id': completion_id,
+                        'completion': completion
+                    })
+
+        if len(generation_set) != len(prompt_set) * self.num_samples:
+            logger.error("Number of generations does not match the expected number.")
+
+        self.release_memory()
+
+        return generation_set
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/BigCodeBench.py b/code_eval/OpenCodeEval/benchmark/BigCodeBench.py
new file mode 100644
index 0000000..abc4faf
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/BigCodeBench.py
@@ -0,0 +1,113 @@
+import os
+from typing import Literal
+
+ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from OpenCodeEval.benchmark.base import Benchmark, PYTHON_STOP, PYTHON_IMPORTS
+from OpenCodeEval.utils import refine_text, stream_jsonl
+from OpenCodeEval.eval.func_eval import check_correctness
+from OpenCodeEval.eval.sanitize import sanitize
+
+class BigCodeBench(Benchmark):
+
+    name: str = "BigCodeBench"
+    path: str = None
+
+    fullset_path = os.path.abspath(os.path.join(ROOT, "../data/BigCodeBench.jsonl"))
+    subset_path = os.path.abspath(os.path.join(ROOT, "../data/BigCodeBench_Hard.jsonl"))
+
+    imports_code = PYTHON_IMPORTS
+    chat_stop = PYTHON_STOP
+    base_stop = ['\n"""', "\nassert"]
+
+    def __init__(self,
+                 name: str = "BigCodeBench",
+                 timeout:float = 10.0,
+                 prompt_type: Literal["Completion", "Instruction"] = "Completion"
+                 ):
+        
+        super().__init__()
+        
+        self.name = name
+        self.timeout = timeout
+        self.prompt_type = prompt_type
+
+        if self.name == "BigCodeHard":
+            self.path = self.subset_path
+        elif self.name == "BigCodeBench":
+            self.path = self.fullset_path
+
+        self.tasks = self.get_task()
+
+    def get_task(self):
+        """
+        Get the task data from the jsonl file into a dictionary.
+        """
+
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename=self.path):
+
+            task_id = int(task_data["task_id"].split("/")[-1])
+            
+            tasks[task_id] = task_data
+        
+        return tasks
+    
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        prompts = []
+        for task_id, task_data in self.tasks.items():
+
+            if self.prompt_type == "Completion":
+                prompt = task_data['complete_prompt']
+            elif self.prompt_type == "Instruction":
+                prompt = task_data['instruct_prompt']
+
+            prompts.append(
+                dict(
+                    task_id = task_id,
+                    prompt = refine_text(prompt)
+                )
+            )
+
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        entry_point = self.tasks[generation['task_id']]["entry_point"]
+
+        result = dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = sanitize(generation['completion'], entry_point)
+        )
+
+        return result
+
+    def process_results(self, solution):
+        """
+        Takes the list of LM generations and evaluates them against the test cases
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        code = (
+            task_data["code_prompt"] + "\n" 
+            + "    pass\n" + "\n"
+            + solution['solution'] + "\n"
+        )
+        
+        result = check_correctness(solution['task_id'],
+                                   solution['completion_id'],
+                                   code,
+                                   task_data["test"],
+                                   self.timeout)
+        
+        return result
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/Bird.py b/code_eval/OpenCodeEval/benchmark/Bird.py
new file mode 100644
index 0000000..b4359fb
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/Bird.py
@@ -0,0 +1,123 @@
+import os
+import sys
+
+from loguru import logger
+from typing import Literal
+
+from OpenCodeEval.benchmark.base import Benchmark
+from OpenCodeEval.utils import refine_text, program_extract, stream_jsonl
+from OpenCodeEval.eval.sql_test import check_correctness
+
+class Bird(Benchmark):
+
+    name: str = "Bird"
+
+    def __init__(
+        self,
+        split: Literal["train", "dev"] = "dev",
+        time_out: float = 30.0,
+        prompt_type: str = "Instruction"
+    ):
+    
+        super().__init__()
+        
+        self.split = split
+        self.time_out = time_out
+        self.prompt_type = prompt_type
+
+        if self.prompt_type == "Completion":
+            logger.error("Completion prompt type not supported for Bird")
+
+        self.database = os.path.join(self.path, f"{self.name}/{self.split}/database")
+        self.path = os.path.join(self.path, f"{self.name}/{self.split}/data.jsonl")
+
+        self.tasks = self.get_task()
+        
+    def get_task(self):
+        """
+        Get the task data from the json file into a dictionary.
+        """
+    
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename = self.path):
+
+            tasks[int(task_data['id'])] = task_data
+        
+        return tasks
+
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        def construct_prompt(data):
+            instruction = data['o_schema']
+            instruction += f"\n\n-- External Knowledge: {data['evidence']}\n\n"
+            instruction += "-- Using valid SQLite and understanding External Knowledge, answer the following questions for the tables provided above.\n\n"
+            instruction += f"Question: {data['question']}\n"
+            return instruction
+
+        prompts = []
+        
+        
+        for task_id, task_data in self.tasks.items():
+            
+            prompt = construct_prompt(task_data)
+            
+            prompts.append(
+                dict(
+                    task_id = task_id,
+                    prompt = refine_text(prompt)
+                )
+            )
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        def one_line_sql(response):
+
+            response = program_extract(response, "sql", "last").strip()
+
+            lines = response.splitlines()
+            lines = [l.strip() for l in lines if l.strip()]
+            sql = " ".join(lines)
+            
+            return sql
+
+        result = dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = one_line_sql(generation['completion'])
+        )
+
+        return result
+
+    def process_results(self, solution):
+        """
+        Takes the list of LM generations and evaluates them against the test cases
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        db_path = self.database + f"/{task_data['db_id']}/{task_data['db_id']}.sqlite"
+
+        result, passed, sql_return = check_correctness(
+            solution['solution'],
+            task_data['sql'],
+            db_path,
+            self.time_out,
+            "set_match"
+        )
+        
+        return dict(
+            task_id = solution['task_id'],
+            completion_id = solution['completion_id'],
+            passed = passed,
+            result = result,
+            solution = solution['solution'],
+            sql_return = sql_return
+        )
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/HumanEval.py b/code_eval/OpenCodeEval/benchmark/HumanEval.py
new file mode 100644
index 0000000..3c3aece
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/HumanEval.py
@@ -0,0 +1,114 @@
+import os
+from typing import Literal
+
+from OpenCodeEval.benchmark.base import Benchmark, PYTHON_STOP, PYTHON_IMPORTS
+from OpenCodeEval.utils import refine_text, stream_jsonl, program_extract
+from OpenCodeEval.eval.func_eval import check_correctness
+from OpenCodeEval.eval.sanitize import sanitize
+
+class HumanEval(Benchmark):
+
+    name: str = "HumanEval"
+
+    imports_code = PYTHON_IMPORTS
+    chat_stop = PYTHON_STOP
+    base_stop = ["\ndef ", "\nclass ", "\nimport ", "\nfrom ", "\nassert "]
+
+    def __init__(
+        self,
+        split: Literal["base", "hard"] = "base",
+        time_out: float = 3.0,
+        prompt_type: str = "Completion"
+    ):
+
+        super().__init__()
+
+        self.split = split
+        self.time_out = time_out
+        self.prompt_type = prompt_type
+
+        self.path = os.path.join(self.path, f"{self.name}/{self.split}.jsonl")
+
+        self.tasks = self.get_task()
+
+    def get_task(self):
+        """
+        Get the task data from the jsonl file into a dictionary.
+        """
+
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename=self.path):
+
+            task_id = int(task_data["task_id"].split("/")[-1])
+            
+            tasks[task_id] = task_data
+        
+        return tasks
+
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        assert self.prompt_type == "Completion", "Prompt type must be Completion for HumanEval"
+
+        prompts = []
+        for task_id, task_data in self.tasks.items():
+            prompts.append(
+                dict(
+                    task_id = task_id,
+                    prompt = refine_text(task_data['prompt'])
+                )
+            )
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        entry_point = self.tasks[generation['task_id']]["entry_point"]
+
+        try:
+            completion = '\n'.join(generation['completion'].splitlines()[-200:])
+
+            if '</think>' in completion:
+                completion = completion.split('</think>')[1]
+            
+            solution = sanitize(completion, entry_point)
+        except Exception:
+            solution = program_extract(generation['completion'], program="python", mode="all")
+
+        result = dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = solution
+        )
+
+        return result
+
+    def process_results(self, solution):
+        """
+        Takes the list of LM generations and evaluates them against the test cases
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        code = (
+            "\n".join(self.imports_code) + "\n"
+            + task_data["prompt"] + "\n"
+            + "    pass\n" + "\n"
+            + solution['solution'] + "\n"
+            + task_data['test'] + "\n"
+            + f"check({task_data['entry_point']})"
+            )
+
+        result = check_correctness(
+            solution['task_id'],
+            solution['completion_id'],
+            code,
+            self.time_out
+        )
+        
+        return result
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/LeetCode.py b/code_eval/OpenCodeEval/benchmark/LeetCode.py
new file mode 100644
index 0000000..97b3489
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/LeetCode.py
@@ -0,0 +1,121 @@
+import os
+from typing import Literal
+from loguru import logger
+
+from OpenCodeEval.benchmark.base import Benchmark, PYTHON_IMPORTS, LEETCODE_IMPORTS, PYTHON_STOP
+from OpenCodeEval.utils import refine_text, stream_jsonl
+from OpenCodeEval.eval.func_eval import check_correctness
+from OpenCodeEval.eval.sanitize import sanitize
+class LeetCode(Benchmark):
+
+    name: str = "LeetCode"
+
+    imports_code = PYTHON_IMPORTS + LEETCODE_IMPORTS
+    chat_stop = PYTHON_STOP
+    base_stop = ["\ndef ", "\nclass ", "\nimport ", "\nfrom ", "\nassert "]
+
+    def __init__(
+        self,
+        split: Literal["contest", "train", "validation", "test"] = "contest",
+        time_out: float = 3.0,
+        prompt_type: Literal["Completion", "Instruction"] = "Instruction"
+    ):
+
+        super().__init__()
+        
+        self.name = name
+        self.split = split
+        self.time_out = time_out
+
+        self.prompt_type = prompt_type
+        if self.split != "contest" and self.prompt_type == "Completion":
+            logger.error(f"Completion prompt type not support {self.split} split")
+
+        self.path = os.path.join(self.path, f"{self.name}/{self.split}.jsonl")
+        self.tasks = self.get_task()
+
+    def get_task(self):
+        """
+        Get the task data from the jsonl file into a dictionary.
+        """
+
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename=self.path):
+
+            if self.split == "contest":
+                task_id = int(task_data["meta"]["questionId"])
+            else:
+                task_id = int(task_data["meta"]["question_id"])
+            tasks[task_id] = task_data
+        
+        return tasks
+        
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        prompts = []
+        for task_id, task_data in self.tasks.items():
+
+            if self.split == "contest":
+                if self.prompt_type == "Completion":
+                    prompt = task_data['prompt']
+                elif self.prompt_type == "Instruction":
+                    prompt = task_data['prompt_sft']
+            else:
+                prompt = task_data['meta']['query']
+
+            prompts.append(
+                dict(
+                    task_id = task_id,
+                    prompt = refine_text(prompt)
+                )
+            )
+
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        return dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = sanitize(
+                text = generation['completion'],
+                entrypoint = "Solution",
+            )
+        )
+    
+    def process_results(self, solution):
+        """
+        Takes the list of LM generations and evaluates them against the test cases
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        if self.split == "contest":
+            code = (
+                "\n".join(self.imports_code) + "\n\n"
+                + solution['solution'] + "\n\n"
+                + task_data['test']
+                )
+        else:
+            code = (
+                "\n".join(self.imports_code) + "\n\n"
+                + task_data['meta']['lang_code'] + "\n"
+                + "        pass\n" + "\n"
+                + solution['solution'] + "\n"
+                + task_data['test'] + "\n"
+                + f"check({task_data['entry_point']})"
+                )
+        
+        result = check_correctness(solution['task_id'],
+                                   solution['completion_id'],
+                                   code,
+                                   self.time_out)
+        
+        return result
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/LiveCodeBench.py b/code_eval/OpenCodeEval/benchmark/LiveCodeBench.py
new file mode 100644
index 0000000..8e0ccd9
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/LiveCodeBench.py
@@ -0,0 +1,76 @@
+import os
+from typing import Literal
+
+from OpenCodeEval.benchmark.base import Benchmark, PYTHON_STOP, PYTHON_IMPORTS
+from OpenCodeEval.utils import refine_text, stream_jsonl, program_extract
+from OpenCodeEval.eval.func_eval import check_correctness
+from OpenCodeEval.eval.sanitize import sanitize
+
+class LiveCodeBench(Benchmark):
+
+    name: str = "LiveCodeBench"
+    path: str = None
+
+    platform_dict = dict(
+        atcoder = 1,
+        codeforces = 2,
+        leetcode = 3,
+    )
+
+    def __init__(
+        self,
+        split: Literal["v1", "v2", "v3", "v4", "v5"] = "v5",
+        time_out: float = 3.0,
+        prompt_type: str = "Instruction"
+    ):
+
+        super().__init__()
+
+        self.path = os.path.join(self.path, self.name)
+
+        self.tasks = self.get_task()
+
+    def get_task_id(self, data):
+        """
+        Get the task id for the task.
+        """
+
+        from datetime import datetime
+
+        date_id = datetime.fromisoformat(data['contest_date'])
+
+        # refromat the date to YYYYMMDD
+        date_id = date_id.strftime("%Y%m%d")
+
+        if data['platform'] == 'atcoder':
+
+            paltform_id = "1"
+            contest, letter = data['question_id'].split('_')
+            contest = ''.join(token for token in contest if token.isdigit())
+            contest = contest.zfill(4)
+                
+            task_id =  paltform_id + contest + str(ord(letter) - ord('a') + 1)
+
+        elif data['platform'] == 'codeforces':
+            paltform_id = "2"
+            contest, letter = data['question_id'].split('_')
+            task_id =  paltform_id + contest + str(ord(letter) - ord('A') + 1)
+
+        elif data['platform'] == 'leetcode':
+            paltform_id = "3"
+            task_id = paltform_id + data['question_id'] + "0"
+        
+        else:
+            logger.error(f"Invalid platform: {data['platform']}")
+
+        return int(task_id)
+
+    def get_task(self):
+        """
+        Get the task data from the jsonl file into a dictionary.
+        """
+        
+        version = int(self.split.split('v')[1])
+
+        for i in range(1, version + 1):
+            
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/MBPP.py b/code_eval/OpenCodeEval/benchmark/MBPP.py
new file mode 100644
index 0000000..0242893
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/MBPP.py
@@ -0,0 +1,126 @@
+import os
+import sys
+
+ROOT = os.path.dirname(os.path.abspath(__file__))
+
+from OpenCodeEval.benchmark.base import Benchmark, PYTHON_STOP, PYTHON_IMPORTS
+from OpenCodeEval.utils import refine_text, stream_jsonl, program_extract
+from OpenCodeEval.eval.func_eval import check_correctness
+from OpenCodeEval.eval.sanitize import sanitize
+
+class MBPP(Benchmark):
+
+    name: str = "MBPP"
+
+    imports_code = PYTHON_IMPORTS
+    chat_stop = PYTHON_STOP
+    base_stop = ['\n"""', "\nassert"]
+    # TODO: add more stop words, e.g. "\nif __name__", "\ndef main(", "\nprint(", '\n```\n']
+
+    def __init__(
+        self,
+        split: str = "base",
+        time_out: float = 3.0,
+        prompt_type: str = "Instruction"
+    ):
+
+        super().__init__()
+        
+        self.split = split
+        self.time_out = time_out
+        self.prompt_type = prompt_type
+
+        self.path = os.path.join(self.path, f"{self.name}/data.jsonl")
+
+        self.tasks = self.get_task()
+
+    def get_task(self):
+        """
+        Get the task data from the jsonl file into a dictionary.
+        """
+
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename=self.path):
+
+            task_id = int(task_data["task_id"])
+            
+            tasks[task_id] = task_data
+        
+        return tasks
+    
+    def format_prompt(self, 
+                     promblem: str,
+                     test: str,
+                     ) -> str:
+        promblem = f"You are an expert Python programmer, and here is your task:\n{promblem}"
+        test = f"Your code should pass the test:\n{test}"
+        prompt = promblem + "\n" + test
+        return prompt
+    
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        assert self.prompt_type == "Instruction", "Prompt type must be Instruction for MBPP"
+
+        prompts = []
+        for task_id, task_data in self.tasks.items():
+            prompts.append(
+                dict(
+                    task_id = task_id,
+                    prompt = refine_text(self.format_prompt(task_data["text"], task_data["test_list"][0]))
+                )
+            )
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        entry_point = self.tasks[generation['task_id']]["entry_point"]
+
+        try:
+            # generation['completion'] = program_extract(generation['completion'], program="python", mode="last")
+            solution = sanitize(generation['completion'], entry_point)
+            # solution = solution.replace("func0", entry_point)
+        except Exception:
+            solution = program_extract(generation['completion'], program="python", mode="all")
+        
+        return dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = solution
+        )
+    
+    
+    def process_results(self, solution):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        if self.split == "base":
+            test_code = "\n".join(task_data['test_imports']) + "\n\n" + "\n".join(task_data['test_list'])
+        elif self.split == "plus":
+            test_code = "\n".join(task_data['test_imports']) + "\n\n" + task_data['test']
+
+        code =  (
+            "\n".join(self.imports_code) + "\n"
+            + solution['solution'] + "\n"
+            + test_code
+        )
+
+        result = check_correctness(solution['task_id'],
+                                   solution['completion_id'],
+                                   code,
+                                   self.time_out)
+        
+        return result
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/Spider.py b/code_eval/OpenCodeEval/benchmark/Spider.py
new file mode 100644
index 0000000..dc18afa
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/Spider.py
@@ -0,0 +1,118 @@
+import os
+import json
+
+from loguru import logger
+from typing import Literal
+
+from OpenCodeEval.benchmark.base import Benchmark
+from OpenCodeEval.utils import refine_text, program_extract, markdown_extract, stream_jsonl
+from OpenCodeEval.eval.sql_test import check_correctness
+
+class Spider(Benchmark):
+
+    name: str = "Spider"
+
+    def __init__(
+        self,
+        split: Literal["train", "dev"] = "dev",
+        time_out: float = 3.0,
+        prompt_type: str = "Instruction"
+    ):
+    
+        super().__init__()
+        
+        self.split = split
+        self.time_out = time_out
+        self.prompt_type = prompt_type
+
+        if self.prompt_type == "Completion":
+            logger.error("Completion prompt type not supported for Spider")
+
+        self.database = os.path.join(self.path, f"{self.name}/{self.split}/database")
+        self.path = os.path.join(self.path, f"{self.name}/{self.split}/data.jsonl")
+
+        self.tasks = self.get_task()
+
+    def get_task(self):
+        """
+        Get the task data from the json file into a dictionary.
+        """
+
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename = self.path):
+
+            tasks[int(task_data['id'])] = task_data
+        
+        return tasks
+
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        prompts = []
+        
+        
+        for task_id, task_data in self.tasks.items():
+            
+            prompt = task_data['instruction']
+            
+            prompts.append(
+                dict(
+                    task_id = task_id,
+                    prompt = refine_text(prompt)
+                )
+            )
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        solution = ' '.join(program_extract(
+            text = generation['completion'],
+            program = 'sql', 
+            mode = 'last').splitlines()
+        )
+
+        if solution == "":
+            solution = ' '.join(markdown_extract(
+                text = generation['completion'],
+                mode = 'last').splitlines()
+            )
+
+        result = dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = solution
+        )
+
+        return result
+
+    def process_results(self, solution):
+        """
+        Takes the list of LM generations and evaluates them against the test cases
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        db_path = os.path.join(self.database, f"{task_data['db_id']}/{task_data['db_id']}.sqlite")
+
+        result, passed,sql_return = check_correctness(
+            solution['solution'],
+            task_data['output'],
+            db_path,
+            self.time_out,
+            "exact_match"
+        )
+        
+        return dict(
+            task_id = solution['task_id'],
+            completion_id = solution['completion_id'],
+            passed = passed,
+            result = result,
+            solution = solution['solution'],
+            sql_return = sql_return
+        )
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/__init__.py b/code_eval/OpenCodeEval/benchmark/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/__init__.py
diff --git a/code_eval/OpenCodeEval/benchmark/base.py b/code_eval/OpenCodeEval/benchmark/base.py
new file mode 100644
index 0000000..4bd9750
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/base.py
@@ -0,0 +1,124 @@
+import os
+import sys
+
+ROOT = os.path.dirname(os.path.abspath(__file__))
+
+PYTHON_STOP = [ "\nif __name__",
+                "\ndef main(",
+                "\nprint("
+                ]
+    
+PYTHON_IMPORTS = [  "import math",
+                    "import re",
+                    "import sys",
+                    "import copy",
+                    "import datetime",
+                    "import itertools",
+                    "import collections",
+                    "import heapq",
+                    "import functools",
+                    "import hashlib",
+                    "import numpy",
+                    "import numpy as np",
+                    "import string",
+                    "from typing import *",
+                    "from collections import *"
+                    ]
+
+LEETCODE_IMPORTS =  [
+    'from typing import *',
+    'from functools import *',
+    'from collections import *',
+    'from itertools import *',
+    'from heapq import *',
+    'from bisect import *',
+    'from string import *',
+    'from operator import *',
+    'from math import *',
+    'import math',
+    'import datetime',
+    "inf = float('inf')",
+]
+
+from abc import ABC, abstractmethod
+
+class Benchmark(ABC):
+
+    name: str = None
+    split: str = None
+    path: str = os.path.abspath(os.path.join(os.path.dirname(__file__), "data/"))
+
+    imports = []
+    chat_stop = []
+    base_stop = []
+
+    def __init__(self):
+        """
+        :param stop_words: list
+            list of stop words if the generation uses a stopping criteria during generation
+        :param requires_execution: bool
+            wheter the task requires code execution during evaluation or not
+        """
+        pass
+
+    def fewshot_examples(self):
+        """Loads and returns the few-shot examples for the task if they exist."""
+        pass
+
+    @abstractmethod
+    def get_task(self):
+        """Builds the task for the LM to generate from.
+        """
+        pass
+
+    @abstractmethod
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from.
+        :param doc: dict[str: str]
+            sample from the test dataset
+        """
+        pass
+
+
+    def get_reference(self, doc):
+        """Builds the reference solution for the doc.
+        :param doc: dict[str: str]
+            sample from the test dataset
+        """
+        pass
+
+    @abstractmethod
+    def postprocess_generation(self, task, generation):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+        """
+        pass
+
+    @abstractmethod
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations as in {"metric_name": result}.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        :return: dict[str: float]
+        """
+        pass
+
+    def _stop_at_stop_token(decoded_string, stop_tokens):
+        """
+        Produces the prefix of decoded_string that ends at the first occurrence of
+        a stop_token.
+        WARNING: the decoded_string *must not* include the prompt, which may have stop tokens
+        itself.
+        """
+        min_stop_index = len(decoded_string)
+        for stop_token in stop_tokens:
+            stop_index = decoded_string.find(stop_token)
+            if stop_index != -1 and stop_index < min_stop_index:
+                min_stop_index = stop_index
+        return decoded_string[:min_stop_index]
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/benchmark/mbpp.py b/code_eval/OpenCodeEval/benchmark/mbpp.py
new file mode 100644
index 0000000..ee522a2
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/mbpp.py
@@ -0,0 +1,153 @@
+import os
+
+from OpenCodeEval.benchmark.base import Benchmark, PYTHON_STOP, PYTHON_IMPORTS
+from OpenCodeEval.utils import refine_text, stream_jsonl
+from OpenCodeEval.eval.func_eval import check_correctness
+from OpenCodeEval.eval.sanitize import sanitize
+
+from typing import List, Literal
+
+class mbpp(Benchmark):
+
+    name: str = "mbpp"
+
+    imports_code = PYTHON_IMPORTS
+    chat_stop = PYTHON_STOP
+    base_stop = ['\n"""', "\nassert"]
+
+    def __init__(
+        self,
+        split: Literal["full", "sanitized"] = "full",
+        time_out: float = 3.0,
+        prompt_type: str = "Instruction"
+    ):
+
+        super().__init__()
+
+        self.split = split
+        self.time_out = time_out
+        self.prompt_type = prompt_type
+
+        self.path = os.path.join(self.path, f"{self.name}/{self.split}.jsonl")
+        self.tasks = self.get_task()
+
+        self.few_shots_prompt = self.get_few_shots_prompts() if split == "full" else ""
+    
+    def get_task(self):
+        """
+        Get the task data from the jsonl file into a dictionary.
+        """
+
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename = self.path):
+            task_id = int(task_data["task_id"])
+
+            task_data['text'] = refine_text(task_data['text'])
+            
+            tasks[task_id] = task_data
+        
+        return tasks
+
+    def fewshot_examples(self):
+
+        few_shots_start = 1
+        few_shots_end = 4
+
+        few_shots = []
+
+        for task_id, task_data in self.tasks.items():
+            if task_id >= few_shots_start and task_id < few_shots_end:
+                few_shots.append(task_data)
+        
+        return few_shots
+    
+    def format_prompt(self,
+                      promblem: str,
+                      tests: List[str],
+                      code: str = None
+                    ) -> str:
+        promblem = f"You are an expert Python programmer, and here is your task:\n{promblem}"
+        test = "\n".join(tests)
+        test = f"Your code should pass these tests:\n{test}\n"
+        prompt = promblem + test
+        if code:
+            code = refine_text(code)
+            code = f"\n```python\n{code}\n```\n"
+            prompt = prompt + code
+        else:
+            prompt = prompt + "\n```python\n"
+        return prompt
+    
+    def get_few_shots_prompts(self):
+        
+        few_shots_prompts = []
+        for few_shot in self.fewshot_examples():
+            few_shots_prompts.append(self.format_prompt(few_shot["text"], few_shot["test_list"], few_shot["code"]))
+
+        return '\n'.join(few_shots_prompts)
+    
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        assert self.prompt_type == "Instruction", "Prompt type must be Instruction for mbpp"
+
+        if self.split == "full":
+            test_start = 10
+            test_end = 510
+        elif self.split == "sanitized":
+            test_start = 0
+            test_end = 974
+
+        prompts = []
+
+        for task_id, task_data in self.tasks.items():
+            if task_id >= test_start and task_id < test_end:
+                
+                prompt = self.few_shots_prompt + '\n' + self.format_prompt(task_data["text"], task_data["test_list"])
+                prompts.append({
+                    'task_id': task_id,
+                    'prompt': prompt
+                })
+
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        if generation['completion'].startswith(self.few_shots_prompt):
+            generation['completion'] = generation['completion'][len(self.few_shots_prompt):]
+        
+        # if "```python" not in generation['completion']:
+            # generation['completion'] = ""
+
+        return dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = sanitize(generation['completion'])
+        )
+    
+    def process_results(self, solution):
+        """
+        Takes the list of LM generations and evaluates them against the test cases
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        code =  (
+                    "\n".join(self.imports_code) + "\n"
+                    + task_data['test_setup_code'] + "\n"
+                    + solution['solution'] + "\n"
+                    + "\n".join(task_data['test_list'])
+                )
+        
+        result = check_correctness(solution['task_id'],
+                                   solution['completion_id'],
+                                   code,
+                                   self.time_out)
+        
+        return result
diff --git a/code_eval/OpenCodeEval/benchmark/understandml.py b/code_eval/OpenCodeEval/benchmark/understandml.py
new file mode 100644
index 0000000..590db3f
--- /dev/null
+++ b/code_eval/OpenCodeEval/benchmark/understandml.py
@@ -0,0 +1,152 @@
+import os
+
+from OpenCodeEval.benchmark.base import Benchmark, PYTHON_STOP, PYTHON_IMPORTS
+from OpenCodeEval.utils import program_extract, stream_jsonl
+from OpenCodeEval.eval.func_eval import check_correctness
+from OpenCodeEval.eval.sanitize import sanitize
+
+from typing import List, Literal
+
+from typing import *
+from functools import *
+from collections import *
+from itertools import *
+from heapq import *
+from bisect import *
+from string import *
+from operator import *
+from math import *
+
+import numpy as np
+from numpy import *
+
+import datetime
+import copy
+
+inf = float('inf')
+
+if os.environ.get("MAX_LINES"):
+    MAX_LINES = int(os.environ.get("MAX_LINES"))
+else:
+    MAX_LINES = 200
+
+def base_prompt(data):
+    prompt = 'You are an expert Python programmer, and here is your task:\n'
+    prompt = prompt + f'# Task: {data["title"]}\n'
+    prompt = prompt + f'# Description:\n{data["description"]}\n'
+    # prompt = prompt + f'# Examples:\n'
+    # for example_idx, (example, reasoning) in enumerate(zip(data["examples"], data["reasoning"])):
+    #     prompt = prompt + f'## Example {example_idx + 1}:\n'
+    #     prompt = prompt + f'### Input:\n{example["input"]}\n'
+    #     prompt = prompt + f'### Output:\n{example["output"]}\n'
+    #     prompt = prompt + f'### Reasoning:\n{reasoning}\n'
+    input_code = (data["import_code"] + "\n" + data["starter_code"]).strip()
+    prompt = prompt + f'# Your code should start with:\n```python\n{input_code}\n```\n'
+    if data['output_constrains'].strip():
+        prompt = prompt + f'# Output Constraints:\n{data["output_constrains"].strip()}\n'
+
+    return prompt
+
+
+
+class understandml(Benchmark):
+
+    name: str = "understandml"
+
+    imports_code = PYTHON_IMPORTS
+    chat_stop = PYTHON_STOP
+    base_stop = ['\n"""', "\nassert"]
+
+    def __init__(
+        self,
+        split: Literal["human", "model"] = "model",
+        time_out: float = 3.0,
+        prompt_type: str = "Instruction"
+    ):
+
+        super().__init__()
+
+        self.split = split
+        self.time_out = time_out
+        self.prompt_type = prompt_type
+
+        self.path = os.path.join(self.path, f"{self.name}/{self.split}_benchmark.jsonl")
+        self.tasks = self.get_task()
+    
+    def get_task(self):
+        """
+        Get the task data from the jsonl file into a dictionary.
+        """
+
+        tasks = {}
+        
+        for task_data in stream_jsonl(filename = self.path):
+            task_id = int(task_data["id"])
+            
+            tasks[task_id] = task_data
+        
+        return tasks
+    
+    def get_prompt(self):
+        """
+        Builds the prompt for the LM to generate from.
+        """
+
+        assert self.prompt_type == "Instruction", "Prompt type must be Instruction for mbpp"
+
+        prompts = []
+
+        for task_id, task_data in self.tasks.items():
+                
+            prompt = base_prompt(task_data)
+            prompts.append({
+                'task_id': task_id,
+                'prompt': prompt
+            })
+
+        return prompts
+
+    def postprocess_generation(self, generation):
+        """
+        Postprocess the generations.
+        """
+
+        entry_point = self.tasks[generation['task_id']]["entry_point"]
+
+        try:
+            completion = '\n'.join(generation['completion'].splitlines()[-MAX_LINES:])
+
+            if '</think>' in completion:
+                completion = completion.split('</think>')[1]
+            
+            solution = sanitize(completion, entry_point)
+        except Exception:
+            solution = program_extract(generation['completion'], program="python", mode="all")
+
+        result = dict(
+            task_id = generation['task_id'],
+            completion_id = generation['completion_id'],
+            solution = solution
+        )
+
+        return result
+    
+    def process_results(self, solution):
+        """
+        Takes the list of LM generations and evaluates them against the test cases
+        """
+
+        task_data = self.tasks[solution['task_id']]
+
+        code =  (
+                    task_data['import_code'] + "\n"
+                    + solution['solution'] + "\n"
+                    + "\n".join(task_data['test_cases'])
+                )
+        
+        result = check_correctness(solution['task_id'],
+                                   solution['completion_id'],
+                                   code,
+                                   self.time_out)
+        
+        return result
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/data/dataset.sh b/code_eval/OpenCodeEval/data/dataset.sh
new file mode 100755
index 0000000..0e85f8f
--- /dev/null
+++ b/code_eval/OpenCodeEval/data/dataset.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# 定义要下载的文件的 URL 列表
+urls=(
+    "https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.1/BigCodeBench-Hard.jsonl.gz"
+    "https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.1/BigCodeBench.jsonl.gz"
+    "https://github.com/evalplus/humanevalplus_release/releases/download/v0.1.10/HumanEvalPlus.jsonl.gz"
+    "https://github.com/evalplus/mbppplus_release/releases/download/v0.2.0/MbppPlus.jsonl.gz"
+    "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"
+)
+
+# 下载并解压每个文件
+for url in "${urls[@]}"; do
+    # 获取文件名
+    filename=$(basename "$url")
+    
+    # 删除已有的压缩文件和解压后的文件，确保不会重复
+    [ -f "$filename" ] && rm "$filename"
+    [ -f "${filename%.gz}" ] && rm "${filename%.gz}"
+
+    echo "Downloading $url..."
+    wget "$url"
+    
+    echo "Unzipping $filename..."
+    gunzip "$filename"
+done
+
+echo "All files have been downloaded and unzipped."
diff --git a/code_eval/OpenCodeEval/eval/__init__.py b/code_eval/OpenCodeEval/eval/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/code_eval/OpenCodeEval/eval/__init__.py
diff --git a/code_eval/OpenCodeEval/eval/func_eval.py b/code_eval/OpenCodeEval/eval/func_eval.py
new file mode 100644
index 0000000..493c0c6
--- /dev/null
+++ b/code_eval/OpenCodeEval/eval/func_eval.py
@@ -0,0 +1,232 @@
+import io
+import os
+
+import signal
+import tempfile
+import platform
+import contextlib
+import faulthandler
+import multiprocessing
+
+from typing import Optional, Callable, Dict
+
+
+# WARNING
+# This program exists to execute untrusted model-generated code. Although
+# it is highly unlikely that model-generated code will do something overtly
+# malicious in response to this test suite, model-generated code may act
+# destructively due to a lack of model capability or alignment.
+# Users are strongly encouraged to sandbox this evaluation suite so that it 
+# does not perform destructive actions on their host or network. For more 
+# information on how OpenAI sandboxes its code, see the accompanying paper.
+# Once you have read this disclaimer and taken appropriate precautions, 
+# uncomment the 58 line and proceed at your own risk
+
+def check_correctness(task_id: int,
+                      completion_id: int,
+                      solution: str,
+                      time_out: float,
+                      ) -> Dict:
+    """
+    Evaluates the functional correctness of a completion by running the test
+    suite provided in the problem. 
+
+    :param completion_id: an optional completion ID so we can match
+        the results later even if execution finishes asynchronously.
+    """
+
+    def unsafe_execute():
+
+        with create_tempdir():
+
+            # These system calls are needed when cleaning up tempdir.
+            import os
+            import shutil
+            rmtree = shutil.rmtree
+            rmdir = os.rmdir
+            chdir = os.chdir
+
+            # Disable functionalities that can make destructive changes to the test.
+            reliability_guard()
+
+            # Construct the check program and run it.
+            check_program = solution
+
+            try:
+                exec_globals = {}
+                with swallow_io():
+                    with time_limit(time_out):
+                        exec(check_program, exec_globals)
+                result.append("passed")
+            except TimeoutException:
+                result.append("timed out")
+            except BaseException as e:
+                result.append(f"failed: {e}")
+
+            # Needed for cleaning up.
+            shutil.rmtree = rmtree
+            os.rmdir = rmdir
+            os.chdir = chdir
+
+    manager = multiprocessing.Manager()
+    result = manager.list()
+
+    p = multiprocessing.Process(target=unsafe_execute)
+    p.start()
+    p.join(timeout=time_out + 1)
+    if p.is_alive():
+        p.kill()
+
+    if not result:
+        result.append("timed out")
+
+    return dict(
+        task_id = task_id,
+        completion_id = completion_id,
+        passed = result[0] == "passed",
+        result = result[0],
+        solution = solution
+    )
+
+
+@contextlib.contextmanager
+def time_limit(seconds: float):
+    def signal_handler(signum, frame):
+        raise TimeoutException("Timed out!")
+    signal.setitimer(signal.ITIMER_REAL, seconds)
+    signal.signal(signal.SIGALRM, signal_handler)
+    try:
+        yield
+    finally:
+        signal.setitimer(signal.ITIMER_REAL, 0)
+
+
+@contextlib.contextmanager
+def swallow_io():
+    stream = WriteOnlyStringIO()
+    with contextlib.redirect_stdout(stream):
+        with contextlib.redirect_stderr(stream):
+            with redirect_stdin(stream):
+                yield
+
+
+@contextlib.contextmanager
+def create_tempdir():
+    with tempfile.TemporaryDirectory() as dirname:
+        with chdir(dirname):
+            yield dirname
+
+
+class TimeoutException(Exception):
+    pass
+
+
+class WriteOnlyStringIO(io.StringIO):
+    """ StringIO that throws an exception when it's read from """
+
+    def read(self, *args, **kwargs):
+        raise IOError
+
+    def readline(self, *args, **kwargs):
+        raise IOError
+
+    def readlines(self, *args, **kwargs):
+        raise IOError
+
+    def readable(self, *args, **kwargs):
+        """ Returns True if the IO object can be read. """
+        return False
+
+
+class redirect_stdin(contextlib._RedirectStream):  # type: ignore
+    _stream = 'stdin'
+
+
+@contextlib.contextmanager
+def chdir(root):
+    if root == ".":
+        yield
+        return
+    cwd = os.getcwd()
+    os.chdir(root)
+    try:
+        yield
+    except BaseException as exc:
+        raise exc
+    finally:
+        os.chdir(cwd)
+
+
+def reliability_guard(maximum_memory_bytes: Optional[int] = None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the 
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        import resource
+        resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
+        resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
+        if not platform.uname().system == 'Darwin':
+            resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
+
+    faulthandler.disable()
+
+    import builtins
+    builtins.exit = None
+    builtins.quit = None
+
+    import os
+    os.environ['OMP_NUM_THREADS'] = '1'
+
+    os.kill = None
+    os.system = None
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.fchdir = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__['help'] = None
+
+    import sys
+    sys.modules['ipdb'] = None
+    sys.modules['joblib'] = None
+    sys.modules['resource'] = None
+    sys.modules['psutil'] = None
+    sys.modules['tkinter'] = None
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/eval/sanitize.py b/code_eval/OpenCodeEval/eval/sanitize.py
new file mode 100644
index 0000000..a82ea65
--- /dev/null
+++ b/code_eval/OpenCodeEval/eval/sanitize.py
@@ -0,0 +1,196 @@
+"""Post-processing LLM-generated Python code implemented using AST."""
+import ast
+import traceback
+import signal
+
+from typing import Dict, List, Optional, Set, Tuple
+from OpenCodeEval.utils import refine_text
+
+def syntax_check(code, verbose = False):
+    try:
+        ast.parse(code)
+        return True
+    except (SyntaxError, MemoryError):
+        if verbose:
+            traceback.print_exc()
+        return False
+
+def extract_longest_valid_code(text: str) -> str:
+    lines = text.splitlines()
+
+    max_valid_lines = 0
+    max_valid_snippet = ""
+
+    for i in range(len(lines)):
+        for j in range(i, len(lines)):
+            current_snippet = "\n".join(lines[i:j+1])
+            if syntax_check(current_snippet):
+                valid_line_count = sum(1 for line in lines[i:j+1] if line.strip())
+                if valid_line_count > max_valid_lines:
+                    max_valid_lines = valid_line_count
+                    max_valid_snippet = current_snippet
+
+    return max_valid_snippet
+
+def get_deps(nodes: List[Tuple[str, ast.AST]]) -> Dict[str, Set[str]]:
+    name2deps = {}
+    for name, node in nodes:
+        deps = set()
+        stack = [node]
+        while stack:
+            current = stack.pop()
+            for child in ast.iter_child_nodes(current):
+                if isinstance(child, ast.Name):
+                    deps.add(child.id)
+                elif isinstance(child, ast.Attribute):
+                    deps.add(child.attr)
+                else:
+                    stack.append(child)
+        name2deps[name] = deps
+    return name2deps
+
+def get_function_dependency(entrypoint: str, call_graph: Dict[str, Set[str]]) -> Set[str]:
+    visited = set()
+    to_visit = [entrypoint]
+
+    while to_visit:
+        current = to_visit.pop(0)
+        if current not in visited:
+            visited.add(current)
+            to_visit.extend(call_graph.get(current, set()) - visited)
+
+    return visited
+
+def get_definition_name(node: ast.AST) -> Optional[str]:
+    if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
+        return node.name
+    elif isinstance(node, ast.Assign):
+        targets = node.targets
+        if targets and isinstance(targets[0], ast.Name):
+            return targets[0].id
+    return None
+
+def has_return_statement(node: ast.AST) -> bool:
+    return any(isinstance(n, ast.Return) for n in ast.walk(node))
+
+def has_yield_statement(node: ast.AST) -> bool:
+    return any(isinstance(n, ast.Yield) for n in ast.walk(node))
+
+class TimeoutException(Exception): pass
+
+def _timeout_handler(signum, frame):
+    raise TimeoutException()
+"""
+def sanitize(text: str, entrypoint: Optional[str] = None) -> str:
+     # 设置信号处理器
+    signal.signal(signal.SIGALRM, _timeout_handler)
+    signal.alarm(30)  # 30秒后发送SIGALRM
+    
+    text = refine_text(text)
+
+    try:
+        code = extract_longest_valid_code(text)
+
+        tree = ast.parse(code)
+            
+        definitions = {}
+
+        imports = []
+
+        for node in tree.body:
+            if isinstance(node, (ast.Import, ast.ImportFrom)):
+                imports.append(node)
+            elif isinstance(node, ast.ClassDef):
+                name = node.name
+                definitions[name] = ('class', node)
+            elif isinstance(node, ast.FunctionDef):
+                name = node.name
+                if has_return_statement(node) or has_yield_statement(node):
+                    definitions[name] = ('function', node)
+            elif isinstance(node, ast.Assign):
+                name = get_definition_name(node)
+                if name:
+                    definitions[name] = ('variable', node)
+
+        if entrypoint:
+            name2deps = get_deps([(name, node) for name, (_, node) in definitions.items()])
+            reachable = get_function_dependency(entrypoint, name2deps)
+
+        sanitized_output = []
+
+        for node in imports:
+            sanitized_output.append(ast.unparse(node))
+
+        for name, (_, node) in definitions.items():
+            if not entrypoint or name in reachable:
+                sanitized_output.append(ast.unparse(node))
+
+        return "\n".join(sanitized_output)
+
+    except Exception as e:
+        print(f"Error extracting longest valid code: {e}")
+        return ""
+    finally:
+        signal.alarm(0)  # 取消定时器
+"""
+from multiprocessing import Process, Queue, TimeoutError
+
+def sanitize(text: str, entrypoint: Optional[str] = None) -> str:
+    def _sanitize_worker(q, text, entrypoint):
+        try:
+            # 原有处理逻辑保持不变
+            text = refine_text(text)
+            code = extract_longest_valid_code(text)
+            tree = ast.parse(code)
+            definitions = {}
+            imports = []
+            for node in tree.body:
+                if isinstance(node, (ast.Import, ast.ImportFrom)):
+                    imports.append(node)
+                elif isinstance(node, ast.ClassDef):
+                    name = node.name
+                    definitions[name] = ('class', node)
+                elif isinstance(node, ast.FunctionDef):
+                    name = node.name
+                    if has_return_statement(node) or has_yield_statement(node):
+                        definitions[name] = ('function', node)
+                elif isinstance(node, ast.Assign):
+                    name = get_definition_name(node)
+                    if name:
+                        definitions[name] = ('variable', node)
+
+            if entrypoint:
+                name2deps = get_deps([(name, node) for name, (_, node) in definitions.items()])
+                reachable = get_function_dependency(entrypoint, name2deps)
+
+            sanitized_output = []
+            for node in imports:
+                sanitized_output.append(ast.unparse(node))
+            for name, (_, node) in definitions.items():
+                if not entrypoint or name in reachable:
+                    sanitized_output.append(ast.unparse(node))
+            
+            q.put("\n".join(sanitized_output))
+            
+        except Exception as e:
+            print(f"Error extracting longest valid code: {e}")
+            q.put("")
+
+    # 使用多进程实现超时控制
+    q = Queue()
+    p = Process(target=_sanitize_worker, args=(q, text, entrypoint))
+    p.start()
+    
+    try:
+        # 等待3秒获取结果
+        result = q.get(timeout=3)
+        
+        p.terminate()
+        p.join()  # 确保进程退出
+        return result
+    except TimeoutError:
+        print("Function timed out after 3 seconds")
+        p.terminate()  # 强制终止进程
+        p.join()
+        return ""
+    
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/eval/sql_test.py b/code_eval/OpenCodeEval/eval/sql_test.py
new file mode 100644
index 0000000..9b0724e
--- /dev/null
+++ b/code_eval/OpenCodeEval/eval/sql_test.py
@@ -0,0 +1,187 @@
+import re
+import random
+import sqlite3
+
+from loguru import logger
+from itertools import product
+from collections import defaultdict
+from typing import Tuple, List, Set
+from func_timeout import func_timeout, FunctionTimedOut
+
+def permute_tuple(element: Tuple, perm: Tuple) -> Tuple:
+    assert len(element) == len(perm)
+    return tuple([element[i] for i in perm])
+
+
+def unorder_row(row: Tuple) -> Tuple:
+    return tuple(sorted(row, key=lambda x: str(x) + str(type(x))))
+
+
+# unorder each row in the table
+# [result_1 and result_2 has the same bag of unordered row]
+# is a necessary condition of
+# [result_1 and result_2 are equivalent in denotation]
+def quick_rej(result1: List[Tuple], result2: List[Tuple], order_matters: bool) -> bool:
+    s1 = [unorder_row(row) for row in result1]
+    s2 = [unorder_row(row) for row in result2]
+    if order_matters:
+        return s1 == s2
+    else:
+        return set(s1) == set(s2)
+
+# return whether two bag of relations are equivalent
+def multiset_eq(l1: List, l2: List) -> bool:
+    if len(l1) != len(l2):
+        return False
+    d = defaultdict(int)
+    for e in l1:
+        d[e] = d[e] + 1
+    for e in l2:
+        d[e] = d[e] - 1
+        if d[e] < 0:
+            return False
+    return True
+
+def get_constraint_permutation(tab1_sets_by_columns: List[Set], result2: List[Tuple]):
+    num_cols = len(result2[0])
+    perm_constraints = [{i for i in range(num_cols)} for _ in range(num_cols)]
+    if num_cols <= 3:
+        return product(*perm_constraints)
+
+    # we sample 20 rows and constrain the space of permutations
+    for _ in range(20):
+        random_tab2_row = random.choice(result2)
+
+        for tab1_col in range(num_cols):
+            for tab2_col in set(perm_constraints[tab1_col]):
+                if random_tab2_row[tab2_col] not in tab1_sets_by_columns[tab1_col]:
+                    perm_constraints[tab1_col].remove(tab2_col)
+    return product(*perm_constraints)
+
+# check whether two denotations are correct
+def result_eq(result1: List[Tuple], result2: List[Tuple], order_matters: bool) -> bool:
+    if len(result1) == 0 and len(result2) == 0:
+        return True
+
+    # if length is not the same, then they are definitely different bag of rows
+    if len(result1) != len(result2):
+        return False
+
+    num_cols = len(result1[0])
+
+    # if the results do not have the same number of columns, they are different
+    if len(result2[0]) != num_cols:
+        return False
+
+    # unorder each row and compare whether the denotation is the same
+    # this can already find most pair of denotations that are different
+    if not quick_rej(result1, result2, order_matters):
+        return False
+
+    # the rest of the problem is in fact more complicated than one might think
+    # we want to find a permutation of column order and a permutation of row order,
+    # s.t. result_1 is the same as result_2
+    # we return true if we can find such column & row permutations
+    # and false if we cannot
+    tab1_sets_by_columns = [{row[i] for row in result1} for i in range(num_cols)]
+
+    # on a high level, we enumerate all possible column permutations that might make result_1 == result_2
+    # we decrease the size of the column permutation space by the function get_constraint_permutation
+    # if one of the permutation make result_1, result_2 equivalent, then they are equivalent
+    for perm in get_constraint_permutation(tab1_sets_by_columns, result2):
+        if len(perm) != len(set(perm)):
+            continue
+        if num_cols == 1:
+            result2_perm = result2
+        else:
+            result2_perm = [permute_tuple(element, perm) for element in result2]
+        if order_matters:
+            if result1 == result2_perm:
+                return True
+        else:
+            # in fact the first condition must hold if the second condition holds
+            # but the first is way more efficient implementation-wise
+            # and we use it to quickly reject impossible candidates
+            if set(result1) == set(result2_perm) and multiset_eq(result1, result2_perm):
+                return True
+    return False
+
+# postprocess the model predictions to avoid execution errors
+# e.g. removing spaces between ">" and "="
+def postprocess(query: str) -> str:
+    query = query.replace("> =", ">=").replace("< =", "<=").replace("! =", "!=")
+    return query
+
+def find_detail(reason):
+    patterns = [
+        "ambiguous column name",
+        "no such column",
+        "no such table",
+        "no such function",
+        "syntax error",
+    ]
+
+    for p in patterns:
+        if p in reason:
+            return p
+    return "others"
+
+def execute_sql(predicted_sql,ground_truth, db_path, method):
+    conn = sqlite3.connect(db_path)
+    # Connect to the database
+    cursor = conn.cursor()
+    cursor.execute(predicted_sql)
+    predicted_res = cursor.fetchall()
+    cursor.execute(ground_truth)
+    ground_truth_res = cursor.fetchall()
+    result = "failed"
+    passed = False
+
+    if predicted_res is None:
+        sql_return = [[]]
+    else:
+        sql_return = predicted_res
+
+    if method == "set_match":
+
+        if set(predicted_res) == set(ground_truth_res):
+            result = "passed"
+            passed = True
+        return result, passed, sql_return
+
+    elif method == "exact_match":
+
+        order_matters = "orderby" in re.sub(r"\s+", ground_truth.lower(), "")
+        if result_eq(predicted_res, ground_truth_res, order_matters = order_matters):
+            result = "passed"
+            passed = True
+        else:
+            passed = False
+
+        return result, passed, sql_return
+
+    else:
+        logger.error(f"Unknown evaluation method: {method}")
+
+def check_correctness(predicted_sql,ground_truth, db_path, meta_time_out, method):
+    try:
+        result, passed, sql_return = func_timeout(
+            meta_time_out,
+            execute_sql,
+            args = (
+                predicted_sql,
+                ground_truth,
+                db_path,
+                method
+                )
+            )
+    except FunctionTimedOut:
+        result = "timeout"
+        passed = False
+        sql_return = [[]]
+    except Exception as e:
+        result = f"error:{e}"
+        passed = False
+        sql_return = [[]]
+
+    return result, passed, sql_return
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/eval/unit_test.py b/code_eval/OpenCodeEval/eval/unit_test.py
new file mode 100644
index 0000000..5dd1f49
--- /dev/null
+++ b/code_eval/OpenCodeEval/eval/unit_test.py
@@ -0,0 +1,472 @@
+# The MIT License
+#
+# Copyright (c) OpenAI (https://openai.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+import io
+import os
+import sys
+import time
+import types
+import unittest
+import contextlib
+import faulthandler
+import platform
+import signal
+import tempfile
+import subprocess
+import multiprocessing
+import numpy as np
+
+from multiprocessing import Array, Value, Manager
+from typing import Any, Dict, List, Tuple, Union
+
+@contextlib.contextmanager
+def swallow_subprocess_output():
+    """Context manager to swallow stdout and stderr for subprocesses."""
+    original_popen = subprocess.Popen
+    original_run = subprocess.run
+
+    def _popen_patch(*args, **kwargs):
+        if 'capture_output' in kwargs and kwargs['capture_output']:
+            # Avoid setting stdout or stderr if capture_output is True
+            kwargs.pop('stdout', None)
+            kwargs.pop('stderr', None)
+        else:
+            kwargs.setdefault('stdout', subprocess.PIPE)
+            kwargs.setdefault('stderr', subprocess.PIPE)
+        return original_popen(*args, **kwargs)
+
+    def _run_patch(*args, **kwargs):
+        if 'capture_output' in kwargs and kwargs['capture_output']:
+            # Avoid setting stdout or stderr if capture_output is True
+            kwargs.pop('stdout', None)
+            kwargs.pop('stderr', None)
+        else:
+            kwargs.setdefault('stdout', subprocess.PIPE)
+            kwargs.setdefault('stderr', subprocess.PIPE)
+        return original_run(*args, **kwargs)
+
+    subprocess.Popen = _popen_patch
+    subprocess.run = _run_patch
+    try:
+        yield
+    finally:
+        subprocess.Popen = original_popen
+        subprocess.run = original_run
+
+@contextlib.contextmanager
+def swallow_io():
+    stream = WriteOnlyStringIO()
+    with contextlib.redirect_stdout(stream):
+        with contextlib.redirect_stderr(stream):
+            with redirect_stdin(stream):
+                with swallow_subprocess_output():
+                    yield
+
+
+@contextlib.contextmanager
+def time_limit(seconds: float):
+    def signal_handler(signum, frame):
+        raise TimeoutException("Timed out!")
+
+    signal.setitimer(signal.ITIMER_REAL, seconds)
+    signal.signal(signal.SIGALRM, signal_handler)
+    try:
+        yield
+    finally:
+        signal.setitimer(signal.ITIMER_REAL, 0)
+
+
+@contextlib.contextmanager
+def create_tempdir():
+    with tempfile.TemporaryDirectory() as dirname:
+        with chdir(dirname):
+            yield dirname
+
+
+@contextlib.contextmanager
+def chdir(root):
+    if root == ".":
+        yield
+        return
+    cwd = os.getcwd()
+    os.chdir(root)
+    try:
+        yield
+    except BaseException as exc:
+        raise exc
+    finally:
+        os.chdir(cwd)
+
+
+@contextlib.contextmanager
+def safe_environment():
+    # Save original functions
+    original_kill = os.kill
+    original_killpg = os.killpg
+    original_system = os.system
+    original_subprocess_call = subprocess.call
+    original_subprocess_check_output = subprocess.check_output
+    original_subprocess_run = subprocess.run
+    original_subprocess_popen = subprocess.Popen
+    original_os_popen = os.popen
+    original_os_execv = os.execv
+    original_os_execvp = os.execvp
+    original_os_execvpe = os.execvpe
+
+    current_pid = os.getpid()
+    current_pgid = os.getpgid(current_pid)
+    manager = multiprocessing.Manager()
+    child_pids = manager.list()
+
+    def safe_kill(pid, sig):
+        try:
+            pgid = os.getpgid(pid)
+            if pid == current_pid or pid in child_pids:
+                original_kill(pid, sig)
+            else:
+                print(f"Prevented attempt to kill PID {pid} with signal {sig}")
+        except ProcessLookupError:
+            pass
+
+    def safe_killpg(pgid, sig):
+        if pgid == current_pgid or pgid in {os.getpgid(pid) for pid in child_pids}:
+            original_killpg(pgid, sig)
+        else:
+            print(f"Prevented attempt to kill PGID {pgid} with signal {sig}")
+
+    def safe_system(command):
+        print(f"Intercepted system command: {command}")
+        if 'kill' in command or 'killall' in command:
+            return 0  # Simulate successful execution without doing anything
+        return original_system(command)
+
+    def safe_subprocess_call(command, *args, **kwargs):
+        print(f"Intercepted subprocess call: {command}")
+        if 'kill' in command or 'killall' in command:
+            return 0  # Simulate successful execution without doing anything
+        return original_subprocess_call(command, *args, **kwargs)
+
+    def safe_subprocess_check_output(command, *args, **kwargs):
+        print(f"Intercepted command: {command}")
+        if 'ps' in command:
+            return b""  # Simulate no processes found
+        return original_subprocess_check_output(command, *args, **kwargs)
+
+    def safe_subprocess_run(*args, **kwargs):
+        print(f"Intercepted subprocess run command: {args}")
+        if 'kill' in args[0] or 'killall' in args[0]:
+            return subprocess.CompletedProcess(args, 0, b'', b'')  # Simulate successful execution
+        return original_subprocess_run(*args, **kwargs)
+
+    class SafePopen(subprocess.Popen):
+        def __init__(self, *args, **kwargs):
+            print(f"Intercepted Popen command: {args}")
+            kwargs['preexec_fn'] = os.setsid  # Start the process in a new session
+            super().__init__(*args, **kwargs)
+            child_pids.append(self.pid)
+
+        def communicate(self, *args, **kwargs):
+            try:
+                return super().communicate(*args, **kwargs)
+            except subprocess.TimeoutExpired:
+                print("Timeout expired, intercepted and returning None")
+                return None, None
+
+        def kill(self):
+            print(f"Intercepted kill call for PID {self.pid}")
+            safe_kill(self.pid, signal.SIGTERM)
+
+        def terminate(self):
+            print(f"Intercepted terminate call for PID {self.pid}")
+            safe_kill(self.pid, signal.SIGTERM)
+
+    def safe_os_popen(command):
+        print(f"Intercepted os.popen command: {command}")
+        if 'kill' in command or 'killall' in command:
+            return os.popen('echo Intercepted')
+        return original_os_popen(command)
+
+    def safe_exec(*args, **kwargs):
+        print(f"Intercepted exec command: {args}")
+
+    # Override the risky functions with the safe versions
+    os.kill = safe_kill
+    os.killpg = safe_killpg
+    os.system = safe_system
+    subprocess.call = safe_subprocess_call
+    subprocess.check_output = safe_subprocess_check_output
+    subprocess.run = safe_subprocess_run
+    subprocess.Popen = SafePopen
+    os.popen = safe_os_popen
+    os.execv = safe_exec
+    os.execvp = safe_exec
+    os.execvpe = safe_exec
+
+    try:
+        yield
+    finally:
+        for pid in child_pids:
+            try:
+                os.kill(pid, signal.SIGTERM)
+                for _ in range(10):
+                    time.sleep(0.1)
+                    try:
+                        os.kill(pid, 0)
+                    except ProcessLookupError:
+                        break
+                else:
+                    os.kill(pid, signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+            except Exception as e:
+                print(f"Error handling process {pid}: {e}")
+        
+        os.kill = original_kill
+        os.killpg = original_killpg
+        os.system = original_system
+        subprocess.call = original_subprocess_call
+        subprocess.check_output = original_subprocess_check_output
+        subprocess.run = original_subprocess_run
+        subprocess.Popen = original_subprocess_popen
+        os.popen = original_os_popen
+        os.execv = original_os_execv
+        os.execvp = original_os_execvp
+        os.execvpe = original_os_execvpe
+
+
+class TimeoutException(Exception):
+    pass
+
+
+class WriteOnlyStringIO(io.StringIO):
+    """StringIO that throws an exception when it's read from"""
+
+    def read(self, *args, **kwargs):
+        raise IOError
+
+    def readline(self, *args, **kwargs):
+        raise IOError
+
+    def readlines(self, *args, **kwargs):
+        raise IOError
+
+    def readable(self, *args, **kwargs):
+        """Returns True if the IO object can be read."""
+        return False
+
+
+class redirect_stdin(contextlib._RedirectStream):  # type: ignore
+    _stream = "stdin"
+
+
+def reliability_guard(max_as_limit, max_data_limit, max_stack_limit):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+    
+    import os
+    import time
+    from datetime import datetime
+
+    os.environ['TZ'] = 'UTC'
+    time.tzset()
+    
+    os.environ["OMP_NUM_THREADS"] = "1"
+    os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" 
+    os.environ['TF_ENABLE_ONEDNN_OPTS'] = "0"
+    
+    if max_as_limit and max_data_limit and max_stack_limit:
+        import resource
+        
+        max_as_limit = max_as_limit * 1024 * 1024
+        max_data_limit = max_data_limit * 1024 * 1024
+        max_stack_limit = max_stack_limit * 1024 * 1024
+        
+        resource.setrlimit(
+            resource.RLIMIT_AS, (max_as_limit, max_as_limit)
+        )
+        resource.setrlimit(
+            resource.RLIMIT_DATA, (max_data_limit, max_data_limit)
+        )
+        if not platform.uname().system == "Darwin":
+            resource.setrlimit(
+                resource.RLIMIT_STACK, (max_stack_limit, max_stack_limit)
+            )
+
+    faulthandler.disable()
+
+    import builtins
+
+    builtins.exit = None
+    builtins.quit = None
+
+    import matplotlib.pyplot as plt
+    plt.close('all')
+
+
+PASS = "pass"
+FAIL = "fail"
+TIMEOUT = "timeout"
+
+_SUCCESS = 0
+_FAILED = 1
+_TIMEOUT = 2
+_UNKNOWN = 3
+
+_mapping = {_SUCCESS: PASS, _FAILED: FAIL, _TIMEOUT: TIMEOUT, _UNKNOWN: None}
+
+
+def unsafe_execute(
+    code: str,
+    test_code: str,
+    timeout: float,
+    stat,  # Value
+    details,  # Array
+):
+    with safe_environment(), create_tempdir():
+        # These system calls are needed when cleaning up tempdir.
+        import os
+        import shutil
+        import builtins
+        
+        rmtree = shutil.rmtree
+        rmdir = os.rmdir
+        chdir = os.chdir
+        # Disable functionalities that can make destructive changes to the test.
+        reliability_guard(max_as_limit = 30720, max_data_limit = 30720, max_stack_limit = 10)
+        module_name = "__test__"
+        new_module = types.ModuleType(module_name)
+        # Set necessary attributes for the module
+        new_module.__dict__.update({
+            '__builtins__': builtins,
+            '__file__': f"{module_name}.py",
+            '__package__': None,
+            '__doc__': None,
+            'sys': sys,
+            'os': os,
+            'environ': os.environ,
+        })
+
+        try:
+            full_code = code + "\n" + test_code
+
+            with swallow_io():
+                exec(compile(full_code, f"{module_name}.py", 'exec'), new_module.__dict__)
+                sys.modules[module_name] = new_module
+                TestCases = getattr(new_module, 'TestCases')
+                loader = unittest.TestLoader()
+                suite = loader.loadTestsFromTestCase(TestCases)
+                test_result = unittest.TestResult()
+
+                with time_limit(timeout):
+                    suite.run(test_result)
+            
+            issues = test_result.failures + test_result.errors
+            for test, trace in issues:
+                details[test.id().split(".")[-1]] = trace
+            stat.value = _SUCCESS
+        except BaseException as e:
+            details["ALL"] = str(e)
+            stat.value = _FAILED
+        # Needed for cleaning up.
+        shutil.rmtree = rmtree
+        os.rmdir = rmdir
+        os.chdir = chdir
+
+
+import psutil
+
+def terminate_process_tree(pid):
+    try:
+        parent = psutil.Process(pid)
+        children = parent.children(recursive=True)
+        for child in children:
+            try:
+                if child.is_running():
+                    os.kill(child.pid, signal.SIGKILL)
+            except psutil.NoSuchProcess:
+                continue
+        if parent.is_running():
+            os.kill(parent.pid, signal.SIGKILL)
+    except psutil.NoSuchProcess:
+        pass
+
+def check_correctness(
+    task_id: int,
+    solution_id: int,
+    solution: str,
+    test: str,
+    timeout: float,
+) -> Tuple[str, np.ndarray]:
+
+    result = {
+        "task_id": task_id,
+        "solution_id": solution_id
+    }
+    
+    # shared memory objects
+    stat = Value("i", _UNKNOWN)
+    manager = Manager()
+    details = manager.dict()
+
+    p = multiprocessing.Process(
+        target=unsafe_execute,
+        args=(
+            solution,
+            test,
+            timeout,
+            stat,
+            details,
+        ),
+    )
+
+    p.start()
+    p.join(timeout=timeout+1)
+
+    if p.is_alive():
+        terminate_process_tree(p.pid)
+        stat.value = _TIMEOUT
+
+    stat = _mapping[stat.value]
+    details = dict(details)
+    
+    if not stat:
+        stat = TIMEOUT
+    if stat == PASS:
+        if details:
+            stat = FAIL
+
+    result["passed"] = stat == PASS
+    result["result"] = details
+    result["solution"] = solution
+
+    manager.shutdown()
+
+    return result
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/factory.py b/code_eval/OpenCodeEval/factory.py
new file mode 100644
index 0000000..5d3fefc
--- /dev/null
+++ b/code_eval/OpenCodeEval/factory.py
@@ -0,0 +1,89 @@
+from loguru import logger
+
+from OpenCodeEval.benchmark.HumanEval import HumanEval
+from OpenCodeEval.benchmark.mbpp import mbpp
+from OpenCodeEval.benchmark.MBPP import MBPP
+from OpenCodeEval.benchmark.LeetCode import LeetCode
+from OpenCodeEval.benchmark.BigCodeBench import BigCodeBench
+from OpenCodeEval.benchmark.Bird import Bird
+from OpenCodeEval.benchmark.Spider import Spider
+
+from OpenCodeEval.benchmark.understandml import understandml
+
+from OpenCodeEval.backend.vllm import VllmGenerator
+from OpenCodeEval.backend.sglang import SglangGenerator
+from OpenCodeEval.backend.openai import OpenaiGenerator
+
+class BenchmarkFactory:
+
+    @staticmethod
+    def get_task(args):
+        task_map = {
+            "HumanEval": HumanEval,
+            "mbpp": mbpp,
+            "MBPP": MBPP,
+            "LeetCode": LeetCode,
+            "BigCodeBench": BigCodeBench,
+            "Bird": Bird,
+            "Spider": Spider,
+            "understandml": understandml
+        }
+
+        # Check if the task exists in the map
+        if args.task not in task_map:
+            logger.error(f"Unknown Task type: {args.task}")
+
+        # Get the corresponding class
+        task_class = task_map[args.task]
+
+        return task_class(
+            split = args.split,
+            time_out = args.time_out,
+            prompt_type = args.prompt_type
+        )
+
+class BackendFactory:
+
+    @staticmethod
+    def get_backend(args):
+        if args.backend == "vllm":
+            return VllmGenerator(
+                model_name = args.model_name,
+                model_type = args.model_type,
+                tokenizer_name = args.tokenizer_name,
+                num_gpus = args.num_gpus,
+                batch_size = args.batch_size,
+                temperature = args.temperature,
+                top_p = args.top_p,
+                num_samples = args.num_samples,
+                max_tokens = args.max_tokens,
+                seed = args.seed,
+                trust_remote_code = args.trust_remote_code,
+            )
+
+        elif args.backend == "sglang":
+            return SglangGenerator(
+                model_name = args.model_name,
+                model_type = args.model_type,
+                tokenizer_name = args.tokenizer_name,
+                num_gpus = args.num_gpus,
+                batch_size = args.batch_size,
+                temperature = args.temperature,
+                top_p = args.top_p,
+                num_samples = args.num_samples,
+                max_tokens = args.max_tokens,
+                trust_remote_code = args.trust_remote_code,
+            )
+
+        elif args.backend == "openai":
+            return OpenaiGenerator(
+                model_name = args.model_name,
+                model_type = args.model_type,
+                batch_size = args.batch_size,
+                temperature = args.temperature,
+                top_p = args.top_p,
+                num_samples = args.num_samples,
+                max_tokens = args.max_tokens,
+            )
+        else:
+            logger.error(f"Unknown Backend type: {args.backend}")
diff --git a/code_eval/OpenCodeEval/main.py b/code_eval/OpenCodeEval/main.py
new file mode 100644
index 0000000..d78ddec
--- /dev/null
+++ b/code_eval/OpenCodeEval/main.py
@@ -0,0 +1,115 @@
+import os
+import argparse
+
+from OpenCodeEval.args import get_args, check_args
+from OpenCodeEval.utils import refine_text, write_jsonl, stream_jsonl, calculate_pass_at_k
+
+from OpenCodeEval.factory import BenchmarkFactory, BackendFactory
+
+from tqdm.contrib.concurrent import process_map, thread_map
+
+def main():
+
+    parser = argparse.ArgumentParser()
+    args = get_args(parser)
+    args = check_args(args)
+
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok = True)
+
+    task = BenchmarkFactory.get_task(args)
+
+    # get prompts
+    prompts = task.get_prompt()
+
+    for prompt in prompts:
+        prompt['prompt'] = refine_text(args.prompt_prefix + prompt['prompt'] + args.prompt_suffix)
+    prompts = sorted(prompts, key = lambda x: x['task_id'])
+    write_jsonl(os.path.join(save_path, "prompts.jsonl"), prompts)
+
+    if args.split  ==  'plus':
+        base_path = save_path.replace('plus', 'base')
+        if os.path.exists(os.path.join(base_path, "generations.jsonl")):
+            generations = list(stream_jsonl(os.path.join(base_path, "generations.jsonl")))
+            write_jsonl(os.path.join(save_path, "generations.jsonl"), generations)
+
+    # check if generations exits
+    if not os.path.exists(os.path.join(save_path, "generations.jsonl")):
+        
+        prompts = list(stream_jsonl(os.path.join(save_path, "prompts.jsonl")))
+        # get generations
+        decoder = BackendFactory.get_backend(args)
+        if decoder.is_chat():
+            decoder.set_stop(task.chat_stop)
+        else:
+            decoder.set_stop(task.chat_stop + task.base_stop)
+
+        generations = decoder.generate(
+            prompts,
+            args.response_prefix,
+            args.response_suffix
+        )
+
+        generations = sorted([data for data in generations if data['completion']], key = lambda x: (x['task_id'], x['completion_id']))
+        write_jsonl(os.path.join(save_path, "generations.jsonl"), generations)
+
+    else:
+        
+        generated_ids = [data['task_id'] for data in stream_jsonl(os.path.join(save_path, "generations.jsonl"))]
+        prompts = [data for data in stream_jsonl(os.path.join(save_path, "prompts.jsonl")) if data['task_id'] not in generated_ids]
+        if len(prompts) > 0:
+
+            # get generations
+            decoder = BackendFactory.get_backend(args)
+            if decoder.is_chat():
+                decoder.set_stop(task.chat_stop)
+            else:
+                decoder.set_stop(task.chat_stop + task.base_stop)
+
+            continue_generations = decoder.generate(
+                prompts,
+                args.response_prefix,
+                args.response_suffix
+            )
+
+            generations = sorted([data for data in continue_generations if data['completion']] + list(stream_jsonl(os.path.join(save_path, "generations.jsonl"))), key = lambda x: (x['task_id'], x['completion_id']))
+            write_jsonl(os.path.join(save_path, "generations.jsonl"), generations)
+
+
+    # post-process generations
+    # if not os.path.exists(os.path.join(save_path, "solutions.jsonl")):
+    if True:
+
+        generations = list(stream_jsonl(os.path.join(save_path, "generations.jsonl")))
+        solutions = thread_map(
+            task.postprocess_generation,
+            generations,
+            max_workers = args.num_workers,
+            desc = "Post-processing Generations"
+        )
+        solutions = sorted(solutions, key = lambda x: (x['task_id'], x['completion_id']))
+        write_jsonl(os.path.join(save_path, "solutions.jsonl"), solutions)
+
+    # evaluate solutions
+    # if not os.path.exists(os.path.join(save_path, "evaluations.jsonl")):
+    if True:
+        solutions = list(stream_jsonl(os.path.join(save_path, "solutions.jsonl")))
+        evaluations = thread_map(
+            task.process_results,
+            solutions,
+            max_workers = args.num_workers,
+            desc = "Evaluating Solutions"
+        )
+        evaluations = sorted(evaluations, key = lambda x: (x['task_id'], x['completion_id']))
+        write_jsonl(os.path.join(save_path, "evaluations.jsonl"), evaluations)
+
+    # calculate pass@k
+    # if not os.path.exists(os.path.join(save_path, "results.jsonl")):
+    if True:
+        evaluations = list(stream_jsonl(os.path.join(save_path, "evaluations.jsonl")))
+        results = calculate_pass_at_k(evaluations, args.num_samples, args.list_k)
+        write_jsonl(os.path.join(save_path, "results.jsonl"), results)
+
+
+if __name__ == "__main__":
+    main()
+\ No newline at end of file
diff --git a/code_eval/OpenCodeEval/utils.py b/code_eval/OpenCodeEval/utils.py
new file mode 100644
index 0000000..4c65d49
--- /dev/null
+++ b/code_eval/OpenCodeEval/utils.py
@@ -0,0 +1,165 @@
+import os
+import re
+import gzip
+import json
+import itertools
+import numpy as np
+
+from tqdm import tqdm
+from collections import defaultdict
+from typing import Dict, List, Union, Iterable, Callable, Literal
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+def refine_text(text: str, add_new_line: bool = True) -> str:
+    text =  text.replace("\t", "    ")
+    text = text.replace("\r\n", "\n").replace("\r", "\n")
+    if add_new_line:
+        return text.strip() + "\n"
+    else:
+        return text.strip()
+
+def multi_process_function(function: Callable,
+                           parameters: List,
+                           num_workers: int = 1,
+                           desc: str = "Completing tasks"):
+    
+    if num_workers > len(parameters) or num_workers > os.cpu_count():
+        num_workers = min(os.cpu_count(), len(parameters))
+
+    with ThreadPoolExecutor(num_workers) as executor:
+        futures = []
+        for param in parameters:
+            future = executor.submit(function, param)
+            futures.append(future)
+            
+        results = []
+        for future in tqdm(as_completed(futures), total=len(futures), desc=desc):
+            result = future.result()
+            results.append(result)
+
+    return results
+
+def markdown_extract(text: str, mode: Literal["first", "last", "all"] = "all") -> str:
+    """
+    Extract content enclosed by triple backticks (```) in a Markdown text.
+
+    Args:
+        text (str): The Markdown text to extract from.
+        mode (Literal["first", "last", "all"]):
+            - "first": Extract the first block of content.
+            - "last": Extract the last block of content.
+            - "all": Extract all blocks of content and join them with double newlines.
+
+    Returns:
+        str: Extracted content based on the specified mode.
+    """
+    # Match content inside triple backticks, ignoring the ``` lines
+    pattern = r"```[ \t]*[\r\n]+[ \t]*(.*?)[ \t]*[\r\n]+[ \t]*```"
+    matches = re.findall(pattern, text, re.DOTALL)
+
+    if matches:
+        if mode == "first":
+            return matches[0]
+        elif mode == "last":
+            return matches[-1]
+        elif mode == "all":
+            return "\n\n".join(matches)
+    else:
+            return ""
+
+def program_extract(
+    text: str,
+    program: str = "python",
+    mode: Literal["first", "last", "all"] = "all"
+) -> str:
+
+    program_pattern = rf"```{program}[ \t]*[\r\n]+[ \t]*(.*?)[ \t]*[\r\n]+[ \t]*```"
+    program_re = re.compile(program_pattern, re.DOTALL | re.IGNORECASE)
+
+    matches = program_re.findall(text)
+    if matches:
+        if mode == "first":
+            return matches[0]
+        elif mode == "last":
+            return matches[-1]
+        elif mode == "all":
+            return "\n\n".join(matches)
+    else:
+        return ""
+
+def stream_jsonl(filename: str) -> Iterable[Dict]:
+    """
+    Parses each jsonl line and yields it as a dictionary
+    """
+    if filename.endswith(".gz"):
+        with open(filename, "rb") as gzfp:
+            with gzip.open(gzfp, 'rt') as fp:
+                for line in fp:
+                    if any(not x.isspace() for x in line):
+                        yield json.loads(line)
+    else:
+        with open(filename, "r", encoding="utf-8") as fp:
+            for line in fp:
+                if any(not x.isspace() for x in line):
+                    yield json.loads(line)
+
+
+def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
+    """
+    Writes an iterable of dictionaries to jsonl
+    """
+    if append:
+        mode = 'ab'
+    else:
+        mode = 'wb'
+    filename = os.path.expanduser(filename)
+    if filename.endswith(".gz"):
+        with open(filename, mode) as fp:
+            with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
+                for x in data:
+                    gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
+    else:
+        with open(filename, mode) as fp:
+            for x in data:
+                fp.write((json.dumps(x) + "\n").encode('utf-8'))
+
+def estimate_pass_at_k(
+    num_samples: Union[int, List[int], np.ndarray],
+    num_correct: Union[List[int], np.ndarray],
+    k: int
+) -> np.ndarray:
+    """
+    Estimates pass@k of each problem and returns them in an array.
+    """
+
+    def estimator(n: int, c: int, k: int) -> float:
+        """
+        Calculates 1 - comb(n - c, k) / comb(n, k).
+        """
+        if n - c < k:
+            return 1.0
+        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
+    if isinstance(num_samples, int):
+        num_samples_it = itertools.repeat(num_samples, len(num_correct))
+    else:
+        assert len(num_samples) == len(num_correct)
+        num_samples_it = iter(num_samples)
+
+    return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
+
+def calculate_pass_at_k(
+    evaluations: List[Dict], 
+    num_samples: int, 
+    list_k: List[int]
+    ) -> List[Dict]:
+
+    task_results = defaultdict(int)
+    for evaluation in evaluations:
+        task_results[evaluation['task_id']] += evaluation['passed']
+    task_results = list(task_results.values())
+    benchmark_results = []
+    for k in list_k:
+        pass_rate = float(np.mean(estimate_pass_at_k(num_samples = num_samples, num_correct = task_results, k = k)))
+        benchmark_results.append({f"Pass@{k}": pass_rate})
+    return benchmark_results
+\ No newline at end of file
diff --git a/code_eval/README.md b/code_eval/README.md
new file mode 100644
index 0000000..32daaa9
--- /dev/null
+++ b/code_eval/README.md
@@ -0,0 +1,27 @@
+## Quick Start
+
+1. Download benchmark datasets:
+
+```bash
+cd OpenCodeEval/data
+bash dataset.sh
+```
+
+2. Install dependencies:
+
+```bash
+pip install -e .
+```
+
+3. **Configure Evaluation Scripts**  
+   - Replace placeholders in the evaluation scripts with the actual model name and path.  
+   - Adjust any other necessary settings (e.g., evaluation parameters, output paths) to suit your requirements.
+
+4. Execute the evaluation script for your desired benchmark. For example, to evaluate using the `test-humaneval-ckpt-list.sh` script:
+
+Such as:
+```bash
+bash test-humaneval-ckpt-list.sh
+```
+
+   > **Note**: Ensure that all configurations are correctly set before running the script to avoid errors.
diff --git a/code_eval/test-humaneval-ckpt-list.sh b/code_eval/test-humaneval-ckpt-list.sh
new file mode 100644
index 0000000..03514bd
--- /dev/null
+++ b/code_eval/test-humaneval-ckpt-list.sh
@@ -0,0 +1,28 @@
+export VLLM_ENABLE_V1_MULTIPROCESSING=0
+
+# Declare an associative array to store model mappings
+# TODO:Replace with actual model name and path
+declare -A model_dict=(
+    ["model_name_1"]="/path/to/model1"  
+    ["model_name_2"]="/path/to/model2"
+)
+
+# 外层循环：遍历model_dict
+for exp_name in "${!model_dict[@]}"; do
+    model="${model_dict[$exp_name]}"
+    echo "Evaluating model: $model"
+
+    python OpenCodeEval/main.py  --model_name $model \
+                    --task "HumanEval" \
+                    --save "test/output_humaneval_${exp_name}" \
+                    --num_gpus 1 \
+                    --batch_size 164 \
+                    --max_tokens 4096 \
+                    --temperature 0.1 \
+                    --seed 0 \
+                    --prompt_type "Completion" \
+                    --model_type "Chat" \
+                    --prompt_prefix $'Please provide a self-contained Python script that solves the following problem in a markdown code block:\n```python\n' \
+                    --prompt_suffix $'\n```\n' \
+
+done
+\ No newline at end of file
diff --git a/code_eval/test-mbpp-ckpt-list.sh b/code_eval/test-mbpp-ckpt-list.sh
new file mode 100644
index 0000000..b7bb5fd
--- /dev/null
+++ b/code_eval/test-mbpp-ckpt-list.sh
@@ -0,0 +1,32 @@
+export VLLM_ENABLE_V1_MULTIPROCESSING=0
+
+# Declare an associative array to store model mappings
+# TODO:Replace with actual model name and path
+declare -A model_dict=(
+    ["model_name_1"]="/path/to/model1"  
+    ["model_name_2"]="/path/to/model2"
+)
+
+# 外层循环：遍历model_dict
+for exp_name in "${!model_dict[@]}"; do
+    model="${model_dict[$exp_name]}"
+    echo "Evaluating model: $model"
+
+    python OpenCodeEval/main.py  --model_name $model \
+                    --task "MBPP" \
+                    --save "test_mbpp/output_mbpp_step_${exp_name}" \
+                    --num_gpus 1 \
+                    --batch_size 378 \
+                    --max_tokens 4096 \
+                    --temperature 0.0 \
+                    --seed 0 \
+                    --time_out 3.0 \
+                    --prompt_type "Instruction" \
+                    --model_type "Chat" \
+                    --prompt_prefix "" \
+                    --prompt_suffix "" \
+                    --trust_remote_code
+
+done
+
+
diff --git a/kk_eval/compute_score.py b/kk_eval/compute_score.py
new file mode 100644
index 0000000..133e9ab
--- /dev/null
+++ b/kk_eval/compute_score.py
@@ -0,0 +1,132 @@
+import re
+from typing import Dict, Tuple, Optional
+
+def extract_solution(solution_str: str) -> Tuple[Optional[str], str]:
+    """Extracts the final answer from the model's response string.
+    
+    Args:
+        solution_str: Raw response string from the language model
+        
+    Returns:
+        Tuple containing (extracted_answer, processed_string)
+    """
+    # Split response to isolate assistant output
+    # if "Assistant:" in solution_str:
+    #     processed_str = solution_str.split("Assistant:", 1)[1]
+    # elif "<|im_start|>assistant" in solution_str:
+    #     processed_str = solution_str.split("<|im_start|>assistant", 1)[1]
+    # else:
+    #     print("[Error] Failed to locate model response header")
+    #     return None, solution_str
+    processed_str = solution_str
+
+    # Extract final answer using XML-style tags
+    answer_pattern = r'<answer>(.*?)</answer>'
+    matches = list(re.finditer(answer_pattern, processed_str, re.DOTALL))
+    
+    if not matches:
+        print("[Error] No valid answer tags found")
+        return None, processed_str
+        
+    final_answer = matches[-1].group(1).strip()
+    return final_answer, processed_str
+
+def parse_solution_text_format(solution_text: str) -> Dict[str, str]:
+    """Parses ground truth solution text into status dictionary.
+    
+    Args:
+        solution_text: Formatted solution text from dataset
+        
+    Returns:
+        Dictionary mapping character names to their roles (knight/knave)
+    """
+    status_dict = {}
+    print("\n[Ground Truth Parsing]")
+    
+    for line in solution_text.split('\n'):
+        line = line.strip()
+        if not line:
+            continue
+            
+        match = re.search(r'\b([A-Za-z]+)\b.*?\b(knight|knave)\b', line, re.IGNORECASE)
+        if match:
+            name, role = match.groups()
+            status_dict[name] = role.lower()
+            print(f"  Found: {name} → {role}")
+        else:
+            print(f"  [Warning] Unparseable line: '{line}'")
+    
+    return status_dict
+
+def parse_model_answer(answer_text: str, expected_names: list) -> Optional[Dict[str, str]]:
+    """Parses model's answer text into status dictionary.
+    
+    Args:
+        answer_text: Text extracted from model's <answer> tags
+        expected_names: List of character names requiring identification
+        
+    Returns:
+        Dictionary mapping character names to predicted roles, or None if incomplete
+    """
+    status_dict = {}
+    print("\n[Model Answer Parsing]")
+    print(f"  Expected characters: {expected_names}")
+    
+    for name in expected_names:
+        pattern = re.compile(
+            rf'\b{re.escape(name)}\b.*?\b(knight|knave)\b', 
+            re.IGNORECASE
+        )
+        match = pattern.search(answer_text)
+        
+        if match:
+            role = match.group(1).lower()
+            status_dict[name] = role
+            print(f"  Found: {name} → {role}")
+        else:
+            print(f"  [Error] Missing identification for {name}")
+            return None
+    
+    return status_dict
+
+def validate_response_structure(processed_str: str) -> bool:
+    """Performs comprehensive validation of response structure.
+    
+    Args:
+        processed_str: Processed response string from the model
+        
+    Returns:
+        Boolean indicating whether all formatting requirements are met
+    """
+    print("\n[Structure Validation]")
+    validation_passed = True
+    return validation_passed
+
+    # Check required tags
+    tags = {
+        'think_end': ('</think>', 1),
+        'answer_start': ('<answer>', 1),
+        'answer_end': ('</answer>', 1)
+    }
+
+    positions = {}
+    for tag_name, (tag_str, expected_count) in tags.items():
+        count = processed_str.count(tag_str)
+        positions[tag_name] = pos = processed_str.find(tag_str)
+        
+        print(f"  {tag_str}: count={count}, position={pos}")
+        
+        if count != expected_count:
+            print(f"  [Error] {tag_str} appears {count} times (expected {expected_count})")
+            validation_passed = False
+
+    # Verify tag order
+    if (
+        positions['think_end'] > positions['answer_start'] or
+        positions['answer_start'] > positions['answer_end']):
+        print("  [Error] Incorrect tag order: Expected ...</think><answer>...</answer>")
+        validation_passed = False
+    else:
+        print("  Tag sequence validation passed")
+
+    return validation_passed
diff --git a/kk_eval/eval_kk-ckpt_list.sh b/kk_eval/eval_kk-ckpt_list.sh
new file mode 100644
index 0000000..c3315fa
--- /dev/null
+++ b/kk_eval/eval_kk-ckpt_list.sh
@@ -0,0 +1,50 @@
+# 定义基础变量
+export VLLM_ENABLE_V1_MULTIPROCESSING=0
+config="vllm"
+num_limit=100
+max_token=3072
+ntrain=0
+split="test"
+log_path="log/test"
+
+# 创建日志目录
+mkdir -p ${log_path}
+
+# Declare an associative array to store model mappings
+# TODO:Replace with actual model name and path
+declare -A model_dict=(
+    ["model_name_1"]="/path/to/model1"  
+    ["model_name_2"]="/path/to/model2"
+)
+
+
+# Outer Loop：遍历model_dict
+for exp_name in "${!model_dict[@]}"; do
+    model="${model_dict[$exp_name]}"
+
+    # Inner Loop：遍历不同的 eval_nppl 值
+    for eval_nppl in 2 3 4 5 6 7 8; do
+        log_file="${log_path}/${exp_name}_nppl${eval_nppl}.log" # 日志文件名包含模型名称和 eval_nppl
+        echo "Starting job for model: $model, eval_nppl: $eval_nppl, logging to $log_file"
+
+        # 启动评估任务
+        CUDA_VISIBLE_DEVICES=1 PYTHONUNBUFFERED=1 python main_eval_instruct.py \
+        --batch_size 100 \
+        --model ${model} \
+        --max_token ${max_token} \
+        --ntrain ${ntrain} \
+        --config "${config}_${exp_name}_nppl${eval_nppl}" \
+        --limit ${num_limit} \
+        --split ${split} \
+        --temperature 0.0 \
+        --top_p 1.0 \
+        --seed 0 \
+        --problem_type "clean" \
+        --output_file "${log_path}/${exp_name}_nppl${eval_nppl}.json" \
+        --eval_nppl ${eval_nppl} > "$log_file" 2>&1 
+    done
+done
+
+
+# 等待所有后台任务完成
+wait
+\ No newline at end of file
diff --git a/kk_eval/kk_processor.py b/kk_eval/kk_processor.py
new file mode 100644
index 0000000..626b265
--- /dev/null
+++ b/kk_eval/kk_processor.py
@@ -0,0 +1,208 @@
+import numpy as np
+from kk_prompt import system_instruction, demonstration_2char, system_instruction_no_reason, demonstration_2char_no_reason
+
+from compute_score import extract_solution, parse_solution_text_format, parse_model_answer, validate_response_structure
+
+def num_tokens_from_string(string):
+    import tiktoken
+    """Returns the number of tokens in a text string."""
+    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
+    num_tokens = len(encoding.encode(string))
+    return num_tokens
+
+
+def parse_cot_eval(pred_str, ans,
+                   conclusion_patterns=['CONCLUSION:'],
+                   verbose=False,
+                   finish_patterns=["### Reason", "Let's think step by step again", "let's go back and check", "###"],
+                   reformat_gold_conditions=None):
+    
+    def judge_string(input_str, reformat_gold_conditions, wrong_reason, finish_patterns):
+        correct_count = 0
+        is_correct = False
+        beyond_id = len(reformat_gold_conditions)+1
+        beyond_id_pattern = f"({beyond_id})"
+
+        for finish_pattern in finish_patterns:
+            if finish_pattern in input_str:
+                input_str = input_str.split(finish_pattern)[0]
+
+        if beyond_id_pattern in input_str:
+            is_correct = False
+            wrong_reason = "beyond_list"
+        elif "if" in input_str:
+            is_correct = False
+            wrong_reason = "contain_if"
+        else:
+            is_correct = True
+            for gold_condition in reformat_gold_conditions:
+                if gold_condition not in input_str:
+                    is_correct = False
+                    wrong_reason = "wrong_identity"
+                else:
+                    correct_count += 1
+        correct_ratio = correct_count/len(reformat_gold_conditions)
+
+        return is_correct, wrong_reason, correct_ratio
+
+    def check_numbers_in_string(s, N):
+        for i in range(1, N + 1):
+            if f"({i})" not in s:
+                return False
+        return True
+    
+    original_str = pred_str
+    pred_str = pred_str.split("### Question")[0]
+    pred_answer = pred_str
+    is_correct = False
+    correct_ratio = 0
+    if reformat_gold_conditions is None:
+        gold = ans.replace(" and ", "").replace(".", "")
+        gold_conditions = gold.split(",")
+        reformat_gold_conditions = []
+        for condition in gold_conditions:
+            gold_condition = condition.strip()    # Remove leading and trailing spaces
+            reformat_gold_conditions.append(gold_condition)
+
+    wrong_reason = "no_conclusion_matched"
+    for pattern in conclusion_patterns:
+        pred = pred_str.split(pattern)
+        if len(pred) > 1:
+            if len(pred[1]) > 0:  # if the matched the answer is not empty
+                pred_answer = pred[1]
+                is_correct, wrong_reason, correct_ratio = judge_string(
+                    pred_answer, reformat_gold_conditions, wrong_reason, finish_patterns)
+                break
+    if is_correct == False and wrong_reason == "no_conclusion_matched": 
+        if check_numbers_in_string(pred_str, len(reformat_gold_conditions)): # the answer contains (1)..(2)..
+            is_correct, wrong_reason, correct_ratio = judge_string(
+                pred_str, reformat_gold_conditions, wrong_reason, finish_patterns)
+    if is_correct == False and verbose == True:
+        print("wrong_reason:",wrong_reason)
+        print("********* \nprediction before parse:\n", original_str)
+        print("********* \nprediction after parse:\n", pred_answer)
+
+    return is_correct, pred_answer, wrong_reason, correct_ratio, reformat_gold_conditions
+
+
+def parse_cot_eval_instruct(pred_str, ans,
+                   conclusion_patterns=['<answer>'],
+                   verbose=False,
+                   finish_patterns=["</answer>"],
+                   reformat_gold_conditions=None,
+                   expected_names=None,
+                   solution_text_format=None):
+    print("\n" + "="*80)
+    print(" Processing New Sample ".center(80, '='))
+    
+    # Parse ground truth data
+    gt_status = parse_solution_text_format(solution_text_format)
+    expected_names = list(gt_status.keys())
+    print(f"[Ground Truth] Final identities: {gt_status}")
+
+    # Extract model answer
+    answer_text, processed_str = extract_solution(pred_str)
+    print(f"\n[Model Response]\n{processed_str}")
+
+    # Validate response structure
+    format_correct = validate_response_structure(processed_str)
+    print(f"\n  Format validation: {'PASS' if format_correct else 'FAIL'}")
+
+    # Validate answer content
+    answer_score = 0
+    is_correct = False
+    correct_ratio = 0
+    wrong_reason = "no_conclusion_matched"
+    if format_correct and answer_text:
+        pred_status = parse_model_answer(answer_text, expected_names)
+        if pred_status:
+            print(f"\n[Content Validation]")
+            print(f"  Expected: {gt_status}")
+            print(f"  Predicted: {pred_status}")
+            
+            if pred_status == gt_status:
+                answer_score = 2
+                is_correct = True
+                correct_ratio = 1
+                print("  Content validation: FULL MATCH")
+            else:
+                answer_score = -1.5
+                correct_ratio = 0
+                wrong_reason = "wrong_identity"
+                print("  Content validation: MISMATCH")
+        else:
+            answer_score = -2
+            correct_ratio = 0
+            wrong_reason = "no_conclusion_matched"
+            print( "Fail to parse answer")
+    else:
+        print("\n[Content Validation] Skipped due to format errors or missing answer")
+    
+    if is_correct == False and verbose == True:
+        print("wrong_reason:",wrong_reason)
+        print("********* \nprediction before parse:\n", pred_str)
+        print("********* \nprediction after parse:\n", answer_text)
+
+    return is_correct, answer_text, wrong_reason, correct_ratio, reformat_gold_conditions
+
+
+class KKProcessor:
+    def __init__(self, cot=True, no_linebreak=True):
+        self.cot = cot
+        self.no_linebreak = no_linebreak
+
+    def format_example(self, test_records, idx, model_name=None):
+       
+        item = test_records[idx]
+
+        prompt = "### Question: "+item["quiz"] + "\n"
+        if self.cot:
+            if model_name in ["deepseek-ai/deepseek-math-7b-instruct", "AI-MO/NuminaMath-7B-CoT"]:
+                prompt += "Please reason step by step, and put your final answer within \\boxed{}."
+            else:
+                prompt += "### Answer: Let's think step by step. "
+        else:
+            if self.no_linebreak:
+                prompt += "### Answer:"
+            else:
+                prompt += "### Answer:\n"
+        answer = item["solution_text"]
+        return prompt, answer
+
+    def gen_test_prompt(self, ntrain, test_records, idx, model_name=None):
+        if self.cot:
+            train_prompt = system_instruction
+        else:
+            train_prompt = system_instruction_no_reason
+
+        if ntrain == 1:
+            if self.cot:
+                train_prompt += "\n\n"+demonstration_2char
+            else:
+                train_prompt += "\n\n"+demonstration_2char_no_reason
+        elif ntrain > 1:
+            raise NotImplementedError
+
+        prompt_end, answer = self.format_example(test_records, idx, model_name)
+        prompt = train_prompt + "\n\n" + prompt_end
+
+        return prompt, answer
+
+    def _parse_cot_eval(self, pred_str, ans, model_name=None):
+        conclusion_patterns = ['CONCLUSION:', 'Conclusion:', 'conclusion:']
+
+        if model_name in ["deepseek-ai/deepseek-math-7b-instruct", "AI-MO/NuminaMath-7B-CoT"]:
+            conclusion_patterns = ['boxed{', 'CONCLUSION:', 'Conclusion:', 'conclusion:']
+
+        is_correct, pred_answer, wrong_reason, correct_ratio, reformat_gold_conditions = parse_cot_eval(
+            pred_str, ans, conclusion_patterns=conclusion_patterns, verbose=False)
+
+        return is_correct, pred_answer, reformat_gold_conditions
+    
+    def _parse_cot_eval_instruct(self, pred_str, ans, model_name=None, expected_names=None, solution_text_format=None):
+        conclusion_patterns = ['<answer>']
+
+        is_correct, pred_answer, wrong_reason, correct_ratio, reformat_gold_conditions = parse_cot_eval_instruct(
+            pred_str, ans, conclusion_patterns=conclusion_patterns, verbose=True, expected_names=expected_names, solution_text_format=solution_text_format)
+
+        return is_correct, pred_answer, reformat_gold_conditions
diff --git a/kk_eval/kk_prompt.py b/kk_eval/kk_prompt.py
new file mode 100644
index 0000000..cb9e2d8
--- /dev/null
+++ b/kk_eval/kk_prompt.py
@@ -0,0 +1,38 @@
+system_instruction='''Your task is to solve a logical reasoning problem. You are given set of statements from which you must logically deduce the identity of a set of characters.
+
+You must infer the identity of each character. First, explain your reasoning. At the end of your answer, you must clearly state the identity of each character by following the format:
+
+CONCLUSION:
+(1) ...
+(2) ...
+(3) ...
+'''
+
+
+system_instruction_no_reason='''Your task is to solve a logical reasoning problem. You are given set of statements from which you must logically deduce the identity of a set of characters.
+
+You must infer the identity of each character. At the end of your answer, you must clearly state the identity of each character by following the format:
+
+CONCLUSION:
+(1) ...
+(2) ...
+(3) ...
+'''
+
+demonstration_2char_no_reason='''### Question: A very special island is inhabited only by knights and knaves. Knights always tell the truth, and knaves always lie. You meet 2 inhabitants: Jack, and Sophia. Jack tells you that Sophia is not a knave. Sophia says that If Jack is a knight then Sophia is a knight. So who is a knight and who is a knave?
+### Answer:
+CONCLUSION:
+(1) Jack is a knight
+(2) Sophia is a knight
+'''
+
+
+
+demonstration_2char='''### Question: A very special island is inhabited only by knights and knaves. Knights always tell the truth, and knaves always lie. You meet 2 inhabitants: Ella, and Penelope. In a statement by Ella: \"Ella is a knight or Penelope is a knight\". According to Penelope, \"Ella is a knave if and only if Penelope is a knight\". So who is a knight and who is a knave?
+### Answer: Let's think step by step, by considering whether each person is lying and if that leads to contradiction. Assume Ella is a knight. Penelope cannot be a knight, because this would contradict the claim of their own. Penelope cannot be a knave, because this would contradict the false claim of their own. We have exhausted all possibilities for Penelope, so let us go back and reconsider Ella. Assume Ella is a knave. Penelope cannot be a knight, because this would contradict the false claim of Ella. Assume Penelope is a knave. This leads to a feasible solution.
+CONCLUSION:
+(1) Ella is a knave
+(2) Penelope is a knave
+'''
+
+
diff --git a/kk_eval/main_eval_instruct.py b/kk_eval/main_eval_instruct.py
new file mode 100644
index 0000000..7eb1322
--- /dev/null
+++ b/kk_eval/main_eval_instruct.py
@@ -0,0 +1,209 @@
+      
+import argparse
+import json
+import os
+import numpy as np
+import random
+import torch
+import time
+from vllm import LLM, SamplingParams
+from kk_processor import KKProcessor
+import datasets
+from concurrent.futures import ThreadPoolExecutor
+
+def load_jsonl(file_path):
+    """Load data from a JSONL file."""
+    if not os.path.exists(file_path):
+        return []
+    with open(file_path, "r") as file:
+        return [json.loads(line) for line in file]
+
+def write_jsonl(output_file, data):
+    """Write data to a JSONL file."""
+    with open(output_file, "w+") as file:
+        for item in data:
+            file.write(json.dumps(item) + "\n")
+
+def init_seed(seed=42):
+    """Initialize random seeds for reproducibility."""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+def load_eval_records(args, subject):
+    """Load evaluation records based on arguments."""
+    if args.problem_type != "clean":
+        records = datasets.load_dataset('K-and-K/perturbed-knights-and-knaves', 
+                                        data_files=f"{args.split}/{args.problem_type}/{subject}.jsonl")["train"]
+    else:
+        records = datasets.load_dataset('K-and-K/knights-and-knaves', 
+                                        data_files=f"{args.split}/{subject}.jsonl")["train"]
+    return records
+
+def eval_subject(args, subject, llm, test_records, kk_proc, exist_result_records):
+    """Evaluate one subject."""
+    cors = []
+    start_index = len(exist_result_records)
+    print(f"Processing {subject} starting from index {start_index}")
+
+    # Prepare all prompts using multi-threading
+    prompts, labels = [], []
+    new_prompts = []
+    with ThreadPoolExecutor() as executor:
+        futures = [
+            executor.submit(kk_proc.gen_test_prompt, args.ntrain, test_records, i, args.model)
+            for i in range(start_index, len(test_records))
+        ]
+        for future in futures:
+            prompt, label = future.result()
+            prompts.append(prompt)
+            labels.append(label)
+            question = prompt.split("### Question:")[1].strip().split("### Answer:")[0].strip()
+            new_prompt = f"""<|im_start|>system\nYou are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>.  Now the user asks you to solve a logical reasoning problem. After thinking, when you finally reach a conclusion, clearly state the identity of each character within <answer> </answer> tags. i.e., <answer> (1) ...\n(2) ... </answer>.\n<|im_end|>\n<|im_start|>user\n{question}\n<|im_end|>\n<|im_start|>assistant\n<think>"""
+            new_prompts.append(new_prompt)
+    # Batch inference using vLLM
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_tokens=args.max_token,
+        seed=args.seed
+    )
+    outputs = llm.generate(new_prompts, sampling_params)
+    responses = [output.outputs[0].text for output in outputs]
+
+    # Process results and save to exist_result_records
+    for i, (new_prompt, label, response) in enumerate(zip(new_prompts, labels, responses), start=start_index):
+        cor, parsed_pred, reformat_gold_conditions = kk_proc._parse_cot_eval_instruct(response, label, args.model, test_records[i]['names'], test_records[i]['solution_text_format'])
+        cors.append(cor)
+        new_item = {
+            'quiz': test_records[i]['quiz'],
+            'names': test_records[i]['names'],
+            'solution': test_records[i]['solution'],
+            'solution_text': test_records[i]['solution_text'],
+            'solution_text_format': test_records[i]['solution_text_format'],
+            'index': test_records[i]['index'],
+            'predicts': parsed_pred,
+            'labels': reformat_gold_conditions,
+            'correct': cor,
+            'response': response,
+            'prompts': new_prompt,
+        }
+        exist_result_records.append(new_item)
+
+    acc = np.mean(cors)
+    print(f"Subject: {subject}, Accuracy: {acc:.3f}")
+    return cors, acc, exist_result_records
+
+def load_limited_test_records(args, subject, exist_result_records):
+    """Load limited test records based on given arguments."""
+    test_records = load_eval_records(args, subject)
+    print("test_records:", test_records)
+    if args.limit is not None:
+        test_records = test_records.select(range(min(args.limit, len(test_records))))
+        if args.limit <= len(exist_result_records):
+            return None  # Already processed all samples
+    return test_records
+
+def save_final_acc_results(all_cors, results, fname):
+    """Save final accuracy results to a file."""
+    if all_cors:
+        weighted_acc = np.mean(np.concatenate(all_cors))
+        results["weighted_accuracy"] = weighted_acc
+        print(f"Overall Average Accuracy: {weighted_acc:.3f}")
+        with open(fname, "w") as f:
+            json.dump(results, f)
+
+def load_previous_acc_results(fname):
+    """Load previous accuracy results."""
+    acc_results = {"subject": {}}
+    if os.path.isfile(fname):
+        with open(fname, 'r', encoding='utf-8') as file:
+            acc_results = json.load(file)
+    return acc_results
+
+def get_subjects_to_eval(args):
+    """Get subjects to evaluate."""
+    subjects = []
+    if args.split == "test":
+        if args.eval_nppl == 0:
+            subjects = [f"people{nppl}_num100" for nppl in range(2, 9)]
+        else:
+            subjects = [f"people{args.eval_nppl}_num100"]
+    elif args.split == "train":
+        if args.eval_nppl == 2:
+            subjects = ["people2_num200"]
+        elif args.eval_nppl > 2:
+            subjects = [f"people{args.eval_nppl}_num1000"]
+    return subjects
+
+def main(args):
+    model_short_name = "/".join(args.model.split("/")[-2:])
+    prefix = os.path.join(args.save_dir, f"{model_short_name}_{args.ntrain}shot")
+    args.config += f"_token{args.max_token}{('_cot' if args.cot else '')}" \
+                   f"_{args.split}{('_' + args.problem_type if args.problem_type != 'clean' else '')}"
+    output_folder = os.path.join(prefix, args.config)
+    acc_fname = args.output_file # os.path.join(prefix, f"result_{args.config}.json")
+    os.makedirs(output_folder, exist_ok=True)
+
+    print("Configuration:", args.config)
+    print("Output Folder:", output_folder)
+
+    kk_proc = KKProcessor(cot=args.cot, no_linebreak=args.no_linebreak)
+    subjects = get_subjects_to_eval(args)
+    acc_results = load_previous_acc_results(acc_fname)
+
+    # Initialize vLLM model
+    llm = LLM(
+        model=args.model,
+        tensor_parallel_size=args.ngpus,
+        max_model_len=args.max_token,
+        gpu_memory_utilization=0.85,
+        enforce_eager=False
+    )
+
+    all_cors = []
+    for subject in subjects:
+        result_outfile = os.path.join(output_folder, f"{subject}.jsonl")
+        exist_result_records = load_jsonl(result_outfile)
+        test_records = load_limited_test_records(args, subject, exist_result_records)
+        print("exist_result_records:", exist_result_records)
+        print("test_records:", test_records)
+        if test_records is None:
+            continue
+
+        cors, acc, result_records = eval_subject(args, subject, llm, test_records, kk_proc, exist_result_records)
+        write_jsonl(result_outfile, result_records)
+        all_cors.append(cors)
+        acc_results["subject"][subject] = acc
+
+    save_final_acc_results(all_cors, acc_results, acc_fname)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Evaluation script for KK dataset")
+    parser.add_argument("--ntrain", "-k", type=int, default=0, help="Number of training examples")
+    parser.add_argument("--data_dir", "-d", type=str, default="data", help="Data directory")
+    parser.add_argument("--save_dir", "-s", type=str, default="result_qa", help="Save directory")
+    parser.add_argument("--model", "-m", type=str, required=True, help="Model name or path")
+    parser.add_argument("--max_token", type=int, default=1024, help="Maximum number of tokens")
+    parser.add_argument("--limit", type=int, default=None, help="Limit the number of examples")
+    parser.add_argument("--cot", action="store_true", help="Use chain-of-thought prompting")
+    parser.add_argument("--no_linebreak", action="store_true", help="Remove line breaks")
+    parser.add_argument("--batch_size", type=int, default=8, help="Batch size for VLLM")
+    parser.add_argument("--split", type=str, default="test", choices=["test", "train"], help="Data split to use")
+    parser.add_argument("--eval_nppl", type=int, default=0, help="Number of people to evaluate")
+    parser.add_argument("--problem_type", type=str, default="clean", help="Problem perturbation type")
+    parser.add_argument("--temperature", type=float, default=1.0, help="Temperature for sampling")
+    parser.add_argument("--top_p", type=float, default=1.0, help="Top-p (nucleus) sampling")
+    parser.add_argument("--config", type=str, default="default_config", help="Configuration string for saving results")
+    parser.add_argument("--ngpus", type=int, default=1, help="Number of GPUs for parallel inference")
+    parser.add_argument("--output_file", type=str, default="output_file", help="Output File Path")
+    parser.add_argument("--seed", type=int, default=42, help="Seed")
+    args = parser.parse_args()
+
+    init_seed(seed=args.seed)
+    main(args)
+
+    
+\ No newline at end of file
author	= <=>	2025-06-04 11:49:37 +0800
committer	= <=>	2025-06-04 11:49:37 +0800
commit	947d9dfdf16ae37109898111a5caacae7377b96d (patch)
tree	ff4e884020fb7d968a6192106f370b215647f569
parent	5e163b529a78d528b745b8b57ba794b7b2bba97a (diff)