diff options
| author | Yuren Hao <yurenh2@timan108.cs.illinois.edu> | 2025-09-04 22:16:22 -0500 |
|---|---|---|
| committer | Yuren Hao <yurenh2@timan108.cs.illinois.edu> | 2025-09-04 22:16:22 -0500 |
| commit | fc6d57ffb8d5ddb5820fcc00b5491a585c259ebc (patch) | |
| tree | e9841f93a353e2107225cfc721d1ce57c0e594dc /Qwen2.5-Eval/evaluation/model_utils.py | |
Initial commit
Diffstat (limited to 'Qwen2.5-Eval/evaluation/model_utils.py')
| -rwxr-xr-x | Qwen2.5-Eval/evaluation/model_utils.py | 235 |
1 files changed, 235 insertions, 0 deletions
diff --git a/Qwen2.5-Eval/evaluation/model_utils.py b/Qwen2.5-Eval/evaluation/model_utils.py new file mode 100755 index 0000000..1097d75 --- /dev/null +++ b/Qwen2.5-Eval/evaluation/model_utils.py @@ -0,0 +1,235 @@ +""" +https://github.com/allenai/open-instruct +""" +import torch +import tqdm +from transformers import StoppingCriteria, StoppingCriteriaList + + +class KeywordsStoppingCriteria(StoppingCriteria): + def __init__(self, keywords_str, tokenizer): + StoppingCriteria.__init__(self) + self.current_context = [] + self.tokenizer = tokenizer + self.keywords_str = keywords_str + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: + if len(self.current_context) == 0: + self.current_context = [[] for _ in range(input_ids.shape[0])] + + # self.current_context.append(input_ids[0][-1].item()) + sequences_should_be_stopped = [] + for i in range(input_ids.shape[0]): + _id = input_ids[i][-1].item() + self.current_context[i].append(_id) + current_context = self.tokenizer.decode(self.current_context[i]) + should_be_stopped = False + for word in self.keywords_str: + if word in current_context: + should_be_stopped = True + break + sequences_should_be_stopped.append(should_be_stopped) + return all(sequences_should_be_stopped) + + +class KeyWordsCriteriaTrunc(StoppingCriteria): + def __init__(self, stop_id_sequences, prompt_length): + assert isinstance(stop_id_sequences[0], list), "stop_id_sequences should be a list of list of ids" + self.stop_sequences = stop_id_sequences + self.prompt_length = prompt_length + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: + sequences_should_be_stopped = [] + for i in range(input_ids.shape[0]): + ids = input_ids[i][self.prompt_length:].tolist() + should_be_stopped = False + for stop_sequence in self.stop_sequences: + if input_ids.shape[0] == 1: + _ids = ids[-len(stop_sequence):] + else: + _ids = ids + for j in range(len(_ids), 0, -len(stop_sequence)): + if _ids[max(j - len(stop_sequence), 0): j] == stop_sequence: + should_be_stopped = True + break + if should_be_stopped: + break + sequences_should_be_stopped.append(should_be_stopped) + return all(sequences_should_be_stopped) + + +class KeyWordsCriteria(StoppingCriteria): + def __init__(self, stop_id_sequences): + assert isinstance(stop_id_sequences[0], list), "stop_id_sequences should be a list of list of ids" + self.stop_sequences = stop_id_sequences + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: + sequences_should_be_stopped = [] + for i in range(input_ids.shape[0]): + sequence_should_be_stopped = False + for stop_sequence in self.stop_sequences: + if input_ids[i][-len(stop_sequence):].tolist() == stop_sequence: + sequence_should_be_stopped = True + break + sequences_should_be_stopped.append(sequence_should_be_stopped) + return all(sequences_should_be_stopped) + + +@torch.no_grad() +def generate_completions(model, tokenizer, prompts, batch_size=1, stop_id_sequences=None, add_special_tokens=True, disable_tqdm=False, **generation_kwargs): + generations = [] + if not disable_tqdm: + progress = tqdm.tqdm(total=len(prompts), desc="Generating Completions") + + num_return_sequences = generation_kwargs.get("num_return_sequences", 1) + for i in range(0, len(prompts), batch_size): + batch_prompts = prompts[i:i+batch_size] + tokenized_prompts = tokenizer(batch_prompts, padding="longest", return_tensors="pt", add_special_tokens=add_special_tokens) + batch_input_ids = tokenized_prompts.input_ids + attention_mask = tokenized_prompts.attention_mask + + if model.device.type == "cuda": + batch_input_ids = batch_input_ids.cuda() + attention_mask = attention_mask.cuda() + + # try: + stop_criteria = KeywordsStoppingCriteria(stop_id_sequences, tokenizer) + batch_outputs = model.generate( + input_ids=batch_input_ids, + attention_mask=attention_mask, + stopping_criteria=StoppingCriteriaList([stop_criteria]), + # stopping_criteria=[KeyWordsCriteria(stop_id_sequences)] if stop_id_sequences else None, + # stopping_criteria=[KeyWordsCriteriaTrunc(stop_id_sequences, batch_input_ids.size(1))] if stop_id_sequences else None, + **generation_kwargs + ) + + # the stopping criteria is applied at batch level, so if other examples are not stopped, the entire batch will continue to generate. + # so some outputs still have the stop sequence, which we need to remove. + # if stop_id_sequences: + # for output_idx in range(batch_outputs.shape[0]): + # for token_idx in range(batch_input_ids.shape[1], batch_outputs.shape[1]): + # if any(batch_outputs[output_idx, token_idx: token_idx+len(stop_sequence)].tolist() == stop_sequence for stop_sequence in stop_id_sequences): + # batch_outputs[output_idx, token_idx:] = tokenizer.pad_token_id + # break + + # remove the prompt from the output + # we need to re-encode the prompt because we need to make sure the special tokens are treated the same way as in the outputs. + # we changed our previous way of truncating the output token ids dicrectly because some tokenizer (e.g., llama) won't add space token before the first token. + # space is important for some tasks (e.g., code completion). + batch_outputs = tokenizer.batch_decode(batch_outputs, skip_special_tokens=True) + batch_prompts = tokenizer.batch_decode(batch_input_ids, skip_special_tokens=True) + # duplicate the prompts to match the number of return sequences + batch_prompts = [prompt for prompt in batch_prompts for _ in range(num_return_sequences)] + batch_generations = [ + output[len(prompt):] for prompt, output in zip(batch_prompts, batch_outputs) + ] + + # remove the remain stop sequence from the output. + for idx, prediction in enumerate(batch_generations): + for stop_sequence in stop_id_sequences: + batch_generations[idx] = prediction.split(stop_sequence)[0] + + generations += batch_generations + + if not disable_tqdm: + progress.update(len(batch_prompts)//num_return_sequences) + + assert len(generations) == len(prompts) * num_return_sequences, "number of generations should be equal to number of prompts * num_return_sequences" + return generations + + +def load_hf_lm_and_tokenizer( + model_name_or_path, + tokenizer_name_or_path=None, + device_map="auto", + load_in_8bit=False, + load_in_half=True, + gptq_model=False, + use_fast_tokenizer=False, + padding_side="left", + use_safetensors=False, + ): + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer + + if not tokenizer_name_or_path: + tokenizer_name_or_path = model_name_or_path + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=use_fast_tokenizer, padding_side=padding_side, trust_remote_code=True) + # tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, legacy=False, use_fast=use_fast_tokenizer, padding_side=padding_side, trust_remote_code=True) + + # set pad token to eos token if pad token is not set + if tokenizer.pad_token is None: + if tokenizer.unk_token: + tokenizer.pad_token = tokenizer.unk_token + tokenizer.pad_token_id = tokenizer.unk_token_id + elif tokenizer.eos_token: + tokenizer.pad_token = tokenizer.eos_token + tokenizer.pad_token_id = tokenizer.eos_token_id + else: + raise ValueError("You are using a new tokenizer without a pad token." + "This is not supported by this script.") + + # if tokenizer.pad_token is None: + # tokenizer.pad_token = tokenizer.unk_token + # tokenizer.pad_token_id = tokenizer.unk_token_id + + if gptq_model: + from auto_gptq import AutoGPTQForCausalLM + model_wrapper = AutoGPTQForCausalLM.from_quantized( + model_name_or_path, device="cuda:0", use_triton=True + ) + model = model_wrapper.model + elif load_in_8bit: + model = AutoModelForCausalLM.from_pretrained( + model_name_or_path, + device_map=device_map, + load_in_8bit=True + ) + else: + # return "", tokenizer + # defaul load in float16 + model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + torch_dtype=torch.float16, + device_map=device_map, + trust_remote_code=True, + use_safetensors=use_safetensors) + if torch.cuda.is_available(): + model = model.cuda() + if load_in_half: + model = model.half() + model.eval() + return model, tokenizer + + +def _test_generate_completions(): + model_name_or_path = "../models/codellama_7b/v1-16k" + llm, tokenizer = load_hf_lm_and_tokenizer( + model_name_or_path=model_name_or_path, + load_in_half=True, + use_fast_tokenizer=True, + use_safetensors=True, + ) + # some math word problems + prompts = [ + "---\n1+1=2\n---2+2=4\n---3+3=6\n---4+4=8\n---5+5=10\n---6+6=", + "---\n1+1=2\n---12+12=24\n---3+3=6\n---12345+12345=", + # "A train leaves Chicago at 7am and travels at 60mph. Another train leaves Chicago at 9am and travels at 80mph. When will the second train overtake the first?", + # "The sum of two numbers is 10. The difference of the same two numbers is 4. What are the two numbers?", + ] + + stop_sequences = ["\n\n\n", "---"] + # Because many tokenizers will treat the word after space differently from the original word alone, + # to be consistent, we add a space before tokenization and remove it after tokenization. + # stop_id_sequences = [tokenizer.encode(" " + x, add_special_tokens=False)[1:] for x in stop_sequences] + outputs = generate_completions( + model=llm, + tokenizer=tokenizer, + prompts=prompts, + max_new_tokens=128, + batch_size=16, + # stop_id_sequences=stop_id_sequences, + stop_id_sequences=stop_sequences, + ) + print(outputs) + +if __name__ == "__main__": + _test_generate_completions()
\ No newline at end of file |
