From ca69b367cc6e11d84f743285bf5e03f99f4927b4 Mon Sep 17 00:00:00 2001 From: zhang Date: Mon, 20 Jun 2022 23:14:13 +0800 Subject: tokenizer 02 --- .../02_tokenizer_encode_plus_token_type_ids.ipynb | 617 +++++++++++++++++++++ 1 file changed, 617 insertions(+) create mode 100644 hugface/02_tokenizer_encode_plus_token_type_ids.ipynb (limited to 'hugface') diff --git a/hugface/02_tokenizer_encode_plus_token_type_ids.ipynb b/hugface/02_tokenizer_encode_plus_token_type_ids.ipynb new file mode 100644 index 0000000..06a0a58 --- /dev/null +++ b/hugface/02_tokenizer_encode_plus_token_type_ids.ipynb @@ -0,0 +1,617 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. 加载模型" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T14:56:22.932982Z", + "start_time": "2022-06-20T14:56:22.930078Z" + } + }, + "outputs": [], + "source": [ + "from transformers import BertTokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T14:56:31.483548Z", + "start_time": "2022-06-20T14:56:31.480598Z" + } + }, + "outputs": [], + "source": [ + "model_name = 'bert-base-uncased'" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T14:56:50.395407Z", + "start_time": "2022-06-20T14:56:42.028614Z" + } + }, + "outputs": [], + "source": [ + "tokenizer = BertTokenizer.from_pretrained(model_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T14:56:51.186108Z", + "start_time": "2022-06-20T14:56:51.181772Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T14:57:32.114749Z", + "start_time": "2022-06-20T14:57:32.109702Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'unk_token': '[UNK]',\n", + " 'sep_token': '[SEP]',\n", + " 'pad_token': '[PAD]',\n", + " 'cls_token': '[CLS]',\n", + " 'mask_token': '[MASK]'}" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.special_tokens_map" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T14:58:08.894192Z", + "start_time": "2022-06-20T14:58:08.891071Z" + } + }, + "outputs": [], + "source": [ + "special_tokens = list(tokenizer.special_tokens_map.values())" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T14:58:11.659126Z", + "start_time": "2022-06-20T14:58:11.653540Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "special_tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T14:58:28.232023Z", + "start_time": "2022-06-20T14:58:28.227861Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[100, 102, 0, 101, 103]" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.convert_tokens_to_ids(special_tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T14:59:20.727082Z", + "start_time": "2022-06-20T14:59:20.722891Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[101, 100, 102, 0, 101, 103, 102]" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.encode(special_tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T14:59:48.802903Z", + "start_time": "2022-06-20T14:59:48.798063Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'[CLS] [UNK] [SEP] [PAD] [CLS] [MASK] [SEP]'" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.decode([101, 100, 102, 0, 101, 103, 102])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. 认识文本语料" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- newsgroups_train.DESCR\n", + "- newsgroups_train.data\n", + "- newsgroups_train.target\n", + "- newsgroups_train.target_names" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T15:00:35.577160Z", + "start_time": "2022-06-20T15:00:35.574035Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_20newsgroups" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T15:00:42.921294Z", + "start_time": "2022-06-20T15:00:42.653867Z" + } + }, + "outputs": [], + "source": [ + "newsgroups_train = fetch_20newsgroups(subset='train')" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T15:01:21.114905Z", + "start_time": "2022-06-20T15:01:21.110446Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "list" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(newsgroups_train.data)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T15:01:28.937567Z", + "start_time": "2022-06-20T15:01:28.933964Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "11314" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(newsgroups_train.data)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T15:01:40.160373Z", + "start_time": "2022-06-20T15:01:40.155860Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "11314" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(newsgroups_train.target) " + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T15:01:57.389065Z", + "start_time": "2022-06-20T15:01:57.385988Z" + } + }, + "outputs": [], + "source": [ + "from collections import Counter" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T15:02:03.345302Z", + "start_time": "2022-06-20T15:02:03.337241Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({7: 594,\n", + " 4: 578,\n", + " 1: 584,\n", + " 14: 593,\n", + " 16: 546,\n", + " 13: 594,\n", + " 3: 590,\n", + " 2: 591,\n", + " 8: 598,\n", + " 19: 377,\n", + " 6: 585,\n", + " 0: 480,\n", + " 12: 591,\n", + " 5: 593,\n", + " 10: 600,\n", + " 9: 597,\n", + " 15: 599,\n", + " 17: 564,\n", + " 18: 465,\n", + " 11: 595})" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Counter(newsgroups_train.target)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T15:02:20.691128Z", + "start_time": "2022-06-20T15:02:20.686558Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['alt.atheism',\n", + " 'comp.graphics',\n", + " 'comp.os.ms-windows.misc',\n", + " 'comp.sys.ibm.pc.hardware',\n", + " 'comp.sys.mac.hardware',\n", + " 'comp.windows.x',\n", + " 'misc.forsale',\n", + " 'rec.autos',\n", + " 'rec.motorcycles',\n", + " 'rec.sport.baseball',\n", + " 'rec.sport.hockey',\n", + " 'sci.crypt',\n", + " 'sci.electronics',\n", + " 'sci.med',\n", + " 'sci.space',\n", + " 'soc.religion.christian',\n", + " 'talk.politics.guns',\n", + " 'talk.politics.mideast',\n", + " 'talk.politics.misc',\n", + " 'talk.religion.misc']" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "newsgroups_train.target_names" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. tokenizer 补充" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T14:34:05.319605Z", + "start_time": "2022-06-20T14:34:05.303549Z" + } + }, + "source": [ + "- input_ids, attention_mask\n", + " - mask:bert 另外一个预训练任务,mlm;\n", + "- encode_plus, token_type_ids\n", + " - token_type_ids: 0:表示第一句,1:表示第二句;可以通过 tokenizer()(tokenizer.\\_\\_call\\_\\_:都是0的);也可以通过 encode_plus 生成/返回(前一句为0,后一句为1);\n", + " - 句子对,一般使用在 nsp(next sentence predict,bert 预训练任务)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T15:02:48.909751Z", + "start_time": "2022-06-20T15:02:48.906613Z" + } + }, + "outputs": [], + "source": [ + "test_news = newsgroups_train.data[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T15:03:04.579584Z", + "start_time": "2022-06-20T15:03:04.575396Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1981" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(test_news[2])" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T15:07:13.117543Z", + "start_time": "2022-06-20T15:07:13.108526Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input_ids': [101, 2013, 1024, 3393, 2099, 2595, 3367, 1030, 11333, 2213, 1012, 8529, 2094, 1012, 3968, 2226, 1006, 2073, 1005, 1055, 2026, 2518, 1007, 3395, 1024, 2054, 2482, 2003, 2023, 999, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# single sentence 级别的\n", + "tokenizer(test_news[0], truncation=True, max_length=32)" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T15:08:30.245910Z", + "start_time": "2022-06-20T15:08:30.231817Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.\n" + ] + }, + { + "data": { + "text/plain": [ + "{'input_ids': [101, 2013, 1024, 3393, 2099, 2595, 3367, 1030, 11333, 2213, 1012, 8529, 2094, 1012, 3968, 2226, 102, 2013, 1024, 3124, 5283, 2080, 1030, 9806, 1012, 1057, 1012, 2899, 1012, 3968, 2226, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}" + ] + }, + "execution_count": 118, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sentence pair 级别\n", + "tokenizer.encode_plus(text=test_news[0], text_pair=test_news[1], max_length=32, truncation=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-20T15:08:51.172868Z", + "start_time": "2022-06-20T15:08:51.167536Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'[CLS] from : lerxst @ wam. umd. edu [SEP] from : guykuo @ carson. u. washington. edu [SEP]'" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.decode([101, 2013, 1024, 3393, 2099, 2595, 3367, 1030, 11333, 2213, 1012, 8529, 2094, 1012, 3968, 2226, 102, 2013, 1024, 3124, 5283, 2080, 1030, 9806, 1012, 1057, 1012, 2899, 1012, 3968, 2226, 102])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- cgit v1.2.3