diff options
Diffstat (limited to 'fine_tune/bert/tutorials/01_mlm.ipynb')
| -rw-r--r-- | fine_tune/bert/tutorials/01_mlm.ipynb | 762 |
1 files changed, 762 insertions, 0 deletions
diff --git a/fine_tune/bert/tutorials/01_mlm.ipynb b/fine_tune/bert/tutorials/01_mlm.ipynb new file mode 100644 index 0000000..95ab5be --- /dev/null +++ b/fine_tune/bert/tutorials/01_mlm.ipynb @@ -0,0 +1,762 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-27T13:45:18.997095Z", + "start_time": "2022-06-27T13:45:18.394848Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/chunhuizhang/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n", + " return f(*args, **kwds)\n", + "/Users/chunhuizhang/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n", + " return f(*args, **kwds)\n", + "/Users/chunhuizhang/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n", + " return f(*args, **kwds)\n" + ] + } + ], + "source": [ + "from transformers import BertTokenizer, BertForMaskedLM\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-27T13:40:25.320915Z", + "start_time": "2022-06-27T13:40:25.318612Z" + } + }, + "outputs": [], + "source": [ + "model_name = 'bert-base-uncased'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-27T13:41:09.358613Z", + "start_time": "2022-06-27T13:40:57.925124Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n", + "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" + ] + } + ], + "source": [ + "tokenizer = BertTokenizer.from_pretrained(model_name)\n", + "mlm = BertForMaskedLM.from_pretrained(model_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-27T13:42:59.026410Z", + "start_time": "2022-06-27T13:42:59.020472Z" + } + }, + "outputs": [], + "source": [ + "with open('./data/text/meditations/clean.txt', 'r') as f:\n", + " texts = f.read().split('\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-27T13:43:00.127330Z", + "start_time": "2022-06-27T13:43:00.123932Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "507" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(texts)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-27T13:44:00.414360Z", + "start_time": "2022-06-27T13:44:00.408114Z" + } + }, + "outputs": [], + "source": [ + "max_len = 0\n", + "text_len_list = []\n", + "for text in texts:\n", + " text_len = len(text.split(' '))\n", + " text_len_list.append(text_len)\n", + "# print(text_len)\n", + "# if text_len > max_len:\n", + "# max_len = text_len" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-27T13:43:18.412453Z", + "start_time": "2022-06-27T13:43:18.409195Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "880" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "max_len" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-27T13:47:34.304180Z", + "start_time": "2022-06-27T13:47:34.295875Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[14,\n", + " 13,\n", + " 34,\n", + " 29,\n", + " 70,\n", + " 104,\n", + " 194,\n", + " 126,\n", + " 141,\n", + " 67,\n", + " 32,\n", + " 47,\n", + " 52,\n", + " 156,\n", + " 193,\n", + " 880,\n", + " 656,\n", + " 6,\n", + " 170,\n", + " 128,\n", + " 143,\n", + " 88,\n", + " 137,\n", + " 46,\n", + " 69,\n", + " 36,\n", + " 68,\n", + " 161,\n", + " 262,\n", + " 196,\n", + " 143,\n", + " 213,\n", + " 42,\n", + " 191,\n", + " 241,\n", + " 3,\n", + " 174,\n", + " 306,\n", + " 171,\n", + " 503,\n", + " 129,\n", + " 315,\n", + " 167,\n", + " 80,\n", + " 49,\n", + " 101,\n", + " 307,\n", + " 84,\n", + " 77,\n", + " 68,\n", + " 36,\n", + " 241,\n", + " 112,\n", + " 18,\n", + " 435,\n", + " 127,\n", + " 181,\n", + " 56,\n", + " 67,\n", + " 30,\n", + " 29,\n", + " 14,\n", + " 95,\n", + " 32,\n", + " 86,\n", + " 24,\n", + " 27,\n", + " 19,\n", + " 32,\n", + " 28,\n", + " 56,\n", + " 119,\n", + " 125,\n", + " 201,\n", + " 27,\n", + " 27,\n", + " 69,\n", + " 128,\n", + " 34,\n", + " 81,\n", + " 41,\n", + " 17,\n", + " 131,\n", + " 49,\n", + " 51,\n", + " 184,\n", + " 160,\n", + " 21,\n", + " 15,\n", + " 77,\n", + " 38,\n", + " 20,\n", + " 140,\n", + " 66,\n", + " 14,\n", + " 20,\n", + " 42,\n", + " 37,\n", + " 67,\n", + " 124,\n", + " 66,\n", + " 181,\n", + " 23,\n", + " 198,\n", + " 153,\n", + " 41,\n", + " 313,\n", + " 24,\n", + " 87,\n", + " 84,\n", + " 196,\n", + " 283,\n", + " 42,\n", + " 499,\n", + " 195,\n", + " 177,\n", + " 64,\n", + " 69,\n", + " 197,\n", + " 102,\n", + " 66,\n", + " 184,\n", + " 178,\n", + " 21,\n", + " 59,\n", + " 55,\n", + " 128,\n", + " 63,\n", + " 62,\n", + " 114,\n", + " 46,\n", + " 40,\n", + " 102,\n", + " 61,\n", + " 115,\n", + " 100,\n", + " 58,\n", + " 46,\n", + " 84,\n", + " 45,\n", + " 66,\n", + " 142,\n", + " 84,\n", + " 36,\n", + " 76,\n", + " 79,\n", + " 52,\n", + " 72,\n", + " 14,\n", + " 24,\n", + " 20,\n", + " 14,\n", + " 22,\n", + " 41,\n", + " 53,\n", + " 97,\n", + " 43,\n", + " 68,\n", + " 172,\n", + " 177,\n", + " 187,\n", + " 343,\n", + " 38,\n", + " 68,\n", + " 40,\n", + " 122,\n", + " 46,\n", + " 30,\n", + " 79,\n", + " 35,\n", + " 63,\n", + " 99,\n", + " 81,\n", + " 36,\n", + " 23,\n", + " 314,\n", + " 48,\n", + " 93,\n", + " 72,\n", + " 10,\n", + " 79,\n", + " 108,\n", + " 38,\n", + " 63,\n", + " 29,\n", + " 97,\n", + " 122,\n", + " 159,\n", + " 44,\n", + " 316,\n", + " 49,\n", + " 170,\n", + " 82,\n", + " 54,\n", + " 95,\n", + " 35,\n", + " 31,\n", + " 23,\n", + " 15,\n", + " 39,\n", + " 16,\n", + " 59,\n", + " 28,\n", + " 35,\n", + " 79,\n", + " 106,\n", + " 86,\n", + " 43,\n", + " 126,\n", + " 27,\n", + " 47,\n", + " 30,\n", + " 91,\n", + " 34,\n", + " 15,\n", + " 120,\n", + " 53,\n", + " 39,\n", + " 134,\n", + " 53,\n", + " 81,\n", + " 65,\n", + " 34,\n", + " 15,\n", + " 69,\n", + " 72,\n", + " 66,\n", + " 40,\n", + " 93,\n", + " 73,\n", + " 27,\n", + " 58,\n", + " 23,\n", + " 39,\n", + " 20,\n", + " 56,\n", + " 61,\n", + " 55,\n", + " 12,\n", + " 33,\n", + " 16,\n", + " 8,\n", + " 17,\n", + " 17,\n", + " 9,\n", + " 9,\n", + " 72,\n", + " 59,\n", + " 103,\n", + " 38,\n", + " 69,\n", + " 72,\n", + " 46,\n", + " 32,\n", + " 37,\n", + " 52,\n", + " 46,\n", + " 93,\n", + " 124,\n", + " 27,\n", + " 21,\n", + " 132,\n", + " 19,\n", + " 54,\n", + " 32,\n", + " 44,\n", + " 48,\n", + " 141,\n", + " 13,\n", + " 168,\n", + " 111,\n", + " 201,\n", + " 27,\n", + " 66,\n", + " 29,\n", + " 22,\n", + " 41,\n", + " 33,\n", + " 60,\n", + " 238,\n", + " 63,\n", + " 60,\n", + " 14,\n", + " 98,\n", + " 60,\n", + " 209,\n", + " 53,\n", + " 17,\n", + " 49,\n", + " 38,\n", + " 58,\n", + " 27,\n", + " 76,\n", + " 60,\n", + " 52,\n", + " 88,\n", + " 42,\n", + " 48,\n", + " 79,\n", + " 31,\n", + " 47,\n", + " 19,\n", + " 17,\n", + " 41,\n", + " 22,\n", + " 131,\n", + " 64,\n", + " 39,\n", + " 64,\n", + " 57,\n", + " 21,\n", + " 92,\n", + " 125,\n", + " 13,\n", + " 164,\n", + " 83,\n", + " 99,\n", + " 100,\n", + " 12,\n", + " 32,\n", + " 58,\n", + " 177,\n", + " 20,\n", + " 49,\n", + " 67,\n", + " 64,\n", + " 81,\n", + " 144,\n", + " 105,\n", + " 97,\n", + " 176,\n", + " 40,\n", + " 101,\n", + " 89,\n", + " 40,\n", + " 68,\n", + " 45,\n", + " 69,\n", + " 180,\n", + " 54,\n", + " 15,\n", + " 36,\n", + " 15,\n", + " 473,\n", + " 119,\n", + " 308,\n", + " 21,\n", + " 19,\n", + " 27,\n", + " 15,\n", + " 59,\n", + " 400,\n", + " 57,\n", + " 64,\n", + " 39,\n", + " 27,\n", + " 33,\n", + " 29,\n", + " 31,\n", + " 24,\n", + " 25,\n", + " 23,\n", + " 14,\n", + " 118,\n", + " 71,\n", + " 80,\n", + " 36,\n", + " 40,\n", + " 30,\n", + " 91,\n", + " 96,\n", + " 60,\n", + " 183,\n", + " 109,\n", + " 53,\n", + " 79,\n", + " 42,\n", + " 53,\n", + " 87,\n", + " 70,\n", + " 74,\n", + " 18,\n", + " 79,\n", + " 249,\n", + " 175,\n", + " 503,\n", + " 205,\n", + " 38,\n", + " 63,\n", + " 131,\n", + " 26,\n", + " 35,\n", + " 266,\n", + " 352,\n", + " 396,\n", + " 146,\n", + " 62,\n", + " 167,\n", + " 111,\n", + " 31,\n", + " 84,\n", + " 40,\n", + " 74,\n", + " 20,\n", + " 35,\n", + " 34,\n", + " 64,\n", + " 28,\n", + " 49,\n", + " 43,\n", + " 64,\n", + " 57,\n", + " 81,\n", + " 103,\n", + " 83,\n", + " 22,\n", + " 46,\n", + " 25,\n", + " 81,\n", + " 228,\n", + " 85,\n", + " 397,\n", + " 32,\n", + " 143,\n", + " 119,\n", + " 282,\n", + " 36,\n", + " 97,\n", + " 248,\n", + " 113,\n", + " 65,\n", + " 29,\n", + " 32,\n", + " 98,\n", + " 16,\n", + " 11,\n", + " 15,\n", + " 42,\n", + " 59,\n", + " 27,\n", + " 223,\n", + " 137,\n", + " 104,\n", + " 48,\n", + " 48,\n", + " 168,\n", + " 22,\n", + " 154,\n", + " 198,\n", + " 36,\n", + " 91,\n", + " 40,\n", + " 80,\n", + " 53,\n", + " 48,\n", + " 27,\n", + " 99,\n", + " 31,\n", + " 180,\n", + " 172,\n", + " 62,\n", + " 122,\n", + " 240,\n", + " 125,\n", + " 20,\n", + " 18,\n", + " 20,\n", + " 39,\n", + " 25,\n", + " 50,\n", + " 45,\n", + " 33,\n", + " 51,\n", + " 63,\n", + " 22,\n", + " 97,\n", + " 17,\n", + " 45,\n", + " 250,\n", + " 87,\n", + " 187,\n", + " 64,\n", + " 104,\n", + " 70,\n", + " 124,\n", + " 43,\n", + " 38,\n", + " 46,\n", + " 45,\n", + " 13,\n", + " 22,\n", + " 38,\n", + " 17,\n", + " 110,\n", + " 32,\n", + " 113,\n", + " 23,\n", + " 35,\n", + " 49,\n", + " 21,\n", + " 54,\n", + " 38,\n", + " 219,\n", + " 188,\n", + " 14,\n", + " 151,\n", + " 131]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_len_list" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-27T13:49:01.304890Z", + "start_time": "2022-06-27T13:49:01.159757Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAQ1klEQVR4nO3df6xfdX3H8edrFMGpEZC7prbNitrNoImF3CFE/2AwJ6JZMXEEsmhnSOoSyHAx28D9oSYjgURlmGxkVdBqnMAQR4NMh5XE+IfgBSvyQ+ZVymhT6FUBZWZE8L0/vp/K1/a298f3/rCfPh/JN/ecz+dzvud9Tg8vTj/3fL9NVSFJ6svvLHcBkqSFZ7hLUocMd0nqkOEuSR0y3CWpQyuWuwCAE088sdatW7fcZUjSYeWee+75cVWNTdf3WxHu69atY2JiYrnLkKTDSpJHD9bntIwkdchwl6QOGe6S1KEZwz3JsUnuTvLdJA8k+Uhr/0ySR5LsaK8NrT1JPpFkMsl9SU5d7IOQJP2m2fxC9VngrKp6JsnRwDeT/Gfr+9uqunm/8W8D1rfXG4Fr209J0hKZ8c69Bp5pq0e316G+bWwj8Nm23beA45KsGr1USdJszWrOPclRSXYAe4E7ququ1nVFm3q5OskxrW018NjQ5rta2/7vuTnJRJKJqampEQ5BkrS/WYV7VT1fVRuANcBpSV4PXA68Fvgj4ATg7+ey46raUlXjVTU+NjbtM/iSpHma09MyVfUUcCdwTlXtaVMvzwKfBk5rw3YDa4c2W9PaJElLZMZfqCYZA35ZVU8leTHwFuCqJKuqak+SAOcB97dNtgGXJLmBwS9Sn66qPYtUP+su+/JivfWMdl759mXbtyQdymyellkFbE1yFIM7/Zuq6rYkX2/BH2AH8Fdt/O3AucAk8AvgvQtftiTpUGYM96q6DzhlmvazDjK+gItHL02SNF9+QlWSOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQzOGe5Jjk9yd5LtJHkjykdZ+UpK7kkwmuTHJi1r7MW19svWvW9xDkCTtbzZ37s8CZ1XVG4ANwDlJTgeuAq6uqtcATwIXtfEXAU+29qvbOEnSEpox3GvgmbZ6dHsVcBZwc2vfCpzXlje2dVr/2UmyYBVLkmY0qzn3JEcl2QHsBe4Afgg8VVXPtSG7gNVteTXwGEDrfxp4xTTvuTnJRJKJqamp0Y5CkvQbZhXuVfV8VW0A1gCnAa8ddcdVtaWqxqtqfGxsbNS3kyQNmdPTMlX1FHAncAZwXJIVrWsNsLst7wbWArT+lwM/WZBqJUmzMpunZcaSHNeWXwy8BXiIQci/qw3bBNzalre1dVr/16uqFrJoSdKhrZh5CKuArUmOYvA/g5uq6rYkDwI3JPlH4DvAdW38dcDnkkwCPwUuWIS6JUmHMGO4V9V9wCnTtP+Iwfz7/u3/B/z5glQnSZoXP6EqSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1KEZwz3J2iR3JnkwyQNJLm3tH06yO8mO9jp3aJvLk0wmeTjJWxfzACRJB1oxizHPAR+oqnuTvAy4J8kdre/qqvro8OAkJwMXAK8DXgl8LckfVNXzC1m4JOngZrxzr6o9VXVvW/458BCw+hCbbARuqKpnq+oRYBI4bSGKlSTNzpzm3JOsA04B7mpNlyS5L8n1SY5vbauBx4Y228U0/zNIsjnJRJKJqampORcuSTq4WYd7kpcCXwTeX1U/A64FXg1sAPYAH5vLjqtqS1WNV9X42NjYXDaVJM1gVuGe5GgGwf75qroFoKqeqKrnq+pXwCd5YeplN7B2aPM1rU2StERm87RMgOuAh6rq40Ptq4aGvRO4vy1vAy5IckySk4D1wN0LV7IkaSazeVrmTcC7ge8l2dHaPghcmGQDUMBO4H0AVfVAkpuABxk8aXOxT8pI0tKaMdyr6ptApum6/RDbXAFcMUJdkqQR+AlVSeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA7NGO5J1ia5M8mDSR5IcmlrPyHJHUl+0H4e39qT5BNJJpPcl+TUxT4ISdJvms2d+3PAB6rqZOB04OIkJwOXAduraj2wva0DvA1Y316bgWsXvGpJ0iHNGO5Vtaeq7m3LPwceAlYDG4GtbdhW4Ly2vBH4bA18CzguyaoFr1ySdFBzmnNPsg44BbgLWFlVe1rX48DKtrwaeGxos12tbf/32pxkIsnE1NTUHMuWJB3KrMM9yUuBLwLvr6qfDfdVVQE1lx1X1ZaqGq+q8bGxsblsKkmawazCPcnRDIL981V1S2t+Yt90S/u5t7XvBtYObb6mtUmSlshsnpYJcB3wUFV9fKhrG7CpLW8Cbh1qf097auZ04Omh6RtJ0hJYMYsxbwLeDXwvyY7W9kHgSuCmJBcBjwLnt77bgXOBSeAXwHsXtGJJ0oxmDPeq+iaQg3SfPc34Ai4esS5J0gj8hKokdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtSh2YM9yTXJ9mb5P6htg8n2Z1kR3udO9R3eZLJJA8neetiFS5JOrjZ3Ll/Bjhnmvarq2pDe90OkORk4ALgdW2bf0ly1EIVK0manRnDvaq+Afx0lu+3Ebihqp6tqkeASeC0EeqTJM3DKHPulyS5r03bHN/aVgOPDY3Z1doOkGRzkokkE1NTUyOUIUna33zD/Vrg1cAGYA/wsbm+QVVtqarxqhofGxubZxmSpOnMK9yr6omqer6qfgV8khemXnYDa4eGrmltkqQlNK9wT7JqaPWdwL4nabYBFyQ5JslJwHrg7tFKlCTN1YqZBiT5AnAmcGKSXcCHgDOTbAAK2Am8D6CqHkhyE/Ag8BxwcVU9vzilS5IOZsZwr6oLp2m+7hDjrwCuGKUoSdJo/ISqJHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUodWzDQgyfXAO4C9VfX61nYCcCOwDtgJnF9VTyYJcA1wLvAL4C+r6t7FKX35rbvsy8uy351Xvn1Z9ivp8DGbO/fPAOfs13YZsL2q1gPb2zrA24D17bUZuHZhypQkzcWM4V5V3wB+ul/zRmBrW94KnDfU/tka+BZwXJJVC1WsJGl25jvnvrKq9rTlx4GVbXk18NjQuF2t7QBJNieZSDIxNTU1zzIkSdMZ+ReqVVVAzWO7LVU1XlXjY2Njo5YhSRoy33B/Yt90S/u5t7XvBtYOjVvT2iRJS2i+4b4N2NSWNwG3DrW/JwOnA08PTd9IkpbIbB6F/AJwJnBikl3Ah4ArgZuSXAQ8Cpzfht/O4DHISQaPQr53EWqWJM1gxnCvqgsP0nX2NGMLuHjUoiRJo/ETqpLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdWjHKxkl2Aj8Hngeeq6rxJCcANwLrgJ3A+VX15GhlSpLmYiHu3P+4qjZU1XhbvwzYXlXrge1tXZK0hBZjWmYjsLUtbwXOW4R9SJIOYdRwL+C/ktyTZHNrW1lVe9ry48DK6TZMsjnJRJKJqampEcuQJA0bac4deHNV7U7ye8AdSb4/3FlVlaSm27CqtgBbAMbHx6cdI0man5Hu3Ktqd/u5F/gScBrwRJJVAO3n3lGLlCTNzbzDPclLkrxs3zLwp8D9wDZgUxu2Cbh11CIlSXMzyrTMSuBLSfa9z79V1VeSfBu4KclFwKPA+aOXKUmai3mHe1X9CHjDNO0/Ac4epShJ0mj8hKokdchwl6QOjfoopJbBusu+vGz73nnl25dt35Jmzzt3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhv/JXc7JcXzfsVw1Lc+OduyR1yHCXpA4Z7pLUoUWbc09yDnANcBTwqaq6crH2pf4t5z8tuFz8PYNGsSh37kmOAv4ZeBtwMnBhkpMXY1+SpAMt1p37acBkVf0IIMkNwEbgwUXan6QFciT+A+w9HvNihftq4LGh9V3AG4cHJNkMbG6rzyR5eB77ORH48bwq7Jvn5UCH3TnJVUuym9+q87JExzyTJT0nIx7z7x+sY9mec6+qLcCWUd4jyURVjS9QSd3wvBzIczI9z8uBejkni/W0zG5g7dD6mtYmSVoCixXu3wbWJzkpyYuAC4Bti7QvSdJ+FmVapqqeS3IJ8FUGj0JeX1UPLMKuRprW6Zjn5UCek+l5Xg7UxTlJVS13DZKkBeYnVCWpQ4a7JHXosA33JOckeTjJZJLLlruepZJkbZI7kzyY5IEkl7b2E5LckeQH7efxrT1JPtHO031JTl3eI1g8SY5K8p0kt7X1k5Lc1Y79xvbLfZIc09YnW/+65ax7MSU5LsnNSb6f5KEkZxzp10qSv2n/7dyf5AtJju3xWjksw/0I/3qD54APVNXJwOnAxe3YLwO2V9V6YHtbh8E5Wt9em4Frl77kJXMp8NDQ+lXA1VX1GuBJ4KLWfhHwZGu/uo3r1TXAV6rqtcAbGJyfI/ZaSbIa+GtgvKpez+CBjwvo8VqpqsPuBZwBfHVo/XLg8uWua5nOxa3AW4CHgVWtbRXwcFv+V+DCofG/HtfTi8FnKbYDZwG3AWHwKcMV+18zDJ7iOqMtr2jjstzHsAjn5OXAI/sf25F8rfDCp+dPaH/2twFv7fFaOSzv3Jn+6w1WL1Mty6b9FfEU4C5gZVXtaV2PAyvb8pFyrv4J+DvgV239FcBTVfVcWx8+7l+fk9b/dBvfm5OAKeDTbbrqU0lewhF8rVTVbuCjwP8Aexj82d9Dh9fK4RruR7wkLwW+CLy/qn423FeD24wj5hnXJO8A9lbVPctdy2+ZFcCpwLVVdQrwv7wwBQMckdfK8Qy+xPAk4JXAS4BzlrWoRXK4hvsR/fUGSY5mEOyfr6pbWvMTSVa1/lXA3tZ+JJyrNwF/lmQncAODqZlrgOOS7Pug3vBx//qctP6XAz9ZyoKXyC5gV1Xd1dZvZhD2R/K18ifAI1U1VVW/BG5hcP10d60cruF+xH69QZIA1wEPVdXHh7q2AZva8iYGc/H72t/TnoQ4HXh66K/kXaiqy6tqTVWtY3AtfL2q/gK4E3hXG7b/Odl3rt7Vxnd391pVjwOPJfnD1nQ2g6/dPmKvFQbTMacn+d3239K+c9LftbLck/4j/GLkXOC/gR8C/7Dc9Szhcb+ZwV+j7wN2tNe5DOYBtwM/AL4GnNDGh8GTRT8EvsfgKYFlP45FPD9nAre15VcBdwOTwL8Dx7T2Y9v6ZOt/1XLXvYjnYwMw0a6X/wCOP9KvFeAjwPeB+4HPAcf0eK349QOS1KHDdVpGknQIhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nq0P8DUKojecC+u+0AAAAASUVORK5CYII=\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "(cnt, bars, _) = plt.hist(text_len_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-27T13:49:10.416996Z", + "start_time": "2022-06-27T13:49:10.413790Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "11" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(bars)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} |
