{ "cells": [ { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2022-06-27T13:45:18.997095Z", "start_time": "2022-06-27T13:45:18.394848Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/chunhuizhang/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n", " return f(*args, **kwds)\n", "/Users/chunhuizhang/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n", " return f(*args, **kwds)\n", "/Users/chunhuizhang/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n", " return f(*args, **kwds)\n" ] } ], "source": [ "from transformers import BertTokenizer, BertForMaskedLM\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2022-06-27T13:40:25.320915Z", "start_time": "2022-06-27T13:40:25.318612Z" } }, "outputs": [], "source": [ "model_name = 'bert-base-uncased'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2022-06-27T13:41:09.358613Z", "start_time": "2022-06-27T13:40:57.925124Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n", "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" ] } ], "source": [ "tokenizer = BertTokenizer.from_pretrained(model_name)\n", "mlm = BertForMaskedLM.from_pretrained(model_name)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2022-06-27T13:42:59.026410Z", "start_time": "2022-06-27T13:42:59.020472Z" } }, "outputs": [], "source": [ "with open('./data/text/meditations/clean.txt', 'r') as f:\n", " texts = f.read().split('\\n')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2022-06-27T13:43:00.127330Z", "start_time": "2022-06-27T13:43:00.123932Z" } }, "outputs": [ { "data": { "text/plain": [ "507" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(texts)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2022-06-27T13:44:00.414360Z", "start_time": "2022-06-27T13:44:00.408114Z" } }, "outputs": [], "source": [ "max_len = 0\n", "text_len_list = []\n", "for text in texts:\n", " text_len = len(text.split(' '))\n", " text_len_list.append(text_len)\n", "# print(text_len)\n", "# if text_len > max_len:\n", "# max_len = text_len" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2022-06-27T13:43:18.412453Z", "start_time": "2022-06-27T13:43:18.409195Z" } }, "outputs": [ { "data": { "text/plain": [ "880" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "max_len" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2022-06-27T13:47:34.304180Z", "start_time": "2022-06-27T13:47:34.295875Z" } }, "outputs": [ { "data": { "text/plain": [ "[14,\n", " 13,\n", " 34,\n", " 29,\n", " 70,\n", " 104,\n", " 194,\n", " 126,\n", " 141,\n", " 67,\n", " 32,\n", " 47,\n", " 52,\n", " 156,\n", " 193,\n", " 880,\n", " 656,\n", " 6,\n", " 170,\n", " 128,\n", " 143,\n", " 88,\n", " 137,\n", " 46,\n", " 69,\n", " 36,\n", " 68,\n", " 161,\n", " 262,\n", " 196,\n", " 143,\n", " 213,\n", " 42,\n", " 191,\n", " 241,\n", " 3,\n", " 174,\n", " 306,\n", " 171,\n", " 503,\n", " 129,\n", " 315,\n", " 167,\n", " 80,\n", " 49,\n", " 101,\n", " 307,\n", " 84,\n", " 77,\n", " 68,\n", " 36,\n", " 241,\n", " 112,\n", " 18,\n", " 435,\n", " 127,\n", " 181,\n", " 56,\n", " 67,\n", " 30,\n", " 29,\n", " 14,\n", " 95,\n", " 32,\n", " 86,\n", " 24,\n", " 27,\n", " 19,\n", " 32,\n", " 28,\n", " 56,\n", " 119,\n", " 125,\n", " 201,\n", " 27,\n", " 27,\n", " 69,\n", " 128,\n", " 34,\n", " 81,\n", " 41,\n", " 17,\n", " 131,\n", " 49,\n", " 51,\n", " 184,\n", " 160,\n", " 21,\n", " 15,\n", " 77,\n", " 38,\n", " 20,\n", " 140,\n", " 66,\n", " 14,\n", " 20,\n", " 42,\n", " 37,\n", " 67,\n", " 124,\n", " 66,\n", " 181,\n", " 23,\n", " 198,\n", " 153,\n", " 41,\n", " 313,\n", " 24,\n", " 87,\n", " 84,\n", " 196,\n", " 283,\n", " 42,\n", " 499,\n", " 195,\n", " 177,\n", " 64,\n", " 69,\n", " 197,\n", " 102,\n", " 66,\n", " 184,\n", " 178,\n", " 21,\n", " 59,\n", " 55,\n", " 128,\n", " 63,\n", " 62,\n", " 114,\n", " 46,\n", " 40,\n", " 102,\n", " 61,\n", " 115,\n", " 100,\n", " 58,\n", " 46,\n", " 84,\n", " 45,\n", " 66,\n", " 142,\n", " 84,\n", " 36,\n", " 76,\n", " 79,\n", " 52,\n", " 72,\n", " 14,\n", " 24,\n", " 20,\n", " 14,\n", " 22,\n", " 41,\n", " 53,\n", " 97,\n", " 43,\n", " 68,\n", " 172,\n", " 177,\n", " 187,\n", " 343,\n", " 38,\n", " 68,\n", " 40,\n", " 122,\n", " 46,\n", " 30,\n", " 79,\n", " 35,\n", " 63,\n", " 99,\n", " 81,\n", " 36,\n", " 23,\n", " 314,\n", " 48,\n", " 93,\n", " 72,\n", " 10,\n", " 79,\n", " 108,\n", " 38,\n", " 63,\n", " 29,\n", " 97,\n", " 122,\n", " 159,\n", " 44,\n", " 316,\n", " 49,\n", " 170,\n", " 82,\n", " 54,\n", " 95,\n", " 35,\n", " 31,\n", " 23,\n", " 15,\n", " 39,\n", " 16,\n", " 59,\n", " 28,\n", " 35,\n", " 79,\n", " 106,\n", " 86,\n", " 43,\n", " 126,\n", " 27,\n", " 47,\n", " 30,\n", " 91,\n", " 34,\n", " 15,\n", " 120,\n", " 53,\n", " 39,\n", " 134,\n", " 53,\n", " 81,\n", " 65,\n", " 34,\n", " 15,\n", " 69,\n", " 72,\n", " 66,\n", " 40,\n", " 93,\n", " 73,\n", " 27,\n", " 58,\n", " 23,\n", " 39,\n", " 20,\n", " 56,\n", " 61,\n", " 55,\n", " 12,\n", " 33,\n", " 16,\n", " 8,\n", " 17,\n", " 17,\n", " 9,\n", " 9,\n", " 72,\n", " 59,\n", " 103,\n", " 38,\n", " 69,\n", " 72,\n", " 46,\n", " 32,\n", " 37,\n", " 52,\n", " 46,\n", " 93,\n", " 124,\n", " 27,\n", " 21,\n", " 132,\n", " 19,\n", " 54,\n", " 32,\n", " 44,\n", " 48,\n", " 141,\n", " 13,\n", " 168,\n", " 111,\n", " 201,\n", " 27,\n", " 66,\n", " 29,\n", " 22,\n", " 41,\n", " 33,\n", " 60,\n", " 238,\n", " 63,\n", " 60,\n", " 14,\n", " 98,\n", " 60,\n", " 209,\n", " 53,\n", " 17,\n", " 49,\n", " 38,\n", " 58,\n", " 27,\n", " 76,\n", " 60,\n", " 52,\n", " 88,\n", " 42,\n", " 48,\n", " 79,\n", " 31,\n", " 47,\n", " 19,\n", " 17,\n", " 41,\n", " 22,\n", " 131,\n", " 64,\n", " 39,\n", " 64,\n", " 57,\n", " 21,\n", " 92,\n", " 125,\n", " 13,\n", " 164,\n", " 83,\n", " 99,\n", " 100,\n", " 12,\n", " 32,\n", " 58,\n", " 177,\n", " 20,\n", " 49,\n", " 67,\n", " 64,\n", " 81,\n", " 144,\n", " 105,\n", " 97,\n", " 176,\n", " 40,\n", " 101,\n", " 89,\n", " 40,\n", " 68,\n", " 45,\n", " 69,\n", " 180,\n", " 54,\n", " 15,\n", " 36,\n", " 15,\n", " 473,\n", " 119,\n", " 308,\n", " 21,\n", " 19,\n", " 27,\n", " 15,\n", " 59,\n", " 400,\n", " 57,\n", " 64,\n", " 39,\n", " 27,\n", " 33,\n", " 29,\n", " 31,\n", " 24,\n", " 25,\n", " 23,\n", " 14,\n", " 118,\n", " 71,\n", " 80,\n", " 36,\n", " 40,\n", " 30,\n", " 91,\n", " 96,\n", " 60,\n", " 183,\n", " 109,\n", " 53,\n", " 79,\n", " 42,\n", " 53,\n", " 87,\n", " 70,\n", " 74,\n", " 18,\n", " 79,\n", " 249,\n", " 175,\n", " 503,\n", " 205,\n", " 38,\n", " 63,\n", " 131,\n", " 26,\n", " 35,\n", " 266,\n", " 352,\n", " 396,\n", " 146,\n", " 62,\n", " 167,\n", " 111,\n", " 31,\n", " 84,\n", " 40,\n", " 74,\n", " 20,\n", " 35,\n", " 34,\n", " 64,\n", " 28,\n", " 49,\n", " 43,\n", " 64,\n", " 57,\n", " 81,\n", " 103,\n", " 83,\n", " 22,\n", " 46,\n", " 25,\n", " 81,\n", " 228,\n", " 85,\n", " 397,\n", " 32,\n", " 143,\n", " 119,\n", " 282,\n", " 36,\n", " 97,\n", " 248,\n", " 113,\n", " 65,\n", " 29,\n", " 32,\n", " 98,\n", " 16,\n", " 11,\n", " 15,\n", " 42,\n", " 59,\n", " 27,\n", " 223,\n", " 137,\n", " 104,\n", " 48,\n", " 48,\n", " 168,\n", " 22,\n", " 154,\n", " 198,\n", " 36,\n", " 91,\n", " 40,\n", " 80,\n", " 53,\n", " 48,\n", " 27,\n", " 99,\n", " 31,\n", " 180,\n", " 172,\n", " 62,\n", " 122,\n", " 240,\n", " 125,\n", " 20,\n", " 18,\n", " 20,\n", " 39,\n", " 25,\n", " 50,\n", " 45,\n", " 33,\n", " 51,\n", " 63,\n", " 22,\n", " 97,\n", " 17,\n", " 45,\n", " 250,\n", " 87,\n", " 187,\n", " 64,\n", " 104,\n", " 70,\n", " 124,\n", " 43,\n", " 38,\n", " 46,\n", " 45,\n", " 13,\n", " 22,\n", " 38,\n", " 17,\n", " 110,\n", " 32,\n", " 113,\n", " 23,\n", " 35,\n", " 49,\n", " 21,\n", " 54,\n", " 38,\n", " 219,\n", " 188,\n", " 14,\n", " 151,\n", " 131]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text_len_list" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "ExecuteTime": { "end_time": "2022-06-27T13:49:01.304890Z", "start_time": "2022-06-27T13:49:01.159757Z" } }, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAQ1klEQVR4nO3df6xfdX3H8edrFMGpEZC7prbNitrNoImF3CFE/2AwJ6JZMXEEsmhnSOoSyHAx28D9oSYjgURlmGxkVdBqnMAQR4NMh5XE+IfgBSvyQ+ZVymhT6FUBZWZE8L0/vp/K1/a298f3/rCfPh/JN/ecz+dzvud9Tg8vTj/3fL9NVSFJ6svvLHcBkqSFZ7hLUocMd0nqkOEuSR0y3CWpQyuWuwCAE088sdatW7fcZUjSYeWee+75cVWNTdf3WxHu69atY2JiYrnLkKTDSpJHD9bntIwkdchwl6QOGe6S1KEZwz3JsUnuTvLdJA8k+Uhr/0ySR5LsaK8NrT1JPpFkMsl9SU5d7IOQJP2m2fxC9VngrKp6JsnRwDeT/Gfr+9uqunm/8W8D1rfXG4Fr209J0hKZ8c69Bp5pq0e316G+bWwj8Nm23beA45KsGr1USdJszWrOPclRSXYAe4E7ququ1nVFm3q5OskxrW018NjQ5rta2/7vuTnJRJKJqampEQ5BkrS/WYV7VT1fVRuANcBpSV4PXA68Fvgj4ATg7+ey46raUlXjVTU+NjbtM/iSpHma09MyVfUUcCdwTlXtaVMvzwKfBk5rw3YDa4c2W9PaJElLZMZfqCYZA35ZVU8leTHwFuCqJKuqak+SAOcB97dNtgGXJLmBwS9Sn66qPYtUP+su+/JivfWMdl759mXbtyQdymyellkFbE1yFIM7/Zuq6rYkX2/BH2AH8Fdt/O3AucAk8AvgvQtftiTpUGYM96q6DzhlmvazDjK+gItHL02SNF9+QlWSOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQzOGe5Jjk9yd5LtJHkjykdZ+UpK7kkwmuTHJi1r7MW19svWvW9xDkCTtbzZ37s8CZ1XVG4ANwDlJTgeuAq6uqtcATwIXtfEXAU+29qvbOEnSEpox3GvgmbZ6dHsVcBZwc2vfCpzXlje2dVr/2UmyYBVLkmY0qzn3JEcl2QHsBe4Afgg8VVXPtSG7gNVteTXwGEDrfxp4xTTvuTnJRJKJqamp0Y5CkvQbZhXuVfV8VW0A1gCnAa8ddcdVtaWqxqtqfGxsbNS3kyQNmdPTMlX1FHAncAZwXJIVrWsNsLst7wbWArT+lwM/WZBqJUmzMpunZcaSHNeWXwy8BXiIQci/qw3bBNzalre1dVr/16uqFrJoSdKhrZh5CKuArUmOYvA/g5uq6rYkDwI3JPlH4DvAdW38dcDnkkwCPwUuWIS6JUmHMGO4V9V9wCnTtP+Iwfz7/u3/B/z5glQnSZoXP6EqSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1KEZwz3J2iR3JnkwyQNJLm3tH06yO8mO9jp3aJvLk0wmeTjJWxfzACRJB1oxizHPAR+oqnuTvAy4J8kdre/qqvro8OAkJwMXAK8DXgl8LckfVNXzC1m4JOngZrxzr6o9VXVvW/458BCw+hCbbARuqKpnq+oRYBI4bSGKlSTNzpzm3JOsA04B7mpNlyS5L8n1SY5vbauBx4Y228U0/zNIsjnJRJKJqampORcuSTq4WYd7kpcCXwTeX1U/A64FXg1sAPYAH5vLjqtqS1WNV9X42NjYXDaVJM1gVuGe5GgGwf75qroFoKqeqKrnq+pXwCd5YeplN7B2aPM1rU2StERm87RMgOuAh6rq40Ptq4aGvRO4vy1vAy5IckySk4D1wN0LV7IkaSazeVrmTcC7ge8l2dHaPghcmGQDUMBO4H0AVfVAkpuABxk8aXOxT8pI0tKaMdyr6ptApum6/RDbXAFcMUJdkqQR+AlVSeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA7NGO5J1ia5M8mDSR5IcmlrPyHJHUl+0H4e39qT5BNJJpPcl+TUxT4ISdJvms2d+3PAB6rqZOB04OIkJwOXAduraj2wva0DvA1Y316bgWsXvGpJ0iHNGO5Vtaeq7m3LPwceAlYDG4GtbdhW4Ly2vBH4bA18CzguyaoFr1ySdFBzmnNPsg44BbgLWFlVe1rX48DKtrwaeGxos12tbf/32pxkIsnE1NTUHMuWJB3KrMM9yUuBLwLvr6qfDfdVVQE1lx1X1ZaqGq+q8bGxsblsKkmawazCPcnRDIL981V1S2t+Yt90S/u5t7XvBtYObb6mtUmSlshsnpYJcB3wUFV9fKhrG7CpLW8Cbh1qf097auZ04Omh6RtJ0hJYMYsxbwLeDXwvyY7W9kHgSuCmJBcBjwLnt77bgXOBSeAXwHsXtGJJ0oxmDPeq+iaQg3SfPc34Ai4esS5J0gj8hKokdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtSh2YM9yTXJ9mb5P6htg8n2Z1kR3udO9R3eZLJJA8neetiFS5JOrjZ3Ll/Bjhnmvarq2pDe90OkORk4ALgdW2bf0ly1EIVK0manRnDvaq+Afx0lu+3Ebihqp6tqkeASeC0EeqTJM3DKHPulyS5r03bHN/aVgOPDY3Z1doOkGRzkokkE1NTUyOUIUna33zD/Vrg1cAGYA/wsbm+QVVtqarxqhofGxubZxmSpOnMK9yr6omqer6qfgV8khemXnYDa4eGrmltkqQlNK9wT7JqaPWdwL4nabYBFyQ5JslJwHrg7tFKlCTN1YqZBiT5AnAmcGKSXcCHgDOTbAAK2Am8D6CqHkhyE/Ag8BxwcVU9vzilS5IOZsZwr6oLp2m+7hDjrwCuGKUoSdJo/ISqJHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUodWzDQgyfXAO4C9VfX61nYCcCOwDtgJnF9VTyYJcA1wLvAL4C+r6t7FKX35rbvsy8uy351Xvn1Z9ivp8DGbO/fPAOfs13YZsL2q1gPb2zrA24D17bUZuHZhypQkzcWM4V5V3wB+ul/zRmBrW94KnDfU/tka+BZwXJJVC1WsJGl25jvnvrKq9rTlx4GVbXk18NjQuF2t7QBJNieZSDIxNTU1zzIkSdMZ+ReqVVVAzWO7LVU1XlXjY2Njo5YhSRoy33B/Yt90S/u5t7XvBtYOjVvT2iRJS2i+4b4N2NSWNwG3DrW/JwOnA08PTd9IkpbIbB6F/AJwJnBikl3Ah4ArgZuSXAQ8Cpzfht/O4DHISQaPQr53EWqWJM1gxnCvqgsP0nX2NGMLuHjUoiRJo/ETqpLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdWjHKxkl2Aj8Hngeeq6rxJCcANwLrgJ3A+VX15GhlSpLmYiHu3P+4qjZU1XhbvwzYXlXrge1tXZK0hBZjWmYjsLUtbwXOW4R9SJIOYdRwL+C/ktyTZHNrW1lVe9ry48DK6TZMsjnJRJKJqampEcuQJA0bac4deHNV7U7ye8AdSb4/3FlVlaSm27CqtgBbAMbHx6cdI0man5Hu3Ktqd/u5F/gScBrwRJJVAO3n3lGLlCTNzbzDPclLkrxs3zLwp8D9wDZgUxu2Cbh11CIlSXMzyrTMSuBLSfa9z79V1VeSfBu4KclFwKPA+aOXKUmai3mHe1X9CHjDNO0/Ac4epShJ0mj8hKokdchwl6QOjfoopJbBusu+vGz73nnl25dt35Jmzzt3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhv/JXc7JcXzfsVw1Lc+OduyR1yHCXpA4Z7pLUoUWbc09yDnANcBTwqaq6crH2pf4t5z8tuFz8PYNGsSh37kmOAv4ZeBtwMnBhkpMXY1+SpAMt1p37acBkVf0IIMkNwEbgwUXan6QFciT+A+w9HvNihftq4LGh9V3AG4cHJNkMbG6rzyR5eB77ORH48bwq7Jvn5UCH3TnJVUuym9+q87JExzyTJT0nIx7z7x+sY9mec6+qLcCWUd4jyURVjS9QSd3wvBzIczI9z8uBejkni/W0zG5g7dD6mtYmSVoCixXu3wbWJzkpyYuAC4Bti7QvSdJ+FmVapqqeS3IJ8FUGj0JeX1UPLMKuRprW6Zjn5UCek+l5Xg7UxTlJVS13DZKkBeYnVCWpQ4a7JHXosA33JOckeTjJZJLLlruepZJkbZI7kzyY5IEkl7b2E5LckeQH7efxrT1JPtHO031JTl3eI1g8SY5K8p0kt7X1k5Lc1Y79xvbLfZIc09YnW/+65ax7MSU5LsnNSb6f5KEkZxzp10qSv2n/7dyf5AtJju3xWjksw/0I/3qD54APVNXJwOnAxe3YLwO2V9V6YHtbh8E5Wt9em4Frl77kJXMp8NDQ+lXA1VX1GuBJ4KLWfhHwZGu/uo3r1TXAV6rqtcAbGJyfI/ZaSbIa+GtgvKpez+CBjwvo8VqpqsPuBZwBfHVo/XLg8uWua5nOxa3AW4CHgVWtbRXwcFv+V+DCofG/HtfTi8FnKbYDZwG3AWHwKcMV+18zDJ7iOqMtr2jjstzHsAjn5OXAI/sf25F8rfDCp+dPaH/2twFv7fFaOSzv3Jn+6w1WL1Mty6b9FfEU4C5gZVXtaV2PAyvb8pFyrv4J+DvgV239FcBTVfVcWx8+7l+fk9b/dBvfm5OAKeDTbbrqU0lewhF8rVTVbuCjwP8Aexj82d9Dh9fK4RruR7wkLwW+CLy/qn423FeD24wj5hnXJO8A9lbVPctdy2+ZFcCpwLVVdQrwv7wwBQMckdfK8Qy+xPAk4JXAS4BzlrWoRXK4hvsR/fUGSY5mEOyfr6pbWvMTSVa1/lXA3tZ+JJyrNwF/lmQncAODqZlrgOOS7Pug3vBx//qctP6XAz9ZyoKXyC5gV1Xd1dZvZhD2R/K18ifAI1U1VVW/BG5hcP10d60cruF+xH69QZIA1wEPVdXHh7q2AZva8iYGc/H72t/TnoQ4HXh66K/kXaiqy6tqTVWtY3AtfL2q/gK4E3hXG7b/Odl3rt7Vxnd391pVjwOPJfnD1nQ2g6/dPmKvFQbTMacn+d3239K+c9LftbLck/4j/GLkXOC/gR8C/7Dc9Szhcb+ZwV+j7wN2tNe5DOYBtwM/AL4GnNDGh8GTRT8EvsfgKYFlP45FPD9nAre15VcBdwOTwL8Dx7T2Y9v6ZOt/1XLXvYjnYwMw0a6X/wCOP9KvFeAjwPeB+4HPAcf0eK349QOS1KHDdVpGknQIhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nq0P8DUKojecC+u+0AAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "(cnt, bars, _) = plt.hist(text_len_list)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "ExecuteTime": { "end_time": "2022-06-27T13:49:10.416996Z", "start_time": "2022-06-27T13:49:10.413790Z" } }, "outputs": [ { "data": { "text/plain": [ "11" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(bars)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }