From e0208fc250d4c8ae4d42f44ffddf151237211877 Mon Sep 17 00:00:00 2001
From: zhang <zch921005@126.com>
Date: Sun, 10 Jul 2022 11:00:10 +0800
Subject: wordpiece

---
 fine_tune/bert/tutorials/05_model_outputs.ipynb | 395 ++++++++++++++++++++++++
 fine_tune/bert/tutorials/05_output.py           |  25 ++
 2 files changed, 420 insertions(+)
 create mode 100644 fine_tune/bert/tutorials/05_model_outputs.ipynb
 create mode 100644 fine_tune/bert/tutorials/05_output.py

(limited to 'fine_tune/bert/tutorials')

diff --git a/fine_tune/bert/tutorials/05_model_outputs.ipynb b/fine_tune/bert/tutorials/05_model_outputs.ipynb
new file mode 100644
index 0000000..40155ea
--- /dev/null
+++ b/fine_tune/bert/tutorials/05_model_outputs.ipynb
@@ -0,0 +1,395 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:29:24.616302Z",
+     "start_time": "2022-07-10T02:29:18.081012Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torch import nn\n",
+    "from transformers import BertModel, BertTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:29:41.979562Z",
+     "start_time": "2022-07-10T02:29:24.618327Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
+      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_name = 'bert-base-uncased'\n",
+    "\n",
+    "tokenizer = BertTokenizer.from_pretrained(model_name)\n",
+    "model = BertModel.from_pretrained(model_name, output_hidden_states=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:23:46.325851Z",
+     "start_time": "2022-07-10T02:23:46.322584Z"
+    }
+   },
+   "source": [
+    "### 1. input "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:43:56.693361Z",
+     "start_time": "2022-07-10T02:43:56.691053Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "text = \"After stealing money from the bank vault, the bank robber was seen \" \\\n",
+    "   \"fishing on the Mississippi river bank.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:44:09.053456Z",
+     "start_time": "2022-07-10T02:44:09.049462Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "token_input = tokenizer(text, return_tensors='pt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:44:10.582118Z",
+     "start_time": "2022-07-10T02:44:10.558526Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'input_ids': tensor([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,  1996,\n",
+       "          2924, 27307,  2001,  2464,  5645,  2006,  1996,  5900,  2314,  2924,\n",
+       "          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "token_input"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:45:01.400030Z",
+     "start_time": "2022-07-10T02:45:01.395465Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(tensor([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,  1996,\n",
+       "           2924, 27307,  2001,  2464,  5645,  2006,  1996,  5900,  2314,  2924,\n",
+       "           1012,   102]]), torch.Size([1, 22]))"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "token_input['input_ids'], token_input['input_ids'].shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- batch_size = 1， 只有一个句子，序列长度为 22（未 truncate 及 padding）\n",
+    "- "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:23:36.420705Z",
+     "start_time": "2022-07-10T02:23:36.418239Z"
+    }
+   },
+   "source": [
+    "### 2. model forward"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- forward\n",
+    "    - embedding => encoder => pooler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:46:02.848067Z",
+     "start_time": "2022-07-10T02:46:02.703744Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "model.eval()\n",
+    "with torch.no_grad():\n",
+    "    outputs = model(**token_input)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:23:44.578754Z",
+     "start_time": "2022-07-10T02:23:44.576132Z"
+    }
+   },
+   "source": [
+    "### 3. output"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:40:09.069362Z",
+     "start_time": "2022-07-10T02:40:09.065437Z"
+    }
+   },
+   "source": [
+    "- len(outputs) == 3\n",
+    "- outputs[0]\n",
+    "    - last_hidden_state, shape: batch_size\\*seq_len\\*hidden_size(1\\*22\\*768)\n",
+    "- outputs[1]\n",
+    "    - pooler_output, shape: batch_size\\*hidden_size(1\\*768)\n",
+    "    - Last layer hidden-state of the first token of the sequence (classification token, [CLS])\n",
+    "- outputs[2] (model.config.output_hidden_states = True) \n",
+    "    - type: tuple\n",
+    "    - one for the output of the embeddings(1), if the model has an embedding layer(12), + one for the output of each layer) \n",
+    "        - (1+12)\\*(batch_size\\*seq_len\\*hidden_size) = 13\\*1\\*22\\*768\n",
+    "        \n",
+    "        \n",
+    "        \n",
+    "- outputs[0] == outputs[2][-1]\n",
+    "- outputs[1] == model.pooler(outputs[2][-1])\n",
+    "- outputs[2][0] == model.embeddings(token_input['input_ids'], token_input['token_type_ids'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:46:38.864887Z",
+     "start_time": "2022-07-10T02:46:38.861110Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(outputs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:48:44.742132Z",
+     "start_time": "2022-07-10T02:48:44.736806Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(tuple, 13)"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type(outputs[2]), len(outputs[2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:50:23.068317Z",
+     "start_time": "2022-07-10T02:50:23.059660Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[True, True, True,  ..., True, True, True],\n",
+       "         [True, True, True,  ..., True, True, True],\n",
+       "         [True, True, True,  ..., True, True, True],\n",
+       "         ...,\n",
+       "         [True, True, True,  ..., True, True, True],\n",
+       "         [True, True, True,  ..., True, True, True],\n",
+       "         [True, True, True,  ..., True, True, True]]])"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "outputs[0] == outputs[2][-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:54:20.023961Z",
+     "start_time": "2022-07-10T02:54:20.014364Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[True, True, True,  ..., True, True, True],\n",
+       "         [True, True, True,  ..., True, True, True],\n",
+       "         [True, True, True,  ..., True, True, True],\n",
+       "         ...,\n",
+       "         [True, True, True,  ..., True, True, True],\n",
+       "         [True, True, True,  ..., True, True, True],\n",
+       "         [True, True, True,  ..., True, True, True]]])"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "outputs[2][0] == model.embeddings(token_input['input_ids'], token_input['token_type_ids'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-07-10T02:56:10.311946Z",
+     "start_time": "2022-07-10T02:56:10.307742Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0 torch.Size([1, 22, 768])\n",
+      "1 torch.Size([1, 22, 768])\n",
+      "2 torch.Size([1, 22, 768])\n",
+      "3 torch.Size([1, 22, 768])\n",
+      "4 torch.Size([1, 22, 768])\n",
+      "5 torch.Size([1, 22, 768])\n",
+      "6 torch.Size([1, 22, 768])\n",
+      "7 torch.Size([1, 22, 768])\n",
+      "8 torch.Size([1, 22, 768])\n",
+      "9 torch.Size([1, 22, 768])\n",
+      "10 torch.Size([1, 22, 768])\n",
+      "11 torch.Size([1, 22, 768])\n",
+      "12 torch.Size([1, 22, 768])\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(len(outputs[2])):\n",
+    "    print(i, outputs[2][i].shape)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/fine_tune/bert/tutorials/05_output.py b/fine_tune/bert/tutorials/05_output.py
new file mode 100644
index 0000000..1911641
--- /dev/null
+++ b/fine_tune/bert/tutorials/05_output.py
@@ -0,0 +1,25 @@
+from transformers import BertModel, BertTokenizer
+from transformers.models.bert import BertModel
+import torch
+from torch import nn
+
+
+if __name__ == '__main__':
+
+    model_name = 'bert-base-uncased'
+
+    tokenizer = BertTokenizer.from_pretrained(model_name)
+    model = BertModel.from_pretrained(model_name, output_hidden_states=True)
+
+    text = "After stealing money from the bank vault, the bank robber was seen " \
+       "fishing on the Mississippi river bank."
+
+
+    token_inputs = tokenizer(text, return_tensors='pt')
+    with torch.no_grad():
+        outputs = model(**token_inputs)
+
+
+
+
+
-- 
cgit v1.2.3