summaryrefslogtreecommitdiff
path: root/learn_torch
diff options
context:
space:
mode:
Diffstat (limited to 'learn_torch')
-rw-r--r--learn_torch/grad/05_torch_variables_grad_inplace_operation.ipynb984
1 files changed, 984 insertions, 0 deletions
diff --git a/learn_torch/grad/05_torch_variables_grad_inplace_operation.ipynb b/learn_torch/grad/05_torch_variables_grad_inplace_operation.ipynb
new file mode 100644
index 0000000..df32ca8
--- /dev/null
+++ b/learn_torch/grad/05_torch_variables_grad_inplace_operation.ipynb
@@ -0,0 +1,984 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:45:03.778712Z",
+ "start_time": "2023-02-28T14:45:01.520437Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1.10.2\n",
+ "sys.version_info(major=3, minor=6, micro=8, releaselevel='final', serial=0)\n"
+ ]
+ }
+ ],
+ "source": [
+ "import sys\n",
+ "import torch\n",
+ "from torch import nn\n",
+ "print(torch.__version__)\n",
+ "print(sys.version_info)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "- compute loss\n",
+ " - forward\n",
+ "- loss.backward()(或者任意的 objective.backward())\n",
+ " - backward (compute grad)\n",
+ "- optimizer.step()\n",
+ " - x = x - lr*x.grad"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 两种不被允许的 inplace operation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "1. 对于 requires_grad==True 的 叶子张量 (leaf tensor) 不能使用 inplace operation\n",
+ " - all `Parameters` are leaf node and requires grad\n",
+ " - tensor.is_leaf == True\n",
+ "2. 对于在求梯度阶段需要用到的张量不能使用 inplace operation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 叶子节点(leaf node)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:48:30.078573Z",
+ "start_time": "2023-02-28T14:48:30.075850Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "w = torch.FloatTensor(10) # w 是个 leaf tensor\n",
+ "w.requires_grad = True # 将 requires_grad 设置为 True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:48:32.906797Z",
+ "start_time": "2023-02-28T14:48:32.891322Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tensor([4.0725e-36, 1.4013e-45, 4.7817e-33, 1.4013e-45, 4.7732e-33, 1.4013e-45,\n",
+ " 4.7732e-33, 1.4013e-45, 4.7733e-33, 1.4013e-45], requires_grad=True)"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "w"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:48:54.733133Z",
+ "start_time": "2023-02-28T14:48:54.729702Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "w.is_leaf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:49:13.151338Z",
+ "start_time": "2023-02-28T14:49:12.971843Z"
+ }
+ },
+ "outputs": [
+ {
+ "ename": "RuntimeError",
+ "evalue": "a leaf Variable that requires grad is being used in an in-place operation.",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m<ipython-input-5-a7ec7d06d9b7>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# inplace operation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mw\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnormal_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;31mRuntimeError\u001b[0m: a leaf Variable that requires grad is being used in an in-place operation."
+ ]
+ }
+ ],
+ "source": [
+ "# inplace operation\n",
+ "w.normal_()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:49:45.963897Z",
+ "start_time": "2023-02-28T14:49:45.959168Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "w.data.requires_grad"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:50:30.353687Z",
+ "start_time": "2023-02-28T14:50:30.349282Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tensor([-1.6973, -0.1248, -1.1631, 0.6612, 0.2086, -0.1125, -0.5158, 0.2699,\n",
+ " 0.2380, 0.6100])"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "w.data.normal_()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:50:31.778610Z",
+ "start_time": "2023-02-28T14:50:31.773968Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tensor([-1.6973, -0.1248, -1.1631, 0.6612, 0.2086, -0.1125, -0.5158, 0.2699,\n",
+ " 0.2380, 0.6100])"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "w.data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 求梯度阶段(不限于是否是 leaf node/variable/parameters)需要用到的张量"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:51:17.822399Z",
+ "start_time": "2023-02-28T14:51:17.819569Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "x = torch.FloatTensor([[1., 2.]])\n",
+ "w1 = torch.FloatTensor([[2.], [1.]])\n",
+ "w2 = torch.FloatTensor([3.])\n",
+ "w1.requires_grad = True\n",
+ "w2.requires_grad = True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:51:46.091766Z",
+ "start_time": "2023-02-28T14:51:46.087861Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# x.is_leaf\n",
+ "# w1.is_leaf\n",
+ "w2.is_leaf"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "x\n",
+ "w1 -> d\n",
+ " w2 -> f"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:54:03.854647Z",
+ "start_time": "2023-02-28T14:54:03.816440Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "ename": "RuntimeError",
+ "evalue": "one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 1]], which is output 0 of torch::autograd::CopySlices, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m<ipython-input-17-1dde630bbd78>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0md\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 306\u001b[0m inputs=inputs)\n\u001b[0;32m--> 307\u001b[0;31m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautograd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 308\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 309\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mregister_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 154\u001b[0m Variable._execution_engine.run_backward(\n\u001b[1;32m 155\u001b[0m \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 156\u001b[0;31m allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag\n\u001b[0m\u001b[1;32m 157\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mRuntimeError\u001b[0m: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 1]], which is output 0 of torch::autograd::CopySlices, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True)."
+ ]
+ }
+ ],
+ "source": [
+ "d = torch.matmul(x, w1)\n",
+ "f = torch.matmul(d, w2)\n",
+ "d[:] = 0\n",
+ "\n",
+ "f.backward()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:54:23.808756Z",
+ "start_time": "2023-02-28T14:54:23.804946Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "d = torch.matmul(x, w1)\n",
+ "d[:] = 0\n",
+ "f = torch.matmul(d, w2)\n",
+ "\n",
+ "f.backward()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:54:35.746473Z",
+ "start_time": "2023-02-28T14:54:35.741455Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tensor([0.])"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "w2.grad"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "- 在计算 f 的时候, d 是等于某个值的, f 对于 w2 的导数是和这时候的 d 值相关的\n",
+ "- 但是计算完 f 之后, d 的值变了, 这就会导致 f.backward() 对于 w2 的导数计算出错误, 为了防止这种错误, pytorch 选择了报错的形式.\n",
+ "- 造成这个问题的主要原因是因为 在执行 f = torch.matmul(d, w2) 这句的时候, pytorch 的反向求导机制保存了 d 的引用为了之后的反向求导计算."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## `.data`与`.detach`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:20:41.990724Z",
+ "start_time": "2023-02-28T14:20:41.987584Z"
+ }
+ },
+ "source": [
+ "- detach\n",
+ " - Returns a new Tensor, detached from the current graph.\n",
+ " - The result will never require gradient.\n",
+ "- x.data 与 x.detach() 返回的 tensor 有相同的地方, 也有不同的地方,相同点如下\n",
+ " - 都和 x 共享同一块数据\n",
+ " - 都和 x 的 计算历史无关\n",
+ " - requires_grad = False\n",
+ "- x.data 的修改不会导致报错,但其实计算是有问题的(相当于埋了一个bug);\n",
+ " - x.detach() 会直接报错(更加梯度安全);"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "a = torch.tensor([1, 2, 3.], requires_grad=True)\n",
+ "\n",
+ "out = a.sigmoid()\n",
+ "\n",
+ "c = out.data\n",
+ "# c = out.detach()\n",
+ "\n",
+ "print(f'a.requires_grad: {a.requires_grad}, out.requires_grad: {out.requires_grad}, c.requires_grad: {c.requires_grad}')\n",
+ "\n",
+ "print(out)\n",
+ "print(c)\n",
+ "c.zero_()\n",
+ "\n",
+ "print(out)\n",
+ "print(c)\n",
+ "\n",
+ "out.sum().backward()\n",
+ "print(a.grad, a.sigmoid()*(1-a.sigmoid()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:56:40.192452Z",
+ "start_time": "2023-02-28T14:56:40.189472Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "a = torch.tensor([1, 2, 3.], requires_grad=True)\n",
+ "out = a.sigmoid()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:56:42.011455Z",
+ "start_time": "2023-02-28T14:56:42.007338Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tensor([0.7311, 0.8808, 0.9526], grad_fn=<SigmoidBackward0>)"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "out"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:56:51.947340Z",
+ "start_time": "2023-02-28T14:56:51.944252Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "c = out.data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:57:05.568157Z",
+ "start_time": "2023-02-28T14:57:05.561002Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "a.requires_grad:True, out.requires_grad: True, c.requires_grad: False\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f'a.requires_grad:{a.requires_grad}, out.requires_grad: {out.requires_grad}, c.requires_grad: {c.requires_grad}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:57:16.654499Z",
+ "start_time": "2023-02-28T14:57:16.648875Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tensor([0.7311, 0.8808, 0.9526], grad_fn=<SigmoidBackward0>)"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "out"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:57:17.920288Z",
+ "start_time": "2023-02-28T14:57:17.915152Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tensor([0.7311, 0.8808, 0.9526])"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "c"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:57:34.971511Z",
+ "start_time": "2023-02-28T14:57:34.965686Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tensor([0., 0., 0.])"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "c.zero_()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:57:41.077909Z",
+ "start_time": "2023-02-28T14:57:41.073708Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tensor([0., 0., 0.], grad_fn=<SigmoidBackward0>)"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "out"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:57:50.771024Z",
+ "start_time": "2023-02-28T14:57:50.767388Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "out.requires_grad"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:57:56.119624Z",
+ "start_time": "2023-02-28T14:57:56.115622Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "c.requires_grad"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:58:20.147508Z",
+ "start_time": "2023-02-28T14:58:20.116942Z"
+ },
+ "collapsed": true
+ },
+ "outputs": [
+ {
+ "ename": "RuntimeError",
+ "evalue": "Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m<ipython-input-31-e67dddba67b2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# 应该报错,而未报错\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 306\u001b[0m inputs=inputs)\n\u001b[0;32m--> 307\u001b[0;31m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautograd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 308\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 309\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mregister_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 154\u001b[0m Variable._execution_engine.run_backward(\n\u001b[1;32m 155\u001b[0m \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 156\u001b[0;31m allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag\n\u001b[0m\u001b[1;32m 157\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mRuntimeError\u001b[0m: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward."
+ ]
+ }
+ ],
+ "source": [
+ "# 应该报错,而未报错\n",
+ "out.sum().backward()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:59:25.255115Z",
+ "start_time": "2023-02-28T14:59:25.252229Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "a = torch.tensor([1, 2, 3.], requires_grad=True)\n",
+ "\n",
+ "out = a.sigmoid()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:59:35.652693Z",
+ "start_time": "2023-02-28T14:59:35.650328Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "out.sum().backward()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:59:56.719336Z",
+ "start_time": "2023-02-28T14:59:56.714858Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "tensor([0.1966, 0.1050, 0.0452]) tensor([0.1966, 0.1050, 0.0452], grad_fn=<MulBackward0>)\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(a.grad, a.sigmoid()*(1-a.sigmoid()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T14:58:59.767314Z",
+ "start_time": "2023-02-28T14:58:59.757237Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "a.requires_grad: True, out.requires_grad: True, c.requires_grad: False\n",
+ "tensor([0.7311, 0.8808, 0.9526], grad_fn=<SigmoidBackward0>)\n",
+ "tensor([0.7311, 0.8808, 0.9526])\n",
+ "tensor([0., 0., 0.], grad_fn=<SigmoidBackward0>)\n",
+ "tensor([0., 0., 0.])\n",
+ "tensor([0., 0., 0.]) tensor([0.1966, 0.1050, 0.0452], grad_fn=<MulBackward0>)\n"
+ ]
+ }
+ ],
+ "source": [
+ "a = torch.tensor([1, 2, 3.], requires_grad=True)\n",
+ "\n",
+ "out = a.sigmoid()\n",
+ "\n",
+ "c = out.data\n",
+ "# c = out.detach()\n",
+ "\n",
+ "print(f'a.requires_grad: {a.requires_grad}, out.requires_grad: {out.requires_grad}, c.requires_grad: {c.requires_grad}')\n",
+ "\n",
+ "print(out)\n",
+ "print(c)\n",
+ "c.zero_()\n",
+ "\n",
+ "print(out)\n",
+ "print(c)\n",
+ "\n",
+ "out.sum().backward()\n",
+ "print(a.grad, a.sigmoid()*(1-a.sigmoid()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T15:00:24.272031Z",
+ "start_time": "2023-02-28T15:00:24.236261Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "a.requires_grad: True, out.requires_grad: True, c.requires_grad: False\n",
+ "tensor([0.7311, 0.8808, 0.9526], grad_fn=<SigmoidBackward0>)\n",
+ "tensor([0.7311, 0.8808, 0.9526])\n",
+ "tensor([0., 0., 0.], grad_fn=<SigmoidBackward0>)\n",
+ "tensor([0., 0., 0.])\n"
+ ]
+ },
+ {
+ "ename": "RuntimeError",
+ "evalue": "one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3]], which is output 0 of SigmoidBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m<ipython-input-37-6545a457b224>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgrad\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msigmoid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msigmoid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 306\u001b[0m inputs=inputs)\n\u001b[0;32m--> 307\u001b[0;31m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautograd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 308\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 309\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mregister_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 154\u001b[0m Variable._execution_engine.run_backward(\n\u001b[1;32m 155\u001b[0m \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 156\u001b[0;31m allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag\n\u001b[0m\u001b[1;32m 157\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mRuntimeError\u001b[0m: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3]], which is output 0 of SigmoidBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True)."
+ ]
+ }
+ ],
+ "source": [
+ "a = torch.tensor([1, 2, 3.], requires_grad=True)\n",
+ "\n",
+ "out = a.sigmoid()\n",
+ "\n",
+ "# c = out.data\n",
+ "c = out.detach()\n",
+ "\n",
+ "print(f'a.requires_grad: {a.requires_grad}, out.requires_grad: {out.requires_grad}, c.requires_grad: {c.requires_grad}')\n",
+ "\n",
+ "print(out)\n",
+ "print(c)\n",
+ "c.zero_()\n",
+ "\n",
+ "print(out)\n",
+ "print(c)\n",
+ "\n",
+ "out.sum().backward()\n",
+ "print(a.grad, a.sigmoid()*(1-a.sigmoid()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## embedding"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "n, d, m = 3, 5, 7\n",
+ "# embedding = nn.Embedding(n, d, max_norm=True)\n",
+ "embedding = nn.Embedding(n, d, max_norm=1)\n",
+ "W = torch.randn((m, d), requires_grad=True)\n",
+ "idx = torch.tensor([1, 2])\n",
+ "a = embedding.weight.clone() @ W.t() # weight must be cloned for this to be differentiable\n",
+ "b = embedding(idx) @ W.t() # modifies weight in-place\n",
+ "out = (a.unsqueeze(0) + b.unsqueeze(1))\n",
+ "loss = out.sigmoid().prod()\n",
+ "loss.backward()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "e\n",
+ "f(w) -> a\n",
+ " out => loss\n",
+ "e\n",
+ "g(w) -> b\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T15:02:18.907853Z",
+ "start_time": "2023-02-28T15:02:18.831171Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "ename": "RuntimeError",
+ "evalue": "one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3, 5]] is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m<ipython-input-38-d6c0a82b2426>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munsqueeze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munsqueeze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msigmoid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprod\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 306\u001b[0m inputs=inputs)\n\u001b[0;32m--> 307\u001b[0;31m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautograd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 308\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 309\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mregister_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 154\u001b[0m Variable._execution_engine.run_backward(\n\u001b[1;32m 155\u001b[0m \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 156\u001b[0;31m allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag\n\u001b[0m\u001b[1;32m 157\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mRuntimeError\u001b[0m: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3, 5]] is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True)."
+ ]
+ }
+ ],
+ "source": [
+ "n, d, m = 3, 5, 7\n",
+ "# embedding = nn.Embedding(n, d, max_norm=True)\n",
+ "embedding = nn.Embedding(n, d, max_norm=1)\n",
+ "W = torch.randn((m, d), requires_grad=True)\n",
+ "idx = torch.tensor([1, 2])\n",
+ "a = embedding.weight @ W.t() # weight must be cloned for this to be differentiable\n",
+ "b = embedding(idx) @ W.t() # modifies weight in-place\n",
+ "out = (a.unsqueeze(0) + b.unsqueeze(1))\n",
+ "loss = out.sigmoid().prod()\n",
+ "loss.backward()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-02-28T15:04:55.594661Z",
+ "start_time": "2023-02-28T15:04:55.589654Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "n, d, m = 3, 5, 7\n",
+ "# embedding = nn.Embedding(n, d, max_norm=True)\n",
+ "embedding = nn.Embedding(n, d, max_norm=1)\n",
+ "W = torch.randn((m, d), requires_grad=True)\n",
+ "idx = torch.tensor([1, 2])\n",
+ "b = embedding(idx) @ W.t() # modifies weight in-place\n",
+ "a = embedding.weight @ W.t() # weight must be cloned for this to be differentiable\n",
+ "out = (a.unsqueeze(0) + b.unsqueeze(1))\n",
+ "loss = out.sigmoid().prod()\n",
+ "loss.backward()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Because W in the line computing a requires gradients, we must save embedding.weight to compute those gradients in the backward pass. However, in the line computing b, executing embedding(idx) will scale embedding.weight by max_norm - in place. So, without cloning it in line a, embedding.weight will be modified when line b is executed - changing what was saved for the backward pass to update W. Hence the requirement to clone embedding.weight - to save it before it gets scaled in line b.\n",
+ "If you don't use embedding.weight outside of the normal forward pass, you don't need to worry about all this.\n",
+ "If you get an error, post it (and your code)."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": true
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}