1 files changed, 984 insertions, 0 deletions
diff --git a/learn_torch/grad/05_torch_variables_grad_inplace_operation.ipynb b/learn_torch/grad/05_torch_variables_grad_inplace_operation.ipynb
new file mode 100644
index 0000000..df32ca8
--- /dev/null
+++ b/learn_torch/grad/05_torch_variables_grad_inplace_operation.ipynb
@@ -0,0 +1,984 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:45:03.778712Z",
+     "start_time": "2023-02-28T14:45:01.520437Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.10.2\n",
+      "sys.version_info(major=3, minor=6, micro=8, releaselevel='final', serial=0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "print(torch.__version__)\n",
+    "print(sys.version_info)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- compute loss\n",
+    "    - forward\n",
+    "- loss.backward()（或者任意的 objective.backward()）\n",
+    "    - backward (compute grad)\n",
+    "- optimizer.step()\n",
+    "    - x = x - lr*x.grad"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 两种不被允许的 inplace operation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1. 对于 requires_grad==True 的 叶子张量 (leaf tensor) 不能使用 inplace operation\n",
+    "    - all `Parameters` are leaf node and requires grad\n",
+    "    - tensor.is_leaf == True\n",
+    "2. 对于在求梯度阶段需要用到的张量不能使用 inplace operation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 叶子节点（leaf node）"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:48:30.078573Z",
+     "start_time": "2023-02-28T14:48:30.075850Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "w = torch.FloatTensor(10) # w 是个 leaf tensor\n",
+    "w.requires_grad = True    # 将 requires_grad 设置为 True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:48:32.906797Z",
+     "start_time": "2023-02-28T14:48:32.891322Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([4.0725e-36, 1.4013e-45, 4.7817e-33, 1.4013e-45, 4.7732e-33, 1.4013e-45,\n",
+       "        4.7732e-33, 1.4013e-45, 4.7733e-33, 1.4013e-45], requires_grad=True)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "w"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:48:54.733133Z",
+     "start_time": "2023-02-28T14:48:54.729702Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "w.is_leaf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:49:13.151338Z",
+     "start_time": "2023-02-28T14:49:12.971843Z"
+    }
+   },
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "a leaf Variable that requires grad is being used in an in-place operation.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-5-a7ec7d06d9b7>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# inplace operation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mw\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnormal_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m: a leaf Variable that requires grad is being used in an in-place operation."
+     ]
+    }
+   ],
+   "source": [
+    "# inplace operation\n",
+    "w.normal_()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:49:45.963897Z",
+     "start_time": "2023-02-28T14:49:45.959168Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "w.data.requires_grad"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:50:30.353687Z",
+     "start_time": "2023-02-28T14:50:30.349282Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([-1.6973, -0.1248, -1.1631,  0.6612,  0.2086, -0.1125, -0.5158,  0.2699,\n",
+       "         0.2380,  0.6100])"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "w.data.normal_()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:50:31.778610Z",
+     "start_time": "2023-02-28T14:50:31.773968Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([-1.6973, -0.1248, -1.1631,  0.6612,  0.2086, -0.1125, -0.5158,  0.2699,\n",
+       "         0.2380,  0.6100])"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "w.data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 求梯度阶段（不限于是否是 leaf node/variable/parameters）需要用到的张量"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:51:17.822399Z",
+     "start_time": "2023-02-28T14:51:17.819569Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "x = torch.FloatTensor([[1., 2.]])\n",
+    "w1 = torch.FloatTensor([[2.], [1.]])\n",
+    "w2 = torch.FloatTensor([3.])\n",
+    "w1.requires_grad = True\n",
+    "w2.requires_grad = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:51:46.091766Z",
+     "start_time": "2023-02-28T14:51:46.087861Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# x.is_leaf\n",
+    "# w1.is_leaf\n",
+    "w2.is_leaf"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "x\n",
+    "w1 -> d\n",
+    "      w2 -> f"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:54:03.854647Z",
+     "start_time": "2023-02-28T14:54:03.816440Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 1]], which is output 0 of torch::autograd::CopySlices, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-17-1dde630bbd78>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0md\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m    305\u001b[0m                 \u001b[0mcreate_graph\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    306\u001b[0m                 inputs=inputs)\n\u001b[0;32m--> 307\u001b[0;31m         \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautograd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    308\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    309\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mregister_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m    154\u001b[0m     Variable._execution_engine.run_backward(\n\u001b[1;32m    155\u001b[0m         \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 156\u001b[0;31m         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag\n\u001b[0m\u001b[1;32m    157\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 1]], which is output 0 of torch::autograd::CopySlices, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True)."
+     ]
+    }
+   ],
+   "source": [
+    "d = torch.matmul(x, w1)\n",
+    "f = torch.matmul(d, w2)\n",
+    "d[:] = 0\n",
+    "\n",
+    "f.backward()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:54:23.808756Z",
+     "start_time": "2023-02-28T14:54:23.804946Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "d = torch.matmul(x, w1)\n",
+    "d[:] = 0\n",
+    "f = torch.matmul(d, w2)\n",
+    "\n",
+    "f.backward()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:54:35.746473Z",
+     "start_time": "2023-02-28T14:54:35.741455Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([0.])"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "w2.grad"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- 在计算 f 的时候, d 是等于某个值的, f 对于 w2 的导数是和这时候的 d 值相关的\n",
+    "- 但是计算完 f 之后, d 的值变了, 这就会导致 f.backward() 对于 w2 的导数计算出错误, 为了防止这种错误, pytorch 选择了报错的形式.\n",
+    "- 造成这个问题的主要原因是因为 在执行 f = torch.matmul(d, w2) 这句的时候, pytorch 的反向求导机制保存了 d 的引用为了之后的反向求导计算."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## `.data`与`.detach`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:20:41.990724Z",
+     "start_time": "2023-02-28T14:20:41.987584Z"
+    }
+   },
+   "source": [
+    "- detach\n",
+    "  - Returns a new Tensor, detached from the current graph.\n",
+    "  - The result will never require gradient.\n",
+    "- x.data 与 x.detach() 返回的 tensor 有相同的地方, 也有不同的地方，相同点如下\n",
+    "  - 都和 x 共享同一块数据\n",
+    "  - 都和 x 的 计算历史无关\n",
+    "  - requires_grad = False\n",
+    "- x.data 的修改不会导致报错，但其实计算是有问题的（相当于埋了一个bug）；\n",
+    "    - x.detach() 会直接报错（更加梯度安全）；"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = torch.tensor([1, 2, 3.], requires_grad=True)\n",
+    "\n",
+    "out = a.sigmoid()\n",
+    "\n",
+    "c = out.data\n",
+    "# c = out.detach()\n",
+    "\n",
+    "print(f'a.requires_grad: {a.requires_grad}, out.requires_grad: {out.requires_grad}, c.requires_grad: {c.requires_grad}')\n",
+    "\n",
+    "print(out)\n",
+    "print(c)\n",
+    "c.zero_()\n",
+    "\n",
+    "print(out)\n",
+    "print(c)\n",
+    "\n",
+    "out.sum().backward()\n",
+    "print(a.grad, a.sigmoid()*(1-a.sigmoid()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:56:40.192452Z",
+     "start_time": "2023-02-28T14:56:40.189472Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "a = torch.tensor([1, 2, 3.], requires_grad=True)\n",
+    "out = a.sigmoid()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:56:42.011455Z",
+     "start_time": "2023-02-28T14:56:42.007338Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([0.7311, 0.8808, 0.9526], grad_fn=<SigmoidBackward0>)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:56:51.947340Z",
+     "start_time": "2023-02-28T14:56:51.944252Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "c = out.data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:57:05.568157Z",
+     "start_time": "2023-02-28T14:57:05.561002Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "a.requires_grad:True, out.requires_grad: True, c.requires_grad: False\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f'a.requires_grad:{a.requires_grad}, out.requires_grad: {out.requires_grad}, c.requires_grad: {c.requires_grad}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:57:16.654499Z",
+     "start_time": "2023-02-28T14:57:16.648875Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([0.7311, 0.8808, 0.9526], grad_fn=<SigmoidBackward0>)"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:57:17.920288Z",
+     "start_time": "2023-02-28T14:57:17.915152Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([0.7311, 0.8808, 0.9526])"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "c"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:57:34.971511Z",
+     "start_time": "2023-02-28T14:57:34.965686Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([0., 0., 0.])"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "c.zero_()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:57:41.077909Z",
+     "start_time": "2023-02-28T14:57:41.073708Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([0., 0., 0.], grad_fn=<SigmoidBackward0>)"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:57:50.771024Z",
+     "start_time": "2023-02-28T14:57:50.767388Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "out.requires_grad"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:57:56.119624Z",
+     "start_time": "2023-02-28T14:57:56.115622Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "c.requires_grad"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:58:20.147508Z",
+     "start_time": "2023-02-28T14:58:20.116942Z"
+    },
+    "collapsed": true
+   },
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-31-e67dddba67b2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# 应该报错，而未报错\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m    305\u001b[0m                 \u001b[0mcreate_graph\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    306\u001b[0m                 inputs=inputs)\n\u001b[0;32m--> 307\u001b[0;31m         \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautograd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    308\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    309\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mregister_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m    154\u001b[0m     Variable._execution_engine.run_backward(\n\u001b[1;32m    155\u001b[0m         \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 156\u001b[0;31m         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag\n\u001b[0m\u001b[1;32m    157\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward."
+     ]
+    }
+   ],
+   "source": [
+    "# 应该报错，而未报错\n",
+    "out.sum().backward()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:59:25.255115Z",
+     "start_time": "2023-02-28T14:59:25.252229Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "a = torch.tensor([1, 2, 3.], requires_grad=True)\n",
+    "\n",
+    "out = a.sigmoid()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:59:35.652693Z",
+     "start_time": "2023-02-28T14:59:35.650328Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "out.sum().backward()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:59:56.719336Z",
+     "start_time": "2023-02-28T14:59:56.714858Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([0.1966, 0.1050, 0.0452]) tensor([0.1966, 0.1050, 0.0452], grad_fn=<MulBackward0>)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(a.grad, a.sigmoid()*(1-a.sigmoid()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T14:58:59.767314Z",
+     "start_time": "2023-02-28T14:58:59.757237Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "a.requires_grad: True, out.requires_grad: True, c.requires_grad: False\n",
+      "tensor([0.7311, 0.8808, 0.9526], grad_fn=<SigmoidBackward0>)\n",
+      "tensor([0.7311, 0.8808, 0.9526])\n",
+      "tensor([0., 0., 0.], grad_fn=<SigmoidBackward0>)\n",
+      "tensor([0., 0., 0.])\n",
+      "tensor([0., 0., 0.]) tensor([0.1966, 0.1050, 0.0452], grad_fn=<MulBackward0>)\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = torch.tensor([1, 2, 3.], requires_grad=True)\n",
+    "\n",
+    "out = a.sigmoid()\n",
+    "\n",
+    "c = out.data\n",
+    "# c = out.detach()\n",
+    "\n",
+    "print(f'a.requires_grad: {a.requires_grad}, out.requires_grad: {out.requires_grad}, c.requires_grad: {c.requires_grad}')\n",
+    "\n",
+    "print(out)\n",
+    "print(c)\n",
+    "c.zero_()\n",
+    "\n",
+    "print(out)\n",
+    "print(c)\n",
+    "\n",
+    "out.sum().backward()\n",
+    "print(a.grad, a.sigmoid()*(1-a.sigmoid()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T15:00:24.272031Z",
+     "start_time": "2023-02-28T15:00:24.236261Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "a.requires_grad: True, out.requires_grad: True, c.requires_grad: False\n",
+      "tensor([0.7311, 0.8808, 0.9526], grad_fn=<SigmoidBackward0>)\n",
+      "tensor([0.7311, 0.8808, 0.9526])\n",
+      "tensor([0., 0., 0.], grad_fn=<SigmoidBackward0>)\n",
+      "tensor([0., 0., 0.])\n"
+     ]
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3]], which is output 0 of SigmoidBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-37-6545a457b224>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     18\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgrad\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msigmoid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msigmoid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m    305\u001b[0m                 \u001b[0mcreate_graph\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    306\u001b[0m                 inputs=inputs)\n\u001b[0;32m--> 307\u001b[0;31m         \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautograd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    308\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    309\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mregister_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m    154\u001b[0m     Variable._execution_engine.run_backward(\n\u001b[1;32m    155\u001b[0m         \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 156\u001b[0;31m         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag\n\u001b[0m\u001b[1;32m    157\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3]], which is output 0 of SigmoidBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True)."
+     ]
+    }
+   ],
+   "source": [
+    "a = torch.tensor([1, 2, 3.], requires_grad=True)\n",
+    "\n",
+    "out = a.sigmoid()\n",
+    "\n",
+    "# c = out.data\n",
+    "c = out.detach()\n",
+    "\n",
+    "print(f'a.requires_grad: {a.requires_grad}, out.requires_grad: {out.requires_grad}, c.requires_grad: {c.requires_grad}')\n",
+    "\n",
+    "print(out)\n",
+    "print(c)\n",
+    "c.zero_()\n",
+    "\n",
+    "print(out)\n",
+    "print(c)\n",
+    "\n",
+    "out.sum().backward()\n",
+    "print(a.grad, a.sigmoid()*(1-a.sigmoid()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n, d, m = 3, 5, 7\n",
+    "# embedding = nn.Embedding(n, d, max_norm=True)\n",
+    "embedding = nn.Embedding(n, d, max_norm=1)\n",
+    "W = torch.randn((m, d), requires_grad=True)\n",
+    "idx = torch.tensor([1, 2])\n",
+    "a = embedding.weight.clone() @ W.t()  # weight must be cloned for this to be differentiable\n",
+    "b = embedding(idx) @ W.t()  # modifies weight in-place\n",
+    "out = (a.unsqueeze(0) + b.unsqueeze(1))\n",
+    "loss = out.sigmoid().prod()\n",
+    "loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "e\n",
+    "f(w) -> a\n",
+    "                out => loss\n",
+    "e\n",
+    "g(w) -> b\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T15:02:18.907853Z",
+     "start_time": "2023-02-28T15:02:18.831171Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3, 5]] is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-38-d6c0a82b2426>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munsqueeze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munsqueeze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msigmoid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprod\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m    305\u001b[0m                 \u001b[0mcreate_graph\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    306\u001b[0m                 inputs=inputs)\n\u001b[0;32m--> 307\u001b[0;31m         \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautograd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    308\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    309\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mregister_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m    154\u001b[0m     Variable._execution_engine.run_backward(\n\u001b[1;32m    155\u001b[0m         \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 156\u001b[0;31m         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag\n\u001b[0m\u001b[1;32m    157\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3, 5]] is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True)."
+     ]
+    }
+   ],
+   "source": [
+    "n, d, m = 3, 5, 7\n",
+    "# embedding = nn.Embedding(n, d, max_norm=True)\n",
+    "embedding = nn.Embedding(n, d, max_norm=1)\n",
+    "W = torch.randn((m, d), requires_grad=True)\n",
+    "idx = torch.tensor([1, 2])\n",
+    "a = embedding.weight @ W.t()  # weight must be cloned for this to be differentiable\n",
+    "b = embedding(idx) @ W.t()  # modifies weight in-place\n",
+    "out = (a.unsqueeze(0) + b.unsqueeze(1))\n",
+    "loss = out.sigmoid().prod()\n",
+    "loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-02-28T15:04:55.594661Z",
+     "start_time": "2023-02-28T15:04:55.589654Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "n, d, m = 3, 5, 7\n",
+    "# embedding = nn.Embedding(n, d, max_norm=True)\n",
+    "embedding = nn.Embedding(n, d, max_norm=1)\n",
+    "W = torch.randn((m, d), requires_grad=True)\n",
+    "idx = torch.tensor([1, 2])\n",
+    "b = embedding(idx) @ W.t()  # modifies weight in-place\n",
+    "a = embedding.weight @ W.t()  # weight must be cloned for this to be differentiable\n",
+    "out = (a.unsqueeze(0) + b.unsqueeze(1))\n",
+    "loss = out.sigmoid().prod()\n",
+    "loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Because W in the line computing a requires gradients, we must save embedding.weight to compute those gradients in the backward pass. However, in the line computing b, executing embedding(idx) will scale embedding.weight by max_norm - in place. So, without cloning it in line a, embedding.weight will be modified when line b is executed - changing what was saved for the backward pass to update W. Hence the requirement to clone embedding.weight - to save it before it gets scaled in line b.\n",
+    "If you don't use embedding.weight outside of the normal forward pass, you don't need to worry about all this.\n",
+    "If you get an error, post it (and your code)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": true
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}