diff options
| -rw-r--r-- | learn_torch/grad/05_torch_variables_grad_inplace_operation.ipynb | 984 |
1 files changed, 984 insertions, 0 deletions
diff --git a/learn_torch/grad/05_torch_variables_grad_inplace_operation.ipynb b/learn_torch/grad/05_torch_variables_grad_inplace_operation.ipynb new file mode 100644 index 0000000..df32ca8 --- /dev/null +++ b/learn_torch/grad/05_torch_variables_grad_inplace_operation.ipynb @@ -0,0 +1,984 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:45:03.778712Z", + "start_time": "2023-02-28T14:45:01.520437Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.10.2\n", + "sys.version_info(major=3, minor=6, micro=8, releaselevel='final', serial=0)\n" + ] + } + ], + "source": [ + "import sys\n", + "import torch\n", + "from torch import nn\n", + "print(torch.__version__)\n", + "print(sys.version_info)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- compute loss\n", + " - forward\n", + "- loss.backward()(或者任意的 objective.backward())\n", + " - backward (compute grad)\n", + "- optimizer.step()\n", + " - x = x - lr*x.grad" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 两种不被允许的 inplace operation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. 对于 requires_grad==True 的 叶子张量 (leaf tensor) 不能使用 inplace operation\n", + " - all `Parameters` are leaf node and requires grad\n", + " - tensor.is_leaf == True\n", + "2. 对于在求梯度阶段需要用到的张量不能使用 inplace operation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 叶子节点(leaf node)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:48:30.078573Z", + "start_time": "2023-02-28T14:48:30.075850Z" + } + }, + "outputs": [], + "source": [ + "w = torch.FloatTensor(10) # w 是个 leaf tensor\n", + "w.requires_grad = True # 将 requires_grad 设置为 True" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:48:32.906797Z", + "start_time": "2023-02-28T14:48:32.891322Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([4.0725e-36, 1.4013e-45, 4.7817e-33, 1.4013e-45, 4.7732e-33, 1.4013e-45,\n", + " 4.7732e-33, 1.4013e-45, 4.7733e-33, 1.4013e-45], requires_grad=True)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:48:54.733133Z", + "start_time": "2023-02-28T14:48:54.729702Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w.is_leaf" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:49:13.151338Z", + "start_time": "2023-02-28T14:49:12.971843Z" + } + }, + "outputs": [ + { + "ename": "RuntimeError", + "evalue": "a leaf Variable that requires grad is being used in an in-place operation.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-5-a7ec7d06d9b7>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# inplace operation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mw\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnormal_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m: a leaf Variable that requires grad is being used in an in-place operation." + ] + } + ], + "source": [ + "# inplace operation\n", + "w.normal_()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:49:45.963897Z", + "start_time": "2023-02-28T14:49:45.959168Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w.data.requires_grad" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:50:30.353687Z", + "start_time": "2023-02-28T14:50:30.349282Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([-1.6973, -0.1248, -1.1631, 0.6612, 0.2086, -0.1125, -0.5158, 0.2699,\n", + " 0.2380, 0.6100])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w.data.normal_()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:50:31.778610Z", + "start_time": "2023-02-28T14:50:31.773968Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([-1.6973, -0.1248, -1.1631, 0.6612, 0.2086, -0.1125, -0.5158, 0.2699,\n", + " 0.2380, 0.6100])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w.data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 求梯度阶段(不限于是否是 leaf node/variable/parameters)需要用到的张量" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:51:17.822399Z", + "start_time": "2023-02-28T14:51:17.819569Z" + } + }, + "outputs": [], + "source": [ + "x = torch.FloatTensor([[1., 2.]])\n", + "w1 = torch.FloatTensor([[2.], [1.]])\n", + "w2 = torch.FloatTensor([3.])\n", + "w1.requires_grad = True\n", + "w2.requires_grad = True" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:51:46.091766Z", + "start_time": "2023-02-28T14:51:46.087861Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# x.is_leaf\n", + "# w1.is_leaf\n", + "w2.is_leaf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "x\n", + "w1 -> d\n", + " w2 -> f" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:54:03.854647Z", + "start_time": "2023-02-28T14:54:03.816440Z" + }, + "scrolled": true + }, + "outputs": [ + { + "ename": "RuntimeError", + "evalue": "one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 1]], which is output 0 of torch::autograd::CopySlices, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-17-1dde630bbd78>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0md\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 306\u001b[0m inputs=inputs)\n\u001b[0;32m--> 307\u001b[0;31m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautograd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 308\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 309\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mregister_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 154\u001b[0m Variable._execution_engine.run_backward(\n\u001b[1;32m 155\u001b[0m \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 156\u001b[0;31m allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag\n\u001b[0m\u001b[1;32m 157\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mRuntimeError\u001b[0m: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 1]], which is output 0 of torch::autograd::CopySlices, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True)." + ] + } + ], + "source": [ + "d = torch.matmul(x, w1)\n", + "f = torch.matmul(d, w2)\n", + "d[:] = 0\n", + "\n", + "f.backward()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:54:23.808756Z", + "start_time": "2023-02-28T14:54:23.804946Z" + } + }, + "outputs": [], + "source": [ + "d = torch.matmul(x, w1)\n", + "d[:] = 0\n", + "f = torch.matmul(d, w2)\n", + "\n", + "f.backward()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:54:35.746473Z", + "start_time": "2023-02-28T14:54:35.741455Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([0.])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w2.grad" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- 在计算 f 的时候, d 是等于某个值的, f 对于 w2 的导数是和这时候的 d 值相关的\n", + "- 但是计算完 f 之后, d 的值变了, 这就会导致 f.backward() 对于 w2 的导数计算出错误, 为了防止这种错误, pytorch 选择了报错的形式.\n", + "- 造成这个问题的主要原因是因为 在执行 f = torch.matmul(d, w2) 这句的时候, pytorch 的反向求导机制保存了 d 的引用为了之后的反向求导计算." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `.data`与`.detach`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:20:41.990724Z", + "start_time": "2023-02-28T14:20:41.987584Z" + } + }, + "source": [ + "- detach\n", + " - Returns a new Tensor, detached from the current graph.\n", + " - The result will never require gradient.\n", + "- x.data 与 x.detach() 返回的 tensor 有相同的地方, 也有不同的地方,相同点如下\n", + " - 都和 x 共享同一块数据\n", + " - 都和 x 的 计算历史无关\n", + " - requires_grad = False\n", + "- x.data 的修改不会导致报错,但其实计算是有问题的(相当于埋了一个bug);\n", + " - x.detach() 会直接报错(更加梯度安全);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "a = torch.tensor([1, 2, 3.], requires_grad=True)\n", + "\n", + "out = a.sigmoid()\n", + "\n", + "c = out.data\n", + "# c = out.detach()\n", + "\n", + "print(f'a.requires_grad: {a.requires_grad}, out.requires_grad: {out.requires_grad}, c.requires_grad: {c.requires_grad}')\n", + "\n", + "print(out)\n", + "print(c)\n", + "c.zero_()\n", + "\n", + "print(out)\n", + "print(c)\n", + "\n", + "out.sum().backward()\n", + "print(a.grad, a.sigmoid()*(1-a.sigmoid()))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:56:40.192452Z", + "start_time": "2023-02-28T14:56:40.189472Z" + } + }, + "outputs": [], + "source": [ + "a = torch.tensor([1, 2, 3.], requires_grad=True)\n", + "out = a.sigmoid()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:56:42.011455Z", + "start_time": "2023-02-28T14:56:42.007338Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([0.7311, 0.8808, 0.9526], grad_fn=<SigmoidBackward0>)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:56:51.947340Z", + "start_time": "2023-02-28T14:56:51.944252Z" + } + }, + "outputs": [], + "source": [ + "c = out.data" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:57:05.568157Z", + "start_time": "2023-02-28T14:57:05.561002Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a.requires_grad:True, out.requires_grad: True, c.requires_grad: False\n" + ] + } + ], + "source": [ + "print(f'a.requires_grad:{a.requires_grad}, out.requires_grad: {out.requires_grad}, c.requires_grad: {c.requires_grad}')" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:57:16.654499Z", + "start_time": "2023-02-28T14:57:16.648875Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([0.7311, 0.8808, 0.9526], grad_fn=<SigmoidBackward0>)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:57:17.920288Z", + "start_time": "2023-02-28T14:57:17.915152Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([0.7311, 0.8808, 0.9526])" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:57:34.971511Z", + "start_time": "2023-02-28T14:57:34.965686Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([0., 0., 0.])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c.zero_()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:57:41.077909Z", + "start_time": "2023-02-28T14:57:41.073708Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([0., 0., 0.], grad_fn=<SigmoidBackward0>)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:57:50.771024Z", + "start_time": "2023-02-28T14:57:50.767388Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out.requires_grad" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:57:56.119624Z", + "start_time": "2023-02-28T14:57:56.115622Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c.requires_grad" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:58:20.147508Z", + "start_time": "2023-02-28T14:58:20.116942Z" + }, + "collapsed": true + }, + "outputs": [ + { + "ename": "RuntimeError", + "evalue": "Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-31-e67dddba67b2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# 应该报错,而未报错\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 306\u001b[0m inputs=inputs)\n\u001b[0;32m--> 307\u001b[0;31m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautograd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 308\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 309\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mregister_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 154\u001b[0m Variable._execution_engine.run_backward(\n\u001b[1;32m 155\u001b[0m \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 156\u001b[0;31m allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag\n\u001b[0m\u001b[1;32m 157\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mRuntimeError\u001b[0m: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward." + ] + } + ], + "source": [ + "# 应该报错,而未报错\n", + "out.sum().backward()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:59:25.255115Z", + "start_time": "2023-02-28T14:59:25.252229Z" + } + }, + "outputs": [], + "source": [ + "a = torch.tensor([1, 2, 3.], requires_grad=True)\n", + "\n", + "out = a.sigmoid()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:59:35.652693Z", + "start_time": "2023-02-28T14:59:35.650328Z" + } + }, + "outputs": [], + "source": [ + "out.sum().backward()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:59:56.719336Z", + "start_time": "2023-02-28T14:59:56.714858Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([0.1966, 0.1050, 0.0452]) tensor([0.1966, 0.1050, 0.0452], grad_fn=<MulBackward0>)\n" + ] + } + ], + "source": [ + "print(a.grad, a.sigmoid()*(1-a.sigmoid()))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T14:58:59.767314Z", + "start_time": "2023-02-28T14:58:59.757237Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a.requires_grad: True, out.requires_grad: True, c.requires_grad: False\n", + "tensor([0.7311, 0.8808, 0.9526], grad_fn=<SigmoidBackward0>)\n", + "tensor([0.7311, 0.8808, 0.9526])\n", + "tensor([0., 0., 0.], grad_fn=<SigmoidBackward0>)\n", + "tensor([0., 0., 0.])\n", + "tensor([0., 0., 0.]) tensor([0.1966, 0.1050, 0.0452], grad_fn=<MulBackward0>)\n" + ] + } + ], + "source": [ + "a = torch.tensor([1, 2, 3.], requires_grad=True)\n", + "\n", + "out = a.sigmoid()\n", + "\n", + "c = out.data\n", + "# c = out.detach()\n", + "\n", + "print(f'a.requires_grad: {a.requires_grad}, out.requires_grad: {out.requires_grad}, c.requires_grad: {c.requires_grad}')\n", + "\n", + "print(out)\n", + "print(c)\n", + "c.zero_()\n", + "\n", + "print(out)\n", + "print(c)\n", + "\n", + "out.sum().backward()\n", + "print(a.grad, a.sigmoid()*(1-a.sigmoid()))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T15:00:24.272031Z", + "start_time": "2023-02-28T15:00:24.236261Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a.requires_grad: True, out.requires_grad: True, c.requires_grad: False\n", + "tensor([0.7311, 0.8808, 0.9526], grad_fn=<SigmoidBackward0>)\n", + "tensor([0.7311, 0.8808, 0.9526])\n", + "tensor([0., 0., 0.], grad_fn=<SigmoidBackward0>)\n", + "tensor([0., 0., 0.])\n" + ] + }, + { + "ename": "RuntimeError", + "evalue": "one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3]], which is output 0 of SigmoidBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-37-6545a457b224>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgrad\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msigmoid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msigmoid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 306\u001b[0m inputs=inputs)\n\u001b[0;32m--> 307\u001b[0;31m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautograd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 308\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 309\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mregister_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 154\u001b[0m Variable._execution_engine.run_backward(\n\u001b[1;32m 155\u001b[0m \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 156\u001b[0;31m allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag\n\u001b[0m\u001b[1;32m 157\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mRuntimeError\u001b[0m: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3]], which is output 0 of SigmoidBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True)." + ] + } + ], + "source": [ + "a = torch.tensor([1, 2, 3.], requires_grad=True)\n", + "\n", + "out = a.sigmoid()\n", + "\n", + "# c = out.data\n", + "c = out.detach()\n", + "\n", + "print(f'a.requires_grad: {a.requires_grad}, out.requires_grad: {out.requires_grad}, c.requires_grad: {c.requires_grad}')\n", + "\n", + "print(out)\n", + "print(c)\n", + "c.zero_()\n", + "\n", + "print(out)\n", + "print(c)\n", + "\n", + "out.sum().backward()\n", + "print(a.grad, a.sigmoid()*(1-a.sigmoid()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## embedding" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n, d, m = 3, 5, 7\n", + "# embedding = nn.Embedding(n, d, max_norm=True)\n", + "embedding = nn.Embedding(n, d, max_norm=1)\n", + "W = torch.randn((m, d), requires_grad=True)\n", + "idx = torch.tensor([1, 2])\n", + "a = embedding.weight.clone() @ W.t() # weight must be cloned for this to be differentiable\n", + "b = embedding(idx) @ W.t() # modifies weight in-place\n", + "out = (a.unsqueeze(0) + b.unsqueeze(1))\n", + "loss = out.sigmoid().prod()\n", + "loss.backward()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "e\n", + "f(w) -> a\n", + " out => loss\n", + "e\n", + "g(w) -> b\n" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T15:02:18.907853Z", + "start_time": "2023-02-28T15:02:18.831171Z" + }, + "scrolled": true + }, + "outputs": [ + { + "ename": "RuntimeError", + "evalue": "one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3, 5]] is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-38-d6c0a82b2426>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munsqueeze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munsqueeze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msigmoid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprod\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 306\u001b[0m inputs=inputs)\n\u001b[0;32m--> 307\u001b[0;31m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautograd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 308\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 309\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mregister_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 154\u001b[0m Variable._execution_engine.run_backward(\n\u001b[1;32m 155\u001b[0m \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 156\u001b[0;31m allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag\n\u001b[0m\u001b[1;32m 157\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mRuntimeError\u001b[0m: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3, 5]] is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True)." + ] + } + ], + "source": [ + "n, d, m = 3, 5, 7\n", + "# embedding = nn.Embedding(n, d, max_norm=True)\n", + "embedding = nn.Embedding(n, d, max_norm=1)\n", + "W = torch.randn((m, d), requires_grad=True)\n", + "idx = torch.tensor([1, 2])\n", + "a = embedding.weight @ W.t() # weight must be cloned for this to be differentiable\n", + "b = embedding(idx) @ W.t() # modifies weight in-place\n", + "out = (a.unsqueeze(0) + b.unsqueeze(1))\n", + "loss = out.sigmoid().prod()\n", + "loss.backward()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "ExecuteTime": { + "end_time": "2023-02-28T15:04:55.594661Z", + "start_time": "2023-02-28T15:04:55.589654Z" + } + }, + "outputs": [], + "source": [ + "n, d, m = 3, 5, 7\n", + "# embedding = nn.Embedding(n, d, max_norm=True)\n", + "embedding = nn.Embedding(n, d, max_norm=1)\n", + "W = torch.randn((m, d), requires_grad=True)\n", + "idx = torch.tensor([1, 2])\n", + "b = embedding(idx) @ W.t() # modifies weight in-place\n", + "a = embedding.weight @ W.t() # weight must be cloned for this to be differentiable\n", + "out = (a.unsqueeze(0) + b.unsqueeze(1))\n", + "loss = out.sigmoid().prod()\n", + "loss.backward()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Because W in the line computing a requires gradients, we must save embedding.weight to compute those gradients in the backward pass. However, in the line computing b, executing embedding(idx) will scale embedding.weight by max_norm - in place. So, without cloning it in line a, embedding.weight will be modified when line b is executed - changing what was saved for the backward pass to update W. Hence the requirement to clone embedding.weight - to save it before it gets scaled in line b.\n", + "If you don't use embedding.weight outside of the normal forward pass, you don't need to worry about all this.\n", + "If you get an error, post it (and your code)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} |
