├── .ipynb_checkpoints ├── 1.梯度下降-checkpoint.ipynb ├── 2.随机梯度下降-checkpoint.ipynb ├── 3.momentum-checkpoint.ipynb ├── 4.ada_grad-checkpoint.ipynb ├── 5.rms_prop-checkpoint.ipynb ├── 6.ada_delta-checkpoint.ipynb └── 7.adam-checkpoint.ipynb ├── 1.梯度下降.ipynb ├── 2.随机梯度下降.ipynb ├── 3.momentum.ipynb ├── 4.ada_grad.ipynb ├── 5.rms_prop.ipynb ├── 6.ada_delta.ipynb ├── 7.adam.ipynb ├── README.md └── 线性数据.csv /.ipynb_checkpoints/1.梯度下降-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4fcd93aa", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/plain": [ 12 | "((1503, 5), (1503,))" 13 | ] 14 | }, 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "output_type": "execute_result" 18 | } 19 | ], 20 | "source": [ 21 | "import numpy as np\n", 22 | "\n", 23 | "#加载数据\n", 24 | "data = np.loadtxt(fname='./线性数据.csv', delimiter='\\t')\n", 25 | "\n", 26 | "#标准化\n", 27 | "data -= data.mean(axis=0)\n", 28 | "data /= data.std(axis=0)\n", 29 | "\n", 30 | "x = data[:, :-1]\n", 31 | "y = data[:, -1]\n", 32 | "\n", 33 | "x.shape, y.shape" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "cc6c8e3c", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "#常量\n", 44 | "N, M = x.shape\n", 45 | "\n", 46 | "#变量\n", 47 | "w = np.ones(M)\n", 48 | "b = 0" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "id": "92163201", 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "0.6590042695516539" 61 | ] 62 | }, 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "#预测函数\n", 70 | "def predict(x):\n", 71 | " return w.dot(x) + b\n", 72 | "\n", 73 | "\n", 74 | "predict(x[0])" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "id": "a7bb7a80", 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "7.367867692433937" 87 | ] 88 | }, 89 | "execution_count": 4, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "#求loss,MSELoss\n", 96 | "def get_loss():\n", 97 | " loss = 0\n", 98 | " for i in range(N):\n", 99 | " pred = predict(x[i])\n", 100 | " loss += (pred - y[i])**2\n", 101 | " return loss / N\n", 102 | "\n", 103 | "\n", 104 | "get_loss()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 5, 110 | "id": "8027d213", 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "(array([2.03668543, 2.38225639, 1.02215384, 2.13526642, 3.22327899]),\n", 117 | " 0.0010000000036924916)" 118 | ] 119 | }, 120 | "execution_count": 5, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "def get_gradient():\n", 127 | " global w\n", 128 | " global b\n", 129 | "\n", 130 | " eps = 1e-3\n", 131 | "\n", 132 | " loss_before = get_loss()\n", 133 | "\n", 134 | " gradient_w = np.empty(M)\n", 135 | " for i in range(M):\n", 136 | " w[i] += eps\n", 137 | " loss_after = get_loss()\n", 138 | " w[i] -= eps\n", 139 | " gradient_w[i] = (loss_after - loss_before) / eps\n", 140 | "\n", 141 | " b += eps\n", 142 | " loss_after = get_loss()\n", 143 | " b -= eps\n", 144 | " gradient_b = (loss_after - loss_before) / eps\n", 145 | "\n", 146 | " return gradient_w, gradient_b\n", 147 | "\n", 148 | "\n", 149 | "get_gradient()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 6, 155 | "id": "c371c6a4", 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "name": "stdout", 160 | "output_type": "stream", 161 | "text": [ 162 | "0 7.112757670092038\n", 163 | "50 1.7854577366414703\n", 164 | "100 0.8188794143216034\n", 165 | "150 0.5927178198131446\n", 166 | "200 0.5305917673804184\n", 167 | "250 0.5095310094726683\n", 168 | "300 0.5002280440367376\n", 169 | "350 0.4950505954913211\n", 170 | "400 0.49174823787396005\n", 171 | "450 0.48950848092220356\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "plt_x = []\n", 177 | "plt_y = []\n", 178 | "for i in range(500):\n", 179 | " gradient_w, gradient_b = get_gradient()\n", 180 | " w -= gradient_w * 1e-2\n", 181 | " b -= gradient_b * 1e-2\n", 182 | "\n", 183 | " plt_x.append(i)\n", 184 | " plt_y.append(get_loss())\n", 185 | "\n", 186 | " if i % 50 == 0:\n", 187 | " print(i, get_loss())" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 7, 193 | "id": "0471a70d", 194 | "metadata": { 195 | "scrolled": true 196 | }, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "image/png": "\n", 201 | "text/plain": [ 202 | "
" 203 | ] 204 | }, 205 | "metadata": { 206 | "needs_background": "light" 207 | }, 208 | "output_type": "display_data" 209 | } 210 | ], 211 | "source": [ 212 | "from matplotlib import pyplot as plt\n", 213 | "%matplotlib inline\n", 214 | "\n", 215 | "plt.plot(plt_x, plt_y)\n", 216 | "plt.show()" 217 | ] 218 | } 219 | ], 220 | "metadata": { 221 | "kernelspec": { 222 | "display_name": "Python 3 (ipykernel)", 223 | "language": "python", 224 | "name": "python3" 225 | }, 226 | "language_info": { 227 | "codemirror_mode": { 228 | "name": "ipython", 229 | "version": 3 230 | }, 231 | "file_extension": ".py", 232 | "mimetype": "text/x-python", 233 | "name": "python", 234 | "nbconvert_exporter": "python", 235 | "pygments_lexer": "ipython3", 236 | "version": "3.8.11" 237 | } 238 | }, 239 | "nbformat": 4, 240 | "nbformat_minor": 5 241 | } 242 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/2.随机梯度下降-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4fcd93aa", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/plain": [ 12 | "((1503, 5), (1503,))" 13 | ] 14 | }, 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "output_type": "execute_result" 18 | } 19 | ], 20 | "source": [ 21 | "import numpy as np\n", 22 | "\n", 23 | "#加载数据\n", 24 | "data = np.loadtxt(fname='./线性数据.csv', delimiter='\\t')\n", 25 | "\n", 26 | "#标准化\n", 27 | "data -= data.mean(axis=0)\n", 28 | "data /= data.std(axis=0)\n", 29 | "\n", 30 | "x = data[:, :-1]\n", 31 | "y = data[:, -1]\n", 32 | "\n", 33 | "x.shape, y.shape" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "cc6c8e3c", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "#常量\n", 44 | "N, M = x.shape\n", 45 | "\n", 46 | "#变量\n", 47 | "w = np.ones(M)\n", 48 | "b = 0" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "id": "92163201", 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "0.6590042695516539" 61 | ] 62 | }, 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "#预测函数\n", 70 | "def predict(x):\n", 71 | " return w.dot(x) + b\n", 72 | "\n", 73 | "\n", 74 | "predict(x[0])" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "id": "a7bb7a80", 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "0.21258140154187247" 87 | ] 88 | }, 89 | "execution_count": 4, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "#求loss,MSELoss\n", 96 | "def get_loss(x, y):\n", 97 | " pred = predict(x)\n", 98 | " loss = (pred - y)**2\n", 99 | " return loss\n", 100 | "\n", 101 | "\n", 102 | "get_loss(x[0], y[0])" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 5, 108 | "id": "8027d213", 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "(array([-0.61003339, -1.05581946, 1.66242713, 1.21242212, -0.59417855]),\n", 115 | " 0.923131013558981)" 116 | ] 117 | }, 118 | "execution_count": 5, 119 | "metadata": {}, 120 | "output_type": "execute_result" 121 | } 122 | ], 123 | "source": [ 124 | "def get_gradient(x, y):\n", 125 | " global w\n", 126 | " global b\n", 127 | "\n", 128 | " eps = 1e-3\n", 129 | "\n", 130 | " loss_before = get_loss(x, y)\n", 131 | "\n", 132 | " gradient_w = np.empty(M)\n", 133 | " for i in range(M):\n", 134 | " w[i] += eps\n", 135 | " loss_after = get_loss(x, y)\n", 136 | " w[i] -= eps\n", 137 | " gradient_w[i] = (loss_after - loss_before) / eps\n", 138 | "\n", 139 | " b += eps\n", 140 | " loss_after = get_loss(x, y)\n", 141 | " b -= eps\n", 142 | " gradient_b = (loss_after - loss_before) / eps\n", 143 | "\n", 144 | " return gradient_w, gradient_b\n", 145 | "\n", 146 | "\n", 147 | "get_gradient(x[0], y[0])" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 6, 153 | "id": "f39e0125", 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "11073.905141728206" 160 | ] 161 | }, 162 | "execution_count": 6, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "def total_loss():\n", 169 | " loss = 0\n", 170 | " for i in range(N):\n", 171 | " loss += get_loss(x[i], y[i])\n", 172 | " return loss\n", 173 | "\n", 174 | "\n", 175 | "total_loss()" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 7, 181 | "id": "c371c6a4", 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "name": "stdout", 186 | "output_type": "stream", 187 | "text": [ 188 | "0 11038.895201527894\n", 189 | "150 6696.3736283721655\n", 190 | "300 4354.119709336485\n", 191 | "450 3004.4953025777063\n", 192 | "600 2310.68634531403\n", 193 | "750 1839.580107951638\n", 194 | "900 1549.8047186884628\n", 195 | "1050 1278.2079624059054\n", 196 | "1200 1099.1810250366634\n", 197 | "1350 986.6025037947752\n", 198 | "1500 921.4757031198328\n", 199 | "1650 879.159069825457\n", 200 | "1800 853.2767252227716\n", 201 | "1950 835.2496534941863\n", 202 | "2100 812.3758750332744\n", 203 | "2250 794.1165878394305\n", 204 | "2400 786.9647280480957\n" 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "plt_x = []\n", 210 | "plt_y = []\n", 211 | "for epoch in range(2500):\n", 212 | " i = np.random.randint(N)\n", 213 | " gradient_w, gradient_b = get_gradient(x[i], y[i])\n", 214 | " w -= gradient_w * 1e-3\n", 215 | " b -= gradient_b * 1e-3\n", 216 | "\n", 217 | " plt_x.append(epoch)\n", 218 | " plt_y.append(total_loss())\n", 219 | "\n", 220 | " if epoch % 150 == 0:\n", 221 | " print(epoch, total_loss())" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 8, 227 | "id": "0471a70d", 228 | "metadata": { 229 | "scrolled": true 230 | }, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "image/png": "\n", 235 | "text/plain": [ 236 | "
" 237 | ] 238 | }, 239 | "metadata": { 240 | "needs_background": "light" 241 | }, 242 | "output_type": "display_data" 243 | } 244 | ], 245 | "source": [ 246 | "from matplotlib import pyplot as plt\n", 247 | "%matplotlib inline\n", 248 | "\n", 249 | "plt.plot(plt_x, plt_y)\n", 250 | "plt.show()" 251 | ] 252 | } 253 | ], 254 | "metadata": { 255 | "kernelspec": { 256 | "display_name": "Python 3 (ipykernel)", 257 | "language": "python", 258 | "name": "python3" 259 | }, 260 | "language_info": { 261 | "codemirror_mode": { 262 | "name": "ipython", 263 | "version": 3 264 | }, 265 | "file_extension": ".py", 266 | "mimetype": "text/x-python", 267 | "name": "python", 268 | "nbconvert_exporter": "python", 269 | "pygments_lexer": "ipython3", 270 | "version": "3.8.11" 271 | } 272 | }, 273 | "nbformat": 4, 274 | "nbformat_minor": 5 275 | } 276 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/3.momentum-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4fcd93aa", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/plain": [ 12 | "((1503, 5), (1503,))" 13 | ] 14 | }, 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "output_type": "execute_result" 18 | } 19 | ], 20 | "source": [ 21 | "import numpy as np\n", 22 | "\n", 23 | "#加载数据\n", 24 | "data = np.loadtxt(fname='./线性数据.csv', delimiter='\\t')\n", 25 | "\n", 26 | "#标准化\n", 27 | "data -= data.mean(axis=0)\n", 28 | "data /= data.std(axis=0)\n", 29 | "\n", 30 | "x = data[:, :-1]\n", 31 | "y = data[:, -1]\n", 32 | "\n", 33 | "x.shape, y.shape" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "cc6c8e3c", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "#常量\n", 44 | "N, M = x.shape\n", 45 | "\n", 46 | "#变量\n", 47 | "w = np.ones(M)\n", 48 | "b = 0\n", 49 | "\n", 50 | "#动量都初始化为0\n", 51 | "momentum_w = np.zeros(M)\n", 52 | "momentum_b = 0" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "id": "92163201", 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "0.6590042695516539" 65 | ] 66 | }, 67 | "execution_count": 3, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "#预测函数\n", 74 | "def predict(x):\n", 75 | " return w.dot(x) + b\n", 76 | "\n", 77 | "\n", 78 | "predict(x[0])" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "id": "a7bb7a80", 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "0.21258140154187247" 91 | ] 92 | }, 93 | "execution_count": 4, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "#求loss,MSELoss\n", 100 | "def get_loss(x, y):\n", 101 | " pred = predict(x)\n", 102 | " loss = (pred - y)**2\n", 103 | " return loss\n", 104 | "\n", 105 | "\n", 106 | "get_loss(x[0], y[0])" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 5, 112 | "id": "8027d213", 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "(array([-0.61003339, -1.05581946, 1.66242713, 1.21242212, -0.59417855]),\n", 119 | " 0.923131013558981)" 120 | ] 121 | }, 122 | "execution_count": 5, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "def get_gradient(x, y):\n", 129 | " global w\n", 130 | " global b\n", 131 | "\n", 132 | " eps = 1e-3\n", 133 | "\n", 134 | " loss_before = get_loss(x, y)\n", 135 | "\n", 136 | " gradient_w = np.empty(M)\n", 137 | " for i in range(M):\n", 138 | " w[i] += eps\n", 139 | " loss_after = get_loss(x, y)\n", 140 | " w[i] -= eps\n", 141 | " gradient_w[i] = (loss_after - loss_before) / eps\n", 142 | "\n", 143 | " b += eps\n", 144 | " loss_after = get_loss(x, y)\n", 145 | " b -= eps\n", 146 | " gradient_b = (loss_after - loss_before) / eps\n", 147 | "\n", 148 | " return gradient_w, gradient_b\n", 149 | "\n", 150 | "\n", 151 | "get_gradient(x[0], y[0])" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 6, 157 | "id": "f39e0125", 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "11073.905141728206" 164 | ] 165 | }, 166 | "execution_count": 6, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "def total_loss():\n", 173 | " loss = 0\n", 174 | " for i in range(N):\n", 175 | " loss += get_loss(x[i], y[i])\n", 176 | " return loss\n", 177 | "\n", 178 | "\n", 179 | "total_loss()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 7, 185 | "id": "c371c6a4", 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "0 [-4.35683357 4.61375434 1.50384418 2.91083043 2.96213566] -4.022402214293841 11044.88570074484\n", 193 | "150 [ 1.26339579 2.13768158 0.49321428 -1.51848376 3.5785042 ] -1.7176170960084935 1967.1697066194426\n", 194 | "300 [ 5.80970624e-03 3.55685572e+00 3.82822053e+00 -2.58323564e-01\n", 195 | " 1.07002133e+01] 2.0649671248152166 947.1121093652372\n", 196 | "450 [-2.26148992 3.8055673 -3.18437755 -3.87469838 -0.61715981] 0.7950908166330994 815.4883875626289\n", 197 | "600 [1.55949644 0.79231053 3.14007815 3.71824435 0.98449926] -0.5781588479929312 783.2820118211715\n", 198 | "750 [ 3.59572027 -0.74530488 0.72892403 -0.15447051 -4.33429161] -2.0781779575632005 753.5637854055667\n", 199 | "900 [ 0.3379927 2.54425898 -4.14378216 0.92339395 0.76440147] 0.8774161483117863 769.3244441314099\n", 200 | "1050 [-1.00006534 -5.83176015 -5.28599293 -5.19888383 -3.62367306] 2.5986887635740827 742.322126662018\n", 201 | "1200 [ 1.32568154 -8.98944316 8.03060836 -1.84855209 -1.93601743] -4.04468557258265 753.4751326083475\n", 202 | "1350 [ 1.82402337 -1.95342977 1.07956753 2.00744178 -4.75462661] -2.284159092719058 734.9285345602206\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "plt_x = []\n", 208 | "plt_y = []\n", 209 | "for epoch in range(1500):\n", 210 | " i = np.random.randint(N)\n", 211 | "\n", 212 | " gradient_w, gradient_b = get_gradient(x[i], y[i])\n", 213 | "\n", 214 | " #这是更新动量的数学公式,0.8是过去动量的权重\n", 215 | " momentum_w = 0.8 * momentum_w + gradient_w\n", 216 | " momentum_b = 0.8 * momentum_b + gradient_b\n", 217 | "\n", 218 | " #这里更新参数不再使用梯度,而是使用动量\n", 219 | " w -= momentum_w * 1e-3\n", 220 | " b -= momentum_b * 1e-3\n", 221 | "\n", 222 | " #思考一下,在时刻0,动量都是0.此时更新动量,动量就等于梯度.\n", 223 | " #也就是说,再时刻0,其实就是再用梯度下降.\n", 224 | " #时刻1,是上一个时刻的梯度乘以0.8,再加上当前时刻的梯度\n", 225 | " #所以在时刻1,差不多可以认为是梯度乘以了1.8.不过这里面两部分的梯度是在两个不同的点上评估出来的.\n", 226 | " #在时刻2,差不多等同于时刻1.往后都差不多.\n", 227 | "\n", 228 | " plt_x.append(epoch)\n", 229 | " plt_y.append(total_loss())\n", 230 | "\n", 231 | " if epoch % 150 == 0:\n", 232 | " print(epoch, momentum_w, momentum_b, total_loss())" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 8, 238 | "id": "0471a70d", 239 | "metadata": { 240 | "scrolled": true 241 | }, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "image/png": "\n", 246 | "text/plain": [ 247 | "
" 248 | ] 249 | }, 250 | "metadata": { 251 | "needs_background": "light" 252 | }, 253 | "output_type": "display_data" 254 | } 255 | ], 256 | "source": [ 257 | "from matplotlib import pyplot as plt\n", 258 | "%matplotlib inline\n", 259 | "\n", 260 | "plt.plot(plt_x, plt_y)\n", 261 | "plt.show()" 262 | ] 263 | } 264 | ], 265 | "metadata": { 266 | "kernelspec": { 267 | "display_name": "Python 3 (ipykernel)", 268 | "language": "python", 269 | "name": "python3" 270 | }, 271 | "language_info": { 272 | "codemirror_mode": { 273 | "name": "ipython", 274 | "version": 3 275 | }, 276 | "file_extension": ".py", 277 | "mimetype": "text/x-python", 278 | "name": "python", 279 | "nbconvert_exporter": "python", 280 | "pygments_lexer": "ipython3", 281 | "version": "3.8.11" 282 | } 283 | }, 284 | "nbformat": 4, 285 | "nbformat_minor": 5 286 | } 287 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/4.ada_grad-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4fcd93aa", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/plain": [ 12 | "((1503, 5), (1503,))" 13 | ] 14 | }, 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "output_type": "execute_result" 18 | } 19 | ], 20 | "source": [ 21 | "import numpy as np\n", 22 | "\n", 23 | "#加载数据\n", 24 | "data = np.loadtxt(fname='./线性数据.csv', delimiter='\\t')\n", 25 | "\n", 26 | "#标准化\n", 27 | "data -= data.mean(axis=0)\n", 28 | "data /= data.std(axis=0)\n", 29 | "\n", 30 | "x = data[:, :-1]\n", 31 | "y = data[:, -1]\n", 32 | "\n", 33 | "x.shape, y.shape" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "cc6c8e3c", 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/plain": [ 45 | "array([0., 0., 0., 0., 0.])" 46 | ] 47 | }, 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": [ 54 | "#常量\n", 55 | "N, M = x.shape\n", 56 | "\n", 57 | "#变量\n", 58 | "w = np.ones(M)\n", 59 | "b = 0\n", 60 | "\n", 61 | "#初始化S为全0\n", 62 | "S_w = np.zeros(M)\n", 63 | "S_b = 0\n", 64 | "\n", 65 | "S_w" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "id": "92163201", 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "0.6590042695516539" 78 | ] 79 | }, 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "#预测函数\n", 87 | "def predict(x):\n", 88 | " return w.dot(x) + b\n", 89 | "\n", 90 | "\n", 91 | "predict(x[0])" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 4, 97 | "id": "a7bb7a80", 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/plain": [ 103 | "0.21258140154187247" 104 | ] 105 | }, 106 | "execution_count": 4, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "#求loss,MSELoss\n", 113 | "def get_loss(x, y):\n", 114 | " pred = predict(x)\n", 115 | " loss = (pred - y)**2\n", 116 | " return loss\n", 117 | "\n", 118 | "\n", 119 | "get_loss(x[0], y[0])" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "id": "8027d213", 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "(array([-0.61003339, -1.05581946, 1.66242713, 1.21242212, -0.59417855]),\n", 132 | " 0.923131013558981)" 133 | ] 134 | }, 135 | "execution_count": 5, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "def get_gradient(x, y):\n", 142 | " global w\n", 143 | " global b\n", 144 | "\n", 145 | " eps = 1e-3\n", 146 | "\n", 147 | " loss_before = get_loss(x, y)\n", 148 | "\n", 149 | " gradient_w = np.empty(M)\n", 150 | " for i in range(M):\n", 151 | " w[i] += eps\n", 152 | " loss_after = get_loss(x, y)\n", 153 | " w[i] -= eps\n", 154 | " gradient_w[i] = (loss_after - loss_before) / eps\n", 155 | "\n", 156 | " b += eps\n", 157 | " loss_after = get_loss(x, y)\n", 158 | " b -= eps\n", 159 | " gradient_b = (loss_after - loss_before) / eps\n", 160 | "\n", 161 | " return gradient_w, gradient_b\n", 162 | "\n", 163 | "\n", 164 | "get_gradient(x[0], y[0])" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 6, 170 | "id": "f39e0125", 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "11073.905141728206" 177 | ] 178 | }, 179 | "execution_count": 6, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "def total_loss():\n", 186 | " loss = 0\n", 187 | " for i in range(N):\n", 188 | " loss += get_loss(x[i], y[i])\n", 189 | " return loss\n", 190 | "\n", 191 | "\n", 192 | "total_loss()" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 7, 198 | "id": "c371c6a4", 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "0 [0.02348118 0.05140723 0.0153264 0.01141825 0.03465209] 0.014058087368505013 10246.765269340094\n", 206 | "150 [0.00215905 0.00213966 0.0025248 0.0023185 0.00215242] 0.0024356945540773096 2658.3900946275558\n", 207 | "300 [0.00159518 0.00180842 0.0019631 0.00191729 0.00173008] 0.0019876188630781272 1600.8100316915763\n", 208 | "450 [0.00148661 0.00166541 0.00178354 0.00173962 0.00157404] 0.0017842433083993126 1186.465991584585\n", 209 | "600 [0.00139184 0.00156629 0.00167125 0.00164534 0.00149001] 0.0016826253755351525 1018.0084411998928\n", 210 | "750 [0.00132517 0.00150278 0.00158741 0.00157468 0.00142465] 0.001594262705724423 927.9759431415143\n", 211 | "900 [0.0012664 0.00144711 0.00152095 0.00150801 0.00135752] 0.0015236251237426357 861.6745940332113\n", 212 | "1050 [0.00123801 0.00141408 0.0014759 0.00146659 0.00133255] 0.001476298163704934 824.5775032016687\n", 213 | "1200 [0.00121116 0.00136314 0.001434 0.00142441 0.00128068] 0.001429332532480186 807.1329761191264\n", 214 | "1350 [0.00116843 0.0013307 0.00139645 0.00138838 0.00124704] 0.0013899172750459162 791.1949098503449\n", 215 | "1500 [0.00115129 0.00130284 0.0013622 0.00136147 0.00122016] 0.0013561117340229432 788.5470016064282\n", 216 | "1650 [0.00111086 0.00126146 0.00132726 0.00132571 0.00119364] 0.0013234756456931064 783.241050040833\n", 217 | "1800 [0.00105912 0.00119525 0.00128706 0.00128536 0.00116119] 0.0012821836202760327 776.5490903516861\n", 218 | "1950 [0.00101795 0.00115106 0.00125265 0.00124849 0.00112686] 0.0012443142251112867 767.7869135624381\n", 219 | "2100 [0.00099775 0.00112372 0.00122065 0.00122 0.00111091] 0.0012130252246838496 762.3245228610506\n", 220 | "2250 [0.00098704 0.00109155 0.00119765 0.00120011 0.00109044] 0.00119230318143276 757.1151773657198\n", 221 | "2400 [0.00096106 0.00106878 0.00117239 0.00118274 0.00108084] 0.0011689278828959015 754.8130624984661\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "plt_x = []\n", 227 | "plt_y = []\n", 228 | "for epoch in range(2500):\n", 229 | " i = np.random.randint(N)\n", 230 | " gradient_w, gradient_b = get_gradient(x[i], y[i])\n", 231 | "\n", 232 | " #adagrad的特点是每个变量都有属于自己的lr\n", 233 | " #要计算各个变量的lr,先要计算S\n", 234 | " #这是S的计算公式\n", 235 | " S_w = S_w + gradient_w**2\n", 236 | " S_b = S_b + gradient_b**2\n", 237 | "\n", 238 | " #计算lr的公式,其中的1e-1是原本的lr,1e-6是防止除0的\n", 239 | " lr_w = 1e-1 / ((S_w + 1e-6)**0.5)\n", 240 | " lr_b = 1e-1 / ((S_b + 1e-6)**0.5)\n", 241 | "\n", 242 | " #所以在时刻0,lr就等于梯度的倒数\n", 243 | " #梯度大的变量会有小lr,梯度小的变量会有大lr\n", 244 | " #往后的每一个时刻,都是类似动量法,考虑上一步的梯度\n", 245 | "\n", 246 | " w -= gradient_w * lr_w\n", 247 | " b -= gradient_b * lr_b\n", 248 | "\n", 249 | " plt_x.append(epoch)\n", 250 | " plt_y.append(total_loss())\n", 251 | "\n", 252 | " if epoch % 150 == 0:\n", 253 | " print(epoch, lr_w, lr_b, total_loss())" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 8, 259 | "id": "0471a70d", 260 | "metadata": { 261 | "scrolled": true 262 | }, 263 | "outputs": [ 264 | { 265 | "data": { 266 | "image/png": "\n", 267 | "text/plain": [ 268 | "
" 269 | ] 270 | }, 271 | "metadata": { 272 | "needs_background": "light" 273 | }, 274 | "output_type": "display_data" 275 | } 276 | ], 277 | "source": [ 278 | "from matplotlib import pyplot as plt\n", 279 | "%matplotlib inline\n", 280 | "\n", 281 | "plt.plot(plt_x, plt_y)\n", 282 | "plt.show()" 283 | ] 284 | } 285 | ], 286 | "metadata": { 287 | "kernelspec": { 288 | "display_name": "Python 3 (ipykernel)", 289 | "language": "python", 290 | "name": "python3" 291 | }, 292 | "language_info": { 293 | "codemirror_mode": { 294 | "name": "ipython", 295 | "version": 3 296 | }, 297 | "file_extension": ".py", 298 | "mimetype": "text/x-python", 299 | "name": "python", 300 | "nbconvert_exporter": "python", 301 | "pygments_lexer": "ipython3", 302 | "version": "3.8.11" 303 | } 304 | }, 305 | "nbformat": 4, 306 | "nbformat_minor": 5 307 | } 308 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/6.ada_delta-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4fcd93aa", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/plain": [ 12 | "((1503, 5), (1503,))" 13 | ] 14 | }, 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "output_type": "execute_result" 18 | } 19 | ], 20 | "source": [ 21 | "import numpy as np\n", 22 | "\n", 23 | "#加载数据\n", 24 | "data = np.loadtxt(fname='./线性数据.csv', delimiter='\\t')\n", 25 | "\n", 26 | "#标准化\n", 27 | "data -= data.mean(axis=0)\n", 28 | "data /= data.std(axis=0)\n", 29 | "\n", 30 | "x = data[:, :-1]\n", 31 | "y = data[:, -1]\n", 32 | "\n", 33 | "x.shape, y.shape" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "cc6c8e3c", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "#常量\n", 44 | "N, M = x.shape\n", 45 | "\n", 46 | "#变量\n", 47 | "w = np.ones(M)\n", 48 | "b = 0\n", 49 | "\n", 50 | "#初始化S为全0\n", 51 | "S_w = np.zeros(M)\n", 52 | "S_b = 0\n", 53 | "\n", 54 | "#初始化delta为全0\n", 55 | "delta_w = np.zeros(M)\n", 56 | "delta_b = 0" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "id": "92163201", 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "0.6590042695516539" 69 | ] 70 | }, 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "#预测函数\n", 78 | "def predict(x):\n", 79 | " return w.dot(x) + b\n", 80 | "\n", 81 | "\n", 82 | "predict(x[0])" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 4, 88 | "id": "a7bb7a80", 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "data": { 93 | "text/plain": [ 94 | "0.21258140154187247" 95 | ] 96 | }, 97 | "execution_count": 4, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "#求loss,MSELoss\n", 104 | "def get_loss(x, y):\n", 105 | " pred = predict(x)\n", 106 | " loss = (pred - y)**2\n", 107 | " return loss\n", 108 | "\n", 109 | "\n", 110 | "get_loss(x[0], y[0])" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 5, 116 | "id": "8027d213", 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/plain": [ 122 | "(array([-0.61003339, -1.05581946, 1.66242713, 1.21242212, -0.59417855]),\n", 123 | " 0.923131013558981)" 124 | ] 125 | }, 126 | "execution_count": 5, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "def get_gradient(x, y):\n", 133 | " global w\n", 134 | " global b\n", 135 | "\n", 136 | " eps = 1e-3\n", 137 | "\n", 138 | " loss_before = get_loss(x, y)\n", 139 | "\n", 140 | " gradient_w = np.empty(M)\n", 141 | " for i in range(M):\n", 142 | " w[i] += eps\n", 143 | " loss_after = get_loss(x, y)\n", 144 | " w[i] -= eps\n", 145 | " gradient_w[i] = (loss_after - loss_before) / eps\n", 146 | "\n", 147 | " b += eps\n", 148 | " loss_after = get_loss(x, y)\n", 149 | " b -= eps\n", 150 | " gradient_b = (loss_after - loss_before) / eps\n", 151 | "\n", 152 | " return gradient_w, gradient_b\n", 153 | "\n", 154 | "\n", 155 | "get_gradient(x[0], y[0])" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 6, 161 | "id": "f39e0125", 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "11073.905141728206" 168 | ] 169 | }, 170 | "execution_count": 6, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "def total_loss():\n", 177 | " loss = 0\n", 178 | " for i in range(N):\n", 179 | " loss += get_loss(x[i], y[i])\n", 180 | " return loss\n", 181 | "\n", 182 | "\n", 183 | "total_loss()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 7, 189 | "id": "c371c6a4", 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stdout", 194 | "output_type": "stream", 195 | "text": [ 196 | "0 [9.99999848e-07 9.99999845e-07] 9.999998984693441e-07 11059.213325989294\n", 197 | "500 [2.66451164e-06 2.89372373e-06] 3.595841972186342e-06 7623.402923852171\n", 198 | "1000 [1.49058551e-06 1.77039119e-06] 2.3569819378456833e-06 5116.6673548382705\n", 199 | "1500 [1.54060667e-07 1.52269411e-07] 4.854100489197519e-07 3453.7618700421162\n", 200 | "2000 [4.5399257e-06 3.6700749e-06] 4.182369642302336e-06 2259.06518603289\n", 201 | "2500 [1.65797772e-06 1.05985930e-06] 1.3383495167537935e-06 1438.5906667372406\n", 202 | "3000 [2.96914906e-07 6.56168224e-07] 6.131919613839703e-07 1051.9695231637406\n", 203 | "3500 [1.48431059e-06 1.85664992e-06] 3.2354643896326765e-06 886.751528580343\n", 204 | "4000 [3.06717189e-06 1.53765326e-06] 7.968217077828009e-06 826.4649373166238\n", 205 | "4500 [1.83228209e-06 1.98744616e-06] 3.0032937384269872e-06 794.0081885344441\n", 206 | "5000 [5.05157589e-07 6.08365156e-07] 6.43608532358165e-07 798.5606607279586\n" 207 | ] 208 | } 209 | ], 210 | "source": [ 211 | "plt_x = []\n", 212 | "plt_y = []\n", 213 | "\n", 214 | "for epoch in range(5500):\n", 215 | " i = np.random.randint(N)\n", 216 | " gradient_w, gradient_b = get_gradient(x[i], y[i])\n", 217 | "\n", 218 | " #ada_delta算法不需要设定超参数lr\n", 219 | " #他需要维持两个变量,delta和S\n", 220 | "\n", 221 | " #S的计算和rmsprop完全一致\n", 222 | " S_w = 0.2 * S_w + 0.8 * gradient_w**2\n", 223 | " S_b = 0.2 * S_b + 0.8 * gradient_b**2\n", 224 | "\n", 225 | " #计算lr的公式,这里的1e-6是为了防止除0\n", 226 | " lr = (delta_w + 1e-6) / (S_w + 1e-6)\n", 227 | " gradient_w = lr**0.5 * gradient_w\n", 228 | "\n", 229 | " lr = (delta_b + 1e-6) / (S_b + 1e-6)\n", 230 | " gradient_b = lr**0.5 * gradient_b\n", 231 | "\n", 232 | " #更新参数\n", 233 | " w -= gradient_w\n", 234 | " b -= gradient_b\n", 235 | "\n", 236 | " #更新delta,这里的两个系数和计算S时用的要一样\n", 237 | " delta_w = 0.2 * delta_w + 0.8 * gradient_w**2\n", 238 | " delta_b = 0.2 * delta_b + 0.8 * gradient_b**2\n", 239 | "\n", 240 | " #思考一下,在时刻0,S就是梯度的平方乘以0.8\n", 241 | " #所以在一开始的时候,S是比较大的.但delta还是0\n", 242 | " #所以一开始的时候lr是比较大的.\n", 243 | " #delta更新为变量更新量的平方*0.8\n", 244 | " #所以delta当中差不多相当于存储了变量更新量的历史信息\n", 245 | " #所以最后的lr,应该是取两者的比值\n", 246 | "\n", 247 | " plt_x.append(epoch)\n", 248 | " plt_y.append(total_loss())\n", 249 | "\n", 250 | " if epoch % 500 == 0:\n", 251 | " print(epoch, delta_w[:2], delta_b, total_loss())" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 8, 257 | "id": "0471a70d", 258 | "metadata": { 259 | "scrolled": true 260 | }, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "image/png": "\n", 265 | "text/plain": [ 266 | "
" 267 | ] 268 | }, 269 | "metadata": { 270 | "needs_background": "light" 271 | }, 272 | "output_type": "display_data" 273 | } 274 | ], 275 | "source": [ 276 | "from matplotlib import pyplot as plt\n", 277 | "%matplotlib inline\n", 278 | "\n", 279 | "plt.plot(plt_x, plt_y)\n", 280 | "plt.show()" 281 | ] 282 | } 283 | ], 284 | "metadata": { 285 | "kernelspec": { 286 | "display_name": "Python 3 (ipykernel)", 287 | "language": "python", 288 | "name": "python3" 289 | }, 290 | "language_info": { 291 | "codemirror_mode": { 292 | "name": "ipython", 293 | "version": 3 294 | }, 295 | "file_extension": ".py", 296 | "mimetype": "text/x-python", 297 | "name": "python", 298 | "nbconvert_exporter": "python", 299 | "pygments_lexer": "ipython3", 300 | "version": "3.8.11" 301 | } 302 | }, 303 | "nbformat": 4, 304 | "nbformat_minor": 5 305 | } 306 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/7.adam-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4fcd93aa", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/plain": [ 12 | "((1503, 5), (1503,))" 13 | ] 14 | }, 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "output_type": "execute_result" 18 | } 19 | ], 20 | "source": [ 21 | "import numpy as np\n", 22 | "\n", 23 | "#加载数据\n", 24 | "data = np.loadtxt(fname='./线性数据.csv', delimiter='\\t')\n", 25 | "\n", 26 | "#标准化\n", 27 | "data -= data.mean(axis=0)\n", 28 | "data /= data.std(axis=0)\n", 29 | "\n", 30 | "x = data[:, :-1]\n", 31 | "y = data[:, -1]\n", 32 | "\n", 33 | "x.shape, y.shape" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "cc6c8e3c", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "#常量\n", 44 | "N, M = x.shape\n", 45 | "\n", 46 | "#变量\n", 47 | "w = np.ones(M)\n", 48 | "b = 0\n", 49 | "\n", 50 | "#初始化S为全0\n", 51 | "S_w = np.zeros(M)\n", 52 | "S_b = 0\n", 53 | "\n", 54 | "v_w = np.zeros(M)\n", 55 | "v_b = 0" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "id": "92163201", 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "0.6590042695516539" 68 | ] 69 | }, 70 | "execution_count": 3, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "#预测函数\n", 77 | "def predict(x):\n", 78 | " return w.dot(x) + b\n", 79 | "\n", 80 | "\n", 81 | "predict(x[0])" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "id": "a7bb7a80", 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/plain": [ 93 | "0.21258140154187247" 94 | ] 95 | }, 96 | "execution_count": 4, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "#求loss,MSELoss\n", 103 | "def get_loss(x, y):\n", 104 | " pred = predict(x)\n", 105 | " loss = (pred - y)**2\n", 106 | " return loss\n", 107 | "\n", 108 | "\n", 109 | "get_loss(x[0], y[0])" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 5, 115 | "id": "8027d213", 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "(array([-0.61003339, -1.05581946, 1.66242713, 1.21242212, -0.59417855]),\n", 122 | " 0.923131013558981)" 123 | ] 124 | }, 125 | "execution_count": 5, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "def get_gradient(x, y):\n", 132 | " global w\n", 133 | " global b\n", 134 | "\n", 135 | " eps = 1e-3\n", 136 | "\n", 137 | " loss_before = get_loss(x, y)\n", 138 | "\n", 139 | " gradient_w = np.empty(M)\n", 140 | " for i in range(M):\n", 141 | " w[i] += eps\n", 142 | " loss_after = get_loss(x, y)\n", 143 | " w[i] -= eps\n", 144 | " gradient_w[i] = (loss_after - loss_before) / eps\n", 145 | "\n", 146 | " b += eps\n", 147 | " loss_after = get_loss(x, y)\n", 148 | " b -= eps\n", 149 | " gradient_b = (loss_after - loss_before) / eps\n", 150 | "\n", 151 | " return gradient_w, gradient_b\n", 152 | "\n", 153 | "\n", 154 | "get_gradient(x[0], y[0])" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 6, 160 | "id": "f39e0125", 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "11073.905141728206" 167 | ] 168 | }, 169 | "execution_count": 6, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "def total_loss():\n", 176 | " loss = 0\n", 177 | " for i in range(N):\n", 178 | " loss += get_loss(x[i], y[i])\n", 179 | " return loss\n", 180 | "\n", 181 | "\n", 182 | "total_loss()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 7, 188 | "id": "c371c6a4", 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "name": "stdout", 193 | "output_type": "stream", 194 | "text": [ 195 | "500 [0.08904127 0.71062389] [16.68083241 9.3870837 ] 1261.5118603011992\n", 196 | "1000 [-0.90368761 0.83072632] [8.3751223 4.76494681] 749.9516889645644\n", 197 | "1500 [-0.30210063 0.02406904] [5.98086602 3.792364 ] 745.5930763832396\n", 198 | "2000 [ 0.05851974 -0.52552324] [5.03242601 3.45097686] 734.1425192541457\n", 199 | "2500 [ 0.19785822 -0.16807166] [4.55122551 2.92449296] 763.0359728232618\n", 200 | "3000 [ 0.11518987 -0.08617003] [4.08077263 2.93377002] 746.4590779281906\n", 201 | "3500 [ 0.14053857 -0.11983124] [3.52452827 2.59132747] 783.7272638890388\n", 202 | "4000 [0.03610707 0.36194797] [3.26520532 2.42821712] 766.6407173851062\n", 203 | "4500 [-0.24672945 0.18166294] [3.41205737 3.01430322] 791.864659000636\n", 204 | "5000 [ 0.1558802 -0.19912752] [3.24213769 2.86521236] 788.53176983857\n" 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "plt_x = []\n", 210 | "plt_y = []\n", 211 | "\n", 212 | "for t in range(1, 5500):\n", 213 | " i = np.random.randint(N)\n", 214 | " gradient_w, gradient_b = get_gradient(x[i], y[i])\n", 215 | "\n", 216 | " v_w = 0.9 * v_w + 0.1 * gradient_w\n", 217 | " v_b = 0.9 * v_b + 0.1 * gradient_b\n", 218 | "\n", 219 | " #S的计算和rmsprop完全一致\n", 220 | " S_w = 0.999 * S_w + 0.001 * gradient_w**2\n", 221 | " S_b = 0.999 * S_b + 0.001 * gradient_b**2\n", 222 | "\n", 223 | " #根据以上公式,在时刻0\n", 224 | " #v = [0.1 * gradient_0]\n", 225 | "\n", 226 | " #这可能太过于小,为了消除这个影响,需要做偏差修正,也就是除以系数\n", 227 | " #v = 0.1 * sigma[0.9**(t-i) * gradient_i]\n", 228 | " #S = 0.001 * sigma[0.999**(t-i) * gradient_i**2]\n", 229 | " \n", 230 | " #将梯度的系数部分整理得到\n", 231 | " #0.1 * sigma[0.9**(t-i)] = 1-0.9**t\n", 232 | "\n", 233 | " #偏差修正\n", 234 | " v_hat_w = v_w / (1 - 0.9**t)\n", 235 | " v_hat_b = v_b / (1 - 0.9**t)\n", 236 | " S_hat_w = S_w / (1 - 0.999**t)\n", 237 | " S_hat_b = S_b / (1 - 0.999**t)\n", 238 | "\n", 239 | " #下面是adam参数更新的公式\n", 240 | " #这里的1e-2是超参数lr\n", 241 | " gradient_w = (1e-2 * v_hat_w) / (S_hat_w**0.5 + 1e-6)\n", 242 | " gradient_b = (1e-2 * v_hat_b) / (S_hat_b**0.5 + 1e-6)\n", 243 | "\n", 244 | " #更新参数\n", 245 | " w -= gradient_w\n", 246 | " b -= gradient_b\n", 247 | "\n", 248 | " plt_x.append(t)\n", 249 | " plt_y.append(total_loss())\n", 250 | "\n", 251 | " if t % 500 == 0:\n", 252 | " print(t, v_hat_w[:2], S_hat_w[:2], total_loss())" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 8, 258 | "id": "0471a70d", 259 | "metadata": { 260 | "scrolled": true 261 | }, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "image/png": "\n", 266 | "text/plain": [ 267 | "
" 268 | ] 269 | }, 270 | "metadata": { 271 | "needs_background": "light" 272 | }, 273 | "output_type": "display_data" 274 | } 275 | ], 276 | "source": [ 277 | "from matplotlib import pyplot as plt\n", 278 | "%matplotlib inline\n", 279 | "\n", 280 | "plt.plot(plt_x, plt_y)\n", 281 | "plt.show()" 282 | ] 283 | } 284 | ], 285 | "metadata": { 286 | "kernelspec": { 287 | "display_name": "Python 3 (ipykernel)", 288 | "language": "python", 289 | "name": "python3" 290 | }, 291 | "language_info": { 292 | "codemirror_mode": { 293 | "name": "ipython", 294 | "version": 3 295 | }, 296 | "file_extension": ".py", 297 | "mimetype": "text/x-python", 298 | "name": "python", 299 | "nbconvert_exporter": "python", 300 | "pygments_lexer": "ipython3", 301 | "version": "3.8.11" 302 | } 303 | }, 304 | "nbformat": 4, 305 | "nbformat_minor": 5 306 | } 307 | -------------------------------------------------------------------------------- /1.梯度下降.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4fcd93aa", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/plain": [ 12 | "((1503, 5), (1503,))" 13 | ] 14 | }, 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "output_type": "execute_result" 18 | } 19 | ], 20 | "source": [ 21 | "import numpy as np\n", 22 | "\n", 23 | "#加载数据\n", 24 | "data = np.loadtxt(fname='./线性数据.csv', delimiter='\\t')\n", 25 | "\n", 26 | "#标准化\n", 27 | "data -= data.mean(axis=0)\n", 28 | "data /= data.std(axis=0)\n", 29 | "\n", 30 | "x = data[:, :-1]\n", 31 | "y = data[:, -1]\n", 32 | "\n", 33 | "x.shape, y.shape" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "cc6c8e3c", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "#常量\n", 44 | "N, M = x.shape\n", 45 | "\n", 46 | "#变量\n", 47 | "w = np.ones(M)\n", 48 | "b = 0" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "id": "92163201", 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "0.6590042695516539" 61 | ] 62 | }, 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "#预测函数\n", 70 | "def predict(x):\n", 71 | " return w.dot(x) + b\n", 72 | "\n", 73 | "\n", 74 | "predict(x[0])" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "id": "a7bb7a80", 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "7.367867692433937" 87 | ] 88 | }, 89 | "execution_count": 4, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "#求loss,MSELoss\n", 96 | "def get_loss():\n", 97 | " loss = 0\n", 98 | " for i in range(N):\n", 99 | " pred = predict(x[i])\n", 100 | " loss += (pred - y[i])**2\n", 101 | " return loss / N\n", 102 | "\n", 103 | "\n", 104 | "get_loss()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 5, 110 | "id": "8027d213", 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "(array([2.03668543, 2.38225639, 1.02215384, 2.13526642, 3.22327899]),\n", 117 | " 0.0010000000036924916)" 118 | ] 119 | }, 120 | "execution_count": 5, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "def get_gradient():\n", 127 | " global w\n", 128 | " global b\n", 129 | "\n", 130 | " eps = 1e-3\n", 131 | "\n", 132 | " loss_before = get_loss()\n", 133 | "\n", 134 | " gradient_w = np.empty(M)\n", 135 | " for i in range(M):\n", 136 | " w[i] += eps\n", 137 | " loss_after = get_loss()\n", 138 | " w[i] -= eps\n", 139 | " gradient_w[i] = (loss_after - loss_before) / eps\n", 140 | "\n", 141 | " b += eps\n", 142 | " loss_after = get_loss()\n", 143 | " b -= eps\n", 144 | " gradient_b = (loss_after - loss_before) / eps\n", 145 | "\n", 146 | " return gradient_w, gradient_b\n", 147 | "\n", 148 | "\n", 149 | "get_gradient()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 6, 155 | "id": "c371c6a4", 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "name": "stdout", 160 | "output_type": "stream", 161 | "text": [ 162 | "0 7.112757670092038\n", 163 | "50 1.7854577366414703\n", 164 | "100 0.8188794143216034\n", 165 | "150 0.5927178198131446\n", 166 | "200 0.5305917673804184\n", 167 | "250 0.5095310094726683\n", 168 | "300 0.5002280440367376\n", 169 | "350 0.4950505954913211\n", 170 | "400 0.49174823787396005\n", 171 | "450 0.48950848092220356\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "plt_x = []\n", 177 | "plt_y = []\n", 178 | "for i in range(500):\n", 179 | " gradient_w, gradient_b = get_gradient()\n", 180 | " w -= gradient_w * 1e-2\n", 181 | " b -= gradient_b * 1e-2\n", 182 | "\n", 183 | " plt_x.append(i)\n", 184 | " plt_y.append(get_loss())\n", 185 | "\n", 186 | " if i % 50 == 0:\n", 187 | " print(i, get_loss())" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 7, 193 | "id": "0471a70d", 194 | "metadata": { 195 | "scrolled": true 196 | }, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "image/png": "\n", 201 | "text/plain": [ 202 | "
" 203 | ] 204 | }, 205 | "metadata": { 206 | "needs_background": "light" 207 | }, 208 | "output_type": "display_data" 209 | } 210 | ], 211 | "source": [ 212 | "from matplotlib import pyplot as plt\n", 213 | "%matplotlib inline\n", 214 | "\n", 215 | "plt.plot(plt_x, plt_y)\n", 216 | "plt.show()" 217 | ] 218 | } 219 | ], 220 | "metadata": { 221 | "kernelspec": { 222 | "display_name": "Python 3 (ipykernel)", 223 | "language": "python", 224 | "name": "python3" 225 | }, 226 | "language_info": { 227 | "codemirror_mode": { 228 | "name": "ipython", 229 | "version": 3 230 | }, 231 | "file_extension": ".py", 232 | "mimetype": "text/x-python", 233 | "name": "python", 234 | "nbconvert_exporter": "python", 235 | "pygments_lexer": "ipython3", 236 | "version": "3.8.11" 237 | } 238 | }, 239 | "nbformat": 4, 240 | "nbformat_minor": 5 241 | } 242 | -------------------------------------------------------------------------------- /2.随机梯度下降.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4fcd93aa", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/plain": [ 12 | "((1503, 5), (1503,))" 13 | ] 14 | }, 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "output_type": "execute_result" 18 | } 19 | ], 20 | "source": [ 21 | "import numpy as np\n", 22 | "\n", 23 | "#加载数据\n", 24 | "data = np.loadtxt(fname='./线性数据.csv', delimiter='\\t')\n", 25 | "\n", 26 | "#标准化\n", 27 | "data -= data.mean(axis=0)\n", 28 | "data /= data.std(axis=0)\n", 29 | "\n", 30 | "x = data[:, :-1]\n", 31 | "y = data[:, -1]\n", 32 | "\n", 33 | "x.shape, y.shape" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "cc6c8e3c", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "#常量\n", 44 | "N, M = x.shape\n", 45 | "\n", 46 | "#变量\n", 47 | "w = np.ones(M)\n", 48 | "b = 0" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "id": "92163201", 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "0.6590042695516539" 61 | ] 62 | }, 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "#预测函数\n", 70 | "def predict(x):\n", 71 | " return w.dot(x) + b\n", 72 | "\n", 73 | "\n", 74 | "predict(x[0])" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "id": "a7bb7a80", 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "0.21258140154187247" 87 | ] 88 | }, 89 | "execution_count": 4, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "#求loss,MSELoss\n", 96 | "def get_loss(x, y):\n", 97 | " pred = predict(x)\n", 98 | " loss = (pred - y)**2\n", 99 | " return loss\n", 100 | "\n", 101 | "\n", 102 | "get_loss(x[0], y[0])" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 5, 108 | "id": "8027d213", 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "(array([-0.61003339, -1.05581946, 1.66242713, 1.21242212, -0.59417855]),\n", 115 | " 0.923131013558981)" 116 | ] 117 | }, 118 | "execution_count": 5, 119 | "metadata": {}, 120 | "output_type": "execute_result" 121 | } 122 | ], 123 | "source": [ 124 | "def get_gradient(x, y):\n", 125 | " global w\n", 126 | " global b\n", 127 | "\n", 128 | " eps = 1e-3\n", 129 | "\n", 130 | " loss_before = get_loss(x, y)\n", 131 | "\n", 132 | " gradient_w = np.empty(M)\n", 133 | " for i in range(M):\n", 134 | " w[i] += eps\n", 135 | " loss_after = get_loss(x, y)\n", 136 | " w[i] -= eps\n", 137 | " gradient_w[i] = (loss_after - loss_before) / eps\n", 138 | "\n", 139 | " b += eps\n", 140 | " loss_after = get_loss(x, y)\n", 141 | " b -= eps\n", 142 | " gradient_b = (loss_after - loss_before) / eps\n", 143 | "\n", 144 | " return gradient_w, gradient_b\n", 145 | "\n", 146 | "\n", 147 | "get_gradient(x[0], y[0])" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 6, 153 | "id": "f39e0125", 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "11073.905141728206" 160 | ] 161 | }, 162 | "execution_count": 6, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "def total_loss():\n", 169 | " loss = 0\n", 170 | " for i in range(N):\n", 171 | " loss += get_loss(x[i], y[i])\n", 172 | " return loss\n", 173 | "\n", 174 | "\n", 175 | "total_loss()" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 7, 181 | "id": "c371c6a4", 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "name": "stdout", 186 | "output_type": "stream", 187 | "text": [ 188 | "0 11038.895201527894\n", 189 | "150 6696.3736283721655\n", 190 | "300 4354.119709336485\n", 191 | "450 3004.4953025777063\n", 192 | "600 2310.68634531403\n", 193 | "750 1839.580107951638\n", 194 | "900 1549.8047186884628\n", 195 | "1050 1278.2079624059054\n", 196 | "1200 1099.1810250366634\n", 197 | "1350 986.6025037947752\n", 198 | "1500 921.4757031198328\n", 199 | "1650 879.159069825457\n", 200 | "1800 853.2767252227716\n", 201 | "1950 835.2496534941863\n", 202 | "2100 812.3758750332744\n", 203 | "2250 794.1165878394305\n", 204 | "2400 786.9647280480957\n" 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "plt_x = []\n", 210 | "plt_y = []\n", 211 | "for epoch in range(2500):\n", 212 | " i = np.random.randint(N)\n", 213 | " gradient_w, gradient_b = get_gradient(x[i], y[i])\n", 214 | " w -= gradient_w * 1e-3\n", 215 | " b -= gradient_b * 1e-3\n", 216 | "\n", 217 | " plt_x.append(epoch)\n", 218 | " plt_y.append(total_loss())\n", 219 | "\n", 220 | " if epoch % 150 == 0:\n", 221 | " print(epoch, total_loss())" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 8, 227 | "id": "0471a70d", 228 | "metadata": { 229 | "scrolled": true 230 | }, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "image/png": "\n", 235 | "text/plain": [ 236 | "
" 237 | ] 238 | }, 239 | "metadata": { 240 | "needs_background": "light" 241 | }, 242 | "output_type": "display_data" 243 | } 244 | ], 245 | "source": [ 246 | "from matplotlib import pyplot as plt\n", 247 | "%matplotlib inline\n", 248 | "\n", 249 | "plt.plot(plt_x, plt_y)\n", 250 | "plt.show()" 251 | ] 252 | } 253 | ], 254 | "metadata": { 255 | "kernelspec": { 256 | "display_name": "Python 3 (ipykernel)", 257 | "language": "python", 258 | "name": "python3" 259 | }, 260 | "language_info": { 261 | "codemirror_mode": { 262 | "name": "ipython", 263 | "version": 3 264 | }, 265 | "file_extension": ".py", 266 | "mimetype": "text/x-python", 267 | "name": "python", 268 | "nbconvert_exporter": "python", 269 | "pygments_lexer": "ipython3", 270 | "version": "3.8.11" 271 | } 272 | }, 273 | "nbformat": 4, 274 | "nbformat_minor": 5 275 | } 276 | -------------------------------------------------------------------------------- /3.momentum.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4fcd93aa", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/plain": [ 12 | "((1503, 5), (1503,))" 13 | ] 14 | }, 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "output_type": "execute_result" 18 | } 19 | ], 20 | "source": [ 21 | "import numpy as np\n", 22 | "\n", 23 | "#加载数据\n", 24 | "data = np.loadtxt(fname='./线性数据.csv', delimiter='\\t')\n", 25 | "\n", 26 | "#标准化\n", 27 | "data -= data.mean(axis=0)\n", 28 | "data /= data.std(axis=0)\n", 29 | "\n", 30 | "x = data[:, :-1]\n", 31 | "y = data[:, -1]\n", 32 | "\n", 33 | "x.shape, y.shape" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "cc6c8e3c", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "#常量\n", 44 | "N, M = x.shape\n", 45 | "\n", 46 | "#变量\n", 47 | "w = np.ones(M)\n", 48 | "b = 0\n", 49 | "\n", 50 | "#动量都初始化为0\n", 51 | "momentum_w = np.zeros(M)\n", 52 | "momentum_b = 0" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "id": "92163201", 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "0.6590042695516539" 65 | ] 66 | }, 67 | "execution_count": 3, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "#预测函数\n", 74 | "def predict(x):\n", 75 | " return w.dot(x) + b\n", 76 | "\n", 77 | "\n", 78 | "predict(x[0])" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "id": "a7bb7a80", 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "0.21258140154187247" 91 | ] 92 | }, 93 | "execution_count": 4, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "#求loss,MSELoss\n", 100 | "def get_loss(x, y):\n", 101 | " pred = predict(x)\n", 102 | " loss = (pred - y)**2\n", 103 | " return loss\n", 104 | "\n", 105 | "\n", 106 | "get_loss(x[0], y[0])" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 5, 112 | "id": "8027d213", 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "(array([-0.61003339, -1.05581946, 1.66242713, 1.21242212, -0.59417855]),\n", 119 | " 0.923131013558981)" 120 | ] 121 | }, 122 | "execution_count": 5, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "def get_gradient(x, y):\n", 129 | " global w\n", 130 | " global b\n", 131 | "\n", 132 | " eps = 1e-3\n", 133 | "\n", 134 | " loss_before = get_loss(x, y)\n", 135 | "\n", 136 | " gradient_w = np.empty(M)\n", 137 | " for i in range(M):\n", 138 | " w[i] += eps\n", 139 | " loss_after = get_loss(x, y)\n", 140 | " w[i] -= eps\n", 141 | " gradient_w[i] = (loss_after - loss_before) / eps\n", 142 | "\n", 143 | " b += eps\n", 144 | " loss_after = get_loss(x, y)\n", 145 | " b -= eps\n", 146 | " gradient_b = (loss_after - loss_before) / eps\n", 147 | "\n", 148 | " return gradient_w, gradient_b\n", 149 | "\n", 150 | "\n", 151 | "get_gradient(x[0], y[0])" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 6, 157 | "id": "f39e0125", 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "11073.905141728206" 164 | ] 165 | }, 166 | "execution_count": 6, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "def total_loss():\n", 173 | " loss = 0\n", 174 | " for i in range(N):\n", 175 | " loss += get_loss(x[i], y[i])\n", 176 | " return loss\n", 177 | "\n", 178 | "\n", 179 | "total_loss()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 7, 185 | "id": "c371c6a4", 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "0 [-4.35683357 4.61375434 1.50384418 2.91083043 2.96213566] -4.022402214293841 11044.88570074484\n", 193 | "150 [ 1.26339579 2.13768158 0.49321428 -1.51848376 3.5785042 ] -1.7176170960084935 1967.1697066194426\n", 194 | "300 [ 5.80970624e-03 3.55685572e+00 3.82822053e+00 -2.58323564e-01\n", 195 | " 1.07002133e+01] 2.0649671248152166 947.1121093652372\n", 196 | "450 [-2.26148992 3.8055673 -3.18437755 -3.87469838 -0.61715981] 0.7950908166330994 815.4883875626289\n", 197 | "600 [1.55949644 0.79231053 3.14007815 3.71824435 0.98449926] -0.5781588479929312 783.2820118211715\n", 198 | "750 [ 3.59572027 -0.74530488 0.72892403 -0.15447051 -4.33429161] -2.0781779575632005 753.5637854055667\n", 199 | "900 [ 0.3379927 2.54425898 -4.14378216 0.92339395 0.76440147] 0.8774161483117863 769.3244441314099\n", 200 | "1050 [-1.00006534 -5.83176015 -5.28599293 -5.19888383 -3.62367306] 2.5986887635740827 742.322126662018\n", 201 | "1200 [ 1.32568154 -8.98944316 8.03060836 -1.84855209 -1.93601743] -4.04468557258265 753.4751326083475\n", 202 | "1350 [ 1.82402337 -1.95342977 1.07956753 2.00744178 -4.75462661] -2.284159092719058 734.9285345602206\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "plt_x = []\n", 208 | "plt_y = []\n", 209 | "for epoch in range(1500):\n", 210 | " i = np.random.randint(N)\n", 211 | "\n", 212 | " gradient_w, gradient_b = get_gradient(x[i], y[i])\n", 213 | "\n", 214 | " #这是更新动量的数学公式,0.8是过去动量的权重\n", 215 | " momentum_w = 0.8 * momentum_w + gradient_w\n", 216 | " momentum_b = 0.8 * momentum_b + gradient_b\n", 217 | "\n", 218 | " #这里更新参数不再使用梯度,而是使用动量\n", 219 | " w -= momentum_w * 1e-3\n", 220 | " b -= momentum_b * 1e-3\n", 221 | "\n", 222 | " #思考一下,在时刻0,动量都是0.此时更新动量,动量就等于梯度.\n", 223 | " #也就是说,再时刻0,其实就是再用梯度下降.\n", 224 | " #时刻1,是上一个时刻的梯度乘以0.8,再加上当前时刻的梯度\n", 225 | " #所以在时刻1,差不多可以认为是梯度乘以了1.8.不过这里面两部分的梯度是在两个不同的点上评估出来的.\n", 226 | " #在时刻2,差不多等同于时刻1.往后都差不多.\n", 227 | "\n", 228 | " plt_x.append(epoch)\n", 229 | " plt_y.append(total_loss())\n", 230 | "\n", 231 | " if epoch % 150 == 0:\n", 232 | " print(epoch, momentum_w, momentum_b, total_loss())" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 8, 238 | "id": "0471a70d", 239 | "metadata": { 240 | "scrolled": true 241 | }, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "image/png": "\n", 246 | "text/plain": [ 247 | "
" 248 | ] 249 | }, 250 | "metadata": { 251 | "needs_background": "light" 252 | }, 253 | "output_type": "display_data" 254 | } 255 | ], 256 | "source": [ 257 | "from matplotlib import pyplot as plt\n", 258 | "%matplotlib inline\n", 259 | "\n", 260 | "plt.plot(plt_x, plt_y)\n", 261 | "plt.show()" 262 | ] 263 | } 264 | ], 265 | "metadata": { 266 | "kernelspec": { 267 | "display_name": "Python 3 (ipykernel)", 268 | "language": "python", 269 | "name": "python3" 270 | }, 271 | "language_info": { 272 | "codemirror_mode": { 273 | "name": "ipython", 274 | "version": 3 275 | }, 276 | "file_extension": ".py", 277 | "mimetype": "text/x-python", 278 | "name": "python", 279 | "nbconvert_exporter": "python", 280 | "pygments_lexer": "ipython3", 281 | "version": "3.8.11" 282 | } 283 | }, 284 | "nbformat": 4, 285 | "nbformat_minor": 5 286 | } 287 | -------------------------------------------------------------------------------- /4.ada_grad.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4fcd93aa", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/plain": [ 12 | "((1503, 5), (1503,))" 13 | ] 14 | }, 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "output_type": "execute_result" 18 | } 19 | ], 20 | "source": [ 21 | "import numpy as np\n", 22 | "\n", 23 | "#加载数据\n", 24 | "data = np.loadtxt(fname='./线性数据.csv', delimiter='\\t')\n", 25 | "\n", 26 | "#标准化\n", 27 | "data -= data.mean(axis=0)\n", 28 | "data /= data.std(axis=0)\n", 29 | "\n", 30 | "x = data[:, :-1]\n", 31 | "y = data[:, -1]\n", 32 | "\n", 33 | "x.shape, y.shape" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "cc6c8e3c", 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/plain": [ 45 | "array([0., 0., 0., 0., 0.])" 46 | ] 47 | }, 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": [ 54 | "#常量\n", 55 | "N, M = x.shape\n", 56 | "\n", 57 | "#变量\n", 58 | "w = np.ones(M)\n", 59 | "b = 0\n", 60 | "\n", 61 | "#初始化S为全0\n", 62 | "S_w = np.zeros(M)\n", 63 | "S_b = 0\n", 64 | "\n", 65 | "S_w" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "id": "92163201", 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "0.6590042695516539" 78 | ] 79 | }, 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "#预测函数\n", 87 | "def predict(x):\n", 88 | " return w.dot(x) + b\n", 89 | "\n", 90 | "\n", 91 | "predict(x[0])" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 4, 97 | "id": "a7bb7a80", 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/plain": [ 103 | "0.21258140154187247" 104 | ] 105 | }, 106 | "execution_count": 4, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "#求loss,MSELoss\n", 113 | "def get_loss(x, y):\n", 114 | " pred = predict(x)\n", 115 | " loss = (pred - y)**2\n", 116 | " return loss\n", 117 | "\n", 118 | "\n", 119 | "get_loss(x[0], y[0])" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "id": "8027d213", 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "(array([-0.61003339, -1.05581946, 1.66242713, 1.21242212, -0.59417855]),\n", 132 | " 0.923131013558981)" 133 | ] 134 | }, 135 | "execution_count": 5, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "def get_gradient(x, y):\n", 142 | " global w\n", 143 | " global b\n", 144 | "\n", 145 | " eps = 1e-3\n", 146 | "\n", 147 | " loss_before = get_loss(x, y)\n", 148 | "\n", 149 | " gradient_w = np.empty(M)\n", 150 | " for i in range(M):\n", 151 | " w[i] += eps\n", 152 | " loss_after = get_loss(x, y)\n", 153 | " w[i] -= eps\n", 154 | " gradient_w[i] = (loss_after - loss_before) / eps\n", 155 | "\n", 156 | " b += eps\n", 157 | " loss_after = get_loss(x, y)\n", 158 | " b -= eps\n", 159 | " gradient_b = (loss_after - loss_before) / eps\n", 160 | "\n", 161 | " return gradient_w, gradient_b\n", 162 | "\n", 163 | "\n", 164 | "get_gradient(x[0], y[0])" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 6, 170 | "id": "f39e0125", 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "11073.905141728206" 177 | ] 178 | }, 179 | "execution_count": 6, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "def total_loss():\n", 186 | " loss = 0\n", 187 | " for i in range(N):\n", 188 | " loss += get_loss(x[i], y[i])\n", 189 | " return loss\n", 190 | "\n", 191 | "\n", 192 | "total_loss()" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 7, 198 | "id": "c371c6a4", 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "0 [0.02348118 0.05140723 0.0153264 0.01141825 0.03465209] 0.014058087368505013 10246.765269340094\n", 206 | "150 [0.00215905 0.00213966 0.0025248 0.0023185 0.00215242] 0.0024356945540773096 2658.3900946275558\n", 207 | "300 [0.00159518 0.00180842 0.0019631 0.00191729 0.00173008] 0.0019876188630781272 1600.8100316915763\n", 208 | "450 [0.00148661 0.00166541 0.00178354 0.00173962 0.00157404] 0.0017842433083993126 1186.465991584585\n", 209 | "600 [0.00139184 0.00156629 0.00167125 0.00164534 0.00149001] 0.0016826253755351525 1018.0084411998928\n", 210 | "750 [0.00132517 0.00150278 0.00158741 0.00157468 0.00142465] 0.001594262705724423 927.9759431415143\n", 211 | "900 [0.0012664 0.00144711 0.00152095 0.00150801 0.00135752] 0.0015236251237426357 861.6745940332113\n", 212 | "1050 [0.00123801 0.00141408 0.0014759 0.00146659 0.00133255] 0.001476298163704934 824.5775032016687\n", 213 | "1200 [0.00121116 0.00136314 0.001434 0.00142441 0.00128068] 0.001429332532480186 807.1329761191264\n", 214 | "1350 [0.00116843 0.0013307 0.00139645 0.00138838 0.00124704] 0.0013899172750459162 791.1949098503449\n", 215 | "1500 [0.00115129 0.00130284 0.0013622 0.00136147 0.00122016] 0.0013561117340229432 788.5470016064282\n", 216 | "1650 [0.00111086 0.00126146 0.00132726 0.00132571 0.00119364] 0.0013234756456931064 783.241050040833\n", 217 | "1800 [0.00105912 0.00119525 0.00128706 0.00128536 0.00116119] 0.0012821836202760327 776.5490903516861\n", 218 | "1950 [0.00101795 0.00115106 0.00125265 0.00124849 0.00112686] 0.0012443142251112867 767.7869135624381\n", 219 | "2100 [0.00099775 0.00112372 0.00122065 0.00122 0.00111091] 0.0012130252246838496 762.3245228610506\n", 220 | "2250 [0.00098704 0.00109155 0.00119765 0.00120011 0.00109044] 0.00119230318143276 757.1151773657198\n", 221 | "2400 [0.00096106 0.00106878 0.00117239 0.00118274 0.00108084] 0.0011689278828959015 754.8130624984661\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "plt_x = []\n", 227 | "plt_y = []\n", 228 | "for epoch in range(2500):\n", 229 | " i = np.random.randint(N)\n", 230 | " gradient_w, gradient_b = get_gradient(x[i], y[i])\n", 231 | "\n", 232 | " #adagrad的特点是每个变量都有属于自己的lr\n", 233 | " #要计算各个变量的lr,先要计算S\n", 234 | " #这是S的计算公式\n", 235 | " S_w = S_w + gradient_w**2\n", 236 | " S_b = S_b + gradient_b**2\n", 237 | "\n", 238 | " #计算lr的公式,其中的1e-1是原本的lr,1e-6是防止除0的\n", 239 | " lr_w = 1e-1 / ((S_w + 1e-6)**0.5)\n", 240 | " lr_b = 1e-1 / ((S_b + 1e-6)**0.5)\n", 241 | "\n", 242 | " #所以在时刻0,lr就等于梯度的倒数\n", 243 | " #梯度大的变量会有小lr,梯度小的变量会有大lr\n", 244 | " #往后的每一个时刻,都是类似动量法,考虑上一步的梯度\n", 245 | "\n", 246 | " w -= gradient_w * lr_w\n", 247 | " b -= gradient_b * lr_b\n", 248 | "\n", 249 | " plt_x.append(epoch)\n", 250 | " plt_y.append(total_loss())\n", 251 | "\n", 252 | " if epoch % 150 == 0:\n", 253 | " print(epoch, lr_w, lr_b, total_loss())" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 8, 259 | "id": "0471a70d", 260 | "metadata": { 261 | "scrolled": true 262 | }, 263 | "outputs": [ 264 | { 265 | "data": { 266 | "image/png": "\n", 267 | "text/plain": [ 268 | "
" 269 | ] 270 | }, 271 | "metadata": { 272 | "needs_background": "light" 273 | }, 274 | "output_type": "display_data" 275 | } 276 | ], 277 | "source": [ 278 | "from matplotlib import pyplot as plt\n", 279 | "%matplotlib inline\n", 280 | "\n", 281 | "plt.plot(plt_x, plt_y)\n", 282 | "plt.show()" 283 | ] 284 | } 285 | ], 286 | "metadata": { 287 | "kernelspec": { 288 | "display_name": "Python 3 (ipykernel)", 289 | "language": "python", 290 | "name": "python3" 291 | }, 292 | "language_info": { 293 | "codemirror_mode": { 294 | "name": "ipython", 295 | "version": 3 296 | }, 297 | "file_extension": ".py", 298 | "mimetype": "text/x-python", 299 | "name": "python", 300 | "nbconvert_exporter": "python", 301 | "pygments_lexer": "ipython3", 302 | "version": "3.8.11" 303 | } 304 | }, 305 | "nbformat": 4, 306 | "nbformat_minor": 5 307 | } 308 | -------------------------------------------------------------------------------- /6.ada_delta.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4fcd93aa", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/plain": [ 12 | "((1503, 5), (1503,))" 13 | ] 14 | }, 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "output_type": "execute_result" 18 | } 19 | ], 20 | "source": [ 21 | "import numpy as np\n", 22 | "\n", 23 | "#加载数据\n", 24 | "data = np.loadtxt(fname='./线性数据.csv', delimiter='\\t')\n", 25 | "\n", 26 | "#标准化\n", 27 | "data -= data.mean(axis=0)\n", 28 | "data /= data.std(axis=0)\n", 29 | "\n", 30 | "x = data[:, :-1]\n", 31 | "y = data[:, -1]\n", 32 | "\n", 33 | "x.shape, y.shape" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "cc6c8e3c", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "#常量\n", 44 | "N, M = x.shape\n", 45 | "\n", 46 | "#变量\n", 47 | "w = np.ones(M)\n", 48 | "b = 0\n", 49 | "\n", 50 | "#初始化S为全0\n", 51 | "S_w = np.zeros(M)\n", 52 | "S_b = 0\n", 53 | "\n", 54 | "#初始化delta为全0\n", 55 | "delta_w = np.zeros(M)\n", 56 | "delta_b = 0" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "id": "92163201", 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "0.6590042695516539" 69 | ] 70 | }, 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "#预测函数\n", 78 | "def predict(x):\n", 79 | " return w.dot(x) + b\n", 80 | "\n", 81 | "\n", 82 | "predict(x[0])" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 4, 88 | "id": "a7bb7a80", 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "data": { 93 | "text/plain": [ 94 | "0.21258140154187247" 95 | ] 96 | }, 97 | "execution_count": 4, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "#求loss,MSELoss\n", 104 | "def get_loss(x, y):\n", 105 | " pred = predict(x)\n", 106 | " loss = (pred - y)**2\n", 107 | " return loss\n", 108 | "\n", 109 | "\n", 110 | "get_loss(x[0], y[0])" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 5, 116 | "id": "8027d213", 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/plain": [ 122 | "(array([-0.61003339, -1.05581946, 1.66242713, 1.21242212, -0.59417855]),\n", 123 | " 0.923131013558981)" 124 | ] 125 | }, 126 | "execution_count": 5, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "def get_gradient(x, y):\n", 133 | " global w\n", 134 | " global b\n", 135 | "\n", 136 | " eps = 1e-3\n", 137 | "\n", 138 | " loss_before = get_loss(x, y)\n", 139 | "\n", 140 | " gradient_w = np.empty(M)\n", 141 | " for i in range(M):\n", 142 | " w[i] += eps\n", 143 | " loss_after = get_loss(x, y)\n", 144 | " w[i] -= eps\n", 145 | " gradient_w[i] = (loss_after - loss_before) / eps\n", 146 | "\n", 147 | " b += eps\n", 148 | " loss_after = get_loss(x, y)\n", 149 | " b -= eps\n", 150 | " gradient_b = (loss_after - loss_before) / eps\n", 151 | "\n", 152 | " return gradient_w, gradient_b\n", 153 | "\n", 154 | "\n", 155 | "get_gradient(x[0], y[0])" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 6, 161 | "id": "f39e0125", 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "11073.905141728206" 168 | ] 169 | }, 170 | "execution_count": 6, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "def total_loss():\n", 177 | " loss = 0\n", 178 | " for i in range(N):\n", 179 | " loss += get_loss(x[i], y[i])\n", 180 | " return loss\n", 181 | "\n", 182 | "\n", 183 | "total_loss()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 7, 189 | "id": "c371c6a4", 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stdout", 194 | "output_type": "stream", 195 | "text": [ 196 | "0 [9.99999848e-07 9.99999845e-07] 9.999998984693441e-07 11059.213325989294\n", 197 | "500 [2.66451164e-06 2.89372373e-06] 3.595841972186342e-06 7623.402923852171\n", 198 | "1000 [1.49058551e-06 1.77039119e-06] 2.3569819378456833e-06 5116.6673548382705\n", 199 | "1500 [1.54060667e-07 1.52269411e-07] 4.854100489197519e-07 3453.7618700421162\n", 200 | "2000 [4.5399257e-06 3.6700749e-06] 4.182369642302336e-06 2259.06518603289\n", 201 | "2500 [1.65797772e-06 1.05985930e-06] 1.3383495167537935e-06 1438.5906667372406\n", 202 | "3000 [2.96914906e-07 6.56168224e-07] 6.131919613839703e-07 1051.9695231637406\n", 203 | "3500 [1.48431059e-06 1.85664992e-06] 3.2354643896326765e-06 886.751528580343\n", 204 | "4000 [3.06717189e-06 1.53765326e-06] 7.968217077828009e-06 826.4649373166238\n", 205 | "4500 [1.83228209e-06 1.98744616e-06] 3.0032937384269872e-06 794.0081885344441\n", 206 | "5000 [5.05157589e-07 6.08365156e-07] 6.43608532358165e-07 798.5606607279586\n" 207 | ] 208 | } 209 | ], 210 | "source": [ 211 | "plt_x = []\n", 212 | "plt_y = []\n", 213 | "\n", 214 | "for epoch in range(5500):\n", 215 | " i = np.random.randint(N)\n", 216 | " gradient_w, gradient_b = get_gradient(x[i], y[i])\n", 217 | "\n", 218 | " #ada_delta算法不需要设定超参数lr\n", 219 | " #他需要维持两个变量,delta和S\n", 220 | "\n", 221 | " #S的计算和rmsprop完全一致\n", 222 | " S_w = 0.2 * S_w + 0.8 * gradient_w**2\n", 223 | " S_b = 0.2 * S_b + 0.8 * gradient_b**2\n", 224 | "\n", 225 | " #计算lr的公式,这里的1e-6是为了防止除0\n", 226 | " lr = (delta_w + 1e-6) / (S_w + 1e-6)\n", 227 | " gradient_w = lr**0.5 * gradient_w\n", 228 | "\n", 229 | " lr = (delta_b + 1e-6) / (S_b + 1e-6)\n", 230 | " gradient_b = lr**0.5 * gradient_b\n", 231 | "\n", 232 | " #更新参数\n", 233 | " w -= gradient_w\n", 234 | " b -= gradient_b\n", 235 | "\n", 236 | " #更新delta,这里的两个系数和计算S时用的要一样\n", 237 | " delta_w = 0.2 * delta_w + 0.8 * gradient_w**2\n", 238 | " delta_b = 0.2 * delta_b + 0.8 * gradient_b**2\n", 239 | "\n", 240 | " #思考一下,在时刻0,S就是梯度的平方乘以0.8\n", 241 | " #所以在一开始的时候,S是比较大的.但delta还是0\n", 242 | " #所以一开始的时候lr是比较大的.\n", 243 | " #delta更新为变量更新量的平方*0.8\n", 244 | " #所以delta当中差不多相当于存储了变量更新量的历史信息\n", 245 | " #所以最后的lr,应该是取两者的比值\n", 246 | "\n", 247 | " plt_x.append(epoch)\n", 248 | " plt_y.append(total_loss())\n", 249 | "\n", 250 | " if epoch % 500 == 0:\n", 251 | " print(epoch, delta_w[:2], delta_b, total_loss())" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 8, 257 | "id": "0471a70d", 258 | "metadata": { 259 | "scrolled": true 260 | }, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "image/png": "\n", 265 | "text/plain": [ 266 | "
" 267 | ] 268 | }, 269 | "metadata": { 270 | "needs_background": "light" 271 | }, 272 | "output_type": "display_data" 273 | } 274 | ], 275 | "source": [ 276 | "from matplotlib import pyplot as plt\n", 277 | "%matplotlib inline\n", 278 | "\n", 279 | "plt.plot(plt_x, plt_y)\n", 280 | "plt.show()" 281 | ] 282 | } 283 | ], 284 | "metadata": { 285 | "kernelspec": { 286 | "display_name": "Python 3 (ipykernel)", 287 | "language": "python", 288 | "name": "python3" 289 | }, 290 | "language_info": { 291 | "codemirror_mode": { 292 | "name": "ipython", 293 | "version": 3 294 | }, 295 | "file_extension": ".py", 296 | "mimetype": "text/x-python", 297 | "name": "python", 298 | "nbconvert_exporter": "python", 299 | "pygments_lexer": "ipython3", 300 | "version": "3.8.11" 301 | } 302 | }, 303 | "nbformat": 4, 304 | "nbformat_minor": 5 305 | } 306 | -------------------------------------------------------------------------------- /7.adam.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4fcd93aa", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/plain": [ 12 | "((1503, 5), (1503,))" 13 | ] 14 | }, 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "output_type": "execute_result" 18 | } 19 | ], 20 | "source": [ 21 | "import numpy as np\n", 22 | "\n", 23 | "#加载数据\n", 24 | "data = np.loadtxt(fname='./线性数据.csv', delimiter='\\t')\n", 25 | "\n", 26 | "#标准化\n", 27 | "data -= data.mean(axis=0)\n", 28 | "data /= data.std(axis=0)\n", 29 | "\n", 30 | "x = data[:, :-1]\n", 31 | "y = data[:, -1]\n", 32 | "\n", 33 | "x.shape, y.shape" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "cc6c8e3c", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "#常量\n", 44 | "N, M = x.shape\n", 45 | "\n", 46 | "#变量\n", 47 | "w = np.ones(M)\n", 48 | "b = 0\n", 49 | "\n", 50 | "#初始化S为全0\n", 51 | "S_w = np.zeros(M)\n", 52 | "S_b = 0\n", 53 | "\n", 54 | "v_w = np.zeros(M)\n", 55 | "v_b = 0" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "id": "92163201", 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "0.6590042695516539" 68 | ] 69 | }, 70 | "execution_count": 3, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "#预测函数\n", 77 | "def predict(x):\n", 78 | " return w.dot(x) + b\n", 79 | "\n", 80 | "\n", 81 | "predict(x[0])" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "id": "a7bb7a80", 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/plain": [ 93 | "0.21258140154187247" 94 | ] 95 | }, 96 | "execution_count": 4, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "#求loss,MSELoss\n", 103 | "def get_loss(x, y):\n", 104 | " pred = predict(x)\n", 105 | " loss = (pred - y)**2\n", 106 | " return loss\n", 107 | "\n", 108 | "\n", 109 | "get_loss(x[0], y[0])" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 5, 115 | "id": "8027d213", 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "(array([-0.61003339, -1.05581946, 1.66242713, 1.21242212, -0.59417855]),\n", 122 | " 0.923131013558981)" 123 | ] 124 | }, 125 | "execution_count": 5, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "def get_gradient(x, y):\n", 132 | " global w\n", 133 | " global b\n", 134 | "\n", 135 | " eps = 1e-3\n", 136 | "\n", 137 | " loss_before = get_loss(x, y)\n", 138 | "\n", 139 | " gradient_w = np.empty(M)\n", 140 | " for i in range(M):\n", 141 | " w[i] += eps\n", 142 | " loss_after = get_loss(x, y)\n", 143 | " w[i] -= eps\n", 144 | " gradient_w[i] = (loss_after - loss_before) / eps\n", 145 | "\n", 146 | " b += eps\n", 147 | " loss_after = get_loss(x, y)\n", 148 | " b -= eps\n", 149 | " gradient_b = (loss_after - loss_before) / eps\n", 150 | "\n", 151 | " return gradient_w, gradient_b\n", 152 | "\n", 153 | "\n", 154 | "get_gradient(x[0], y[0])" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 6, 160 | "id": "f39e0125", 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "11073.905141728206" 167 | ] 168 | }, 169 | "execution_count": 6, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "def total_loss():\n", 176 | " loss = 0\n", 177 | " for i in range(N):\n", 178 | " loss += get_loss(x[i], y[i])\n", 179 | " return loss\n", 180 | "\n", 181 | "\n", 182 | "total_loss()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 7, 188 | "id": "c371c6a4", 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "name": "stdout", 193 | "output_type": "stream", 194 | "text": [ 195 | "500 [0.08904127 0.71062389] [16.68083241 9.3870837 ] 1261.5118603011992\n", 196 | "1000 [-0.90368761 0.83072632] [8.3751223 4.76494681] 749.9516889645644\n", 197 | "1500 [-0.30210063 0.02406904] [5.98086602 3.792364 ] 745.5930763832396\n", 198 | "2000 [ 0.05851974 -0.52552324] [5.03242601 3.45097686] 734.1425192541457\n", 199 | "2500 [ 0.19785822 -0.16807166] [4.55122551 2.92449296] 763.0359728232618\n", 200 | "3000 [ 0.11518987 -0.08617003] [4.08077263 2.93377002] 746.4590779281906\n", 201 | "3500 [ 0.14053857 -0.11983124] [3.52452827 2.59132747] 783.7272638890388\n", 202 | "4000 [0.03610707 0.36194797] [3.26520532 2.42821712] 766.6407173851062\n", 203 | "4500 [-0.24672945 0.18166294] [3.41205737 3.01430322] 791.864659000636\n", 204 | "5000 [ 0.1558802 -0.19912752] [3.24213769 2.86521236] 788.53176983857\n" 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "plt_x = []\n", 210 | "plt_y = []\n", 211 | "\n", 212 | "for t in range(1, 5500):\n", 213 | " i = np.random.randint(N)\n", 214 | " gradient_w, gradient_b = get_gradient(x[i], y[i])\n", 215 | "\n", 216 | " v_w = 0.9 * v_w + 0.1 * gradient_w\n", 217 | " v_b = 0.9 * v_b + 0.1 * gradient_b\n", 218 | "\n", 219 | " #S的计算和rmsprop完全一致\n", 220 | " S_w = 0.999 * S_w + 0.001 * gradient_w**2\n", 221 | " S_b = 0.999 * S_b + 0.001 * gradient_b**2\n", 222 | "\n", 223 | " #根据以上公式,在时刻0\n", 224 | " #v = [0.1 * gradient_0]\n", 225 | "\n", 226 | " #这可能太过于小,为了消除这个影响,需要做偏差修正,也就是除以系数\n", 227 | " #v = 0.1 * sigma[0.9**(t-i) * gradient_i]\n", 228 | " #S = 0.001 * sigma[0.999**(t-i) * gradient_i**2]\n", 229 | " \n", 230 | " #将梯度的系数部分整理得到\n", 231 | " #0.1 * sigma[0.9**(t-i)] = 1-0.9**t\n", 232 | "\n", 233 | " #偏差修正\n", 234 | " v_hat_w = v_w / (1 - 0.9**t)\n", 235 | " v_hat_b = v_b / (1 - 0.9**t)\n", 236 | " S_hat_w = S_w / (1 - 0.999**t)\n", 237 | " S_hat_b = S_b / (1 - 0.999**t)\n", 238 | "\n", 239 | " #下面是adam参数更新的公式\n", 240 | " #这里的1e-2是超参数lr\n", 241 | " gradient_w = (1e-2 * v_hat_w) / (S_hat_w**0.5 + 1e-6)\n", 242 | " gradient_b = (1e-2 * v_hat_b) / (S_hat_b**0.5 + 1e-6)\n", 243 | "\n", 244 | " #更新参数\n", 245 | " w -= gradient_w\n", 246 | " b -= gradient_b\n", 247 | "\n", 248 | " plt_x.append(t)\n", 249 | " plt_y.append(total_loss())\n", 250 | "\n", 251 | " if t % 500 == 0:\n", 252 | " print(t, v_hat_w[:2], S_hat_w[:2], total_loss())" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 8, 258 | "id": "0471a70d", 259 | "metadata": { 260 | "scrolled": true 261 | }, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "image/png": "\n", 266 | "text/plain": [ 267 | "
" 268 | ] 269 | }, 270 | "metadata": { 271 | "needs_background": "light" 272 | }, 273 | "output_type": "display_data" 274 | } 275 | ], 276 | "source": [ 277 | "from matplotlib import pyplot as plt\n", 278 | "%matplotlib inline\n", 279 | "\n", 280 | "plt.plot(plt_x, plt_y)\n", 281 | "plt.show()" 282 | ] 283 | } 284 | ], 285 | "metadata": { 286 | "kernelspec": { 287 | "display_name": "Python 3 (ipykernel)", 288 | "language": "python", 289 | "name": "python3" 290 | }, 291 | "language_info": { 292 | "codemirror_mode": { 293 | "name": "ipython", 294 | "version": 3 295 | }, 296 | "file_extension": ".py", 297 | "mimetype": "text/x-python", 298 | "name": "python", 299 | "nbconvert_exporter": "python", 300 | "pygments_lexer": "ipython3", 301 | "version": "3.8.11" 302 | } 303 | }, 304 | "nbformat": 4, 305 | "nbformat_minor": 5 306 | } 307 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 视频课程地址:https://www.bilibili.com/video/BV1M64y187qX/ 2 | --------------------------------------------------------------------------------