├── .gitignore ├── A 60 Minute Blitz ├── .DS_Store ├── 1.tensor_tutorial.ipynb ├── 2.autograd_tutorial.ipynb ├── 3.neural_networks_tutorial.ipynb └── 4.cifar10_tutorial.ipynb ├── Applications └── language_model │ ├── README.md │ ├── data.py │ ├── generate.py │ ├── generated.txt │ ├── main.py │ ├── model.pt │ └── model.py ├── HuggingfaceNLP ├── C1. Start Playing Transformers │ ├── 1. 直接使用pipeline.ipynb │ ├── 2. Transformer家族及基本概念.ipynb │ ├── 3. 端到端的背后.ipynb │ ├── 4. Models & Tokenizers.ipynb │ └── 5. 处理多个序列.ipynb └── C2. Fine-tuning Transformers │ ├── 1. 数据集预处理.ipynb │ ├── 2. 使用Trainer API来fine-tune.ipynb │ ├── 3. 用纯PyTorch来fine-tune.ipynb │ └── runs │ ├── Sep26_15-25-19_PC-201911051016 │ ├── 1632641123.3012567 │ │ └── events.out.tfevents.1632641123.PC-201911051016.50596.1 │ └── events.out.tfevents.1632641123.PC-201911051016.50596.0 │ ├── Sep26_15-36-43_PC-201911051016 │ ├── 1632641809.055524 │ │ └── events.out.tfevents.1632641809.PC-201911051016.50596.3 │ └── events.out.tfevents.1632641808.PC-201911051016.50596.2 │ ├── Sep26_15-37-55_PC-201911051016 │ ├── 1632641879.1103542 │ │ └── events.out.tfevents.1632641879.PC-201911051016.32468.1 │ └── events.out.tfevents.1632641879.PC-201911051016.32468.0 │ ├── Sep26_15-44-26_PC-201911051016 │ ├── 1632642271.2198026 │ │ └── events.out.tfevents.1632642271.PC-201911051016.32468.3 │ └── events.out.tfevents.1632642271.PC-201911051016.32468.2 │ ├── Sep26_15-54-05_PC-201911051016 │ ├── 1632642852.8538904 │ │ └── events.out.tfevents.1632642852.PC-201911051016.3052.1 │ └── events.out.tfevents.1632642852.PC-201911051016.3052.0 │ ├── Sep26_15-54-51_PC-201911051016 │ ├── 1632642898.3413022 │ │ └── events.out.tfevents.1632642898.PC-201911051016.3052.3 │ └── events.out.tfevents.1632642898.PC-201911051016.3052.2 │ └── Sep26_15-55-27_PC-201911051016 │ ├── 1632642935.0711265 │ └── events.out.tfevents.1632642935.PC-201911051016.34932.1 │ └── events.out.tfevents.1632642934.PC-201911051016.34932.0 ├── 使用transformers库.ipynb └── 李沐PyTorch ├── 1. 基础操作.ipynb ├── 2. 自动求导.ipynb └── 3. 线性预测模型.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # basic ignore: 2 | __pycache__/ 3 | .idea/ 4 | .ipynb_checkpoints/ 5 | .DS_Store 6 | 7 | # model weights 8 | weights/ 9 | 10 | 11 | # saved kws 12 | saved_words/ 13 | 14 | # data folder 15 | data/ 16 | datasets/ 17 | dataset/ 18 | 19 | # temp files 20 | temp/ 21 | 22 | 23 | -------------------------------------------------------------------------------- /A 60 Minute Blitz/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/A 60 Minute Blitz/.DS_Store -------------------------------------------------------------------------------- /A 60 Minute Blitz/1.tensor_tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false, 8 | "jupyter": { 9 | "outputs_hidden": false 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "\n", 22 | "Tensors\n", 23 | "--------------------------------------------\n", 24 | "\n", 25 | "Tensors are a specialized data structure that are very similar to arrays\n", 26 | "and matrices. In PyTorch, we use tensors to encode the inputs and\n", 27 | "outputs of a model, as well as the model’s parameters.\n", 28 | "\n", 29 | "Tensors are similar to NumPy’s ndarrays, except that tensors can run on\n", 30 | "GPUs or other specialized hardware to accelerate computing. If you’re familiar with ndarrays, you’ll\n", 31 | "be right at home with the Tensor API. If not, follow along in this quick\n", 32 | "API walkthrough.\n", 33 | "\n", 34 | "\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": { 41 | "collapsed": false, 42 | "jupyter": { 43 | "outputs_hidden": false 44 | } 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "import torch\n", 49 | "import numpy as np" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "Tensor Initialization\n", 57 | "~~~~~~~~~~~~~~~~~~~~~\n", 58 | "\n", 59 | "Tensors can be initialized in various ways. Take a look at the following examples:\n", 60 | "\n", 61 | "**Directly from data**\n", 62 | "\n", 63 | "Tensors can be created directly from data. The data type is automatically inferred.\n", 64 | "\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "collapsed": false, 72 | "jupyter": { 73 | "outputs_hidden": false 74 | } 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "data = [[1, 2],[3, 4]]\n", 79 | "x_data = torch.tensor(data)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "**From a NumPy array**\n", 87 | "\n", 88 | "Tensors can be created from NumPy arrays (and vice versa - see `bridge-to-np-label`).\n", 89 | "\n" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": false, 97 | "jupyter": { 98 | "outputs_hidden": false 99 | } 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "np_array = np.array(data)\n", 104 | "x_np = torch.from_numpy(np_array)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "**From another tensor:**\n", 112 | "\n", 113 | "The new tensor retains the properties (shape, datatype) of the argument tensor, unless explicitly overridden.\n", 114 | "\n" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "collapsed": false, 122 | "jupyter": { 123 | "outputs_hidden": false 124 | } 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "x_ones = torch.ones_like(x_data) # retains the properties of x_data\n", 129 | "print(f\"Ones Tensor: \\n {x_ones} \\n\")\n", 130 | "\n", 131 | "x_rand = torch.rand_like(x_data, dtype=torch.float) # overrides the datatype of x_data\n", 132 | "print(f\"Random Tensor: \\n {x_rand} \\n\")" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "**With random or constant values:**\n", 140 | "\n", 141 | "``shape`` is a tuple of tensor dimensions. In the functions below, it determines the dimensionality of the output tensor.\n", 142 | "\n" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "collapsed": false, 150 | "jupyter": { 151 | "outputs_hidden": false 152 | } 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "shape = (2,3,)\n", 157 | "rand_tensor = torch.rand(shape)\n", 158 | "ones_tensor = torch.ones(shape)\n", 159 | "zeros_tensor = torch.zeros(shape)\n", 160 | "\n", 161 | "print(f\"Random Tensor: \\n {rand_tensor} \\n\")\n", 162 | "print(f\"Ones Tensor: \\n {ones_tensor} \\n\")\n", 163 | "print(f\"Zeros Tensor: \\n {zeros_tensor}\")" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "--------------\n", 171 | "\n", 172 | "\n" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "Tensor Attributes\n", 180 | "~~~~~~~~~~~~~~~~~\n", 181 | "\n", 182 | "Tensor attributes describe their shape, datatype, and the device on which they are stored.\n", 183 | "\n" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": false, 191 | "jupyter": { 192 | "outputs_hidden": false 193 | } 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "tensor = torch.rand(3,4)\n", 198 | "\n", 199 | "print(f\"Shape of tensor: {tensor.shape}\")\n", 200 | "print(f\"Datatype of tensor: {tensor.dtype}\")\n", 201 | "print(f\"Device tensor is stored on: {tensor.device}\")" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "--------------\n", 209 | "\n", 210 | "\n" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "Tensor Operations\n", 218 | "~~~~~~~~~~~~~~~~~\n", 219 | "\n", 220 | "Over 100 tensor operations, including transposing, indexing, slicing,\n", 221 | "mathematical operations, linear algebra, random sampling, and more are\n", 222 | "comprehensively described\n", 223 | "`here `__.\n", 224 | "\n", 225 | "Each of them can be run on the GPU (at typically higher speeds than on a\n", 226 | "CPU). If you’re using Colab, allocate a GPU by going to Edit > Notebook\n", 227 | "Settings.\n", 228 | "\n", 229 | "\n" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": { 236 | "collapsed": false, 237 | "jupyter": { 238 | "outputs_hidden": false 239 | } 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "# We move our tensor to the GPU if available\n", 244 | "if torch.cuda.is_available():\n", 245 | " tensor = tensor.to('cuda')" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "Try out some of the operations from the list.\n", 253 | "If you're familiar with the NumPy API, you'll find the Tensor API a breeze to use.\n", 254 | "\n", 255 | "\n" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "**Standard numpy-like indexing and slicing:**\n", 263 | "\n" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": { 270 | "collapsed": false, 271 | "jupyter": { 272 | "outputs_hidden": false 273 | } 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "tensor = torch.ones(4, 4)\n", 278 | "tensor[:,1] = 0\n", 279 | "print(tensor)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "**Joining tensors** You can use ``torch.cat`` to concatenate a sequence of tensors along a given dimension.\n", 287 | "See also `torch.stack `__,\n", 288 | "another tensor joining op that is subtly different from ``torch.cat``.\n", 289 | "\n" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": { 296 | "collapsed": false, 297 | "jupyter": { 298 | "outputs_hidden": false 299 | } 300 | }, 301 | "outputs": [], 302 | "source": [ 303 | "t1 = torch.cat([tensor, tensor, tensor], dim=1)\n", 304 | "print(t1)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "**Multiplying tensors**\n", 312 | "\n" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": false, 320 | "jupyter": { 321 | "outputs_hidden": false 322 | } 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "# This computes the element-wise product\n", 327 | "print(f\"tensor.mul(tensor) \\n {tensor.mul(tensor)} \\n\")\n", 328 | "# Alternative syntax:\n", 329 | "print(f\"tensor * tensor \\n {tensor * tensor}\")" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "This computes the matrix multiplication between two tensors\n", 337 | "\n" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": { 344 | "collapsed": false, 345 | "jupyter": { 346 | "outputs_hidden": false 347 | } 348 | }, 349 | "outputs": [], 350 | "source": [ 351 | "print(f\"tensor.matmul(tensor.T) \\n {tensor.matmul(tensor.T)} \\n\")\n", 352 | "# Alternative syntax:\n", 353 | "print(f\"tensor @ tensor.T \\n {tensor @ tensor.T}\")" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "**In-place operations**\n", 361 | "Operations that have a ``_`` suffix are in-place. For example: ``x.copy_(y)``, ``x.t_()``, will change ``x``.\n", 362 | "\n" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": { 369 | "collapsed": false, 370 | "jupyter": { 371 | "outputs_hidden": false 372 | } 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "print(tensor, \"\\n\")\n", 377 | "tensor.add_(5)\n", 378 | "print(tensor)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "

Note

In-place operations save some memory, but can be problematic when computing derivatives because of an immediate loss\n", 386 | " of history. Hence, their use is discouraged.

\n", 387 | "\n" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "--------------\n", 395 | "\n", 396 | "\n" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": [ 403 | "\n", 404 | "Bridge with NumPy\n", 405 | "~~~~~~~~~~~~~~~~~\n", 406 | "Tensors on the CPU and NumPy arrays can share their underlying memory\n", 407 | "locations, and changing one will change\tthe other.\n", 408 | "\n" 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "metadata": {}, 414 | "source": [ 415 | "Tensor to NumPy array\n", 416 | "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", 417 | "\n" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": { 424 | "collapsed": false, 425 | "jupyter": { 426 | "outputs_hidden": false 427 | } 428 | }, 429 | "outputs": [], 430 | "source": [ 431 | "t = torch.ones(5)\n", 432 | "print(f\"t: {t}\")\n", 433 | "n = t.numpy()\n", 434 | "print(f\"n: {n}\")" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "A change in the tensor reflects in the NumPy array.\n", 442 | "\n" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": { 449 | "collapsed": false, 450 | "jupyter": { 451 | "outputs_hidden": false 452 | } 453 | }, 454 | "outputs": [], 455 | "source": [ 456 | "t.add_(1)\n", 457 | "print(f\"t: {t}\")\n", 458 | "print(f\"n: {n}\")" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "NumPy array to Tensor\n", 466 | "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", 467 | "\n" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": { 474 | "collapsed": false, 475 | "jupyter": { 476 | "outputs_hidden": false 477 | } 478 | }, 479 | "outputs": [], 480 | "source": [ 481 | "n = np.ones(5)\n", 482 | "t = torch.from_numpy(n)" 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": {}, 488 | "source": [ 489 | "Changes in the NumPy array reflects in the tensor.\n", 490 | "\n" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": { 497 | "collapsed": false, 498 | "jupyter": { 499 | "outputs_hidden": false 500 | } 501 | }, 502 | "outputs": [], 503 | "source": [ 504 | "np.add(n, 1, out=n)\n", 505 | "print(f\"t: {t}\")\n", 506 | "print(f\"n: {n}\")" 507 | ] 508 | } 509 | ], 510 | "metadata": { 511 | "kernelspec": { 512 | "display_name": "Python 3 (ipykernel)", 513 | "language": "python", 514 | "name": "python3" 515 | }, 516 | "language_info": { 517 | "codemirror_mode": { 518 | "name": "ipython", 519 | "version": 3 520 | }, 521 | "file_extension": ".py", 522 | "mimetype": "text/x-python", 523 | "name": "python", 524 | "nbconvert_exporter": "python", 525 | "pygments_lexer": "ipython3", 526 | "version": "3.9.2" 527 | } 528 | }, 529 | "nbformat": 4, 530 | "nbformat_minor": 4 531 | } 532 | -------------------------------------------------------------------------------- /A 60 Minute Blitz/3.neural_networks_tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "jupyter": { 9 | "outputs_hidden": false 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "\n", 22 | "Neural Networks\n", 23 | "===============\n", 24 | "\n", 25 | "Neural networks can be constructed using the ``torch.nn`` package.\n", 26 | "\n", 27 | "Now that you had a glimpse of ``autograd``, ``nn`` depends on\n", 28 | "``autograd`` to define models and differentiate them.\n", 29 | "An ``nn.Module`` contains layers, and a method ``forward(input)`` that\n", 30 | "returns the ``output``.\n", 31 | "\n", 32 | "For example, look at this network that classifies digit images:\n", 33 | "\n", 34 | ".. figure:: /_static/img/mnist.png\n", 35 | " :alt: convnet\n", 36 | "\n", 37 | " convnet\n", 38 | "\n", 39 | "It is a simple feed-forward network. It takes the input, feeds it\n", 40 | "through several layers one after the other, and then finally gives the\n", 41 | "output.\n", 42 | "\n", 43 | "A typical training procedure for a neural network is as follows:\n", 44 | "\n", 45 | "- Define the neural network that has some learnable parameters (or\n", 46 | " weights)\n", 47 | "- Iterate over a dataset of inputs\n", 48 | "- Process input through the network\n", 49 | "- Compute the loss (how far is the output from being correct)\n", 50 | "- Propagate gradients back into the network’s parameters\n", 51 | "- Update the weights of the network, typically using a simple update rule:\n", 52 | " ``weight = weight - learning_rate * gradient``\n", 53 | "\n", 54 | "Define the network\n", 55 | "------------------\n", 56 | "\n", 57 | "Let’s define this network:\n", 58 | "\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 51, 64 | "metadata": { 65 | "collapsed": false, 66 | "jupyter": { 67 | "outputs_hidden": false 68 | } 69 | }, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "Net(\n", 76 | " (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))\n", 77 | " (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))\n", 78 | " (fc1): Linear(in_features=400, out_features=120, bias=True)\n", 79 | " (fc2): Linear(in_features=120, out_features=84, bias=True)\n", 80 | " (fc3): Linear(in_features=84, out_features=10, bias=True)\n", 81 | ")\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "import torch\n", 87 | "import torch.nn as nn\n", 88 | "import torch.nn.functional as F\n", 89 | "\n", 90 | "\n", 91 | "class Net(nn.Module):\n", 92 | "\n", 93 | " def __init__(self):\n", 94 | " super(Net, self).__init__()\n", 95 | " # 1 input image channel, 6 output channels, 5x5 square convolution\n", 96 | " # kernel\n", 97 | " self.conv1 = nn.Conv2d(1, 6, 5)\n", 98 | " self.conv2 = nn.Conv2d(6, 16, 5)\n", 99 | " # an affine operation: y = Wx + b\n", 100 | " self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension \n", 101 | " self.fc2 = nn.Linear(120, 84)\n", 102 | " self.fc3 = nn.Linear(84, 10)\n", 103 | "\n", 104 | " def forward(self, x):\n", 105 | " # Max pooling over a (2, 2) window\n", 106 | " x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))\n", 107 | " # If the size is a square, you can specify with a single number\n", 108 | " x = F.max_pool2d(F.relu(self.conv2(x)), 2)\n", 109 | " x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension\n", 110 | " x = F.relu(self.fc1(x))\n", 111 | " x = F.relu(self.fc2(x))\n", 112 | " x = self.fc3(x)\n", 113 | " return x\n", 114 | "\n", 115 | "\n", 116 | "net = Net()\n", 117 | "print(net) " 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "You just have to define the ``forward`` function, and the ``backward``\n", 125 | "function (where gradients are computed) is automatically defined for you\n", 126 | "using ``autograd``.\n", 127 | "You can use any of the Tensor operations in the ``forward`` function.\n", 128 | "\n", 129 | "The learnable parameters of a model are returned by ``net.parameters()``\n", 130 | "\n" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 56, 136 | "metadata": { 137 | "collapsed": false, 138 | "jupyter": { 139 | "outputs_hidden": false 140 | } 141 | }, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "10\n", 148 | "torch.Size([6, 1, 5, 5])\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "params = list(net.parameters())\n", 154 | "# print(params)\n", 155 | "print(len(params))\n", 156 | "print(params[0].size()) # conv1's .weight" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "Let's try a random 32x32 input.\n", 164 | "Note: expected input size of this net (LeNet) is 32x32. To use this net on\n", 165 | "the MNIST dataset, please resize the images from the dataset to 32x32.\n", 166 | "\n" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "## 呵呵了,这傻逼教程,不写清楚为啥输入一定得是32*32\n", 174 | "\n", 175 | "首先,进过推断,conv默认的stride即步长是1,然后pooling默认的步长是window_size。\n", 176 | "\n", 177 | "根据fc1的输入units个数为16 * 5 * 5,这个 5 * 5 就是输入图像经过conv1,pool1,conv2,pool2后的结果:\n", 178 | "\n", 179 | "(32,32) --conv1(size=5, stride=1)--> (28,28) --pool1(size=2, stride=2)--> (14,14) --conv2(size=5, stride=1)--> (10,10) --pool2(size=2, stride=2)--> (5,5)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 57, 185 | "metadata": { 186 | "collapsed": false, 187 | "jupyter": { 188 | "outputs_hidden": false 189 | } 190 | }, 191 | "outputs": [ 192 | { 193 | "name": "stdout", 194 | "output_type": "stream", 195 | "text": [ 196 | "tensor([[-0.0608, -0.1087, -0.0833, 0.0029, -0.0998, 0.0340, 0.0646, -0.1200,\n", 197 | " 0.0184, -0.0866]], grad_fn=)\n" 198 | ] 199 | } 200 | ], 201 | "source": [ 202 | "input = torch.randn(1, 1, 32, 32) # 下面写了,torch模型只接受batch,所以即使只有一个样本,也需要设置一个batch size 1\n", 203 | "out = net(input)\n", 204 | "print(out)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "Zero the gradient buffers of all parameters and backprops with random\n", 212 | "gradients:\n", 213 | "\n" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 7, 219 | "metadata": { 220 | "collapsed": false, 221 | "jupyter": { 222 | "outputs_hidden": false 223 | } 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "net.zero_grad()\n", 228 | "out.backward(torch.randn(1, 10))" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "

Note

``torch.nn`` only supports mini-batches. The entire ``torch.nn``\n", 236 | " package only supports inputs that are a mini-batch of samples, and not\n", 237 | " a single sample.\n", 238 | "\n", 239 | " For example, ``nn.Conv2d`` will take in a 4D Tensor of\n", 240 | " ``nSamples x nChannels x Height x Width``.\n", 241 | "\n", 242 | " If you have a single sample, just use ``input.unsqueeze(0)`` to add\n", 243 | " a fake batch dimension.

\n", 244 | "\n", 245 | "Before proceeding further, let's recap all the classes you’ve seen so far.\n", 246 | "\n", 247 | "**Recap:**\n", 248 | " - ``torch.Tensor`` - A *multi-dimensional array* with support for autograd\n", 249 | " operations like ``backward()``. Also *holds the gradient* w.r.t. the\n", 250 | " tensor.\n", 251 | " - ``nn.Module`` - Neural network module. *Convenient way of\n", 252 | " encapsulating parameters*, with helpers for moving them to GPU,\n", 253 | " exporting, loading, etc.\n", 254 | " - ``nn.Parameter`` - A kind of Tensor, that is *automatically\n", 255 | " registered as a parameter when assigned as an attribute to a*\n", 256 | " ``Module``.\n", 257 | " - ``autograd.Function`` - Implements *forward and backward definitions\n", 258 | " of an autograd operation*. Every ``Tensor`` operation creates at\n", 259 | " least a single ``Function`` node that connects to functions that\n", 260 | " created a ``Tensor`` and *encodes its history*.\n", 261 | "\n", 262 | "**At this point, we covered:**\n", 263 | " - Defining a neural network\n", 264 | " - Processing inputs and calling backward\n", 265 | "\n", 266 | "**Still Left:**\n", 267 | " - Computing the loss\n", 268 | " - Updating the weights of the network\n", 269 | "\n", 270 | "Loss Function\n", 271 | "-------------\n", 272 | "A loss function takes the (output, target) pair of inputs, and computes a\n", 273 | "value that estimates how far away the output is from the target.\n", 274 | "\n", 275 | "There are several different\n", 276 | "`loss functions `_ under the\n", 277 | "nn package .\n", 278 | "A simple loss is: ``nn.MSELoss`` which computes the mean-squared error\n", 279 | "between the input and the target.\n", 280 | "\n", 281 | "For example:\n", 282 | "\n" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 58, 288 | "metadata": { 289 | "collapsed": false, 290 | "jupyter": { 291 | "outputs_hidden": false 292 | } 293 | }, 294 | "outputs": [ 295 | { 296 | "name": "stdout", 297 | "output_type": "stream", 298 | "text": [ 299 | "tensor(0.8253, grad_fn=)\n", 300 | "tensor(2.3440, grad_fn=)\n" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "output = net(input)\n", 306 | "target = torch.randn(10) # a dummy target, for example\n", 307 | "target = target.view(1, -1) # make it the same shape as output 前面那个1代表batch size\n", 308 | "criterion = nn.MSELoss()\n", 309 | "loss = criterion(output, target)\n", 310 | "print(loss)\n", 311 | "\n", 312 | "# ==============试试 CE loss\n", 313 | "criterion2 = nn.CrossEntropyLoss()\n", 314 | "target2 = torch.tensor([2],dtype=torch.long) # 这里是因为CrossEntropyLoss中的target对于C分类问题,是直接接受index作为target,不用像keras那样变成one-hot输入\n", 315 | "loss2 = criterion2(output, target2)\n", 316 | "print(loss2)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "Now, if you follow ``loss`` in the backward direction, using its\n", 324 | "``.grad_fn`` attribute, you will see a graph of computations that looks\n", 325 | "like this:\n", 326 | "\n", 327 | "::\n", 328 | "\n", 329 | " input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d\n", 330 | " -> flatten -> linear -> relu -> linear -> relu -> linear\n", 331 | " -> MSELoss\n", 332 | " -> loss\n", 333 | "\n", 334 | "So, when we call ``loss.backward()``, the whole graph is differentiated\n", 335 | "w.r.t. the neural net parameters, and all Tensors in the graph that have\n", 336 | "``requires_grad=True`` will have their ``.grad`` Tensor accumulated with the\n", 337 | "gradient.\n", 338 | "\n", 339 | "For illustration, let us follow a few steps backward:\n", 340 | "\n" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 59, 346 | "metadata": { 347 | "collapsed": false, 348 | "jupyter": { 349 | "outputs_hidden": false 350 | } 351 | }, 352 | "outputs": [ 353 | { 354 | "name": "stdout", 355 | "output_type": "stream", 356 | "text": [ 357 | "\n", 358 | "\n", 359 | "\n" 360 | ] 361 | } 362 | ], 363 | "source": [ 364 | "print(loss.grad_fn) # MSELoss\n", 365 | "print(loss.grad_fn.next_functions[0][0]) # Linear\n", 366 | "print(loss.grad_fn.next_functions[0][0].next_functions[0][0]) # ReLU" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "Backprop\n", 374 | "--------\n", 375 | "To backpropagate the error all we have to do is to ``loss.backward()``.\n", 376 | "**You need to clear the existing gradients though, else gradients will be\n", 377 | "accumulated to existing gradients.**\n", 378 | "\n", 379 | "\n", 380 | "Now we shall call ``loss.backward()``, and have a look at conv1's bias\n", 381 | "gradients before and after the backward.\n", 382 | "\n" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 60, 388 | "metadata": { 389 | "collapsed": false, 390 | "jupyter": { 391 | "outputs_hidden": false 392 | } 393 | }, 394 | "outputs": [ 395 | { 396 | "name": "stdout", 397 | "output_type": "stream", 398 | "text": [ 399 | "conv1.bias.grad before backward\n", 400 | "None\n", 401 | "conv1.bias.grad after backward\n", 402 | "tensor([ 0.0062, 0.0048, 0.0059, -0.0036, -0.0046, -0.0134])\n" 403 | ] 404 | } 405 | ], 406 | "source": [ 407 | "net.zero_grad() # zeroes the gradient buffers of all parameters\n", 408 | "\n", 409 | "print('conv1.bias.grad before backward')\n", 410 | "print(net.conv1.bias.grad)\n", 411 | "\n", 412 | "loss.backward()\n", 413 | "\n", 414 | "print('conv1.bias.grad after backward')\n", 415 | "print(net.conv1.bias.grad)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": {}, 421 | "source": [ 422 | "Now, we have seen how to use loss functions.\n", 423 | "\n", 424 | "**Read Later:**\n", 425 | "\n", 426 | " The neural network package contains various modules and loss functions\n", 427 | " that form the building blocks of deep neural networks. A full list with\n", 428 | " documentation is `here `_.\n", 429 | "\n", 430 | "**The only thing left to learn is:**\n", 431 | "\n", 432 | " - Updating the weights of the network\n", 433 | "\n", 434 | "Update the weights\n", 435 | "------------------\n", 436 | "The simplest update rule used in practice is the Stochastic Gradient\n", 437 | "Descent (SGD):\n", 438 | "\n", 439 | " ``weight = weight - learning_rate * gradient``\n", 440 | "\n", 441 | "We can implement this using simple Python code:\n", 442 | "\n", 443 | ".. code:: python\n", 444 | "\n", 445 | " learning_rate = 0.01\n", 446 | " for f in net.parameters():\n", 447 | " f.data.sub_(f.grad.data * learning_rate)\n", 448 | "\n", 449 | "However, as you use neural networks, you want to use various different\n", 450 | "update rules such as SGD, Nesterov-SGD, Adam, RMSProp, etc.\n", 451 | "To enable this, we built a small package: ``torch.optim`` that\n", 452 | "implements all these methods. Using it is very simple:\n", 453 | "\n" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 61, 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "learning_rate = 0.01\n", 463 | "for f in net.parameters():\n", 464 | " f.data.sub_(f.grad.data * learning_rate)" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 62, 470 | "metadata": { 471 | "collapsed": false, 472 | "jupyter": { 473 | "outputs_hidden": false 474 | } 475 | }, 476 | "outputs": [], 477 | "source": [ 478 | "import torch.optim as optim\n", 479 | "\n", 480 | "# create your optimizer\n", 481 | "optimizer = optim.SGD(net.parameters(), lr=0.01)\n", 482 | "\n", 483 | "# in your training loop:\n", 484 | "optimizer.zero_grad() # zero the gradient buffers\n", 485 | "output = net(input)\n", 486 | "loss = criterion(output, target)\n", 487 | "loss.backward()\n", 488 | "optimizer.step() # Does the update" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": {}, 494 | "source": [ 495 | ".. Note::\n", 496 | "\n", 497 | " Observe how gradient buffers had to be manually set to zero using\n", 498 | " ``optimizer.zero_grad()``. This is because gradients are accumulated\n", 499 | " as explained in the `Backprop`_ section.\n", 500 | "\n" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": {}, 506 | "source": [ 507 | "## 这里可以看到,nn.Module 和 torch.optim 两个类都有 .zero_grad()这个功能。" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [] 516 | } 517 | ], 518 | "metadata": { 519 | "kernelspec": { 520 | "display_name": "Python 3 (ipykernel)", 521 | "language": "python", 522 | "name": "python3" 523 | }, 524 | "language_info": { 525 | "codemirror_mode": { 526 | "name": "ipython", 527 | "version": 3 528 | }, 529 | "file_extension": ".py", 530 | "mimetype": "text/x-python", 531 | "name": "python", 532 | "nbconvert_exporter": "python", 533 | "pygments_lexer": "ipython3", 534 | "version": "3.9.2" 535 | } 536 | }, 537 | "nbformat": 4, 538 | "nbformat_minor": 4 539 | } 540 | -------------------------------------------------------------------------------- /Applications/language_model/README.md: -------------------------------------------------------------------------------- 1 | # 训练一个简单的语言模型 2 | 3 | 直接运行main.py进行训练,然后运行generate.py即可生成文本。 4 | 5 | 一个使用《三国演义》训练的语言模型,生成的文本例子如下: 6 | 7 | ```python 8 | trigger words: ['诸', '葛'] 9 | 诸葛亮。忠见一人,雪伤士飞,有何不惜!望备取 10 | 盏,举以从此疏事。”关公不听骋言,回顾琮 11 | 曰:“不然。”垕大骂一声,取上牟扑。那将 12 | 寻知中军盛旗,却并不见。袁隗遣人催取吕翔 13 | ,诏权其言曰:“此乃周瑜之计也。须用军士 14 | :“如不用之甚,非困于里下手;今日欲使操 15 | 来,军士俱能收之。若荐蔡瑁、张辽、徐州刺 16 | 史慈,有州二雷,准备而纳,不可乘之。乃下 17 | 马至戢,设于帐前为小舟并致。 18 | ``` 19 | 20 | 21 | 22 | ## TODO: 23 | 1. 在generate的时候,使用beam search 24 | 2. 试试Transformer 25 | 3. 使用seq2seq来训练 26 | -------------------------------------------------------------------------------- /Applications/language_model/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | class Dictionary: 5 | def __init__(self): 6 | self.word2idx = {} 7 | self.idx2word = [] # idx2word用一个list即可 8 | 9 | def add_word(self, word): 10 | if word not in self.word2idx: 11 | self.idx2word.append(word) 12 | self.word2idx[word] = len(self.idx2word) - 1 13 | return self.word2idx[word] 14 | 15 | def __len__(self): 16 | return len(self.idx2word) 17 | 18 | 19 | class Corpus: 20 | def __init__(self, path, lang='en'): 21 | self.lang = lang 22 | # if lang == 'zh': 23 | # import jieba 24 | self.dictionary = Dictionary() 25 | self.train = self.tokenize(os.path.join(path, 'train.txt')) 26 | self.valid = self.tokenize(os.path.join(path, 'valid.txt')) 27 | self.test = self.tokenize(os.path.join(path, 'test.txt')) 28 | 29 | def tokenize(self, file_path): 30 | assert os.path.exists(file_path) 31 | # Add words to dict: 32 | with open(file_path, 'r', encoding='utf8') as f: 33 | for line in f: 34 | if self.lang == 'zh': 35 | words = [w for w in line] + [''] # 先直接按照字来分 36 | else: # 默认英语 37 | words = line.split(' ') + [''] # 先用空格分词,然后添加 end_of_sentence 符号 38 | for w in words: 39 | self.dictionary.add_word(w) 40 | 41 | with open(file_path, 'r', encoding='utf8') as f: 42 | idss = [] 43 | for line in f: 44 | if self.lang == 'zh': 45 | words = [w for w in line] + [''] # 先直接按照字来分 46 | else: # 默认英语 47 | words = line.split(' ') + [''] # 先用空格分词,然后添加 end_of_sentence 符号 48 | ids = [self.dictionary.word2idx[w] for w in words] 49 | idss.append(torch.tensor(ids, dtype=torch.int64)) 50 | return torch.cat(idss) # 最后是类似这种的东西: tensor([0, 1, 0, ..., 1, 0, 1]) 51 | 52 | 53 | if __name__ == '__main__': 54 | # c = Corpus('../../data/wikitext-2') 55 | c = Corpus('../../data/三国small', lang='zh') 56 | print(c.train.shape) 57 | print(c.valid.shape) 58 | print(c.test.shape) 59 | print(len(c.dictionary.word2idx)) 60 | -------------------------------------------------------------------------------- /Applications/language_model/generate.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Language Modeling on Wikitext-2 3 | # 4 | # This file generates new sentences sampled from the language model 5 | # 6 | ############################################################################### 7 | 8 | import argparse 9 | 10 | import torch 11 | 12 | import data 13 | 14 | parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 Language Model') 15 | 16 | # Model parameters. 17 | parser.add_argument('--data', type=str, default='../../data/三国small', 18 | help='location of the data corpus') 19 | parser.add_argument('--lang', type=str, default='zh', 20 | help='language fo the corpus') 21 | parser.add_argument('--checkpoint', type=str, default='./model.pt', 22 | help='model checkpoint to use') 23 | parser.add_argument('--outf', type=str, default='generated.txt', 24 | help='output file for generated text') 25 | parser.add_argument('--words', type=int, default=1000, 26 | help='number of words to generate') 27 | parser.add_argument('--seed', type=int, default=1111, 28 | help='random seed') 29 | parser.add_argument('--cuda', action='store_true', 30 | help='use CUDA') 31 | parser.add_argument('--temperature', type=float, default=1.0, 32 | help='temperature - higher will increase diversity') 33 | parser.add_argument('--log-interval', type=int, default=100, 34 | help='reporting interval') 35 | args = parser.parse_args() 36 | join_token = '' if args.lang == 'zh' else ' ' 37 | # torch.manual_seed(1) 38 | device = torch.device("cuda" if args.cuda else "cpu") 39 | 40 | if args.temperature < 1e-3: 41 | parser.error("--temperature has to be greater or equal 1e-3") 42 | 43 | # 这里需要用到跟训练集相同的词典,用来输出真实的词 44 | corpus = data.Corpus(args.data, lang=args.lang) 45 | ntokens = len(corpus.dictionary) 46 | 47 | # load model 48 | with open(args.checkpoint, 'rb') as f: 49 | model = torch.load(f).to(device) 50 | model.eval() 51 | 52 | is_transformer_model = hasattr(model, 'model_type') and model.model_type == 'Transformer' 53 | if not is_transformer_model: 54 | hidden = model.init_hidden(1) 55 | 56 | # RNN的输入可以是不定长的,所以理论上我用来trigger的可以是一句话 57 | # input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device) 58 | trigger_words = [w for w in '诸葛'] 59 | print("trigger words:", trigger_words) 60 | input_idx_list = [corpus.dictionary.word2idx[w] for w in trigger_words] 61 | input = torch.tensor(input_idx_list, dtype=torch.long).view(len(input_idx_list), 1).to(device) 62 | 63 | 64 | with open(args.outf, 'w') as outf: 65 | print("trigger word:", trigger_words, file=outf) 66 | print(join_token.join(trigger_words), end=join_token) 67 | print(join_token.join(trigger_words), end=join_token, file=outf) 68 | with torch.no_grad(): 69 | for i in range(args.words): # generate how many words 70 | if is_transformer_model: 71 | raise NotImplementedError() 72 | else: 73 | output, hidden = model(input, hidden) 74 | # 这里[-1]是为了取出最后一个time step的输出 75 | # 否则,如果input是多个词,可能生成的就会不连贯,比方输入"诸葛",后面却生成不了"亮"。 76 | word_weights = output[-1].squeeze().div(args.temperature).exp().cpu() # 这里的temperature啥作用? 77 | word_idx = torch.multinomial(word_weights, 1)[0] # randomly sample 78 | # input.fill_(word_idx) # 用于输入只有一个词 79 | # input = word_idx.view(1, 1) # 用于输入为多个词,但是后面的迭代都是只用前一个词 80 | # 这里让每次都读取前面N个词用于生成下个词,是不是更好? 81 | input_idx_list = input_idx_list[-34:] + [(int(word_idx.item()))] # 每次都用前35个词来预测 82 | input = torch.tensor(input_idx_list, dtype=torch.long).view(len(input_idx_list), 1).to(device) 83 | word = corpus.dictionary.idx2word[word_idx] 84 | if word == '': 85 | break 86 | print(word + ('\n' if i % 20 == 19 else join_token), end=join_token) 87 | outf.write(word + ('\n' if i % 20 == 19 else join_token)) 88 | 89 | # if i % args.log_interval == 0: 90 | # print('| Generated {}/{} words'.format(i, args.words)) 91 | -------------------------------------------------------------------------------- /Applications/language_model/generated.txt: -------------------------------------------------------------------------------- 1 | trigger word: ['诸', '葛'] 2 | 诸葛亮。忠见一人,雪伤士飞,有何不惜!望备取 3 | 盏,举以从此疏事。”关公不听骋言,回顾琮 4 | 曰:“不然。”垕大骂一声,取上牟扑。那将 5 | 寻知中军盛旗,却并不见。袁隗遣人催取吕翔 6 | ,诏权其言曰:“此乃周瑜之计也。须用军士 7 | :“如不用之甚,非困于里下手;今日欲使操 8 | 来,军士俱能收之。若荐蔡瑁、张辽、徐州刺 9 | 史慈,有州二雷,准备而纳,不可乘之。乃下 10 | 马至戢,设于帐前为小舟并致。 11 | -------------------------------------------------------------------------------- /Applications/language_model/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | import math 4 | import os 5 | import torch 6 | import torch.nn as nn 7 | import torch.onnx as onnx 8 | 9 | from data import Corpus 10 | import model 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--data', type=str, default='../../data/三国small', 14 | help='location of the data corpus') 15 | parser.add_argument('--lang', type=str, default='zh', 16 | help='language fo the corpus') 17 | parser.add_argument('--model', type=str, default='LSTM', 18 | help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)') 19 | parser.add_argument('--emsize', type=int, default=200, 20 | help='size of word embeddings') 21 | parser.add_argument('--nhid', type=int, default=200, 22 | help='number of hidden units per layer') 23 | parser.add_argument('--nlayers', type=int, default=2, 24 | help='number of layers') 25 | parser.add_argument('--lr', type=float, default=20, 26 | help='initial learning rate') 27 | parser.add_argument('--clip', type=float, default=0.25, 28 | help='gradient clipping') 29 | parser.add_argument('--epochs', type=int, default=40, 30 | help='upper epoch limit') 31 | parser.add_argument('--batch_size', type=int, default=20, metavar='N', 32 | help='batch size') 33 | parser.add_argument('--bptt', type=int, default=35, 34 | help='sequence length, backprop through time(bptt)') 35 | parser.add_argument('--dropout', type=float, default=0.2, 36 | help='dropout applied to layers (0 = no dropout)') 37 | parser.add_argument('--tied', action='store_true', 38 | help='tie the word embedding and softmax weights') 39 | parser.add_argument('--seed', type=int, default=1111, 40 | help='random seed') 41 | parser.add_argument('--cuda', action='store_true', 42 | help='use CUDA') 43 | parser.add_argument('--log-interval', type=int, default=200, metavar='N', 44 | help='report interval') 45 | parser.add_argument('--save', type=str, default='model.pt', 46 | help='path to save the final model') 47 | parser.add_argument('--onnx-export', type=str, default='', 48 | help='path to export the final model in onnx format') 49 | 50 | parser.add_argument('--nhead', type=int, default=2, 51 | help='the number of heads in the encoder/decoder of the transformer model') 52 | parser.add_argument('--dry-run', action='store_true', 53 | help='verify the code and the model') 54 | 55 | args = parser.parse_args() 56 | 57 | # 设置随机种子便于复现 58 | torch.manual_seed(1) 59 | # 设置cuda 60 | if torch.cuda.is_available(): 61 | if not args.cuda: 62 | print("Hey, You have a CUDA device! Why not using it??") 63 | device = torch.device("cuda" if args.cuda else "cpu") 64 | 65 | ############################################################################### 66 | # Load data 67 | ############################################################################### 68 | 69 | corpus = Corpus(args.data, lang=args.lang) 70 | """ 71 | Starting from sequential data, batchify arranges the dataset into columns. 72 | For instance, with the alphabet as the sequence and batch size 4, we'd get 73 | ┌ a g m s ┐ 74 | │ b h n t │ 75 | │ c i o u │ 76 | │ d j p v │ 77 | │ e k q w │ 78 | └ f l r x ┘. 79 | These columns are treated as independent by the model, which means that the 80 | dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient 81 | batch processing. 82 | 解释一下: 83 | 上面那个矩阵为什么batch维在竖着那一维?因为torch中RNN默认的输入中,sequence_length是第一维, 84 | 也就是行,batch在第二维。所以是这么个形状。 85 | 然后按照batch=4,把'abcdefg.....xyz'分成4份,每一份就是一个独立的字符串了,就可以并行处理。 86 | """ 87 | 88 | def batchify(data, bsz): 89 | """按照batch size来分割文本,所以bsz越大,用于训练的每条文本就越短""" 90 | nbatch = data.shape[0] // bsz 91 | data = data.narrow(0, 0, nbatch * bsz) # 剪裁,(dimension, start, length) 92 | data = data.view(bsz, -1).t().contiguous() # 这里的转置是为了满足RNN的输入,把seq_len放在第一维 93 | # 但.contiguous()啥用,还不知道 94 | return data.to(device) 95 | 96 | 97 | eval_batch_size = 20 98 | train_data = batchify(corpus.train, args.batch_size) 99 | val_data = batchify(corpus.valid, args.batch_size) 100 | test_data = batchify(corpus.test, args.batch_size) 101 | 102 | 103 | ############################################################################### 104 | # Build the model 105 | ############################################################################### 106 | 107 | ntokens = len(corpus.dictionary) 108 | model = model.RNN_Model(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout).to(device) 109 | loss_func = nn.NLLLoss() 110 | 111 | 112 | ############################################################################### 113 | # Training code 114 | ############################################################################### 115 | 116 | """ 117 | get_batch subdivides the source data into chunks of length args.bptt. 118 | If source is equal to the example output of the batchify function, 119 | ┌ a g m s ┐ 120 | │ b h n t │ 121 | │ c i o u │ 122 | │ d j p v │ 123 | │ e k q w │ 124 | └ f l r x ┘. 125 | with a bptt-limit of 2, we'd get the following two Variables for i = 0: 126 | ┌ a g m s ┐ ┌ b h n t ┐ 127 | └ b h n t ┘ └ c i o u ┘ 128 | Note that despite the name of the function, the subdivison of data is not 129 | done along the batch dimension (i.e. dimension 1), since that was handled 130 | by the batchify function. The chunks are along dimension 0, corresponding 131 | to the seq_len dimension in the LSTM. 132 | 133 | 就是说,原本在没有seq_len的限制下,就是上面第一个矩阵,然后有了seq_len之后,应该去划分 134 | 一个个的输入呢,就是按照seq_len去纵向滑动,得到一个个chunk. 135 | """ 136 | 137 | def get_batch(source, i): 138 | """ 139 | 从source中第i位置开始取出seq_len长度的数据。 140 | 首先source data已经有了batch维,这里就是按照seq_len做一个切片; 141 | 然后target这里的都往后挪一个index,这实际上就是一个batch的所有target, 142 | 最后需要view(-1)变形成一维的,这样才能直接输入到NLLLoss损失函数中。 143 | """ 144 | seq_len = min(args.bptt, len(source) - 1 - i) 145 | data = source[i:i+seq_len] 146 | target = source[i+1:i+1+seq_len].view(-1) 147 | return data, target 148 | 149 | def repackage_hidden(h): # 这个玩意儿到底干嘛的? 150 | """Wraps hidden states in new Tensors, to detach them from their history. 151 | 在网上查了查,相关的解释可以参考: 152 | https://discuss.pytorch.org/t/solved-why-we-need-to-detach-variable-which-contains-hidden-representation/1426 153 | """ 154 | if isinstance(h, torch.Tensor): 155 | return h.detach() 156 | else: 157 | return tuple(repackage_hidden(v) for v in h) # 还是个递归函数,更不懂了 158 | 159 | def evaluate(data_source): 160 | # Turn on evaluation mode which disables dropout. 161 | # .eval()是nn.Module的函数,用户转换成evaluation模式,主要针对Dropout,BatchNorm这些组件 162 | model.eval() 163 | total_loss = 0. 164 | ntokens = len(corpus.dictionary) 165 | if args.model != 'Transformer': # 不是Transformer,就有hidden的概念 166 | hidden = model.init_hidden(eval_batch_size) 167 | with torch.no_grad(): 168 | for i in range(0, data_source.size(0) - 1, args.bptt): # 每bptt的 169 | data, targets = get_batch(data_source, i) 170 | if args.model == 'Transformer': 171 | output = model(data) 172 | output = output.view(-1, ntokens) 173 | else: 174 | output, hidden = model(data, hidden) 175 | hidden = repackage_hidden(hidden) 176 | total_loss += len(data) * loss_func(output, targets).item() 177 | return total_loss / (len(data_source) - 1) 178 | 179 | 180 | def train(): 181 | # Turn on training mode which enables dropout. 182 | model.train() 183 | total_loss = 0. # 记录一个epoch的loss 184 | start_time = time.time() 185 | ntokens = len(corpus.dictionary) 186 | if args.model != 'Transformer': 187 | hidden = model.init_hidden(args.batch_size) 188 | for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): 189 | """ 190 | 这里的设计也是挺"奇特的"。不管bptt多大,这里一个迭代都是batch size大小的数据; 191 | i是一系列间隔seq_len的值, 192 | 所以bptt的作用就是告诉get_batch函数我一个batch中的文本是多长。 193 | """ 194 | data, targets = get_batch(train_data, i) # 以seqlen来取一个个batch 195 | # Starting each batch, we detach the hidden state from how it was previously produced. 196 | # If we didn't, the model would try backpropagating all the way to start of the dataset. 197 | model.zero_grad() 198 | if args.model == 'Transformer': 199 | output = model(data) 200 | output = output.view(-1, ntokens) 201 | else: 202 | hidden = repackage_hidden(hidden) 203 | # 每一次新的反向传播,都得先把hidden给清理一次 204 | output, hidden = model(data, hidden) 205 | loss = loss_func(output, targets) 206 | loss.backward() 207 | 208 | # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. 209 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) 210 | for p in model.parameters(): # 为啥不用optim?? 211 | p.data.add_(p.grad, alpha=-lr) 212 | 213 | total_loss += loss.item() 214 | 215 | if batch % args.log_interval == 0 and batch > 0: 216 | cur_loss = total_loss / args.log_interval 217 | elapsed = time.time() - start_time 218 | print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 219 | 'loss {:5.2f} | ppl {:8.2f}'.format( 220 | epoch, batch, len(train_data) // args.bptt, lr, 221 | elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) 222 | total_loss = 0 223 | start_time = time.time() 224 | if args.dry_run: 225 | break 226 | 227 | 228 | def export_onnx(path, batch_size, seq_len): 229 | print('The model is also exported in ONNX format at {}'. 230 | format(os.path.realpath(args.onnx_export))) 231 | model.eval() 232 | dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device) 233 | hidden = model.init_hidden(batch_size) 234 | torch.onnx.export(model, (dummy_input, hidden), path) 235 | 236 | 237 | # Loop over epochs. 238 | lr = args.lr 239 | best_val_loss = None 240 | 241 | # At any point you can hit Ctrl + C to break out of training early. 242 | try: 243 | for epoch in range(1, args.epochs+1): 244 | epoch_start_time = time.time() 245 | train() 246 | val_loss = evaluate(val_data) 247 | print('-' * 89) 248 | print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 249 | 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), 250 | val_loss, math.exp(val_loss))) 251 | print('-' * 89) 252 | # Save the model if the validation loss is the best we've seen so far. 253 | if not best_val_loss or val_loss < best_val_loss: 254 | with open(args.save, 'wb') as f: 255 | torch.save(model, f) 256 | best_val_loss = val_loss 257 | else: 258 | # Anneal the learning rate if no improvement has been seen in the validation dataset. 259 | lr /= 4.0 260 | except KeyboardInterrupt: 261 | print('-' * 89) 262 | print('Exiting from training early') 263 | 264 | # Load the best saved model. 265 | with open(args.save, 'rb') as f: 266 | model = torch.load(f) 267 | # after load the rnn params are not a continuous chunk of memory 268 | # this makes them a continuous chunk, and will speed up forward pass 269 | # Currently, only rnn model supports flatten_parameters function. 270 | if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']: 271 | model.rnn.flatten_parameters() 272 | 273 | # Run on test data. 274 | test_loss = evaluate(test_data) 275 | print('=' * 89) 276 | print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( 277 | test_loss, math.exp(test_loss))) 278 | print('=' * 89) 279 | 280 | if len(args.onnx_export) > 0: 281 | # Export the model in ONNX format. 282 | export_onnx(args.onnx_export, batch_size=1, seq_len=args.bptt) 283 | -------------------------------------------------------------------------------- /Applications/language_model/model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/Applications/language_model/model.pt -------------------------------------------------------------------------------- /Applications/language_model/model.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | class RNN_Model(nn.Module): 7 | def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False): 8 | """ 9 | :param rnn_type: 10 | :param ntoken: num of tokens, vocab size 11 | :param ninp: dimension of input tokens 12 | :param nhid: hidden size 13 | :param nlayers: num of layers 14 | :param dropout: 15 | :param tie_weights: 16 | """ 17 | super(RNN_Model, self).__init__() 18 | self.ntoken = ntoken 19 | self.dropout_layer = nn.Dropout(dropout) 20 | self.encoder = nn.Embedding(num_embeddings=ntoken, embedding_dim=ninp) 21 | # choose the type of RNN: 22 | if rnn_type in ['GRU','LSTM']: 23 | # Docs: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM 24 | # `getattr` is quite useful! 25 | self.rnn = getattr(nn, rnn_type)(input_size=ninp, hidden_size=nhid, num_layers=nlayers) 26 | else: 27 | try: 28 | self.nonlinearity = {'RNN_RELU':'relu', 'RNN_TANH':'tanh'}[rnn_type] 29 | except KeyError: 30 | raise ValueError("""only support ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""") 31 | self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=self.nonlinearity, dropout=dropout) 32 | # decoder, a simple linear layer: 33 | # ntoken就是vocab size,所以输出维度要这么定。实际上,训练LM,就是next word prediction 34 | self.decoder = nn.Linear(in_features=nhid, out_features=ntoken) 35 | 36 | self.rnn_type = rnn_type 37 | self.nhid = nhid 38 | self.nlayers = nlayers 39 | 40 | def init_weights(self): 41 | initrange = 0.1 42 | # Q: why we should manually initialize encoder and decoder? 43 | nn.init.uniform_(self.encoder.weight, -initrange, initrange) 44 | nn.init.zeros_(self.decoder.weight) # Why? 45 | nn.init.uniform_(self.decoder.weight, -initrange, initrange) 46 | 47 | def forward(self, input, hidden): 48 | """ 49 | :param input: 50 | :param hidden: init hidden state to RNN (h_0) 51 | :return: 52 | """ 53 | emb = self.dropout_layer(self.encoder(input)) 54 | # output: the hiddens of n tokens; hidden: the last hidden state (h_n) 55 | output, hidden = self.rnn(emb, hidden) 56 | # 注意输入decoder的是n个time steps的hidden,所以整个模型的输入的seq_len是多长,输出就会有多长 57 | # 这也是seq2seq一般的做法,每个timestep的loss加起来组成整体的loss。 58 | # 只不过这里的decoder就是一个简单的Linear,所以长度必须跟输入保持相同。如果单独一个RNN作为decoder,就可以长度不同了。 59 | decoded = self.decoder(output) 60 | decoded = decoded.view(-1, self.ntoken) 61 | return F.log_softmax(decoded, dim=1), hidden # Why log(softmax(x))? 62 | 63 | def init_hidden(self, bsz): 64 | weight = next(self.parameters()) 65 | if self.rnn_type == 'LSTM': 66 | return (weight.new_zeros(self.nlayers, bsz, self.nhid), 67 | weight.new_zeros(self.nlayers, bsz, self.nhid)) 68 | else: 69 | return weight.new_zeros(self.nlayers, bsz, self.nhid) 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /HuggingfaceNLP/C1. Start Playing Transformers/1. 直接使用pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Pipeline\n", 8 | "Pipeline是Huggingface的一个基本工具,可以理解为一个端到端(end-to-end)的一键调用Transformer模型的工具。\n", 9 | "\n", 10 | "It connects a model with its necessary preprocessing and postprocessing steps, allowing us to directly input any text and get an intelligible answer.\n", 11 | "\n", 12 | "给定一个任务之后,pipeline会自动调用一个预训练好的模型,然后根据你给的输入执行下面三个步骤:\n", 13 | "1. 预处理输入文本,让它可被模型读取\n", 14 | "2. 模型处理\n", 15 | "3. 模型输出的后处理,让预测结果可读\n", 16 | "\n", 17 | "一个例子如下:" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "from transformers import pipeline\n", 27 | "\n", 28 | "clf = pipeline('sentiment-analysis')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 4, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "data": { 38 | "text/plain": [ 39 | "[{'label': 'POSITIVE', 'score': 0.9998709559440613}]" 40 | ] 41 | }, 42 | "execution_count": 4, 43 | "metadata": {}, 44 | "output_type": "execute_result" 45 | } 46 | ], 47 | "source": [ 48 | "clf('Haha, today is a nice day!')" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "还可以直接接受多个句子,一起预测:" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 13, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "[{'label': 'POSITIVE', 'score': 0.9998160600662231},\n", 67 | " {'label': 'POSITIVE', 'score': 0.9998552799224854},\n", 68 | " {'label': 'NEGATIVE', 'score': 0.999782383441925}]" 69 | ] 70 | }, 71 | "execution_count": 13, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "clf(['good','nice','bad'])" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "pipeline支持的task包括:\n", 85 | "\n", 86 | "- \"feature-extraction\": will return a FeatureExtractionPipeline.\n", 87 | "- \"text-classification\": will return a TextClassificationPipeline.\n", 88 | "- \"sentiment-analysis\": (alias of \"text-classification\") will return a TextClassificationPipeline.\n", 89 | "- \"token-classification\": will return a TokenClassificationPipeline.\n", 90 | "- \"ner\" (alias of \"token-classification\"): will return a TokenClassificationPipeline.\n", 91 | "- \"question-answering\": will return a QuestionAnsweringPipeline.\n", 92 | "- \"fill-mask\": will return a FillMaskPipeline.\n", 93 | "- \"summarization\": will return a SummarizationPipeline.\n", 94 | "- \"translation_xx_to_yy\": will return a TranslationPipeline.\n", 95 | "- \"text2text-generation\": will return a Text2TextGenerationPipeline.\n", 96 | "- \"text-generation\": will return a TextGenerationPipeline.\n", 97 | "- \"zero-shot-classification:: will return a ZeroShotClassificationPipeline.\n", 98 | "- \"conversational\": will return a ConversationalPipeline." 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "## Have a try: Zero-shot-classification\n", 106 | "零样本学习,就是训练一个可以预测任何标签的模型,这些标签可以不出现在训练集中。\n", 107 | "\n", 108 | "一种零样本学习的方法,就是通过NLI(文本蕴含)任务,训练一个推理模型,比如这个例子:\n", 109 | "```python\n", 110 | "premise = 'Who are you voting for in 2020?'\n", 111 | "hypothesis = 'This text is about politics.'\n", 112 | "```\n", 113 | "上面有一个前提(premise)和一个假设(hypothesis),NLI任务就是去预测,在这个premise下,hypothesis是否成立。\n", 114 | "\n", 115 | "通过这样的训练,我们可以直接把hypothesis中的politics换成其他词儿,就可以实现zero-shot-learning了。\n", 116 | "\n", 117 | "NLI任务的解释:it classifies if two sentences are logically linked across three labels (contradiction, neutral, entailment) — a task also called natural language inference.\n", 118 | "\n", 119 | "参考阅读:\n", 120 | "- 官方 Zero-shot-classification Pipeline文档:https://huggingface.co/transformers/main_classes/pipelines.html#transformers.ZeroShotClassificationPipeline\n", 121 | "- 零样本学习简介:https://mp.weixin.qq.com/s/6aBzR0O3pwA8-btsuDX82g" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "clf = pipeline('zero-shot-classification')" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 21, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/plain": [ 141 | "[{'sequence': 'A helicopter is flying in the sky',\n", 142 | " 'labels': ['machine', 'animal'],\n", 143 | " 'scores': [0.9938627481460571, 0.006137280724942684]},\n", 144 | " {'sequence': 'A bird is flying in the sky',\n", 145 | " 'labels': ['animal', 'machine'],\n", 146 | " 'scores': [0.9987970590591431, 0.0012029369827359915]}]" 147 | ] 148 | }, 149 | "execution_count": 21, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "clf(sequences=[\"A helicopter is flying in the sky\",\n", 156 | " \"A bird is flying in the sky\"],\n", 157 | " candidate_labels=['animal','machine']) # labels可以完全自定义" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "## Have a try: Text Generation" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 27, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "application/vnd.jupyter.widget-view+json": { 175 | "model_id": "0d84006ae024439fb571c12e15825b9e", 176 | "version_major": 2, 177 | "version_minor": 0 178 | }, 179 | "text/plain": [ 180 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=357.0, style=ProgressStyle(description_…" 181 | ] 182 | }, 183 | "metadata": {}, 184 | "output_type": "display_data" 185 | }, 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | "\n" 191 | ] 192 | }, 193 | { 194 | "data": { 195 | "application/vnd.jupyter.widget-view+json": { 196 | "model_id": "b6e2a89ad3b4447582c1446c10cfd9f0", 197 | "version_major": 2, 198 | "version_minor": 0 199 | }, 200 | "text/plain": [ 201 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=616.0, style=ProgressStyle(description_…" 202 | ] 203 | }, 204 | "metadata": {}, 205 | "output_type": "display_data" 206 | }, 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "generator = pipeline('text-generation', model='liam168/chat-DialoGPT-small-zh') # 默认使用gpt2,也可以指定模型" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 43, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "name": "stderr", 226 | "output_type": "stream", 227 | "text": [ 228 | "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" 229 | ] 230 | }, 231 | { 232 | "data": { 233 | "text/plain": [ 234 | "[{'generated_text': '上午上班吧'}]" 235 | ] 236 | }, 237 | "execution_count": 43, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "generator('上午')" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "## Have a try: Mask Filling" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 46, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "data": { 260 | "application/vnd.jupyter.widget-view+json": { 261 | "model_id": "03b6c5c4b57c4e76917967705df678cb", 262 | "version_major": 2, 263 | "version_minor": 0 264 | }, 265 | "text/plain": [ 266 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…" 267 | ] 268 | }, 269 | "metadata": {}, 270 | "output_type": "display_data" 271 | }, 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "\n" 277 | ] 278 | } 279 | ], 280 | "source": [ 281 | "unmasker = pipeline('fill-mask')" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 52, 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/plain": [ 292 | "[{'sequence': 'What the heck?',\n", 293 | " 'score': 0.3783760964870453,\n", 294 | " 'token': 17835,\n", 295 | " 'token_str': ' heck'},\n", 296 | " {'sequence': 'What the hell?',\n", 297 | " 'score': 0.32931089401245117,\n", 298 | " 'token': 7105,\n", 299 | " 'token_str': ' hell'},\n", 300 | " {'sequence': 'What the fuck?',\n", 301 | " 'score': 0.14645449817180634,\n", 302 | " 'token': 26536,\n", 303 | " 'token_str': ' fuck'}]" 304 | ] 305 | }, 306 | "execution_count": 52, 307 | "metadata": {}, 308 | "output_type": "execute_result" 309 | } 310 | ], 311 | "source": [ 312 | "unmasker('What the ?', top_k=3) # 注意不同的模型,MASK token可能不一样,不一定都是 " 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "## 更多的Task,见官方教程\n", 320 | "https://huggingface.co/course/chapter1/3?fw=pt" 321 | ] 322 | } 323 | ], 324 | "metadata": { 325 | "kernelspec": { 326 | "display_name": "Python 3 (ipykernel)", 327 | "language": "python", 328 | "name": "python3" 329 | }, 330 | "language_info": { 331 | "codemirror_mode": { 332 | "name": "ipython", 333 | "version": 3 334 | }, 335 | "file_extension": ".py", 336 | "mimetype": "text/x-python", 337 | "name": "python", 338 | "nbconvert_exporter": "python", 339 | "pygments_lexer": "ipython3", 340 | "version": "3.9.2" 341 | } 342 | }, 343 | "nbformat": 4, 344 | "nbformat_minor": 4 345 | } 346 | -------------------------------------------------------------------------------- /HuggingfaceNLP/C1. Start Playing Transformers/2. Transformer家族及基本概念.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Transformer大家族\n", 8 | "\n", 9 | "## 1. Transformer结构\n", 10 | "\n", 11 | "Transformer结构最初就是在大2017年名鼎鼎的《Attention Is All You Need》论文中提出的,最开始是用于机器翻译任务。\n", 12 | "\n", 13 | "这里先简单回顾一下Transformer的基本结构:\n", 14 | "\n", 15 | "\n", 16 | "\n", 17 | "- 左边是encoder,用于对输入的sequence进行表示,得到一个很好特征向量。\n", 18 | "- 右边是decoder,利用encoder得到的特征,以及原始的输入,进行新的sequence的生成。\n", 19 | "\n", 20 | "encoder、decoder既可以单独使用,又可以再一起使用,因此,基于Transformer的模型可以分为三大类:\n", 21 | "\n", 22 | "- Encoder-only\n", 23 | "- Decoder-only\n", 24 | "- Encoder-Decoder\n", 25 | "\n", 26 | "\n", 27 | "## 2. Transformer家族\n", 28 | "\n", 29 | "随后各种基于Transformer结构的模型就如雨后春笋般涌现出来,教程中有一张图展示了一些主要模型的时间轴:\n", 30 | "\n", 31 | "\n", 32 | "\n", 33 | "虽然模型多到四只jio都数不过来,但总体上可以分为三个阵营,分别有三个组长:\n", 34 | "\n", 35 | "- 组长1:**BERT**。组员都是BERT类似的结构,是一类**自编码模型**。\n", 36 | "- 组长2:**GPT**。组员都是类似GPT的结构,是一类**自回归模型**。\n", 37 | "- 组长3:**BART/T5**。组员结构都差不多是**encoder-decoder**模型。\n", 38 | "\n", 39 | "### 不同的架构,不同的预训练方式,不同的特长\n", 40 | "\n", 41 | "对于**Encoder-only**的模型,预训练任务通常是“破坏一个句子,然后让模型去预测或填补”。例如BERT中使用的就是两个预训练任务就是**Masked language modeling**和**Next sentence prediction**。\n", 42 | "因此,这类模型擅长进行文本表示,适用于做**文本的分类、实体识别、关键信息抽取**等任务。\n", 43 | "\n", 44 | "对于**Decoder-only**的模型,预训练任务通常是**Next word prediction**,这种方式又被称为**Causal language modeling**。这个Causal就是“因果”的意思,对于decoder,它在训练时是无法看到全文的,只能看到前面的信息。\n", 45 | "因此这类模型适合做**文本生成**任务。\n", 46 | "\n", 47 | "而**Seq2seq**架构,由于包含了encoder和decoder,所以预训练的目标通常是融合了各自的目标,但通常还会设计一些更加复杂的目标,比如对于T5模型,会把一句话中一片区域的词都mask掉,然后让模型去预测。seq2seq架构的模型,就适合做**翻译、对话**等需要根据给定输入来生成输出的任务,这跟decoder-only的模型还是有很大差别的。\n", 48 | "\n", 49 | "### 总结表如下:" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "|类型|架构|Transformer组件 |\tExamples |\tTasks|\n", 57 | "|--|---|--- |\t--- |\t---|\n", 58 | "|**BERT**-like | auto-encoding models|\tEncoder |\t\tALBERT, BERT, DistilBERT, ELECTRA, RoBERTa | \tSentence classification, named entity recognition, extractive question answering|\n", 59 | "|**GPT**-like | auto-regressive models |\tDecoder |\t\tCTRL, GPT, GPT-2, Transformer XL |\t \tText generation|\n", 60 | "|**BART/T5**-like | sequence-to-sequence models|\tEncoder-decoder |\t\tBART, T5, Marian, mBART |\t \tSummarization, translation, generative question answering|\n" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "---\n", 68 | "\n", 69 | "本部分对应的官方链接:\n", 70 | "https://huggingface.co/course/chapter1/4?fw=pt" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "Python 3 (ipykernel)", 84 | "language": "python", 85 | "name": "python3" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 3 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython3", 97 | "version": "3.9.2" 98 | } 99 | }, 100 | "nbformat": 4, 101 | "nbformat_minor": 4 102 | } 103 | -------------------------------------------------------------------------------- /HuggingfaceNLP/C1. Start Playing Transformers/3. 端到端的背后.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 端到端的背后到底是怎么处理的\n", 8 | "\n", 9 | "Pipeline的背后:\\\n", 10 | "" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## 1. Tokenizer\n", 18 | "\n", 19 | "我们使用的tokenizer必须跟对应的模型在预训练时的tokenizer保持一致,也就是词表需要一致。\\\n", 20 | "Huggingface中可以直接指定模型的checkpoint的名字,然后自动下载对应的词表。\\\n", 21 | "具体方式是:\n", 22 | "- 使用`AutoTokenizer`的`from_pretrained`方法\n", 23 | "\n", 24 | "`tokenizer`这个对象可以直接接受参数并输出结果,即它是callable的。具体参数见:\\\n", 25 | "https://huggingface.co/transformers/master/internal/tokenization_utils.html#transformers.tokenization_utils_base.PreTrainedTokenizerBase \\\n", 26 | "主要参数包括:\n", 27 | "- text,可以是单条的string,也可以是一个string的list,还可以是list的list\n", 28 | "- padding,用于填白\n", 29 | "- truncation,用于截断\n", 30 | "- max_length,设置最大句长\n", 31 | "- return_tensors,设置返回数据类型" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "from transformers import AutoTokenizer\n", 41 | "\n", 42 | "checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'\n", 43 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "先看看直接使用tokenizer的结果:" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 13, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/plain": [ 61 | "{'input_ids': [[101, 2651, 2003, 1037, 2204, 2154, 999, 15854, 1066, 1066, 1066, 102], [101, 2129, 2055, 4826, 1029, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}" 62 | ] 63 | }, 64 | "execution_count": 13, 65 | "metadata": {}, 66 | "output_type": "execute_result" 67 | } 68 | ], 69 | "source": [ 70 | "raw_inputs = ['Today is a good day! Woo~~~',\n", 71 | " 'How about tomorrow?']\n", 72 | "tokenizer(raw_inputs)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "可以加上一个 `padding=Ture` 参数,让得到的序列长度对齐:" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 14, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "{'input_ids': [[101, 2651, 2003, 1037, 2204, 2154, 999, 15854, 1066, 1066, 1066, 102], [101, 2129, 2055, 4826, 1029, 102, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]}" 91 | ] 92 | }, 93 | "execution_count": 14, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "tokenizer(raw_inputs, padding=True)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "tokenizer还有`truncation`和`max_length`属性,用于在max_length处截断:" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 19, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/plain": [ 117 | "{'input_ids': [[101, 2651, 2003, 1037, 2204, 2154, 102], [101, 2129, 2055, 4826, 1029, 102, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0]]}" 118 | ] 119 | }, 120 | "execution_count": 19, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "tokenizer(raw_inputs, padding=True, truncation=True, max_length=7) " 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "`return_tensors`属性也很重要,用来指定返回的是什么类型的tensors,`pt`就是pytorch,`tf`就是tensorflow:" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 22, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "{'input_ids': tensor([[ 101, 2651, 2003, 1037, 2204, 2154, 999, 15854, 1066, 1066,\n", 145 | " 1066, 102],\n", 146 | " [ 101, 2129, 2055, 4826, 1029, 102, 0, 0, 0, 0,\n", 147 | " 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", 148 | " [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])}" 149 | ] 150 | }, 151 | "execution_count": 22, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "## 2. Model\n", 165 | "也可以通过AutoModel来直接从checkpoint导入模型。\\\n", 166 | "这里导入的模型,是Transformer的基础模型,接受tokenize之后的输入,输出hidden states,即文本的向量表示,是一种上下文表示。\n", 167 | "\n", 168 | "这个向量表示,会有三个维度:\n", 169 | "1. batch size\n", 170 | "2. sequence length\n", 171 | "3. hidden size" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 23, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "from transformers import AutoModel\n", 181 | "model = AutoModel.from_pretrained(checkpoint)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "加载了模型之后,就可以把tokenizer得到的输出,直接输入到model中:" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 40, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "dict_keys(['last_hidden_state', 'hidden_states', 'attentions'])" 200 | ] 201 | }, 202 | "execution_count": 40, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')\n", 209 | "outputs = model(**inputs) # 这里变量前面的**,代表把inputs这个dictionary给分解成一个个参数单独输进去\n", 210 | "vars(outputs).keys() # 查看一下输出有哪些属性" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | ">这里顺便讲一讲这个函数中`**`的用法:\n", 218 | "\n", 219 | "`**`在函数中的作用就是把后面紧跟着的这个参数,从一个字典的格式,解压成一个个单独的参数。\n", 220 | "\n", 221 | "回顾一下上面tokenizer的输出,我们发现它是一个包含了input_ids和attention_mask两个key的**字典**,因此通过`**`的解压,相当于变成了`intput_ids=..., attention_mask=...`喂给函数。\n", 222 | "\n", 223 | "我们再来查看一下通过AutoModel加载的DistillBertModel模型的输入:\n", 224 | "https://huggingface.co/transformers/master/model_doc/distilbert.html#distilbertmodel\n", 225 | "\n", 226 | "可以看到DistillBertModel的直接call的函数是:\n", 227 | "\n", 228 | "`forward(input_ids=None, attention_mask=None, ...)`\n", 229 | "正好跟`**inputs`后的格式对应上。" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 31, 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "name": "stdout", 239 | "output_type": "stream", 240 | "text": [ 241 | "torch.Size([2, 12, 768])\n" 242 | ] 243 | }, 244 | { 245 | "data": { 246 | "text/plain": [ 247 | "tensor([[[ 0.4627, 0.3042, 0.5431, ..., 0.3706, 1.0033, -0.6074],\n", 248 | " [ 0.6100, 0.3093, 0.2038, ..., 0.3788, 0.9370, -0.6439],\n", 249 | " [ 0.6514, 0.3185, 0.3855, ..., 0.4152, 1.0199, -0.4450],\n", 250 | " ...,\n", 251 | " [ 0.3674, 0.1380, 1.1619, ..., 0.4976, 0.4758, -0.5896],\n", 252 | " [ 0.4182, 0.2503, 1.0898, ..., 0.4745, 0.4042, -0.5444],\n", 253 | " [ 1.1614, 0.2516, 0.9561, ..., 0.5742, 0.8437, -0.9604]],\n", 254 | "\n", 255 | " [[ 0.7956, -0.2343, 0.3810, ..., -0.1270, 0.5182, -0.1612],\n", 256 | " [ 0.9337, 0.2074, 0.6202, ..., 0.1874, 0.6584, -0.1899],\n", 257 | " [ 0.6279, -0.3176, 0.1596, ..., -0.2956, 0.2960, -0.1447],\n", 258 | " ...,\n", 259 | " [ 0.3050, 0.0396, 0.6345, ..., 0.4271, 0.3367, -0.3285],\n", 260 | " [ 0.1773, 0.0111, 0.6275, ..., 0.3831, 0.3543, -0.2919],\n", 261 | " [ 0.2756, 0.0048, 0.9281, ..., 0.2006, 0.4375, -0.3238]]],\n", 262 | " grad_fn=)" 263 | ] 264 | }, 265 | "execution_count": 31, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "print(outputs.last_hidden_state.shape)\n", 272 | "outputs.last_hidden_state" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "可以看到,输出的shape是`torch.Size([2, 12, 768])`,三个维度分别是 batch,seq_len和hidden size。" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 41, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "odict_keys(['last_hidden_state'])" 291 | ] 292 | }, 293 | "execution_count": 41, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "outputs.keys() # 这里查看发现只有 last_hidden_state, 因为其他的值都是None" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "## 3. Model Heads\n", 307 | "模型头,接在基础模型的后面,用于将hidden states文本表示进一步处理,用于具体的任务。\n", 308 | "\n", 309 | "整体框架图:\\\n", 310 | "\n", 311 | "\n", 312 | "Head一般是由若干层的线性层来构成的。\n", 313 | "\n", 314 | "Transformers库中的主要模型架构有:\n", 315 | "- *Model (retrieve the hidden states)\n", 316 | "- *ForCausalLM\n", 317 | "- *ForMaskedLM\n", 318 | "- *ForMultipleChoice\n", 319 | "- *ForQuestionAnswering\n", 320 | "- *ForSequenceClassification\n", 321 | "- *ForTokenClassification\n", 322 | "- ...\n", 323 | "\n", 324 | "单纯的`*Model`,就是不包含 Head 的模型,而有`For*`的则是包含了具体 Head 的模型。\n", 325 | "\n", 326 | "例如,对于前面的那个做在情感分析上pretrain的checkpoint(distilbert-base-uncased-finetuned-sst-2-english),我们可以使用包含 SequenceClassification 的Head的模型去加载,就可以直接得到对应分类问题的logits,而不仅仅是文本向量表示。" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 43, 332 | "metadata": {}, 333 | "outputs": [ 334 | { 335 | "name": "stdout", 336 | "output_type": "stream", 337 | "text": [ 338 | "dict_keys(['loss', 'logits', 'hidden_states', 'attentions'])\n" 339 | ] 340 | }, 341 | { 342 | "data": { 343 | "text/plain": [ 344 | "tensor([[-4.2098, 4.6444],\n", 345 | " [ 0.6367, -0.3753]], grad_fn=)" 346 | ] 347 | }, 348 | "execution_count": 43, 349 | "metadata": {}, 350 | "output_type": "execute_result" 351 | } 352 | ], 353 | "source": [ 354 | "from transformers import AutoModelForSequenceClassification\n", 355 | "clf = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n", 356 | "inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')\n", 357 | "outputs = clf(**inputs)\n", 358 | "print(vars(outputs).keys())\n", 359 | "outputs.logits" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "从outputs的属性就可以看出,带有Head的Model,跟不带Head的Model,输出的东西是不一样的。\n", 367 | "\n", 368 | "没有Head的Model,输出的是`'last_hidden_state', 'hidden_states', 'attentions'`这些玩意儿,因为它仅仅是一个表示模型;\n", 369 | "\n", 370 | "而有Head的Model,输出的是`'loss', 'logits', 'hidden_states', 'attentions'`这些玩意儿,有logits,loss这些东西,因为它是一个完整的预测模型了。\n", 371 | "\n", 372 | "可以顺便看看,加了这个 SequenceClassification Head的DistillBertModel的文档,看看其输入和输出:\n", 373 | "\n", 374 | "https://huggingface.co/transformers/master/model_doc/distilbert.html#distilbertforsequenceclassification\n", 375 | "\n", 376 | "可以看到,输入中,我们还可以提供`labels`,这样就可以直接计算loss了。" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "## 4. Post-Processing\n", 384 | "后处理主要就是两步:\n", 385 | "- 把logits转化成概率值 (用softmax)\n", 386 | "- 把概率值跟具体的标签对应上 (使用模型的config中的id2label)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 46, 392 | "metadata": {}, 393 | "outputs": [ 394 | { 395 | "data": { 396 | "text/plain": [ 397 | "tensor([[1.4276e-04, 9.9986e-01],\n", 398 | " [7.3341e-01, 2.6659e-01]], grad_fn=)" 399 | ] 400 | }, 401 | "execution_count": 46, 402 | "metadata": {}, 403 | "output_type": "execute_result" 404 | } 405 | ], 406 | "source": [ 407 | "import torch\n", 408 | "predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) # dim=-1就是沿着最后一维进行操作\n", 409 | "predictions" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "得到了概率分布,还得知道具体是啥标签吧。标签跟id的隐射关系,也已经被保存在每个pretrain model的config中了,\n", 417 | "我们可以去模型的`config`属性中查看`id2label`字段:" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 54, 423 | "metadata": {}, 424 | "outputs": [ 425 | { 426 | "data": { 427 | "text/plain": [ 428 | "{0: 'NEGATIVE', 1: 'POSITIVE'}" 429 | ] 430 | }, 431 | "execution_count": 54, 432 | "metadata": {}, 433 | "output_type": "execute_result" 434 | } 435 | ], 436 | "source": [ 437 | "id2label = clf.config.id2label\n", 438 | "id2label" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 58, 444 | "metadata": {}, 445 | "outputs": [ 446 | { 447 | "name": "stdout", 448 | "output_type": "stream", 449 | "text": [ 450 | "POSITIVE\n", 451 | "NEGATIVE\n" 452 | ] 453 | } 454 | ], 455 | "source": [ 456 | "for i in torch.argmax(predictions, dim=-1):\n", 457 | " print(id2label[i.item()])" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [] 466 | } 467 | ], 468 | "metadata": { 469 | "kernelspec": { 470 | "display_name": "Python 3 (ipykernel)", 471 | "language": "python", 472 | "name": "python3" 473 | }, 474 | "language_info": { 475 | "codemirror_mode": { 476 | "name": "ipython", 477 | "version": 3 478 | }, 479 | "file_extension": ".py", 480 | "mimetype": "text/x-python", 481 | "name": "python", 482 | "nbconvert_exporter": "python", 483 | "pygments_lexer": "ipython3", 484 | "version": "3.9.2" 485 | } 486 | }, 487 | "nbformat": 4, 488 | "nbformat_minor": 4 489 | } 490 | -------------------------------------------------------------------------------- /HuggingfaceNLP/C1. Start Playing Transformers/4. Models & Tokenizers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Models\n", 8 | "\n", 9 | "前面都是使用的`AutoModel`,这是一个智能的wrapper,可以根据你给定的checkpoint名字,自动去寻找对应的网络结构,故名Auto。\n", 10 | "\n", 11 | "如果明确知道我们需要的是什么网络架构,就可以直接使用具体的`*Model`,比如`BertModel`,就是使用Bert结构。" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## 随机初始化一个Transformer模型:通过`config`来加载\n", 19 | "\n", 20 | "`*Config`这个类,用于给出某个模型的网络结构,通过config来加载模型,得到的就是一个模型的架子,没有预训练的权重。" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "from transformers import BertModel, BertConfig\n", 30 | "\n", 31 | "config = BertConfig()\n", 32 | "model = BertModel(config) # 模型是根据config来构建的,这时构建的模型是参数随机初始化的" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "BertConfig {\n", 45 | " \"attention_probs_dropout_prob\": 0.1,\n", 46 | " \"gradient_checkpointing\": false,\n", 47 | " \"hidden_act\": \"gelu\",\n", 48 | " \"hidden_dropout_prob\": 0.1,\n", 49 | " \"hidden_size\": 768,\n", 50 | " \"initializer_range\": 0.02,\n", 51 | " \"intermediate_size\": 3072,\n", 52 | " \"layer_norm_eps\": 1e-12,\n", 53 | " \"max_position_embeddings\": 512,\n", 54 | " \"model_type\": \"bert\",\n", 55 | " \"num_attention_heads\": 12,\n", 56 | " \"num_hidden_layers\": 12,\n", 57 | " \"pad_token_id\": 0,\n", 58 | " \"position_embedding_type\": \"absolute\",\n", 59 | " \"transformers_version\": \"4.3.3\",\n", 60 | " \"type_vocab_size\": 2,\n", 61 | " \"use_cache\": true,\n", 62 | " \"vocab_size\": 30522\n", 63 | "}\n", 64 | "\n" 65 | ] 66 | } 67 | ], 68 | "source": [ 69 | "print(config)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "更常用的做法则是直接加载预训练模型,然后微调。\n", 77 | "\n", 78 | "## 初始化一个预训练的Transformer模型:通过`from_pretrained`来加载" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "application/vnd.jupyter.widget-view+json": { 89 | "model_id": "96595079a4984858b21cff090d86dc71", 90 | "version_major": 2, 91 | "version_minor": 0 92 | }, 93 | "text/plain": [ 94 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…" 95 | ] 96 | }, 97 | "metadata": {}, 98 | "output_type": "display_data" 99 | }, 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "\n" 105 | ] 106 | }, 107 | { 108 | "data": { 109 | "application/vnd.jupyter.widget-view+json": { 110 | "model_id": "bda0b82bdbd8466693dc28840ad24a1a", 111 | "version_major": 2, 112 | "version_minor": 0 113 | }, 114 | "text/plain": [ 115 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…" 116 | ] 117 | }, 118 | "metadata": {}, 119 | "output_type": "display_data" 120 | }, 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "from transformers import BertModel\n", 131 | "\n", 132 | "model = BertModel.from_pretrained('bert-base-cased')" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "模型的保存:" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 5, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "model.save_pretrained(\"directory_on_my_computer\")\n", 149 | "# 会生成两个文件: config.json pytorch_model.bin" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "# Tokenizer\n", 157 | "transformer模型使用的分词方法,往往不是直接的word-level分词或者char-level分词。\n", 158 | "\n", 159 | "前者会让词表过大,后者则表示能力很低。\n", 160 | "\n", 161 | "因此主流的方式是进行 **subword-level** 的分词。例如对 \"tokenization\" 这个词,可能会被分成 \"token\" 和 \"ization\" 两部分。\n", 162 | "\n", 163 | "常见的subword tokenization方法有:\n", 164 | "- BPE\n", 165 | "- WordPiece\n", 166 | "- Unigram\n", 167 | "- SentencePiece\n", 168 | "- ...\n", 169 | "\n", 170 | "\n", 171 | "这里对BPE做一个简单的介绍,让我们对 sub-word tokenization 的原理有一个基本了解:\n", 172 | "\n", 173 | "## Subword tokenization (☆☆☆)\n", 174 | "Subword tokenization的核心思想是:“频繁出现了词不应该被切分成更小的单位,但不常出现的词应该被切分成更小的单位”。\n", 175 | "\n", 176 | "比方\"annoyingly\"这种词,就不是很常见,但是\"annoying\"和\"ly\"都很常见,因此细分成这两个sub-word就更合理。中文也是类似的,比如“仓库管理系统”作为一个单位就明显在语料中不会很多,因此分成“仓库”和“管理系统”就会好很多。\n", 177 | "\n", 178 | "这样分词的好处在于,大大节省了词表空间,还能够解决OOV问题。因为我们很多使用的词语,都是由更简单的词语或者词缀构成的,我们不用去保存那些“小词”各种排列组合形成的千变万化的“大词”,而用较少的词汇,去覆盖各种各样的词语表示。同时,相比与直接使用最基础的“字”作为词表,sub-word的语义表示能力也更强。\n", 179 | "\n", 180 | "那么,用什么样的标准得到sub-word呢?一个著名的算法就是 **Byte-Pair Encoding (BPE)** :\n", 181 | "\n", 182 | "(下面的内容,主要翻译自Huggingface Docs中讲解tokenizer的部分,十分推荐大家直接阅读: https://huggingface.co/transformers/master/tokenizer_summary.html )\n", 183 | "\n", 184 | "### BPE————Byte-Pair Encoding:\n", 185 | "\n", 186 | "#### **Step1**:首先,我们需要对**语料**进行一个**预分词(pre-tokenization)**:\n", 187 | "\n", 188 | "比方对于英文,我可以直接简单地使用空格加一些标点符号来分词;中文可以使用jieba或者直接字来进行分词。\n", 189 | "\n", 190 | "分词之后,我们就得到了一个**原始词集合**,同时,还会记录每个词在训练语料中出现的**频率**。\n", 191 | "\n", 192 | "假设我们的词集合以及词频是:\n", 193 | "\n", 194 | "```python\n", 195 | "(\"hug\", 10), (\"pug\", 5), (\"pun\", 12), (\"bun\", 4), (\"hugs\", 5)\n", 196 | "```\n", 197 | "\n", 198 | "#### **Step2**:构建**基础词表(base vocab)** 并开始学习 **结合规则(merge rules)**:\n", 199 | "\n", 200 | "\n", 201 | "对于英语来说,我们选择字母来构成**基础词表**:\n", 202 | "\n", 203 | "`[\"b\", \"g\", \"h\", \"n\", \"p\", \"s\", \"u\"]`\n", 204 | "\n", 205 | "注:这个基础词表,就是我们最终词表的初始状态,我们会不断构建新词,加进去,直到达到我们理想的词表规模。\n", 206 | "\n", 207 | "根据这个基础词表,我们可以对原始的词集合进行细粒度分词,并看到基础词的词频:\n", 208 | "\n", 209 | "```python\n", 210 | "(\"h\" \"u\" \"g\", 10), (\"p\" \"u\" \"g\", 5), (\"p\" \"u\" \"n\", 12), (\"b\" \"u\" \"n\", 4), (\"h\" \"u\" \"g\" \"s\", 5)\n", 211 | "```\n", 212 | "\n", 213 | "接下来就是BPE的Byte-Pair核心部分————找symbol pair(符号对)并学习结合规则,即,我们从上面这个统计结果中,找出出现次数最多的那个符号对:\n", 214 | "\n", 215 | "统计一下:\n", 216 | "```python\n", 217 | "h+u 出现了 10+5=15 次\n", 218 | "u+g 出现了 10+5+5 = 20 次\n", 219 | "p+u 出现了 12 次\n", 220 | "...\n", 221 | "```\n", 222 | "统计完毕,我们发现`u+g`出现了最多次,因此,第一个结合规则就是:**把`u`跟`g`拼起来,得到`ug`这个新词!**\n", 223 | "\n", 224 | "那么,我们就把`ug`加入到我们的基础词表:\n", 225 | "\n", 226 | "`[\"b\", \"g\", \"h\", \"n\", \"p\", \"s\", \"u\", \"ug\"]`\n", 227 | "\n", 228 | "同时,词频统计表也变成了:\n", 229 | "```\n", 230 | "(\"h\" \"ug\", 10), (\"p\" \"ug\", 5), (\"p\" \"u\" \"n\", 12), (\"b\" \"u\" \"n\", 4), (\"h\" \"ug\" \"s\", 5)\n", 231 | "```\n", 232 | "\n", 233 | "#### **Step3**:反复地执行上一步,直到达到预设的词表规模。\n", 234 | "\n", 235 | "我们接着统计,发现下一个频率最高的symbol pair是`u+n`,出现了12+4=16次,因此词表中增加`un`这个词;再下一个则是`h+ug`,出现了10+5=15次,因此添加`hug`这个词......\n", 236 | "\n", 237 | "如此进行下去,当达到了预设的`vocab_size`的数目时,就停止,咱们的词表就得到啦!\n", 238 | "\n", 239 | "#### **Step4**:如何分词:\n", 240 | "\n", 241 | "得到了最终词表,在碰到一个词汇表中没有的词的时候,比如`bug`就会把它分成`b`和`ug`。也可以理解成,我首先把`bug`分解成最基本的字母,然后根据前面的结合规律,把`u`跟`g`结合起来,而`b`单独一个。具体在分词时候是如何做的,有时间去读读源码。\n", 242 | "\n", 243 | "---\n", 244 | "\n", 245 | "除了BPE,还有一些其他的sub-word分词法,可以参考 https://huggingface.co/transformers/master/tokenizer_summary.html 。\n", 246 | "\n", 247 | "下面,我们就直接使用Tokenizer来进行分词:" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 2, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "from transformers import BertTokenizer # 或者 AutoTokenizer\n", 257 | "\n", 258 | "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 9, 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "text/plain": [ 269 | "{'input_ids': [101, 2052, 1110, 170, 1363, 1285, 1106, 3858, 11303, 1468, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}" 270 | ] 271 | }, 272 | "execution_count": 9, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | } 276 | ], 277 | "source": [ 278 | "s = 'today is a good day to learn transformers'\n", 279 | "tokenizer()" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "## 了解一下内部的具体步骤:\n", 287 | "\n", 288 | "1. `tokenize()`" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 9, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "data": { 298 | "text/plain": [ 299 | "['today', 'is', 'a', 'good', 'day', 'to', 'learn', 'transform', '##ers']" 300 | ] 301 | }, 302 | "execution_count": 9, 303 | "metadata": {}, 304 | "output_type": "execute_result" 305 | } 306 | ], 307 | "source": [ 308 | "s = 'today is a good day to learn transformers'\n", 309 | "tokens = tokenizer.tokenize(s)\n", 310 | "tokens" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "注意这里的分词结果,`transformers`被分成了`transform`和`##ers`。这里的##代表这个词应该紧跟在前面的那个词,组成一个完整的词。\n", 318 | "\n", 319 | "这样设计,主要是为了方面我们在还原句子的时候,可以正确得把sub-word组成成原来的词。" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "2. `convert_tokens_to_ids()`" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 13, 332 | "metadata": {}, 333 | "outputs": [ 334 | { 335 | "data": { 336 | "text/plain": [ 337 | "[2052, 1110, 170, 1363, 1285, 1106, 3858, 11303, 1468]" 338 | ] 339 | }, 340 | "execution_count": 13, 341 | "metadata": {}, 342 | "output_type": "execute_result" 343 | } 344 | ], 345 | "source": [ 346 | "ids = tokenizer.convert_tokens_to_ids(tokens)\n", 347 | "ids" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "3. `decode`" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 16, 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "name": "stdout", 364 | "output_type": "stream", 365 | "text": [ 366 | "##ers\n", 367 | "today is a good day to learn transformers\n" 368 | ] 369 | } 370 | ], 371 | "source": [ 372 | "print(tokenizer.decode([1468]))\n", 373 | "print(tokenizer.decode(ids)) # 注意这里会把subword自动拼起来" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "## Special Tokens\n", 381 | "\n", 382 | "观察一下上面的结果,直接call tokenizer得到的ids是:\n", 383 | "```\n", 384 | "[101, 2052, 1110, 170, 1363, 1285, 1106, 3858, 11303, 1468, 102]\n", 385 | "```\n", 386 | "而通过`convert_tokens_to_ids`得到的ids是:\n", 387 | "```\n", 388 | "[2052, 1110, 170, 1363, 1285, 1106, 3858, 11303, 1468]\n", 389 | "```\n", 390 | "可以发现,前者在头和尾多了俩token,id分别是 101 和 102。\n", 391 | "\n", 392 | "decode出来瞅瞅:" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 17, 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "data": { 402 | "text/plain": [ 403 | "'[CLS] today is a good day to learn transformers [SEP]'" 404 | ] 405 | }, 406 | "execution_count": 17, 407 | "metadata": {}, 408 | "output_type": "execute_result" 409 | } 410 | ], 411 | "source": [ 412 | "tokenizer.decode([101, 2052, 1110, 170, 1363, 1285, 1106, 3858, 11303, 1468, 102])" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "它们分别是 `[CLS]` 和 `[SEP]`。这两个token的出现,是因为我们调用的模型,在pre-train阶段使用了它们,所以tokenizer也会使用。\n", 420 | "\n", 421 | "不同的模型使用的special tokens不一定相同,所以一定要让tokenizer跟model保持一致!" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [] 430 | } 431 | ], 432 | "metadata": { 433 | "kernelspec": { 434 | "display_name": "Python 3", 435 | "language": "python", 436 | "name": "python3" 437 | }, 438 | "language_info": { 439 | "codemirror_mode": { 440 | "name": "ipython", 441 | "version": 3 442 | }, 443 | "file_extension": ".py", 444 | "mimetype": "text/x-python", 445 | "name": "python", 446 | "nbconvert_exporter": "python", 447 | "pygments_lexer": "ipython3", 448 | "version": "3.7.6" 449 | } 450 | }, 451 | "nbformat": 4, 452 | "nbformat_minor": 4 453 | } 454 | -------------------------------------------------------------------------------- /HuggingfaceNLP/C1. Start Playing Transformers/5. 处理多个序列.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# `attention_mask`在处理多个序列时的作用\n", 8 | "\n", 9 | "现在我们训练和预测基本都是批量化处理的,而前面展示的例子很多都是单条数据。单条数据跟多条数据有一些需要注意的地方。\n", 10 | "\n", 11 | "## 处理单个序列\n", 12 | "\n", 13 | "我们首先加载一个在情感分类上微调过的模型,来进行我们的实验(注意,这里我们就不能能使用`AutoModel`,而应该使用`AutoModelFor*`这种带Head的model)。" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from pprint import pprint as print # 这个pprint能让打印的格式更好看一点\n", 23 | "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n", 24 | "checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'\n", 25 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", 26 | "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "对一个句子,使用tokenizer进行处理:" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]]),\n", 46 | " 'input_ids': tensor([[ 101, 2651, 2003, 1037, 3835, 2154, 999, 102]])}\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "s = 'Today is a nice day!'\n", 52 | "inputs = tokenizer(s, return_tensors='pt')\n", 53 | "print(inputs)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "可以看到,这里的inputs包含了两个部分:`input_ids`和`attention_mask`.\n", 61 | "\n", 62 | "模型可以直接接受`input_ids`并输出:" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 3, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "tensor([[-4.3232, 4.6906]], grad_fn=)" 74 | ] 75 | }, 76 | "execution_count": 3, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "model(inputs.input_ids).logits" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "也可以通过`**inputs`同时接受`inputs`所有的属性:" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "tensor([[-4.3232, 4.6906]], grad_fn=)" 101 | ] 102 | }, 103 | "execution_count": 4, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "model(**inputs).logits" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "上面两种方式的**结果是一样的**。\n", 117 | "\n", 118 | "## 但是当我们需要同时处理**多个序列**时,情况就有变了!" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 5, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],\n", 131 | " [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),\n", 132 | " 'input_ids': tensor([[ 101, 2651, 2003, 1037, 3835, 2154, 999, 102, 0, 0,\n", 133 | " 0],\n", 134 | " [ 101, 2021, 2054, 2055, 4826, 1029, 10047, 2025, 2469, 1012,\n", 135 | " 102]])}\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "ss = ['Today is a nice day!',\n", 141 | " 'But what about tomorrow? Im not sure.']\n", 142 | "inputs = tokenizer(ss, padding=True, return_tensors='pt')\n", 143 | "print(inputs)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "然后,我们试着直接把这里的`input_ids`喂给模型" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 6, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "tensor([[-4.1957, 4.5675],\n", 162 | " [ 3.9803, -3.2120]], grad_fn=)" 163 | ] 164 | }, 165 | "execution_count": 6, 166 | "metadata": {}, 167 | "output_type": "execute_result" 168 | } 169 | ], 170 | "source": [ 171 | "model(inputs.input_ids).logits # 第一个句子原本的logits是 [-4.3232, 4.6906]" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "发现,第一个句子的`logits`变了!\n", 179 | "\n", 180 | "这是**因为在padding之后,第一个句子的encoding变了,多了很多0, 而self-attention会attend到所有的index的值,因此结果就变了**。\n", 181 | "\n", 182 | "这时,就需要我们不仅仅是传入`input_ids`,还需要给出`attention_mask`,这样模型就会在attention的时候,不去attend被mask掉的部分。\n", 183 | "\n", 184 | "\n", 185 | "因此,在处理多个序列的时候,正确的做法是直接把tokenizer处理好的结果,整个输入到模型中,即直接`**inputs`。\n", 186 | "通过`**inputs`,我们实际上就把`attention_mask`也传进去了:" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 7, 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "tensor([[-4.3232, 4.6906],\n", 198 | " [ 3.9803, -3.2120]], grad_fn=)" 199 | ] 200 | }, 201 | "execution_count": 7, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "model(**inputs).logits" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "现在第一个句子的结果,就跟前面单条处理时的一样了。" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [] 223 | } 224 | ], 225 | "metadata": { 226 | "kernelspec": { 227 | "display_name": "Python 3", 228 | "language": "python", 229 | "name": "python3" 230 | }, 231 | "language_info": { 232 | "codemirror_mode": { 233 | "name": "ipython", 234 | "version": 3 235 | }, 236 | "file_extension": ".py", 237 | "mimetype": "text/x-python", 238 | "name": "python", 239 | "nbconvert_exporter": "python", 240 | "pygments_lexer": "ipython3", 241 | "version": "3.7.6" 242 | } 243 | }, 244 | "nbformat": 4, 245 | "nbformat_minor": 4 246 | } 247 | -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/1. 数据集预处理.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 数据集的预处理,使用dynamic padding构造batch" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 试着训练一两条样本" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "data": { 24 | "text/plain": [ 25 | "True" 26 | ] 27 | }, 28 | "execution_count": 1, 29 | "metadata": {}, 30 | "output_type": "execute_result" 31 | } 32 | ], 33 | "source": [ 34 | "import torch\n", 35 | "torch.cuda.is_available()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stderr", 45 | "output_type": "stream", 46 | "text": [ 47 | "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n", 48 | "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", 49 | "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", 50 | "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", 51 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification\n", 57 | "\n", 58 | "# Same as before\n", 59 | "checkpoint = \"bert-base-uncased\"\n", 60 | "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", 61 | "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n", 62 | "sequences = [\n", 63 | " \"I've been waiting for a HuggingFace course my whole life.\",\n", 64 | " \"This course is amazing!\",\n", 65 | "]\n", 66 | "batch = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"pt\")" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "batch['labels'] = torch.tensor([1, 1]) # tokenizer出来的结果是一个dictionary,所以可以直接加入新的 key-value\n", 76 | "\n", 77 | "optimizer = AdamW(model.parameters())\n", 78 | "loss = model(**batch).loss #这里的 loss 是直接根据 batch 中提供的 labels 来计算的,回忆:前面章节查看 model 的输出的时候,有loss这一项\n", 79 | "loss.backward()\n", 80 | "optimizer.step()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "## 从Huggingface Hub中加载数据集\n", 88 | "\n", 89 | "MRPC (Microsoft Research Paraphrase Corpus) dataset consists of 5,801 pairs of sentences, with a label indicating if they are paraphrases or not (i.e., if both sentences mean the same thing)." 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stderr", 99 | "output_type": "stream", 100 | "text": [ 101 | "Reusing dataset glue (C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n" 102 | ] 103 | }, 104 | { 105 | "data": { 106 | "application/vnd.jupyter.widget-view+json": { 107 | "model_id": "14286509d57343f3bc94a8e2f7bb3c64", 108 | "version_major": 2, 109 | "version_minor": 0 110 | }, 111 | "text/plain": [ 112 | " 0%| | 0/3 [00:00 'DatasetDict'\n", 437 | "Docstring:\n", 438 | "Apply a function to all the elements in the table (individually or in batches)\n", 439 | "and update the table (if function does updated examples).\n", 440 | "The transformation is applied to all the datasets of the dataset dictionary.\n", 441 | "```\n", 442 | "\n", 443 | "关于这个map,在Huggingface的测试题中有讲解,这里搬运并翻译一下,辅助理解:\n", 444 | "\n", 445 | "What are the benefits of the Dataset.map method?\n", 446 | "- The results of the function are cached, so it won't take any time if we re-execute the code.\n", 447 | "\n", 448 | " (通过这个map,对数据集的处理会被缓存,所以重新执行代码,也不会再费时间。)\n", 449 | "- It can apply multiprocessing to go faster than applying the function on each element of the dataset.\n", 450 | "\n", 451 | " (它可以使用多进程来处理从而提高处理速度。)\n", 452 | "- It does not load the whole dataset into memory, saving the results as soon as one element is processed.\n", 453 | "\n", 454 | " (它不需要把整个数据集都加载到内存里,同时每个元素一经处理就会马上被保存,因此十分节省内存。)" 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "metadata": {}, 460 | "source": [ 461 | "观察一下,这里通过map之后,得到的Dataset的features变多了:\n", 462 | "```python\n", 463 | "features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids']\n", 464 | "```\n", 465 | "多的几个columns就是tokenizer处理后的结果。" 466 | ] 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "metadata": {}, 471 | "source": [ 472 | "注意到,在这个`tokenize_function`中,我们没有使用`padding`,因为如果使用了padding之后,就会全局统一对一个maxlen进行padding,这样无论在tokenize还是模型的训练上都不够高效。" 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "metadata": {}, 478 | "source": [ 479 | "## Dynamic Padding 动态padding\n", 480 | "\n", 481 | "实际上,我们是故意先不进行padding的,因为我们想**在划分batch的时候再进行padding**,这样可以避免出现很多有一堆padding的序列,从而可以显著节省我们的训练时间。\n", 482 | "\n", 483 | "这里,我们就需要用到`DataCollatorWithPadding`,来进行**动态padding**:" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 34, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "from transformers import DataCollatorWithPadding\n", 493 | "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)" 494 | ] 495 | }, 496 | { 497 | "cell_type": "markdown", 498 | "metadata": {}, 499 | "source": [ 500 | "注意,我们需要使用tokenizer来初始化这个`DataCollatorWithPadding`,因为需要tokenizer来告知具体的padding token是啥,以及padding的方式是在左边还是右边(不同的预训练模型,使用的padding token以及方式可能不同)。\n" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": {}, 506 | "source": [ 507 | "下面假设我们要搞一个size=5的batch,看看如何使用`DataCollatorWithPadding`来实现:" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": 61, 513 | "metadata": {}, 514 | "outputs": [ 515 | { 516 | "data": { 517 | "text/plain": [ 518 | "[50, 59, 47, 67, 59]" 519 | ] 520 | }, 521 | "execution_count": 61, 522 | "metadata": {}, 523 | "output_type": "execute_result" 524 | } 525 | ], 526 | "source": [ 527 | "samples = tokenized_datasets['train'][:5]\n", 528 | "samples.keys()\n", 529 | "# >>> ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids']\n", 530 | "samples = {k:v for k,v in samples.items() if k not in [\"idx\", \"sentence1\", \"sentence2\"]} # 把这里多余的几列去掉\n", 531 | "samples.keys()\n", 532 | "# >>> ['attention_mask', 'input_ids', 'label', 'token_type_ids']\n", 533 | "\n", 534 | "# 打印出每个句子的长度:\n", 535 | "[len(x) for x in samples[\"input_ids\"]]" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": 57, 541 | "metadata": {}, 542 | "outputs": [ 543 | { 544 | "data": { 545 | "text/plain": [ 546 | "[67, 67, 67, 67, 67]" 547 | ] 548 | }, 549 | "execution_count": 57, 550 | "metadata": {}, 551 | "output_type": "execute_result" 552 | } 553 | ], 554 | "source": [ 555 | "batch = data_collator(samples) # samples中必须包含 input_ids 字段,因为这就是collator要处理的对象\n", 556 | "batch.keys()\n", 557 | "# >>> dict_keys(['attention_mask', 'input_ids', 'token_type_ids', 'labels'])\n", 558 | "\n", 559 | "# 再打印长度:\n", 560 | "[len(x) for x in batch['input_ids']]" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": {}, 566 | "source": [ 567 | "可以看到,这个`data_collator`就是一个把给定dataset进行padding的工具,其输入跟输出是完全一样的格式。" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": 64, 573 | "metadata": {}, 574 | "outputs": [ 575 | { 576 | "data": { 577 | "text/plain": [ 578 | "{'attention_mask': torch.Size([5, 67]),\n", 579 | " 'input_ids': torch.Size([5, 67]),\n", 580 | " 'token_type_ids': torch.Size([5, 67]),\n", 581 | " 'labels': torch.Size([5])}" 582 | ] 583 | }, 584 | "execution_count": 64, 585 | "metadata": {}, 586 | "output_type": "execute_result" 587 | } 588 | ], 589 | "source": [ 590 | "{k:v.shape for k,v in batch.items()}" 591 | ] 592 | }, 593 | { 594 | "cell_type": "markdown", 595 | "metadata": {}, 596 | "source": [ 597 | "这个batch,可以形成一个tensor了!接下来就可以用于训练了!" 598 | ] 599 | }, 600 | { 601 | "cell_type": "markdown", 602 | "metadata": {}, 603 | "source": [ 604 | "---\n", 605 | "\n", 606 | "对了,这里多提一句,`collator`这个单词实际上在平时使用英语的时候并不常见,但却在编程中见到多次。\n", 607 | "\n", 608 | "最开始一直以为是`collector`,意为“收集者”等意思,后来查了查,发现不是的。下面是柯林斯词典中对`collate`这个词的解释:\n", 609 | "\n", 610 | "> **collate**: \n", 611 | ">\n", 612 | "> When you collate pieces of information, you **gather** them all together and **examine** them. \n", 613 | "\n", 614 | "就是归纳并整理的意思。所以在我们这个情景下,就是对这些杂乱无章长短不一的序列数据,进行一个个地分组,然后检查并统一长度。\n", 615 | "\n", 616 | "关于DataCollator更多的信息,可以参见文档:\n", 617 | "https://huggingface.co/transformers/master/main_classes/data_collator.html?highlight=datacollatorwithpadding#data-collator" 618 | ] 619 | } 620 | ], 621 | "metadata": { 622 | "kernelspec": { 623 | "display_name": "Python 3", 624 | "language": "python", 625 | "name": "python3" 626 | }, 627 | "language_info": { 628 | "codemirror_mode": { 629 | "name": "ipython", 630 | "version": 3 631 | }, 632 | "file_extension": ".py", 633 | "mimetype": "text/x-python", 634 | "name": "python", 635 | "nbconvert_exporter": "python", 636 | "pygments_lexer": "ipython3", 637 | "version": "3.7.6" 638 | } 639 | }, 640 | "nbformat": 4, 641 | "nbformat_minor": 4 642 | } 643 | -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/2. 使用Trainer API来fine-tune.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 使用Trainer API来微调模型" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 1. 数据集准备和预处理:\n", 15 | "\n", 16 | "这部分就是回顾上一集的内容:\n", 17 | "- 通过dataset包加载数据集\n", 18 | "- 加载预训练模型和tokenizer\n", 19 | "- 定义Dataset.map要使用的预处理函数\n", 20 | "- 定义DataCollator来用于构造训练batch" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stderr", 30 | "output_type": "stream", 31 | "text": [ 32 | "Reusing dataset glue (C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n" 33 | ] 34 | }, 35 | { 36 | "data": { 37 | "application/vnd.jupyter.widget-view+json": { 38 | "model_id": "b4bdadebec1b4fa681fd5b7370f11abc", 39 | "version_major": 2, 40 | "version_minor": 0 41 | }, 42 | "text/plain": [ 43 | " 0%| | 0/3 [00:00\n", 209 | " \n", 218 | " \n", 219 | " \n", 220 | " [1377/1377 06:20, Epoch 3/3]\n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | "
StepTraining Loss
5000.539400
10000.319400

" 240 | ], 241 | "text/plain": [ 242 | "" 243 | ] 244 | }, 245 | "metadata": {}, 246 | "output_type": "display_data" 247 | }, 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "TrainOutput(global_step=1377, training_loss=0.35569445984728887, metrics={'train_runtime': 383.0158, 'train_samples_per_second': 3.595, 'total_flos': 530185443455520, 'epoch': 3.0})" 252 | ] 253 | }, 254 | "execution_count": 4, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "trainer.train()" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "然后我们用`Trainer`来预测:\n", 268 | "\n", 269 | "`trainer.predict()`函数处理的结果是一个named_tuple,类似一个字典,包含三个属性:predictions, label_ids, metrics\n", 270 | "\n", 271 | "注意,这里的三个属性:\n", 272 | "- `predictions`实际上就是logits\n", 273 | "- `label_ids`不是预测出来的id,而是数据集中自带的ground truth的label id,因此如果输入的数据集中没给标签,这里也不会输出\n", 274 | "- `metrics`,也是只有输入的数据集中提供了`label_ids`才会输出metrics,包括loss之类的指标\n", 275 | "\n", 276 | "其中`metrics`中还可以包含我们自定义的字段,我们需要在定义`Trainer`的时候给定`compute_metrics`参数。\n", 277 | "\n", 278 | "文档参考: https://huggingface.co/transformers/master/main_classes/trainer.html#transformers.Trainer.predict" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 5, 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/html": [ 289 | "\n", 290 | "

\n", 291 | " \n", 300 | " \n", 301 | " \n", 302 | " [51/51 00:03]\n", 303 | "
\n", 304 | " " 305 | ], 306 | "text/plain": [ 307 | "" 308 | ] 309 | }, 310 | "metadata": {}, 311 | "output_type": "display_data" 312 | }, 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "(408, 2)\n", 318 | "(408,)\n", 319 | "{'eval_loss': 0.7387174963951111, 'eval_runtime': 3.2872, 'eval_samples_per_second': 124.117}\n" 320 | ] 321 | } 322 | ], 323 | "source": [ 324 | "predictions = trainer.predict(tokenized_datasets['validation'])\n", 325 | "print(predictions.predictions.shape) # logits\n", 326 | "# array([[-2.7887206, 3.1986978],\n", 327 | "# [ 2.5258656, -1.832253 ], ...], dtype=float32)\n", 328 | "print(predictions.label_ids.shape) # array([1, 0, 0, 1, 0, 1, 0, 1, 1, 1, ...], dtype=int64)\n", 329 | "print(predictions.metrics)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "然后就可以用preds和labels来计算一些相关的metrics了。\n", 337 | "\n", 338 | "Huggingface `datasets`里面可以直接导入跟数据集相关的metrics:" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 6, 344 | "metadata": {}, 345 | "outputs": [ 346 | { 347 | "data": { 348 | "text/plain": [ 349 | "{'accuracy': 0.8455882352941176, 'f1': 0.8911917098445595}" 350 | ] 351 | }, 352 | "execution_count": 6, 353 | "metadata": {}, 354 | "output_type": "execute_result" 355 | } 356 | ], 357 | "source": [ 358 | "from datasets import load_metric\n", 359 | "\n", 360 | "preds = np.argmax(predictions.predictions, axis=-1)\n", 361 | "\n", 362 | "metric = load_metric('glue', 'mrpc')\n", 363 | "metric.compute(predictions=preds, references=predictions.label_ids)" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "metric,glue type的文档:\n", 371 | "```\n", 372 | "Args:\n", 373 | " predictions: list of predictions to score.\n", 374 | " Each translation should be tokenized into a list of tokens.\n", 375 | " references: list of lists of references for each translation.\n", 376 | " Each reference should be tokenized into a list of tokens.\n", 377 | "Returns: depending on the GLUE subset, one or several of:\n", 378 | " \"accuracy\": Accuracy\n", 379 | " \"f1\": F1 score\n", 380 | " \"pearson\": Pearson Correlation\n", 381 | " \"spearmanr\": Spearman Correlation\n", 382 | " \"matthews_correlation\": Matthew Correlation\n", 383 | "```" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "## 4.构建`Trainer`中的`compute_metrics`函数\n", 391 | "\n", 392 | "Let’s see how we can build a useful compute_metrics function and use it the next time we train. The function must take an EvalPrediction object (which is a named tuple with a predictions field and a label_ids field) and will return a dictionary mapping strings to floats (the strings being the names of the metrics returned, and the floats their values). \n", 393 | "\n", 394 | "前面我们注意到`Trainer`的参数中,可以提供一个`compute_metrics`函数,用于输出我们希望有的一些指标。\n", 395 | "\n", 396 | "这个`compute_metrics`有一些输入输出的要求:\n", 397 | "- 输入:是一个`EvalPrediction`对象,是一个named tuple,需要有至少`predictions`和`label_ids`两个字段;经过查看源码,这里的predictions,**就是logits**\n", 398 | "- 输出:一个字典,包含各个metrics和对应的数值。\n", 399 | "\n", 400 | "源码地址: https://huggingface.co/transformers/master/_modules/transformers/trainer.html#Trainer" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 4, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "from datasets import load_metric\n", 410 | "def compute_metrics(eval_preds):\n", 411 | " metric = load_metric(\"glue\", \"mrpc\")\n", 412 | " logits, labels = eval_preds.predictions, eval_preds.label_ids\n", 413 | " # 上一行可以直接简写成:\n", 414 | " # logits, labels = eval_preds 因为它相当于一个tuple\n", 415 | " predictions = np.argmax(logits, axis=-1)\n", 416 | " return metric.compute(predictions=predictions, references=labels)" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "总结一下这个过程:\n", 424 | "\n", 425 | "- 首先我们定义了一个`compute_metrics`函数,交给`Trainer`;\n", 426 | "- `Trainer`训练模型,模型会对样本计算,产生 predictions (logits);\n", 427 | "- `Trainer`再把 predictions 和数据集中给定的 label_ids 打包成一个对象,发送给`compute_metrics`函数;\n", 428 | "- `compute_metrics`函数计算好相应的 metrics 然后返回。" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "## 看看带上了 compute_metrics 之后的训练:" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 5, 441 | "metadata": {}, 442 | "outputs": [ 443 | { 444 | "name": "stderr", 445 | "output_type": "stream", 446 | "text": [ 447 | "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n", 448 | "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", 449 | "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", 450 | "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", 451 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" 452 | ] 453 | }, 454 | { 455 | "data": { 456 | "text/html": [ 457 | "\n", 458 | "
\n", 459 | " \n", 468 | " \n", 469 | " \n", 470 | " [1377/1377 06:51, Epoch 3/3]\n", 471 | "
\n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | "
EpochTraining LossValidation LossAccuracyF1RuntimeSamples Per Second
1No log0.3298150.8676470.9035715.87330069.467000
20.4979000.6006490.8455880.89722717.31970023.557000
30.2832000.6050530.8725490.9103459.24430044.135000

" 514 | ], 515 | "text/plain": [ 516 | "" 517 | ] 518 | }, 519 | "metadata": {}, 520 | "output_type": "display_data" 521 | }, 522 | { 523 | "data": { 524 | "text/plain": [ 525 | "TrainOutput(global_step=1377, training_loss=0.32063739751678666, metrics={'train_runtime': 414.1719, 'train_samples_per_second': 3.325, 'total_flos': 530351810395680, 'epoch': 3.0})" 526 | ] 527 | }, 528 | "execution_count": 5, 529 | "metadata": {}, 530 | "output_type": "execute_result" 531 | } 532 | ], 533 | "source": [ 534 | "training_args = TrainingArguments(output_dir='test_trainer', evaluation_strategy='epoch')\n", 535 | "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) # new model\n", 536 | "trainer = Trainer(\n", 537 | " model,\n", 538 | " training_args,\n", 539 | " train_dataset=tokenized_datasets[\"train\"],\n", 540 | " eval_dataset=tokenized_datasets[\"validation\"],\n", 541 | " data_collator=data_collator, # 在定义了tokenizer之后,其实这里的data_collator就不用再写了,会自动根据tokenizer创建\n", 542 | " tokenizer=tokenizer,\n", 543 | " compute_metrics=compute_metrics\n", 544 | ")\n", 545 | "\n", 546 | "trainer.train()" 547 | ] 548 | }, 549 | { 550 | "cell_type": "markdown", 551 | "metadata": {}, 552 | "source": [ 553 | "可见,带上了`compute_metircs`函数之后,在Trainer训练过程中,会把增加的metric也打印出来,方便我们时刻连接训练的进展。" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": null, 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [] 562 | } 563 | ], 564 | "metadata": { 565 | "kernelspec": { 566 | "display_name": "Python 3", 567 | "language": "python", 568 | "name": "python3" 569 | }, 570 | "language_info": { 571 | "codemirror_mode": { 572 | "name": "ipython", 573 | "version": 3 574 | }, 575 | "file_extension": ".py", 576 | "mimetype": "text/x-python", 577 | "name": "python", 578 | "nbconvert_exporter": "python", 579 | "pygments_lexer": "ipython3", 580 | "version": "3.7.6" 581 | } 582 | }, 583 | "nbformat": 4, 584 | "nbformat_minor": 4 585 | } 586 | -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/3. 用纯PyTorch来fine-tune.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 更加透明的方式\n", 8 | "\n", 9 | "这里我们不使用Trainer这个高级API,而是用pytorch来实现。\n", 10 | "\n", 11 | "\n", 12 | "## 1. 数据集预处理\n", 13 | "在Huggingface官方教程里提到,在使用pytorch的dataloader之前,我们需要做一些事情:\n", 14 | "- 把dataset中一些不需要的列给去掉了,比如‘sentence1’,‘sentence2’等\n", 15 | "- 把数据转换成pytorch tensors\n", 16 | "- 修改列名 label 为 labels\n", 17 | "\n", 18 | "其他的都好说,但**为啥要修改列名 label 为 labels,好奇怪哦!**\n", 19 | "这里探究一下:\n", 20 | "\n", 21 | "\n", 22 | "首先,Huggingface的这些transformer Model直接call的时候,接受的标签这个参数是叫\"labels\"。\n", 23 | "所以不管你使用Trainer,还是原生pytorch去写,最终模型处理的时候,肯定是使用的名为\"labels\"的标签参数。\n", 24 | "\n", 25 | "\n", 26 | "但在Huggingface的datasets中,数据集的标签一般命名为\"label\"或者\"label_ids\",那为什么在前两集中,我们没有对标签名进行处理呢?\n", 27 | "\n", 28 | "这一点在transformer的源码`trainer.py`里找到了端倪:\n", 29 | "```python\n", 30 | "# 位置在def _remove_unused_columns函数里\n", 31 | "# Labels may be named label or label_ids, the default data collator handles that.\n", 32 | "signature_columns += [\"label\", \"label_ids\"]\n", 33 | "```\n", 34 | "这里提示了, data collator 会负责处理标签问题。然后我又去查看了`data_collator.py`中发现了一下内容:\n", 35 | "```python\n", 36 | "class DataCollatorWithPadding:\n", 37 | " ...\n", 38 | " def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n", 39 | " ...\n", 40 | " if \"label\" in batch:\n", 41 | " batch[\"labels\"] = batch[\"label\"]\n", 42 | " del batch[\"label\"]\n", 43 | " if \"label_ids\" in batch:\n", 44 | " batch[\"labels\"] = batch[\"label_ids\"]\n", 45 | " del batch[\"label_ids\"]\n", 46 | " return batch\n", 47 | "```\n", 48 | "这就真相大白了:不管数据集中提供的标签名叫\"label\",还是\"label_ids\",\n", 49 | "DataCollatorWithPadding 都会帮你转换成\"labels\",装进batch里,再返回。\n", 50 | "\n", 51 | "前面使用Trainer的时候,DataCollatorWithPadding已经帮我们自动转换了,因此我们不需要操心这个问题。\n", 52 | "\n", 53 | "但这就是让我疑惑的地方:我们使用pytorch来写,其实也不用管这个,因为在pytorch的data_loader里面,有一个`collate_fn`参数,我们可以把DataCollatorWithPadding对象传进去,也会帮我们自动把\"label\"转换成\"labels\"。因此实际上,这应该是教程中的一个错误,我们不需要手动设计。" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 1, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stderr", 63 | "output_type": "stream", 64 | "text": [ 65 | "Reusing dataset glue (C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n" 66 | ] 67 | }, 68 | { 69 | "data": { 70 | "application/vnd.jupyter.widget-view+json": { 71 | "model_id": "b8102a966021470aa4688946db23983f", 72 | "version_major": 2, 73 | "version_minor": 0 74 | }, 75 | "text/plain": [ 76 | " 0%| | 0/3 [00:00), logits=tensor([[-0.2171, -0.4416],\n", 263 | " [-0.2248, -0.4694],\n", 264 | " [-0.2440, -0.4664],\n", 265 | " [-0.2421, -0.4510],\n", 266 | " [-0.2273, -0.4545],\n", 267 | " [-0.2339, -0.4515],\n", 268 | " [-0.2334, -0.4387],\n", 269 | " [-0.2362, -0.4601]], grad_fn=), hidden_states=None, attentions=None)" 270 | ] 271 | }, 272 | "execution_count": 8, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | } 276 | ], 277 | "source": [ 278 | "model(**batch) # 这样的batch可以直接丢进模型处理" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "## optimizer 和 learning rate scheduler" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 9, 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "name": "stdout", 295 | "output_type": "stream", 296 | "text": [ 297 | "1377\n" 298 | ] 299 | } 300 | ], 301 | "source": [ 302 | "from transformers import AdamW, get_scheduler\n", 303 | "\n", 304 | "optimizer = AdamW(model.parameters(), lr=5e-5)\n", 305 | "\n", 306 | "num_epochs = 3\n", 307 | "num_training_steps = num_epochs * len(train_dataloader) # num of batches * num of epochs\n", 308 | "lr_scheduler = get_scheduler(\n", 309 | " 'linear',\n", 310 | " optimizer=optimizer, # scheduler是针对optimizer的lr的\n", 311 | " num_warmup_steps=0,\n", 312 | " num_training_steps=num_training_steps)\n", 313 | "print(num_training_steps)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "## 3. Training" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 10, 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "data": { 330 | "text/plain": [ 331 | "device(type='cuda')" 332 | ] 333 | }, 334 | "execution_count": 10, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "import torch\n", 341 | "\n", 342 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", 343 | "model.to(device)\n", 344 | "\n", 345 | "device" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "## training loops:" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 11, 358 | "metadata": {}, 359 | "outputs": [ 360 | { 361 | "name": "stderr", 362 | "output_type": "stream", 363 | "text": [ 364 | "100%|██████████| 459/459 [01:54<00:00, 4.01it/s]\n", 365 | "100%|██████████| 459/459 [01:55<00:00, 3.98it/s]\n", 366 | "100%|██████████| 459/459 [01:55<00:00, 3.96it/s]\n" 367 | ] 368 | } 369 | ], 370 | "source": [ 371 | "from tqdm import tqdm\n", 372 | "\n", 373 | "for epoch in range(num_epochs):\n", 374 | " for batch in tqdm(train_dataloader):\n", 375 | " # 要在GPU上训练,需要把数据集都移动到GPU上:\n", 376 | " batch = {k:v.to(device) for k,v in batch.items()}\n", 377 | " loss = model(**batch).loss\n", 378 | " loss.backward()\n", 379 | " optimizer.step()\n", 380 | " lr_scheduler.step()\n", 381 | " optimizer.zero_grad()" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "## 4. Evaluation" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 12, 394 | "metadata": {}, 395 | "outputs": [ 396 | { 397 | "data": { 398 | "text/plain": [ 399 | "{'accuracy': 0.8651960784313726, 'f1': 0.9050086355785838}" 400 | ] 401 | }, 402 | "execution_count": 12, 403 | "metadata": {}, 404 | "output_type": "execute_result" 405 | } 406 | ], 407 | "source": [ 408 | "from datasets import load_metric\n", 409 | "\n", 410 | "metric= load_metric(\"glue\", \"mrpc\")\n", 411 | "model.eval()\n", 412 | "for batch in eval_dataloader:\n", 413 | " batch = {k: v.to(device) for k, v in batch.items()}\n", 414 | " with torch.no_grad(): # evaluation的时候不需要算梯度\n", 415 | " outputs = model(**batch)\n", 416 | " \n", 417 | " logits = outputs.logits\n", 418 | " predictions = torch.argmax(logits, dim=-1)\n", 419 | " metric.add_batch(predictions=predictions, references=batch[\"labels\"])\n", 420 | "\n", 421 | "metric.compute()" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "## 5. 使用 Accelerate 库进一步加速\n", 429 | "The training loop we defined earlier works fine on a single CPU or GPU. But using the 🤗 Accelerate library, with just a few adjustments we can enable distributed training on multiple GPUs or TPUs.\n", 430 | "\n", 431 | "日后再说吧~" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [] 440 | } 441 | ], 442 | "metadata": { 443 | "kernelspec": { 444 | "display_name": "Python 3", 445 | "language": "python", 446 | "name": "python3" 447 | }, 448 | "language_info": { 449 | "codemirror_mode": { 450 | "name": "ipython", 451 | "version": 3 452 | }, 453 | "file_extension": ".py", 454 | "mimetype": "text/x-python", 455 | "name": "python", 456 | "nbconvert_exporter": "python", 457 | "pygments_lexer": "ipython3", 458 | "version": "3.7.6" 459 | } 460 | }, 461 | "nbformat": 4, 462 | "nbformat_minor": 4 463 | } 464 | -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-25-19_PC-201911051016/1632641123.3012567/events.out.tfevents.1632641123.PC-201911051016.50596.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-25-19_PC-201911051016/1632641123.3012567/events.out.tfevents.1632641123.PC-201911051016.50596.1 -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-25-19_PC-201911051016/events.out.tfevents.1632641123.PC-201911051016.50596.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-25-19_PC-201911051016/events.out.tfevents.1632641123.PC-201911051016.50596.0 -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-36-43_PC-201911051016/1632641809.055524/events.out.tfevents.1632641809.PC-201911051016.50596.3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-36-43_PC-201911051016/1632641809.055524/events.out.tfevents.1632641809.PC-201911051016.50596.3 -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-36-43_PC-201911051016/events.out.tfevents.1632641808.PC-201911051016.50596.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-36-43_PC-201911051016/events.out.tfevents.1632641808.PC-201911051016.50596.2 -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-37-55_PC-201911051016/1632641879.1103542/events.out.tfevents.1632641879.PC-201911051016.32468.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-37-55_PC-201911051016/1632641879.1103542/events.out.tfevents.1632641879.PC-201911051016.32468.1 -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-37-55_PC-201911051016/events.out.tfevents.1632641879.PC-201911051016.32468.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-37-55_PC-201911051016/events.out.tfevents.1632641879.PC-201911051016.32468.0 -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-44-26_PC-201911051016/1632642271.2198026/events.out.tfevents.1632642271.PC-201911051016.32468.3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-44-26_PC-201911051016/1632642271.2198026/events.out.tfevents.1632642271.PC-201911051016.32468.3 -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-44-26_PC-201911051016/events.out.tfevents.1632642271.PC-201911051016.32468.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-44-26_PC-201911051016/events.out.tfevents.1632642271.PC-201911051016.32468.2 -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-05_PC-201911051016/1632642852.8538904/events.out.tfevents.1632642852.PC-201911051016.3052.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-05_PC-201911051016/1632642852.8538904/events.out.tfevents.1632642852.PC-201911051016.3052.1 -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-05_PC-201911051016/events.out.tfevents.1632642852.PC-201911051016.3052.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-05_PC-201911051016/events.out.tfevents.1632642852.PC-201911051016.3052.0 -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-51_PC-201911051016/1632642898.3413022/events.out.tfevents.1632642898.PC-201911051016.3052.3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-51_PC-201911051016/1632642898.3413022/events.out.tfevents.1632642898.PC-201911051016.3052.3 -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-51_PC-201911051016/events.out.tfevents.1632642898.PC-201911051016.3052.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-51_PC-201911051016/events.out.tfevents.1632642898.PC-201911051016.3052.2 -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-55-27_PC-201911051016/1632642935.0711265/events.out.tfevents.1632642935.PC-201911051016.34932.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-55-27_PC-201911051016/1632642935.0711265/events.out.tfevents.1632642935.PC-201911051016.34932.1 -------------------------------------------------------------------------------- /HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-55-27_PC-201911051016/events.out.tfevents.1632642934.PC-201911051016.34932.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-55-27_PC-201911051016/events.out.tfevents.1632642934.PC-201911051016.34932.0 -------------------------------------------------------------------------------- /使用transformers库.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "D:\\Anaconda3\\envs\\torch\\lib\\importlib\\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n", 13 | " return f(*args, **kwds)\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", 19 | "import torch" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "application/vnd.jupyter.widget-view+json": { 30 | "model_id": "b81cefe174104cf0a816a2acd4f0f4fd", 31 | "version_major": 2, 32 | "version_minor": 0 33 | }, 34 | "text/plain": [ 35 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=811.0, style=ProgressStyle(description_…" 36 | ] 37 | }, 38 | "metadata": {}, 39 | "output_type": "display_data" 40 | }, 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "\n" 46 | ] 47 | }, 48 | { 49 | "data": { 50 | "application/vnd.jupyter.widget-view+json": { 51 | "model_id": "fa26e4665b1949188359fe88264ccabf", 52 | "version_major": 2, 53 | "version_minor": 0 54 | }, 55 | "text/plain": [ 56 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…" 57 | ] 58 | }, 59 | "metadata": {}, 60 | "output_type": "display_data" 61 | }, 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "\n" 67 | ] 68 | }, 69 | { 70 | "data": { 71 | "application/vnd.jupyter.widget-view+json": { 72 | "model_id": "dd9e12f8e20949b99144cec77a5a6ecb", 73 | "version_major": 2, 74 | "version_minor": 0 75 | }, 76 | "text/plain": [ 77 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466197.0, style=ProgressStyle(descripti…" 78 | ] 79 | }, 80 | "metadata": {}, 81 | "output_type": "display_data" 82 | }, 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "\n" 88 | ] 89 | }, 90 | { 91 | "data": { 92 | "application/vnd.jupyter.widget-view+json": { 93 | "model_id": "05c023a6424147f899156566bdf0f2b9", 94 | "version_major": 2, 95 | "version_minor": 0 96 | }, 97 | "text/plain": [ 98 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…" 99 | ] 100 | }, 101 | "metadata": {}, 102 | "output_type": "display_data" 103 | }, 104 | { 105 | "name": "stdout", 106 | "output_type": "stream", 107 | "text": [ 108 | "\n" 109 | ] 110 | }, 111 | { 112 | "data": { 113 | "application/vnd.jupyter.widget-view+json": { 114 | "model_id": "978e7f21bbdd4c0bab47a5135768e425", 115 | "version_major": 2, 116 | "version_minor": 0 117 | }, 118 | "text/plain": [ 119 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=333.0, style=ProgressStyle(description_…" 120 | ] 121 | }, 122 | "metadata": {}, 123 | "output_type": "display_data" 124 | }, 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "\n" 130 | ] 131 | }, 132 | { 133 | "data": { 134 | "application/vnd.jupyter.widget-view+json": { 135 | "model_id": "ad7bb71ca98d48319dc3bddedbeedcac", 136 | "version_major": 2, 137 | "version_minor": 0 138 | }, 139 | "text/plain": [ 140 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267866225.0, style=ProgressStyle(descri…" 141 | ] 142 | }, 143 | "metadata": {}, 144 | "output_type": "display_data" 145 | }, 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "ptm_name = \"liam168/c4-zh-distilbert-base-uncased\"\n", 156 | "tokenizer = AutoTokenizer.from_pretrained(ptm_name)\n", 157 | "model = AutoModelForSequenceClassification.from_pretrained(ptm_name)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 16, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "classes = [\"女性\",\"体育\",\"文学\",\"校园\"]\n", 167 | "s1 = '女生的成绩往往比男生好'\n", 168 | "s2 = '中国奥运军团在东京奥运会上取得了最多的金牌'" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 19, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "{'input_ids': tensor([[ 101, 1746, 1799, 100, 100, 100, 100, 100, 100, 1755, 100, 100,\n", 181 | " 1763, 1742, 100, 100, 100, 100, 100, 1916, 1964, 100, 102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n", 182 | "tensor([[-1.4325, 3.3059, -0.8026, -1.2892]], grad_fn=)\n", 183 | "[[0.008454332128167152, 0.9659157395362854, 0.015873165801167488, 0.00975674670189619]]\n", 184 | "女性: 1%\n", 185 | "体育: 97%\n", 186 | "文学: 2%\n", 187 | "校园: 1%\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "x = tokenizer(s2, return_tensors='pt') # return_tensors='pt' 让返回的格式为torch tensor\n", 193 | "print(x)\n", 194 | "\n", 195 | "logits = model(**x).logits\n", 196 | "print(logits)\n", 197 | "\n", 198 | "result = torch.softmax(logits, dim=1).tolist() # 需要 dim=1\n", 199 | "print(result)\n", 200 | "\n", 201 | "for i in range(len(classes)):\n", 202 | " print(f'{classes[i]}: {int(round(result[0][i]*100))}%')" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [] 211 | } 212 | ], 213 | "metadata": { 214 | "kernelspec": { 215 | "display_name": "Python 3", 216 | "language": "python", 217 | "name": "python3" 218 | }, 219 | "language_info": { 220 | "codemirror_mode": { 221 | "name": "ipython", 222 | "version": 3 223 | }, 224 | "file_extension": ".py", 225 | "mimetype": "text/x-python", 226 | "name": "python", 227 | "nbconvert_exporter": "python", 228 | "pygments_lexer": "ipython3", 229 | "version": "3.7.6" 230 | } 231 | }, 232 | "nbformat": 4, 233 | "nbformat_minor": 4 234 | } 235 | -------------------------------------------------------------------------------- /李沐PyTorch/1. 基础操作.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "9d086990", 6 | "metadata": {}, 7 | "source": [ 8 | "# PyTorch基础数据操作" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "5702d925", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import torch" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "id": "4fe33148", 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/plain": [ 30 | "tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])" 31 | ] 32 | }, 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "output_type": "execute_result" 36 | } 37 | ], 38 | "source": [ 39 | "x = torch.arange(12)\n", 40 | "x" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "id": "cda9bc46", 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/plain": [ 52 | "torch.Size([12])" 53 | ] 54 | }, 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "output_type": "execute_result" 58 | } 59 | ], 60 | "source": [ 61 | "x.shape" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "id": "0810c186", 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "device(type='cpu')" 74 | ] 75 | }, 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "x.device" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 4, 88 | "id": "ae9218fb", 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "data": { 93 | "text/plain": [ 94 | "12" 95 | ] 96 | }, 97 | "execution_count": 4, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "x.numel() # number of elements" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 5, 109 | "id": "bd016a9f", 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "data": { 114 | "text/plain": [ 115 | "tensor([[ 0, 1, 2, 3],\n", 116 | " [ 4, 5, 6, 7],\n", 117 | " [ 8, 9, 10, 11]])" 118 | ] 119 | }, 120 | "execution_count": 5, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "X = x.reshape(3,4)\n", 127 | "X" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 8, 133 | "id": "33ec4f2d", 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "data": { 138 | "text/plain": [ 139 | "tensor([[[1., 1., 1., 1.],\n", 140 | " [1., 1., 1., 1.],\n", 141 | " [1., 1., 1., 1.]],\n", 142 | "\n", 143 | " [[1., 1., 1., 1.],\n", 144 | " [1., 1., 1., 1.],\n", 145 | " [1., 1., 1., 1.]]])" 146 | ] 147 | }, 148 | "execution_count": 8, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "torch.zeros((2,3,4))\n", 155 | "torch.ones((2,3,4))" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 9, 161 | "id": "62b2cb89", 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "tensor([[1, 2, 3],\n", 168 | " [4, 5, 6]])" 169 | ] 170 | }, 171 | "execution_count": 9, 172 | "metadata": {}, 173 | "output_type": "execute_result" 174 | } 175 | ], 176 | "source": [ 177 | "torch.tensor([[1,2,3],[4,5,6]])" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 14, 183 | "id": "c3a6feb8", 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "text/plain": [ 189 | "(tensor([11., 12., 13.]),\n", 190 | " tensor([10., 20., 30.]),\n", 191 | " tensor([0.1000, 0.2000, 0.3000]),\n", 192 | " tensor([ 10., 100., 1000.]))" 193 | ] 194 | }, 195 | "execution_count": 14, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "# 常见标准运算(+ - * / **)都是按元素运算\n", 202 | "x = torch.tensor([1.0,2,3]) ## 在tensor中任意一个数加一个小数点,就可以把tensor类型转化为float浮点型\n", 203 | "y = torch.tensor([10,10,10])\n", 204 | "x + y, x * y, x / y, y ** x" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 17, 210 | "id": "da26fc28", 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "(tensor([[0., 1., 2., 3., 4.],\n", 217 | " [5., 6., 7., 8., 9.]]),\n", 218 | " tensor([[0., 0., 0., 0., 0.],\n", 219 | " [0., 0., 0., 0., 0.]]))" 220 | ] 221 | }, 222 | "execution_count": 17, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "# tensor的拼接\n", 229 | "x = torch.arange(10,dtype=torch.float32).reshape(2,5)\n", 230 | "y = torch.zeros(10,dtype=torch.float32).reshape(2,5)\n", 231 | "x,y" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 20, 237 | "id": "37d1b7ef", 238 | "metadata": {}, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "text/plain": [ 243 | "(tensor([[0., 1., 2., 3., 4.],\n", 244 | " [5., 6., 7., 8., 9.],\n", 245 | " [0., 0., 0., 0., 0.],\n", 246 | " [0., 0., 0., 0., 0.]]),\n", 247 | " tensor([[0., 1., 2., 3., 4., 0., 0., 0., 0., 0.],\n", 248 | " [5., 6., 7., 8., 9., 0., 0., 0., 0., 0.]]))" 249 | ] 250 | }, 251 | "execution_count": 20, 252 | "metadata": {}, 253 | "output_type": "execute_result" 254 | } 255 | ], 256 | "source": [ 257 | "torch.cat([x,y],dim=0), torch.cat([x,y],dim=1) # dim=0按行拼接,dim=1按列拼接" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 21, 263 | "id": "ba36dc4e", 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "text/plain": [ 269 | "tensor([[ True, False, False, False, False],\n", 270 | " [False, False, False, False, False]])" 271 | ] 272 | }, 273 | "execution_count": 21, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "# 逻辑运算符\n", 280 | "x == y" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 24, 286 | "id": "7ef8cbb3", 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/plain": [ 292 | "(tensor([[0],\n", 293 | " [1],\n", 294 | " [2]]),\n", 295 | " tensor([[0, 1]]))" 296 | ] 297 | }, 298 | "execution_count": 24, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "# 即时形状不同(但维数得相同),也可以通过“广播机制”来进行运算(按元素)\n", 305 | "# 具体做法是,通过复制得到各自的一个大张量,然后再逐元素运算\n", 306 | "# 比如一个(3,1)一个(1,2),则需要先都统一成(3,2)的矩阵,再计算\n", 307 | "a = torch.arange(3).reshape(3,1)\n", 308 | "b = torch.arange(2).reshape(1,2)\n", 309 | "a,b" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 25, 315 | "id": "7924e822", 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "text/plain": [ 321 | "tensor([[0, 1],\n", 322 | " [1, 2],\n", 323 | " [2, 3]])" 324 | ] 325 | }, 326 | "execution_count": 25, 327 | "metadata": {}, 328 | "output_type": "execute_result" 329 | } 330 | ], 331 | "source": [ 332 | "a + b" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 82, 338 | "id": "c7b1c5ce", 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "data": { 343 | "text/plain": [ 344 | "(tensor([[0, 1],\n", 345 | " [2, 3],\n", 346 | " [4, 5]]),\n", 347 | " tensor([0, 1]),\n", 348 | " tensor([[0],\n", 349 | " [1]]),\n", 350 | " tensor([[0, 1]]))" 351 | ] 352 | }, 353 | "execution_count": 82, 354 | "metadata": {}, 355 | "output_type": "execute_result" 356 | } 357 | ], 358 | "source": [ 359 | "a = torch.arange(6).reshape(3,2)\n", 360 | "b = torch.arange(2)\n", 361 | "c = b.reshape(2,1)\n", 362 | "d = b.reshape(1,2)\n", 363 | "a,b,c,d" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 85, 369 | "id": "682d063d", 370 | "metadata": {}, 371 | "outputs": [ 372 | { 373 | "data": { 374 | "text/plain": [ 375 | "tensor([[1],\n", 376 | " [3],\n", 377 | " [5]])" 378 | ] 379 | }, 380 | "execution_count": 85, 381 | "metadata": {}, 382 | "output_type": "execute_result" 383 | } 384 | ], 385 | "source": [ 386 | "# * torch.dot torch.matmul torch.mm这些乘法都是怎样的:\n", 387 | "\n", 388 | "a*b\n", 389 | "# tensor([[0, 1],\n", 390 | "# [0, 3],\n", 391 | "# [0, 5]])\n", 392 | "\n", 393 | "torch.dot(a,b)\n", 394 | "# RuntimeError: 1D tensors expected, but got 2D and 1D tensors\n", 395 | " \n", 396 | "torch.matmul(a,b)\n", 397 | "# tensor([1, 3, 5])\n", 398 | "\n", 399 | "torch.mm(a,b)\n", 400 | "# RuntimeError: mat2 must be a matrix\n", 401 | "\n", 402 | "torch.matmul(a,c)\n", 403 | "# tensor([[1],\n", 404 | "# [3],\n", 405 | "# [5]])\n", 406 | "\n", 407 | "torch.mm(a,c)\n", 408 | "# tensor([[1],\n", 409 | "# [3],\n", 410 | "# [5]])\n", 411 | "\n", 412 | "torch.matmul(a,d)\n", 413 | "torch.mm(a,d)\n", 414 | "# RuntimeError: mat1 and mat2 shapes cannot be multiplied (3x2 and 1x2)\n", 415 | "\n", 416 | "torch.matmul(a,d.T)\n", 417 | "torch.mm(a,d.T)\n", 418 | "# tensor([[1],\n", 419 | "# [3],\n", 420 | "# [5]])" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "id": "d28f4c36", 426 | "metadata": {}, 427 | "source": [ 428 | "### 总结一下:\n", 429 | "- `*`就是逐元素相乘,采用broadcast机制,还可以使用torch.mul\n", 430 | "- `.dot`只能用于两个1D向量做内积\n", 431 | "- `.mm`只能用于两个2D的矩阵相乘,必须符合矩阵乘法的规则\n", 432 | "- `.matmul`用途最广泛,兼容性最强。可以对1D,2D以及更高维数据进行乘法。当其中有1D时,采用broadcast,当都是2D时使用矩阵乘法规则。还可以使用 `@`符号。" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "id": "1ea2c991", 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "# 赋值,可以对一个点或者一个区域赋值\n", 443 | "x\n", 444 | "# tensor([[0., 1., 2., 3., 4.],\n", 445 | "# [5., 6., 7., 8., 9.]])\n", 446 | "x[1,2] = 100\n", 447 | "x\n", 448 | "# tensor([[ 0., 1., 2., 3., 4.],\n", 449 | "# [ 5., 6., 100., 8., 9.]])\n", 450 | "x[0, :] = 100\n", 451 | "x\n", 452 | "# tensor([[100., 100., 100., 100., 100.],\n", 453 | "# [ 5., 6., 100., 8., 9.]])" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 33, 459 | "id": "0f796d47", 460 | "metadata": {}, 461 | "outputs": [ 462 | { 463 | "name": "stdout", 464 | "output_type": "stream", 465 | "text": [ 466 | "4694868480\n", 467 | "5230995712\n" 468 | ] 469 | } 470 | ], 471 | "source": [ 472 | "p = torch.arange(5)\n", 473 | "orig_id = id(p) # id是python自带函数,查询变量的内存地址,相当于指针\n", 474 | "p = p + 1 # 这样赋值,会分配新的内存地址,导致占用更多内存\n", 475 | "new_id = id(p)\n", 476 | "print(orig_id)\n", 477 | "print(new_id)" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 34, 483 | "id": "f32f639f", 484 | "metadata": {}, 485 | "outputs": [ 486 | { 487 | "name": "stdout", 488 | "output_type": "stream", 489 | "text": [ 490 | "5230929408\n", 491 | "5230929408\n" 492 | ] 493 | } 494 | ], 495 | "source": [ 496 | "p = torch.arange(5)\n", 497 | "orig_id = id(p)\n", 498 | "p[:] = p + 1 # 使用:符号来赋值,就可以原地进行赋值,不占用新内存\n", 499 | "new_id = id(p)\n", 500 | "print(orig_id)\n", 501 | "print(new_id)" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 35, 507 | "id": "73774f53", 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "name": "stdout", 512 | "output_type": "stream", 513 | "text": [ 514 | "5231238656\n", 515 | "5231238656\n" 516 | ] 517 | } 518 | ], 519 | "source": [ 520 | "p = torch.arange(5)\n", 521 | "orig_id = id(p)\n", 522 | "p += 1 # 使用 += 这样的方式在运算,也可以进行原地赋值\n", 523 | "new_id = id(p)\n", 524 | "print(orig_id)\n", 525 | "print(new_id)" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 36, 531 | "id": "4a23af46", 532 | "metadata": {}, 533 | "outputs": [ 534 | { 535 | "data": { 536 | "text/plain": [ 537 | "(numpy.ndarray, torch.Tensor)" 538 | ] 539 | }, 540 | "execution_count": 36, 541 | "metadata": {}, 542 | "output_type": "execute_result" 543 | } 544 | ], 545 | "source": [ 546 | "# 转换为numpy张量,从numpy张量转化成torch的tensor\n", 547 | "A = x.numpy()\n", 548 | "B = torch.tensor(A)\n", 549 | "type(A), type(B)" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 41, 555 | "id": "8afe642f", 556 | "metadata": {}, 557 | "outputs": [ 558 | { 559 | "data": { 560 | "text/plain": [ 561 | "(tensor(3.5000), , 3.5, 3)" 562 | ] 563 | }, 564 | "execution_count": 41, 565 | "metadata": {}, 566 | "output_type": "execute_result" 567 | } 568 | ], 569 | "source": [ 570 | "# 将大小为1的张量,转换为Python标量\n", 571 | "# 两种方式:.item 或者 float()/int()\n", 572 | "a = torch.tensor(3.5)\n", 573 | "a, a.item, float(a), int(a)" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "id": "d3486883", 580 | "metadata": {}, 581 | "outputs": [], 582 | "source": [ 583 | "# 当x是一个向量或张量时,就没法使用.item了\n", 584 | "x.item()\n", 585 | "# ValueError: only one element tensors can be converted to Python scalars" 586 | ] 587 | }, 588 | { 589 | "cell_type": "markdown", 590 | "id": "78b3c59d", 591 | "metadata": {}, 592 | "source": [ 593 | "# 数据预处理" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": 56, 599 | "id": "47bd8d73", 600 | "metadata": {}, 601 | "outputs": [ 602 | { 603 | "data": { 604 | "text/html": [ 605 | "

\n", 606 | "\n", 619 | "\n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | "
ABC
01.02.0good
14.05.0None
210.010.0None
\n", 649 | "
" 650 | ], 651 | "text/plain": [ 652 | " A B C\n", 653 | "0 1.0 2.0 good\n", 654 | "1 4.0 5.0 None\n", 655 | "2 10.0 10.0 None" 656 | ] 657 | }, 658 | "execution_count": 56, 659 | "metadata": {}, 660 | "output_type": "execute_result" 661 | } 662 | ], 663 | "source": [ 664 | "import pandas as pd\n", 665 | "columns = ['A','B','C']\n", 666 | "data = [[1.,2.,'good'],[4,5],[10,10]]\n", 667 | "df = pd.DataFrame(data, columns=columns)\n", 668 | "df" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 62, 674 | "id": "91dbcf28", 675 | "metadata": {}, 676 | "outputs": [ 677 | { 678 | "data": { 679 | "text/plain": [ 680 | "array([[1.0, 2.0, 'good'],\n", 681 | " [4.0, 5.0, None],\n", 682 | " [10.0, 10.0, None]], dtype=object)" 683 | ] 684 | }, 685 | "execution_count": 62, 686 | "metadata": {}, 687 | "output_type": "execute_result" 688 | } 689 | ], 690 | "source": [ 691 | "df.values" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": 58, 697 | "id": "42b811df", 698 | "metadata": {}, 699 | "outputs": [ 700 | { 701 | "data": { 702 | "text/html": [ 703 | "
\n", 704 | "\n", 717 | "\n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | "
ABC_goodC_nan
01.02.010
14.05.001
210.010.001
\n", 751 | "
" 752 | ], 753 | "text/plain": [ 754 | " A B C_good C_nan\n", 755 | "0 1.0 2.0 1 0\n", 756 | "1 4.0 5.0 0 1\n", 757 | "2 10.0 10.0 0 1" 758 | ] 759 | }, 760 | "execution_count": 58, 761 | "metadata": {}, 762 | "output_type": "execute_result" 763 | } 764 | ], 765 | "source": [ 766 | "inputs = pd.get_dummies(df,dummy_na=True) # 通过这种方法,可以将缺失值转化成0,1特征(一般是非数值特征这样做,数值化的特征就直接fillna即可)\n", 767 | "inputs" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": 61, 773 | "id": "10608127", 774 | "metadata": {}, 775 | "outputs": [ 776 | { 777 | "data": { 778 | "text/plain": [ 779 | "array([[ 1., 2., 1., 0.],\n", 780 | " [ 4., 5., 0., 1.],\n", 781 | " [10., 10., 0., 1.]])" 782 | ] 783 | }, 784 | "execution_count": 61, 785 | "metadata": {}, 786 | "output_type": "execute_result" 787 | } 788 | ], 789 | "source": [ 790 | "inputs.values" 791 | ] 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": null, 796 | "id": "0e5a8fdb", 797 | "metadata": {}, 798 | "outputs": [], 799 | "source": [] 800 | } 801 | ], 802 | "metadata": { 803 | "kernelspec": { 804 | "display_name": "Python 3 (ipykernel)", 805 | "language": "python", 806 | "name": "python3" 807 | }, 808 | "language_info": { 809 | "codemirror_mode": { 810 | "name": "ipython", 811 | "version": 3 812 | }, 813 | "file_extension": ".py", 814 | "mimetype": "text/x-python", 815 | "name": "python", 816 | "nbconvert_exporter": "python", 817 | "pygments_lexer": "ipython3", 818 | "version": "3.9.2" 819 | } 820 | }, 821 | "nbformat": 4, 822 | "nbformat_minor": 5 823 | } 824 | -------------------------------------------------------------------------------- /李沐PyTorch/2. 自动求导.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "0c428e76-5ac2-4293-b766-8ae72395df7c", 6 | "metadata": {}, 7 | "source": [ 8 | "# 自动求导" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 17, 14 | "id": "c03e70d3-aee5-4c18-9514-d6924a3a9ec6", 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "tensor([1., 2., 3.])\n" 22 | ] 23 | }, 24 | { 25 | "data": { 26 | "text/plain": [ 27 | "tensor([1., 4., 9.])" 28 | ] 29 | }, 30 | "execution_count": 17, 31 | "metadata": {}, 32 | "output_type": "execute_result" 33 | } 34 | ], 35 | "source": [ 36 | "import torch\n", 37 | "x = torch.tensor([1.0,2.0,3.0])\n", 38 | "print(x)\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 25, 44 | "id": "99063507-e4c8-4677-993b-6cbe0b9ae59e", 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "tensor([[14.]])" 51 | ] 52 | }, 53 | "execution_count": 25, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "a = x.reshape(1,3)\n", 60 | "b = x.reshape(3,1)\n", 61 | "a,b\n", 62 | "\n", 63 | "torch.mm(a,b)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 8, 69 | "id": "5a52e219-6135-43fb-a3d2-3c2c8fc77a4e", 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "ename": "RuntimeError", 74 | "evalue": "only Tensors of floating point dtype can require gradients", 75 | "output_type": "error", 76 | "traceback": [ 77 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 78 | "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", 79 | "\u001b[0;32m/var/folders/ts/ft1kkj55399gmd5c5cr535dm0000gn/T/ipykernel_81586/1455190116.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequires_grad_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgrad\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 80 | "\u001b[0;31mRuntimeError\u001b[0m: only Tensors of floating point dtype can require gradients" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "x.requires_grad_(True)\n", 86 | "x.grad" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 3, 92 | "id": "644c3fbc-a1c9-4dd2-a3b8-8b46cb1ddc80", 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "text/plain": [ 98 | "0.0" 99 | ] 100 | }, 101 | "execution_count": 3, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "y = torch.dot(x,x)\n", 108 | "y.item()" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "id": "702ddb8c-203c-48cb-9b89-79af11d56777", 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "tensor(0., grad_fn=)" 121 | ] 122 | }, 123 | "execution_count": 6, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | } 127 | ], 128 | "source": [ 129 | "torch.dot(x,x)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 10, 135 | "id": "0042f697-991a-4bd9-92d7-04b2dadb826a", 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/plain": [ 141 | "tensor([0., 1., 2., 3.], grad_fn=)" 142 | ] 143 | }, 144 | "execution_count": 10, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "x.T" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "id": "b82ea2f0-0a37-45da-9a92-5e9065c2ace3", 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead." 161 | ] 162 | } 163 | ], 164 | "metadata": { 165 | "kernelspec": { 166 | "display_name": "Python 3 (ipykernel)", 167 | "language": "python", 168 | "name": "python3" 169 | }, 170 | "language_info": { 171 | "codemirror_mode": { 172 | "name": "ipython", 173 | "version": 3 174 | }, 175 | "file_extension": ".py", 176 | "mimetype": "text/x-python", 177 | "name": "python", 178 | "nbconvert_exporter": "python", 179 | "pygments_lexer": "ipython3", 180 | "version": "3.9.2" 181 | } 182 | }, 183 | "nbformat": 4, 184 | "nbformat_minor": 5 185 | } 186 | --------------------------------------------------------------------------------