├── .gitignore
├── A 60 Minute Blitz
    ├── .DS_Store
    ├── 1.tensor_tutorial.ipynb
    ├── 2.autograd_tutorial.ipynb
    ├── 3.neural_networks_tutorial.ipynb
    └── 4.cifar10_tutorial.ipynb
├── Applications
    └── language_model
    │   ├── README.md
    │   ├── data.py
    │   ├── generate.py
    │   ├── generated.txt
    │   ├── main.py
    │   ├── model.pt
    │   └── model.py
├── HuggingfaceNLP
    ├── C1. Start Playing Transformers
    │   ├── 1. 直接使用pipeline.ipynb
    │   ├── 2. Transformer家族及基本概念.ipynb
    │   ├── 3. 端到端的背后.ipynb
    │   ├── 4. Models & Tokenizers.ipynb
    │   └── 5. 处理多个序列.ipynb
    └── C2. Fine-tuning Transformers
    │   ├── 1. 数据集预处理.ipynb
    │   ├── 2. 使用Trainer API来fine-tune.ipynb
    │   ├── 3. 用纯PyTorch来fine-tune.ipynb
    │   └── runs
    │       ├── Sep26_15-25-19_PC-201911051016
    │           ├── 1632641123.3012567
    │           │   └── events.out.tfevents.1632641123.PC-201911051016.50596.1
    │           └── events.out.tfevents.1632641123.PC-201911051016.50596.0
    │       ├── Sep26_15-36-43_PC-201911051016
    │           ├── 1632641809.055524
    │           │   └── events.out.tfevents.1632641809.PC-201911051016.50596.3
    │           └── events.out.tfevents.1632641808.PC-201911051016.50596.2
    │       ├── Sep26_15-37-55_PC-201911051016
    │           ├── 1632641879.1103542
    │           │   └── events.out.tfevents.1632641879.PC-201911051016.32468.1
    │           └── events.out.tfevents.1632641879.PC-201911051016.32468.0
    │       ├── Sep26_15-44-26_PC-201911051016
    │           ├── 1632642271.2198026
    │           │   └── events.out.tfevents.1632642271.PC-201911051016.32468.3
    │           └── events.out.tfevents.1632642271.PC-201911051016.32468.2
    │       ├── Sep26_15-54-05_PC-201911051016
    │           ├── 1632642852.8538904
    │           │   └── events.out.tfevents.1632642852.PC-201911051016.3052.1
    │           └── events.out.tfevents.1632642852.PC-201911051016.3052.0
    │       ├── Sep26_15-54-51_PC-201911051016
    │           ├── 1632642898.3413022
    │           │   └── events.out.tfevents.1632642898.PC-201911051016.3052.3
    │           └── events.out.tfevents.1632642898.PC-201911051016.3052.2
    │       └── Sep26_15-55-27_PC-201911051016
    │           ├── 1632642935.0711265
    │               └── events.out.tfevents.1632642935.PC-201911051016.34932.1
    │           └── events.out.tfevents.1632642934.PC-201911051016.34932.0
├── 使用transformers库.ipynb
└── 李沐PyTorch
    ├── 1. 基础操作.ipynb
    ├── 2. 自动求导.ipynb
    └── 3. 线性预测模型.ipynb


/.gitignore:
--------------------------------------------------------------------------------
 1 | # basic ignore:
 2 | __pycache__/
 3 | .idea/
 4 | .ipynb_checkpoints/
 5 | .DS_Store
 6 | 
 7 | # model weights
 8 | weights/
 9 | 
10 | 
11 | # saved kws
12 | saved_words/
13 | 
14 | # data folder
15 | data/
16 | datasets/
17 | dataset/
18 | 
19 | # temp files
20 | temp/
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/A 60 Minute Blitz/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/A 60 Minute Blitz/.DS_Store


--------------------------------------------------------------------------------
/A 60 Minute Blitz/1.tensor_tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false,
  8 |     "jupyter": {
  9 |      "outputs_hidden": false
 10 |     }
 11 |    },
 12 |    "outputs": [],
 13 |    "source": [
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "\n",
 22 |     "Tensors\n",
 23 |     "--------------------------------------------\n",
 24 |     "\n",
 25 |     "Tensors are a specialized data structure that are very similar to arrays\n",
 26 |     "and matrices. In PyTorch, we use tensors to encode the inputs and\n",
 27 |     "outputs of a model, as well as the model’s parameters.\n",
 28 |     "\n",
 29 |     "Tensors are similar to NumPy’s ndarrays, except that tensors can run on\n",
 30 |     "GPUs or other specialized hardware to accelerate computing. If you’re familiar with ndarrays, you’ll\n",
 31 |     "be right at home with the Tensor API. If not, follow along in this quick\n",
 32 |     "API walkthrough.\n",
 33 |     "\n",
 34 |     "\n"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {
 41 |     "collapsed": false,
 42 |     "jupyter": {
 43 |      "outputs_hidden": false
 44 |     }
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import torch\n",
 49 |     "import numpy as np"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "Tensor Initialization\n",
 57 |     "~~~~~~~~~~~~~~~~~~~~~\n",
 58 |     "\n",
 59 |     "Tensors can be initialized in various ways. Take a look at the following examples:\n",
 60 |     "\n",
 61 |     "**Directly from data**\n",
 62 |     "\n",
 63 |     "Tensors can be created directly from data. The data type is automatically inferred.\n",
 64 |     "\n"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {
 71 |     "collapsed": false,
 72 |     "jupyter": {
 73 |      "outputs_hidden": false
 74 |     }
 75 |    },
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "data = [[1, 2],[3, 4]]\n",
 79 |     "x_data = torch.tensor(data)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "**From a NumPy array**\n",
 87 |     "\n",
 88 |     "Tensors can be created from NumPy arrays (and vice versa - see `bridge-to-np-label`).\n",
 89 |     "\n"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {
 96 |     "collapsed": false,
 97 |     "jupyter": {
 98 |      "outputs_hidden": false
 99 |     }
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "np_array = np.array(data)\n",
104 |     "x_np = torch.from_numpy(np_array)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "**From another tensor:**\n",
112 |     "\n",
113 |     "The new tensor retains the properties (shape, datatype) of the argument tensor, unless explicitly overridden.\n",
114 |     "\n"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {
121 |     "collapsed": false,
122 |     "jupyter": {
123 |      "outputs_hidden": false
124 |     }
125 |    },
126 |    "outputs": [],
127 |    "source": [
128 |     "x_ones = torch.ones_like(x_data) # retains the properties of x_data\n",
129 |     "print(f\"Ones Tensor: \\n {x_ones} \\n\")\n",
130 |     "\n",
131 |     "x_rand = torch.rand_like(x_data, dtype=torch.float) # overrides the datatype of x_data\n",
132 |     "print(f\"Random Tensor: \\n {x_rand} \\n\")"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "**With random or constant values:**\n",
140 |     "\n",
141 |     "``shape`` is a tuple of tensor dimensions. In the functions below, it determines the dimensionality of the output tensor.\n",
142 |     "\n"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {
149 |     "collapsed": false,
150 |     "jupyter": {
151 |      "outputs_hidden": false
152 |     }
153 |    },
154 |    "outputs": [],
155 |    "source": [
156 |     "shape = (2,3,)\n",
157 |     "rand_tensor = torch.rand(shape)\n",
158 |     "ones_tensor = torch.ones(shape)\n",
159 |     "zeros_tensor = torch.zeros(shape)\n",
160 |     "\n",
161 |     "print(f\"Random Tensor: \\n {rand_tensor} \\n\")\n",
162 |     "print(f\"Ones Tensor: \\n {ones_tensor} \\n\")\n",
163 |     "print(f\"Zeros Tensor: \\n {zeros_tensor}\")"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "--------------\n",
171 |     "\n",
172 |     "\n"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "Tensor Attributes\n",
180 |     "~~~~~~~~~~~~~~~~~\n",
181 |     "\n",
182 |     "Tensor attributes describe their shape, datatype, and the device on which they are stored.\n",
183 |     "\n"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {
190 |     "collapsed": false,
191 |     "jupyter": {
192 |      "outputs_hidden": false
193 |     }
194 |    },
195 |    "outputs": [],
196 |    "source": [
197 |     "tensor = torch.rand(3,4)\n",
198 |     "\n",
199 |     "print(f\"Shape of tensor: {tensor.shape}\")\n",
200 |     "print(f\"Datatype of tensor: {tensor.dtype}\")\n",
201 |     "print(f\"Device tensor is stored on: {tensor.device}\")"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "--------------\n",
209 |     "\n",
210 |     "\n"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "metadata": {},
216 |    "source": [
217 |     "Tensor Operations\n",
218 |     "~~~~~~~~~~~~~~~~~\n",
219 |     "\n",
220 |     "Over 100 tensor operations, including transposing, indexing, slicing,\n",
221 |     "mathematical operations, linear algebra, random sampling, and more are\n",
222 |     "comprehensively described\n",
223 |     "`here <https://pytorch.org/docs/stable/torch.html>`__.\n",
224 |     "\n",
225 |     "Each of them can be run on the GPU (at typically higher speeds than on a\n",
226 |     "CPU). If you’re using Colab, allocate a GPU by going to Edit > Notebook\n",
227 |     "Settings.\n",
228 |     "\n",
229 |     "\n"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {
236 |     "collapsed": false,
237 |     "jupyter": {
238 |      "outputs_hidden": false
239 |     }
240 |    },
241 |    "outputs": [],
242 |    "source": [
243 |     "# We move our tensor to the GPU if available\n",
244 |     "if torch.cuda.is_available():\n",
245 |     "  tensor = tensor.to('cuda')"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "Try out some of the operations from the list.\n",
253 |     "If you're familiar with the NumPy API, you'll find the Tensor API a breeze to use.\n",
254 |     "\n",
255 |     "\n"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "**Standard numpy-like indexing and slicing:**\n",
263 |     "\n"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {
270 |     "collapsed": false,
271 |     "jupyter": {
272 |      "outputs_hidden": false
273 |     }
274 |    },
275 |    "outputs": [],
276 |    "source": [
277 |     "tensor = torch.ones(4, 4)\n",
278 |     "tensor[:,1] = 0\n",
279 |     "print(tensor)"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "**Joining tensors** You can use ``torch.cat`` to concatenate a sequence of tensors along a given dimension.\n",
287 |     "See also `torch.stack <https://pytorch.org/docs/stable/generated/torch.stack.html>`__,\n",
288 |     "another tensor joining op that is subtly different from ``torch.cat``.\n",
289 |     "\n"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "metadata": {
296 |     "collapsed": false,
297 |     "jupyter": {
298 |      "outputs_hidden": false
299 |     }
300 |    },
301 |    "outputs": [],
302 |    "source": [
303 |     "t1 = torch.cat([tensor, tensor, tensor], dim=1)\n",
304 |     "print(t1)"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "markdown",
309 |    "metadata": {},
310 |    "source": [
311 |     "**Multiplying tensors**\n",
312 |     "\n"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {
319 |     "collapsed": false,
320 |     "jupyter": {
321 |      "outputs_hidden": false
322 |     }
323 |    },
324 |    "outputs": [],
325 |    "source": [
326 |     "# This computes the element-wise product\n",
327 |     "print(f\"tensor.mul(tensor) \\n {tensor.mul(tensor)} \\n\")\n",
328 |     "# Alternative syntax:\n",
329 |     "print(f\"tensor * tensor \\n {tensor * tensor}\")"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "markdown",
334 |    "metadata": {},
335 |    "source": [
336 |     "This computes the matrix multiplication between two tensors\n",
337 |     "\n"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "metadata": {
344 |     "collapsed": false,
345 |     "jupyter": {
346 |      "outputs_hidden": false
347 |     }
348 |    },
349 |    "outputs": [],
350 |    "source": [
351 |     "print(f\"tensor.matmul(tensor.T) \\n {tensor.matmul(tensor.T)} \\n\")\n",
352 |     "# Alternative syntax:\n",
353 |     "print(f\"tensor @ tensor.T \\n {tensor @ tensor.T}\")"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {},
359 |    "source": [
360 |     "**In-place operations**\n",
361 |     "Operations that have a ``_`` suffix are in-place. For example: ``x.copy_(y)``, ``x.t_()``, will change ``x``.\n",
362 |     "\n"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": null,
368 |    "metadata": {
369 |     "collapsed": false,
370 |     "jupyter": {
371 |      "outputs_hidden": false
372 |     }
373 |    },
374 |    "outputs": [],
375 |    "source": [
376 |     "print(tensor, \"\\n\")\n",
377 |     "tensor.add_(5)\n",
378 |     "print(tensor)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "metadata": {},
384 |    "source": [
385 |     "<div class=\"alert alert-info\"><h4>Note</h4><p>In-place operations save some memory, but can be problematic when computing derivatives because of an immediate loss\n",
386 |     "     of history. Hence, their use is discouraged.</p></div>\n",
387 |     "\n"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "metadata": {},
393 |    "source": [
394 |     "--------------\n",
395 |     "\n",
396 |     "\n"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "markdown",
401 |    "metadata": {},
402 |    "source": [
403 |     "\n",
404 |     "Bridge with NumPy\n",
405 |     "~~~~~~~~~~~~~~~~~\n",
406 |     "Tensors on the CPU and NumPy arrays can share their underlying memory\n",
407 |     "locations, and changing one will change\tthe other.\n",
408 |     "\n"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "markdown",
413 |    "metadata": {},
414 |    "source": [
415 |     "Tensor to NumPy array\n",
416 |     "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
417 |     "\n"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": null,
423 |    "metadata": {
424 |     "collapsed": false,
425 |     "jupyter": {
426 |      "outputs_hidden": false
427 |     }
428 |    },
429 |    "outputs": [],
430 |    "source": [
431 |     "t = torch.ones(5)\n",
432 |     "print(f\"t: {t}\")\n",
433 |     "n = t.numpy()\n",
434 |     "print(f\"n: {n}\")"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "markdown",
439 |    "metadata": {},
440 |    "source": [
441 |     "A change in the tensor reflects in the NumPy array.\n",
442 |     "\n"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": null,
448 |    "metadata": {
449 |     "collapsed": false,
450 |     "jupyter": {
451 |      "outputs_hidden": false
452 |     }
453 |    },
454 |    "outputs": [],
455 |    "source": [
456 |     "t.add_(1)\n",
457 |     "print(f\"t: {t}\")\n",
458 |     "print(f\"n: {n}\")"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "markdown",
463 |    "metadata": {},
464 |    "source": [
465 |     "NumPy array to Tensor\n",
466 |     "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
467 |     "\n"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": null,
473 |    "metadata": {
474 |     "collapsed": false,
475 |     "jupyter": {
476 |      "outputs_hidden": false
477 |     }
478 |    },
479 |    "outputs": [],
480 |    "source": [
481 |     "n = np.ones(5)\n",
482 |     "t = torch.from_numpy(n)"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "markdown",
487 |    "metadata": {},
488 |    "source": [
489 |     "Changes in the NumPy array reflects in the tensor.\n",
490 |     "\n"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": null,
496 |    "metadata": {
497 |     "collapsed": false,
498 |     "jupyter": {
499 |      "outputs_hidden": false
500 |     }
501 |    },
502 |    "outputs": [],
503 |    "source": [
504 |     "np.add(n, 1, out=n)\n",
505 |     "print(f\"t: {t}\")\n",
506 |     "print(f\"n: {n}\")"
507 |    ]
508 |   }
509 |  ],
510 |  "metadata": {
511 |   "kernelspec": {
512 |    "display_name": "Python 3 (ipykernel)",
513 |    "language": "python",
514 |    "name": "python3"
515 |   },
516 |   "language_info": {
517 |    "codemirror_mode": {
518 |     "name": "ipython",
519 |     "version": 3
520 |    },
521 |    "file_extension": ".py",
522 |    "mimetype": "text/x-python",
523 |    "name": "python",
524 |    "nbconvert_exporter": "python",
525 |    "pygments_lexer": "ipython3",
526 |    "version": "3.9.2"
527 |   }
528 |  },
529 |  "nbformat": 4,
530 |  "nbformat_minor": 4
531 | }
532 | 


--------------------------------------------------------------------------------
/A 60 Minute Blitz/3.neural_networks_tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false,
  8 |     "jupyter": {
  9 |      "outputs_hidden": false
 10 |     }
 11 |    },
 12 |    "outputs": [],
 13 |    "source": [
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "\n",
 22 |     "Neural Networks\n",
 23 |     "===============\n",
 24 |     "\n",
 25 |     "Neural networks can be constructed using the ``torch.nn`` package.\n",
 26 |     "\n",
 27 |     "Now that you had a glimpse of ``autograd``, ``nn`` depends on\n",
 28 |     "``autograd`` to define models and differentiate them.\n",
 29 |     "An ``nn.Module`` contains layers, and a method ``forward(input)`` that\n",
 30 |     "returns the ``output``.\n",
 31 |     "\n",
 32 |     "For example, look at this network that classifies digit images:\n",
 33 |     "\n",
 34 |     ".. figure:: /_static/img/mnist.png\n",
 35 |     "   :alt: convnet\n",
 36 |     "\n",
 37 |     "   convnet\n",
 38 |     "\n",
 39 |     "It is a simple feed-forward network. It takes the input, feeds it\n",
 40 |     "through several layers one after the other, and then finally gives the\n",
 41 |     "output.\n",
 42 |     "\n",
 43 |     "A typical training procedure for a neural network is as follows:\n",
 44 |     "\n",
 45 |     "- Define the neural network that has some learnable parameters (or\n",
 46 |     "  weights)\n",
 47 |     "- Iterate over a dataset of inputs\n",
 48 |     "- Process input through the network\n",
 49 |     "- Compute the loss (how far is the output from being correct)\n",
 50 |     "- Propagate gradients back into the network’s parameters\n",
 51 |     "- Update the weights of the network, typically using a simple update rule:\n",
 52 |     "  ``weight = weight - learning_rate * gradient``\n",
 53 |     "\n",
 54 |     "Define the network\n",
 55 |     "------------------\n",
 56 |     "\n",
 57 |     "Let’s define this network:\n",
 58 |     "\n"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 51,
 64 |    "metadata": {
 65 |     "collapsed": false,
 66 |     "jupyter": {
 67 |      "outputs_hidden": false
 68 |     }
 69 |    },
 70 |    "outputs": [
 71 |     {
 72 |      "name": "stdout",
 73 |      "output_type": "stream",
 74 |      "text": [
 75 |       "Net(\n",
 76 |       "  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))\n",
 77 |       "  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))\n",
 78 |       "  (fc1): Linear(in_features=400, out_features=120, bias=True)\n",
 79 |       "  (fc2): Linear(in_features=120, out_features=84, bias=True)\n",
 80 |       "  (fc3): Linear(in_features=84, out_features=10, bias=True)\n",
 81 |       ")\n"
 82 |      ]
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "import torch\n",
 87 |     "import torch.nn as nn\n",
 88 |     "import torch.nn.functional as F\n",
 89 |     "\n",
 90 |     "\n",
 91 |     "class Net(nn.Module):\n",
 92 |     "\n",
 93 |     "    def __init__(self):\n",
 94 |     "        super(Net, self).__init__()\n",
 95 |     "        # 1 input image channel, 6 output channels, 5x5 square convolution\n",
 96 |     "        # kernel\n",
 97 |     "        self.conv1 = nn.Conv2d(1, 6, 5)\n",
 98 |     "        self.conv2 = nn.Conv2d(6, 16, 5)\n",
 99 |     "        # an affine operation: y = Wx + b\n",
100 |     "        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension \n",
101 |     "        self.fc2 = nn.Linear(120, 84)\n",
102 |     "        self.fc3 = nn.Linear(84, 10)\n",
103 |     "\n",
104 |     "    def forward(self, x):\n",
105 |     "        # Max pooling over a (2, 2) window\n",
106 |     "        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))\n",
107 |     "        # If the size is a square, you can specify with a single number\n",
108 |     "        x = F.max_pool2d(F.relu(self.conv2(x)), 2)\n",
109 |     "        x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension\n",
110 |     "        x = F.relu(self.fc1(x))\n",
111 |     "        x = F.relu(self.fc2(x))\n",
112 |     "        x = self.fc3(x)\n",
113 |     "        return x\n",
114 |     "\n",
115 |     "\n",
116 |     "net = Net()\n",
117 |     "print(net) "
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "You just have to define the ``forward`` function, and the ``backward``\n",
125 |     "function (where gradients are computed) is automatically defined for you\n",
126 |     "using ``autograd``.\n",
127 |     "You can use any of the Tensor operations in the ``forward`` function.\n",
128 |     "\n",
129 |     "The learnable parameters of a model are returned by ``net.parameters()``\n",
130 |     "\n"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 56,
136 |    "metadata": {
137 |     "collapsed": false,
138 |     "jupyter": {
139 |      "outputs_hidden": false
140 |     }
141 |    },
142 |    "outputs": [
143 |     {
144 |      "name": "stdout",
145 |      "output_type": "stream",
146 |      "text": [
147 |       "10\n",
148 |       "torch.Size([6, 1, 5, 5])\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "params = list(net.parameters())\n",
154 |     "# print(params)\n",
155 |     "print(len(params))\n",
156 |     "print(params[0].size())  # conv1's .weight"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "Let's try a random 32x32 input.\n",
164 |     "Note: expected input size of this net (LeNet) is 32x32. To use this net on\n",
165 |     "the MNIST dataset, please resize the images from the dataset to 32x32.\n",
166 |     "\n"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "## 呵呵了，这傻逼教程，不写清楚为啥输入一定得是32*32\n",
174 |     "\n",
175 |     "首先，进过推断，conv默认的stride即步长是1，然后pooling默认的步长是window_size。\n",
176 |     "\n",
177 |     "根据fc1的输入units个数为16 * 5 * 5，这个 5 * 5 就是输入图像经过conv1，pool1，conv2，pool2后的结果：\n",
178 |     "\n",
179 |     "(32,32) --conv1(size=5, stride=1)--> (28,28) --pool1(size=2, stride=2)--> (14,14) --conv2(size=5, stride=1)--> (10,10) --pool2(size=2, stride=2)--> (5,5)"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 57,
185 |    "metadata": {
186 |     "collapsed": false,
187 |     "jupyter": {
188 |      "outputs_hidden": false
189 |     }
190 |    },
191 |    "outputs": [
192 |     {
193 |      "name": "stdout",
194 |      "output_type": "stream",
195 |      "text": [
196 |       "tensor([[-0.0608, -0.1087, -0.0833,  0.0029, -0.0998,  0.0340,  0.0646, -0.1200,\n",
197 |       "          0.0184, -0.0866]], grad_fn=<AddmmBackward>)\n"
198 |      ]
199 |     }
200 |    ],
201 |    "source": [
202 |     "input = torch.randn(1, 1, 32, 32)  # 下面写了，torch模型只接受batch，所以即使只有一个样本，也需要设置一个batch size 1\n",
203 |     "out = net(input)\n",
204 |     "print(out)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "Zero the gradient buffers of all parameters and backprops with random\n",
212 |     "gradients:\n",
213 |     "\n"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 7,
219 |    "metadata": {
220 |     "collapsed": false,
221 |     "jupyter": {
222 |      "outputs_hidden": false
223 |     }
224 |    },
225 |    "outputs": [],
226 |    "source": [
227 |     "net.zero_grad()\n",
228 |     "out.backward(torch.randn(1, 10))"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "<div class=\"alert alert-info\"><h4>Note</h4><p>``torch.nn`` only supports mini-batches. The entire ``torch.nn``\n",
236 |     "    package only supports inputs that are a mini-batch of samples, and not\n",
237 |     "    a single sample.\n",
238 |     "\n",
239 |     "    For example, ``nn.Conv2d`` will take in a 4D Tensor of\n",
240 |     "    ``nSamples x nChannels x Height x Width``.\n",
241 |     "\n",
242 |     "    If you have a single sample, just use ``input.unsqueeze(0)`` to add\n",
243 |     "    a fake batch dimension.</p></div>\n",
244 |     "\n",
245 |     "Before proceeding further, let's recap all the classes you’ve seen so far.\n",
246 |     "\n",
247 |     "**Recap:**\n",
248 |     "  -  ``torch.Tensor`` - A *multi-dimensional array* with support for autograd\n",
249 |     "     operations like ``backward()``. Also *holds the gradient* w.r.t. the\n",
250 |     "     tensor.\n",
251 |     "  -  ``nn.Module`` - Neural network module. *Convenient way of\n",
252 |     "     encapsulating parameters*, with helpers for moving them to GPU,\n",
253 |     "     exporting, loading, etc.\n",
254 |     "  -  ``nn.Parameter`` - A kind of Tensor, that is *automatically\n",
255 |     "     registered as a parameter when assigned as an attribute to a*\n",
256 |     "     ``Module``.\n",
257 |     "  -  ``autograd.Function`` - Implements *forward and backward definitions\n",
258 |     "     of an autograd operation*. Every ``Tensor`` operation creates at\n",
259 |     "     least a single ``Function`` node that connects to functions that\n",
260 |     "     created a ``Tensor`` and *encodes its history*.\n",
261 |     "\n",
262 |     "**At this point, we covered:**\n",
263 |     "  -  Defining a neural network\n",
264 |     "  -  Processing inputs and calling backward\n",
265 |     "\n",
266 |     "**Still Left:**\n",
267 |     "  -  Computing the loss\n",
268 |     "  -  Updating the weights of the network\n",
269 |     "\n",
270 |     "Loss Function\n",
271 |     "-------------\n",
272 |     "A loss function takes the (output, target) pair of inputs, and computes a\n",
273 |     "value that estimates how far away the output is from the target.\n",
274 |     "\n",
275 |     "There are several different\n",
276 |     "`loss functions <https://pytorch.org/docs/nn.html#loss-functions>`_ under the\n",
277 |     "nn package .\n",
278 |     "A simple loss is: ``nn.MSELoss`` which computes the mean-squared error\n",
279 |     "between the input and the target.\n",
280 |     "\n",
281 |     "For example:\n",
282 |     "\n"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 58,
288 |    "metadata": {
289 |     "collapsed": false,
290 |     "jupyter": {
291 |      "outputs_hidden": false
292 |     }
293 |    },
294 |    "outputs": [
295 |     {
296 |      "name": "stdout",
297 |      "output_type": "stream",
298 |      "text": [
299 |       "tensor(0.8253, grad_fn=<MseLossBackward>)\n",
300 |       "tensor(2.3440, grad_fn=<NllLossBackward>)\n"
301 |      ]
302 |     }
303 |    ],
304 |    "source": [
305 |     "output = net(input)\n",
306 |     "target = torch.randn(10)  # a dummy target, for example\n",
307 |     "target = target.view(1, -1)  # make it the same shape as output  前面那个1代表batch size\n",
308 |     "criterion = nn.MSELoss()\n",
309 |     "loss = criterion(output, target)\n",
310 |     "print(loss)\n",
311 |     "\n",
312 |     "# ==============试试 CE loss\n",
313 |     "criterion2 = nn.CrossEntropyLoss()\n",
314 |     "target2 = torch.tensor([2],dtype=torch.long)  # 这里是因为CrossEntropyLoss中的target对于C分类问题，是直接接受index作为target，不用像keras那样变成one-hot输入\n",
315 |     "loss2 = criterion2(output, target2)\n",
316 |     "print(loss2)"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "metadata": {},
322 |    "source": [
323 |     "Now, if you follow ``loss`` in the backward direction, using its\n",
324 |     "``.grad_fn`` attribute, you will see a graph of computations that looks\n",
325 |     "like this:\n",
326 |     "\n",
327 |     "::\n",
328 |     "\n",
329 |     "    input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d\n",
330 |     "          -> flatten -> linear -> relu -> linear -> relu -> linear\n",
331 |     "          -> MSELoss\n",
332 |     "          -> loss\n",
333 |     "\n",
334 |     "So, when we call ``loss.backward()``, the whole graph is differentiated\n",
335 |     "w.r.t. the neural net parameters, and all Tensors in the graph that have\n",
336 |     "``requires_grad=True`` will have their ``.grad`` Tensor accumulated with the\n",
337 |     "gradient.\n",
338 |     "\n",
339 |     "For illustration, let us follow a few steps backward:\n",
340 |     "\n"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 59,
346 |    "metadata": {
347 |     "collapsed": false,
348 |     "jupyter": {
349 |      "outputs_hidden": false
350 |     }
351 |    },
352 |    "outputs": [
353 |     {
354 |      "name": "stdout",
355 |      "output_type": "stream",
356 |      "text": [
357 |       "<MseLossBackward object at 0x145e972b0>\n",
358 |       "<AddmmBackward object at 0x145e97790>\n",
359 |       "<AccumulateGrad object at 0x145e972b0>\n"
360 |      ]
361 |     }
362 |    ],
363 |    "source": [
364 |     "print(loss.grad_fn)  # MSELoss\n",
365 |     "print(loss.grad_fn.next_functions[0][0])  # Linear\n",
366 |     "print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "markdown",
371 |    "metadata": {},
372 |    "source": [
373 |     "Backprop\n",
374 |     "--------\n",
375 |     "To backpropagate the error all we have to do is to ``loss.backward()``.\n",
376 |     "**You need to clear the existing gradients though, else gradients will be\n",
377 |     "accumulated to existing gradients.**\n",
378 |     "\n",
379 |     "\n",
380 |     "Now we shall call ``loss.backward()``, and have a look at conv1's bias\n",
381 |     "gradients before and after the backward.\n",
382 |     "\n"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": 60,
388 |    "metadata": {
389 |     "collapsed": false,
390 |     "jupyter": {
391 |      "outputs_hidden": false
392 |     }
393 |    },
394 |    "outputs": [
395 |     {
396 |      "name": "stdout",
397 |      "output_type": "stream",
398 |      "text": [
399 |       "conv1.bias.grad before backward\n",
400 |       "None\n",
401 |       "conv1.bias.grad after backward\n",
402 |       "tensor([ 0.0062,  0.0048,  0.0059, -0.0036, -0.0046, -0.0134])\n"
403 |      ]
404 |     }
405 |    ],
406 |    "source": [
407 |     "net.zero_grad()     # zeroes the gradient buffers of all parameters\n",
408 |     "\n",
409 |     "print('conv1.bias.grad before backward')\n",
410 |     "print(net.conv1.bias.grad)\n",
411 |     "\n",
412 |     "loss.backward()\n",
413 |     "\n",
414 |     "print('conv1.bias.grad after backward')\n",
415 |     "print(net.conv1.bias.grad)"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "markdown",
420 |    "metadata": {},
421 |    "source": [
422 |     "Now, we have seen how to use loss functions.\n",
423 |     "\n",
424 |     "**Read Later:**\n",
425 |     "\n",
426 |     "  The neural network package contains various modules and loss functions\n",
427 |     "  that form the building blocks of deep neural networks. A full list with\n",
428 |     "  documentation is `here <https://pytorch.org/docs/nn>`_.\n",
429 |     "\n",
430 |     "**The only thing left to learn is:**\n",
431 |     "\n",
432 |     "  - Updating the weights of the network\n",
433 |     "\n",
434 |     "Update the weights\n",
435 |     "------------------\n",
436 |     "The simplest update rule used in practice is the Stochastic Gradient\n",
437 |     "Descent (SGD):\n",
438 |     "\n",
439 |     "     ``weight = weight - learning_rate * gradient``\n",
440 |     "\n",
441 |     "We can implement this using simple Python code:\n",
442 |     "\n",
443 |     ".. code:: python\n",
444 |     "\n",
445 |     "    learning_rate = 0.01\n",
446 |     "    for f in net.parameters():\n",
447 |     "        f.data.sub_(f.grad.data * learning_rate)\n",
448 |     "\n",
449 |     "However, as you use neural networks, you want to use various different\n",
450 |     "update rules such as SGD, Nesterov-SGD, Adam, RMSProp, etc.\n",
451 |     "To enable this, we built a small package: ``torch.optim`` that\n",
452 |     "implements all these methods. Using it is very simple:\n",
453 |     "\n"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": 61,
459 |    "metadata": {},
460 |    "outputs": [],
461 |    "source": [
462 |     "learning_rate = 0.01\n",
463 |     "for f in net.parameters():\n",
464 |     "    f.data.sub_(f.grad.data * learning_rate)"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": 62,
470 |    "metadata": {
471 |     "collapsed": false,
472 |     "jupyter": {
473 |      "outputs_hidden": false
474 |     }
475 |    },
476 |    "outputs": [],
477 |    "source": [
478 |     "import torch.optim as optim\n",
479 |     "\n",
480 |     "# create your optimizer\n",
481 |     "optimizer = optim.SGD(net.parameters(), lr=0.01)\n",
482 |     "\n",
483 |     "# in your training loop:\n",
484 |     "optimizer.zero_grad()   # zero the gradient buffers\n",
485 |     "output = net(input)\n",
486 |     "loss = criterion(output, target)\n",
487 |     "loss.backward()\n",
488 |     "optimizer.step()    # Does the update"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "markdown",
493 |    "metadata": {},
494 |    "source": [
495 |     ".. Note::\n",
496 |     "\n",
497 |     "      Observe how gradient buffers had to be manually set to zero using\n",
498 |     "      ``optimizer.zero_grad()``. This is because gradients are accumulated\n",
499 |     "      as explained in the `Backprop`_ section.\n",
500 |     "\n"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "markdown",
505 |    "metadata": {},
506 |    "source": [
507 |     "## 这里可以看到，nn.Module 和 torch.optim 两个类都有 .zero_grad()这个功能。"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": null,
513 |    "metadata": {},
514 |    "outputs": [],
515 |    "source": []
516 |   }
517 |  ],
518 |  "metadata": {
519 |   "kernelspec": {
520 |    "display_name": "Python 3 (ipykernel)",
521 |    "language": "python",
522 |    "name": "python3"
523 |   },
524 |   "language_info": {
525 |    "codemirror_mode": {
526 |     "name": "ipython",
527 |     "version": 3
528 |    },
529 |    "file_extension": ".py",
530 |    "mimetype": "text/x-python",
531 |    "name": "python",
532 |    "nbconvert_exporter": "python",
533 |    "pygments_lexer": "ipython3",
534 |    "version": "3.9.2"
535 |   }
536 |  },
537 |  "nbformat": 4,
538 |  "nbformat_minor": 4
539 | }
540 | 


--------------------------------------------------------------------------------
/Applications/language_model/README.md:
--------------------------------------------------------------------------------
 1 | # 训练一个简单的语言模型
 2 | 
 3 | 直接运行main.py进行训练，然后运行generate.py即可生成文本。
 4 | 
 5 | 一个使用《三国演义》训练的语言模型，生成的文本例子如下：
 6 | 
 7 | ```python
 8 | trigger words: ['诸', '葛']
 9 | 诸葛亮。忠见一人，雪伤士飞，有何不惜！望备取
10 | 盏，举以从此疏事。”关公不听骋言，回顾琮
11 | 曰：“不然。”垕大骂一声，取上牟扑。那将
12 | 寻知中军盛旗，却并不见。袁隗遣人催取吕翔
13 | ，诏权其言曰：“此乃周瑜之计也。须用军士
14 | ：“如不用之甚，非困于里下手；今日欲使操
15 | 来，军士俱能收之。若荐蔡瑁、张辽、徐州刺
16 | 史慈，有州二雷，准备而纳，不可乘之。乃下
17 | 马至戢，设于帐前为小舟并致。
18 | ```
19 | 
20 | 
21 | 
22 | ## TODO:
23 | 1. 在generate的时候，使用beam search
24 | 2. 试试Transformer
25 | 3. 使用seq2seq来训练 
26 | 


--------------------------------------------------------------------------------
/Applications/language_model/data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | 
 4 | class Dictionary:
 5 |     def __init__(self):
 6 |         self.word2idx = {}
 7 |         self.idx2word = []  # idx2word用一个list即可
 8 | 
 9 |     def add_word(self, word):
10 |         if word not in self.word2idx:
11 |             self.idx2word.append(word)
12 |             self.word2idx[word] = len(self.idx2word) - 1
13 |         return self.word2idx[word]
14 | 
15 |     def __len__(self):
16 |         return len(self.idx2word)
17 | 
18 | 
19 | class Corpus:
20 |     def __init__(self, path, lang='en'):
21 |         self.lang = lang
22 |         # if lang == 'zh':
23 |         #     import jieba
24 |         self.dictionary = Dictionary()
25 |         self.train = self.tokenize(os.path.join(path, 'train.txt'))
26 |         self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
27 |         self.test = self.tokenize(os.path.join(path, 'test.txt'))
28 | 
29 |     def tokenize(self, file_path):
30 |         assert os.path.exists(file_path)
31 |         # Add words to dict:
32 |         with open(file_path, 'r', encoding='utf8') as f:
33 |             for line in f:
34 |                 if self.lang == 'zh':
35 |                     words = [w for w in line] + ['<eos>']  # 先直接按照字来分
36 |                 else:  # 默认英语
37 |                     words = line.split(' ') + ['<eos>']  # 先用空格分词，然后添加 end_of_sentence 符号
38 |                 for w in words:
39 |                     self.dictionary.add_word(w)
40 | 
41 |         with open(file_path, 'r', encoding='utf8') as f:
42 |             idss = []
43 |             for line in f:
44 |                 if self.lang == 'zh':
45 |                     words = [w for w in line] + ['<eos>']  # 先直接按照字来分
46 |                 else:  # 默认英语
47 |                     words = line.split(' ') + ['<eos>']  # 先用空格分词，然后添加 end_of_sentence 符号
48 |                 ids = [self.dictionary.word2idx[w] for w in words]
49 |                 idss.append(torch.tensor(ids, dtype=torch.int64))
50 |             return torch.cat(idss)  # 最后是类似这种的东西： tensor([0, 1, 0,  ..., 1, 0, 1])
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     # c = Corpus('../../data/wikitext-2')
55 |     c = Corpus('../../data/三国small', lang='zh')
56 |     print(c.train.shape)
57 |     print(c.valid.shape)
58 |     print(c.test.shape)
59 |     print(len(c.dictionary.word2idx))
60 | 


--------------------------------------------------------------------------------
/Applications/language_model/generate.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Language Modeling on Wikitext-2
 3 | #
 4 | # This file generates new sentences sampled from the language model
 5 | #
 6 | ###############################################################################
 7 | 
 8 | import argparse
 9 | 
10 | import torch
11 | 
12 | import data
13 | 
14 | parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 Language Model')
15 | 
16 | # Model parameters.
17 | parser.add_argument('--data', type=str, default='../../data/三国small',
18 |                     help='location of the data corpus')
19 | parser.add_argument('--lang', type=str, default='zh',
20 |                     help='language fo the corpus')
21 | parser.add_argument('--checkpoint', type=str, default='./model.pt',
22 |                     help='model checkpoint to use')
23 | parser.add_argument('--outf', type=str, default='generated.txt',
24 |                     help='output file for generated text')
25 | parser.add_argument('--words', type=int, default=1000,
26 |                     help='number of words to generate')
27 | parser.add_argument('--seed', type=int, default=1111,
28 |                     help='random seed')
29 | parser.add_argument('--cuda', action='store_true',
30 |                     help='use CUDA')
31 | parser.add_argument('--temperature', type=float, default=1.0,
32 |                     help='temperature - higher will increase diversity')
33 | parser.add_argument('--log-interval', type=int, default=100,
34 |                     help='reporting interval')
35 | args = parser.parse_args()
36 | join_token = '' if args.lang == 'zh' else ' '
37 | # torch.manual_seed(1)
38 | device = torch.device("cuda" if args.cuda else "cpu")
39 | 
40 | if args.temperature < 1e-3:
41 |     parser.error("--temperature has to be greater or equal 1e-3")
42 | 
43 | # 这里需要用到跟训练集相同的词典，用来输出真实的词
44 | corpus = data.Corpus(args.data, lang=args.lang)
45 | ntokens = len(corpus.dictionary)
46 | 
47 | # load model
48 | with open(args.checkpoint, 'rb') as f:
49 |     model = torch.load(f).to(device)
50 | model.eval()
51 | 
52 | is_transformer_model = hasattr(model, 'model_type') and model.model_type == 'Transformer'
53 | if not is_transformer_model:
54 |     hidden = model.init_hidden(1)
55 | 
56 | # RNN的输入可以是不定长的，所以理论上我用来trigger的可以是一句话
57 | # input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
58 | trigger_words = [w for w in '诸葛']
59 | print("trigger words:", trigger_words)
60 | input_idx_list = [corpus.dictionary.word2idx[w] for w in trigger_words]
61 | input = torch.tensor(input_idx_list, dtype=torch.long).view(len(input_idx_list), 1).to(device)
62 | 
63 | 
64 | with open(args.outf, 'w') as outf:
65 |     print("trigger word:", trigger_words, file=outf)
66 |     print(join_token.join(trigger_words), end=join_token)
67 |     print(join_token.join(trigger_words), end=join_token, file=outf)
68 |     with torch.no_grad():
69 |         for i in range(args.words):  # generate how many words
70 |             if is_transformer_model:
71 |                 raise NotImplementedError()
72 |             else:
73 |                 output, hidden = model(input, hidden)
74 |                 # 这里[-1]是为了取出最后一个time step的输出
75 |                 # 否则，如果input是多个词，可能生成的就会不连贯，比方输入"诸葛"，后面却生成不了"亮"。
76 |                 word_weights = output[-1].squeeze().div(args.temperature).exp().cpu()  # 这里的temperature啥作用？
77 |                 word_idx = torch.multinomial(word_weights, 1)[0]  # randomly sample
78 |                 # input.fill_(word_idx)  #  用于输入只有一个词
79 |                 # input = word_idx.view(1, 1)   #  用于输入为多个词，但是后面的迭代都是只用前一个词
80 |                 # 这里让每次都读取前面N个词用于生成下个词，是不是更好？
81 |                 input_idx_list = input_idx_list[-34:] + [(int(word_idx.item()))]  # 每次都用前35个词来预测
82 |                 input = torch.tensor(input_idx_list, dtype=torch.long).view(len(input_idx_list), 1).to(device)
83 |             word = corpus.dictionary.idx2word[word_idx]
84 |             if word == '<eos>':
85 |                 break
86 |             print(word + ('\n' if i % 20 == 19 else join_token), end=join_token)
87 |             outf.write(word + ('\n' if i % 20 == 19 else join_token))
88 | 
89 |             # if i % args.log_interval == 0:
90 |             #     print('| Generated {}/{} words'.format(i, args.words))
91 | 


--------------------------------------------------------------------------------
/Applications/language_model/generated.txt:
--------------------------------------------------------------------------------
 1 | trigger word: ['诸', '葛']
 2 | 诸葛亮。忠见一人，雪伤士飞，有何不惜！望备取
 3 | 盏，举以从此疏事。”关公不听骋言，回顾琮
 4 | 曰：“不然。”垕大骂一声，取上牟扑。那将
 5 | 寻知中军盛旗，却并不见。袁隗遣人催取吕翔
 6 | ，诏权其言曰：“此乃周瑜之计也。须用军士
 7 | ：“如不用之甚，非困于里下手；今日欲使操
 8 | 来，军士俱能收之。若荐蔡瑁、张辽、徐州刺
 9 | 史慈，有州二雷，准备而纳，不可乘之。乃下
10 | 马至戢，设于帐前为小舟并致。
11 | 


--------------------------------------------------------------------------------
/Applications/language_model/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import time
  3 | import math
  4 | import os
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.onnx as onnx
  8 | 
  9 | from data import Corpus
 10 | import model
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('--data', type=str, default='../../data/三国small',
 14 |                     help='location of the data corpus')
 15 | parser.add_argument('--lang', type=str, default='zh',
 16 |                     help='language fo the corpus')
 17 | parser.add_argument('--model', type=str, default='LSTM',
 18 |                     help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)')
 19 | parser.add_argument('--emsize', type=int, default=200,
 20 |                     help='size of word embeddings')
 21 | parser.add_argument('--nhid', type=int, default=200,
 22 |                     help='number of hidden units per layer')
 23 | parser.add_argument('--nlayers', type=int, default=2,
 24 |                     help='number of layers')
 25 | parser.add_argument('--lr', type=float, default=20,
 26 |                     help='initial learning rate')
 27 | parser.add_argument('--clip', type=float, default=0.25,
 28 |                     help='gradient clipping')
 29 | parser.add_argument('--epochs', type=int, default=40,
 30 |                     help='upper epoch limit')
 31 | parser.add_argument('--batch_size', type=int, default=20, metavar='N',
 32 |                     help='batch size')
 33 | parser.add_argument('--bptt', type=int, default=35,
 34 |                     help='sequence length, backprop through time(bptt)')
 35 | parser.add_argument('--dropout', type=float, default=0.2,
 36 |                     help='dropout applied to layers (0 = no dropout)')
 37 | parser.add_argument('--tied', action='store_true',
 38 |                     help='tie the word embedding and softmax weights')
 39 | parser.add_argument('--seed', type=int, default=1111,
 40 |                     help='random seed')
 41 | parser.add_argument('--cuda', action='store_true',
 42 |                     help='use CUDA')
 43 | parser.add_argument('--log-interval', type=int, default=200, metavar='N',
 44 |                     help='report interval')
 45 | parser.add_argument('--save', type=str, default='model.pt',
 46 |                     help='path to save the final model')
 47 | parser.add_argument('--onnx-export', type=str, default='',
 48 |                     help='path to export the final model in onnx format')
 49 | 
 50 | parser.add_argument('--nhead', type=int, default=2,
 51 |                     help='the number of heads in the encoder/decoder of the transformer model')
 52 | parser.add_argument('--dry-run', action='store_true',
 53 |                     help='verify the code and the model')
 54 | 
 55 | args = parser.parse_args()
 56 | 
 57 | # 设置随机种子便于复现
 58 | torch.manual_seed(1)
 59 | # 设置cuda
 60 | if torch.cuda.is_available():
 61 |     if not args.cuda:
 62 |         print("Hey, You have a CUDA device! Why not using it??")
 63 | device = torch.device("cuda" if args.cuda else "cpu")
 64 | 
 65 | ###############################################################################
 66 | # Load data
 67 | ###############################################################################
 68 | 
 69 | corpus = Corpus(args.data, lang=args.lang)
 70 | """
 71 | Starting from sequential data, batchify arranges the dataset into columns.
 72 | For instance, with the alphabet as the sequence and batch size 4, we'd get
 73 | ┌ a g m s ┐
 74 | │ b h n t │
 75 | │ c i o u │
 76 | │ d j p v │
 77 | │ e k q w │
 78 | └ f l r x ┘.
 79 | These columns are treated as independent by the model, which means that the
 80 | dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
 81 | batch processing.
 82 | 解释一下：
 83 | 上面那个矩阵为什么batch维在竖着那一维？因为torch中RNN默认的输入中，sequence_length是第一维，
 84 | 也就是行，batch在第二维。所以是这么个形状。
 85 | 然后按照batch=4，把'abcdefg.....xyz'分成4份，每一份就是一个独立的字符串了，就可以并行处理。
 86 | """
 87 | 
 88 | def batchify(data, bsz):
 89 |     """按照batch size来分割文本，所以bsz越大，用于训练的每条文本就越短"""
 90 |     nbatch = data.shape[0] // bsz
 91 |     data = data.narrow(0, 0, nbatch * bsz)  # 剪裁，(dimension, start, length)
 92 |     data = data.view(bsz, -1).t().contiguous()  # 这里的转置是为了满足RNN的输入，把seq_len放在第一维
 93 |     # 但.contiguous()啥用，还不知道
 94 |     return data.to(device)
 95 | 
 96 | 
 97 | eval_batch_size = 20
 98 | train_data = batchify(corpus.train, args.batch_size)
 99 | val_data = batchify(corpus.valid, args.batch_size)
100 | test_data = batchify(corpus.test, args.batch_size)
101 | 
102 | 
103 | ###############################################################################
104 | # Build the model
105 | ###############################################################################
106 | 
107 | ntokens = len(corpus.dictionary)
108 | model = model.RNN_Model(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout).to(device)
109 | loss_func = nn.NLLLoss()
110 | 
111 | 
112 | ###############################################################################
113 | # Training code
114 | ###############################################################################
115 | 
116 | """
117 | get_batch subdivides the source data into chunks of length args.bptt.
118 | If source is equal to the example output of the batchify function, 
119 | ┌ a g m s ┐
120 | │ b h n t │
121 | │ c i o u │
122 | │ d j p v │
123 | │ e k q w │
124 | └ f l r x ┘.
125 | with a bptt-limit of 2, we'd get the following two Variables for i = 0:
126 | ┌ a g m s ┐ ┌ b h n t ┐
127 | └ b h n t ┘ └ c i o u ┘
128 | Note that despite the name of the function, the subdivison of data is not
129 | done along the batch dimension (i.e. dimension 1), since that was handled
130 | by the batchify function. The chunks are along dimension 0, corresponding
131 | to the seq_len dimension in the LSTM.
132 | 
133 | 就是说，原本在没有seq_len的限制下，就是上面第一个矩阵，然后有了seq_len之后，应该去划分
134 | 一个个的输入呢，就是按照seq_len去纵向滑动，得到一个个chunk.
135 | """
136 | 
137 | def get_batch(source, i):
138 |     """
139 |     从source中第i位置开始取出seq_len长度的数据。
140 |     首先source data已经有了batch维，这里就是按照seq_len做一个切片；
141 |     然后target这里的都往后挪一个index，这实际上就是一个batch的所有target，
142 |     最后需要view(-1)变形成一维的，这样才能直接输入到NLLLoss损失函数中。
143 |     """
144 |     seq_len = min(args.bptt, len(source) - 1 - i)
145 |     data = source[i:i+seq_len]
146 |     target = source[i+1:i+1+seq_len].view(-1)
147 |     return data, target
148 | 
149 | def repackage_hidden(h):  # 这个玩意儿到底干嘛的？
150 |     """Wraps hidden states in new Tensors, to detach them from their history.
151 |     在网上查了查，相关的解释可以参考：
152 |     https://discuss.pytorch.org/t/solved-why-we-need-to-detach-variable-which-contains-hidden-representation/1426
153 |     """
154 |     if isinstance(h, torch.Tensor):
155 |         return h.detach()
156 |     else:
157 |         return tuple(repackage_hidden(v) for v in h)  # 还是个递归函数，更不懂了
158 | 
159 | def evaluate(data_source):
160 |     # Turn on evaluation mode which disables dropout.
161 |     # .eval()是nn.Module的函数，用户转换成evaluation模式，主要针对Dropout,BatchNorm这些组件
162 |     model.eval()
163 |     total_loss = 0.
164 |     ntokens = len(corpus.dictionary)
165 |     if args.model != 'Transformer':  # 不是Transformer，就有hidden的概念
166 |         hidden = model.init_hidden(eval_batch_size)
167 |     with torch.no_grad():
168 |         for i in range(0, data_source.size(0) - 1, args.bptt):  # 每bptt的
169 |             data, targets = get_batch(data_source, i)
170 |             if args.model == 'Transformer':
171 |                 output = model(data)
172 |                 output = output.view(-1, ntokens)
173 |             else:
174 |                 output, hidden = model(data, hidden)
175 |                 hidden = repackage_hidden(hidden)
176 |             total_loss += len(data) * loss_func(output, targets).item()
177 |     return total_loss / (len(data_source) - 1)
178 | 
179 | 
180 | def train():
181 |     # Turn on training mode which enables dropout.
182 |     model.train()
183 |     total_loss = 0.  # 记录一个epoch的loss
184 |     start_time = time.time()
185 |     ntokens = len(corpus.dictionary)
186 |     if args.model != 'Transformer':
187 |         hidden = model.init_hidden(args.batch_size)
188 |     for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
189 |         """
190 |         这里的设计也是挺"奇特的"。不管bptt多大，这里一个迭代都是batch size大小的数据；
191 |         i是一系列间隔seq_len的值，
192 |         所以bptt的作用就是告诉get_batch函数我一个batch中的文本是多长。
193 |         """
194 |         data, targets = get_batch(train_data, i)  # 以seqlen来取一个个batch
195 |         # Starting each batch, we detach the hidden state from how it was previously produced.
196 |         # If we didn't, the model would try backpropagating all the way to start of the dataset.
197 |         model.zero_grad()
198 |         if args.model == 'Transformer':
199 |             output = model(data)
200 |             output = output.view(-1, ntokens)
201 |         else:
202 |             hidden = repackage_hidden(hidden)
203 |             # 每一次新的反向传播，都得先把hidden给清理一次
204 |             output, hidden = model(data, hidden)
205 |         loss = loss_func(output, targets)
206 |         loss.backward()
207 | 
208 |         # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
209 |         torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
210 |         for p in model.parameters():  # 为啥不用optim？？
211 |             p.data.add_(p.grad, alpha=-lr)
212 | 
213 |         total_loss += loss.item()
214 | 
215 |         if batch % args.log_interval == 0 and batch > 0:
216 |             cur_loss = total_loss / args.log_interval
217 |             elapsed = time.time() - start_time
218 |             print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
219 |                     'loss {:5.2f} | ppl {:8.2f}'.format(
220 |                 epoch, batch, len(train_data) // args.bptt, lr,
221 |                 elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
222 |             total_loss = 0
223 |             start_time = time.time()
224 |         if args.dry_run:
225 |             break
226 | 
227 | 
228 | def export_onnx(path, batch_size, seq_len):
229 |     print('The model is also exported in ONNX format at {}'.
230 |           format(os.path.realpath(args.onnx_export)))
231 |     model.eval()
232 |     dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
233 |     hidden = model.init_hidden(batch_size)
234 |     torch.onnx.export(model, (dummy_input, hidden), path)
235 | 
236 | 
237 | # Loop over epochs.
238 | lr = args.lr
239 | best_val_loss = None
240 | 
241 | # At any point you can hit Ctrl + C to break out of training early.
242 | try:
243 |     for epoch in range(1, args.epochs+1):
244 |         epoch_start_time = time.time()
245 |         train()
246 |         val_loss = evaluate(val_data)
247 |         print('-' * 89)
248 |         print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
249 |                 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
250 |                                            val_loss, math.exp(val_loss)))
251 |         print('-' * 89)
252 |         # Save the model if the validation loss is the best we've seen so far.
253 |         if not best_val_loss or val_loss < best_val_loss:
254 |             with open(args.save, 'wb') as f:
255 |                 torch.save(model, f)
256 |             best_val_loss = val_loss
257 |         else:
258 |             # Anneal the learning rate if no improvement has been seen in the validation dataset.
259 |             lr /= 4.0
260 | except KeyboardInterrupt:
261 |     print('-' * 89)
262 |     print('Exiting from training early')
263 | 
264 | # Load the best saved model.
265 | with open(args.save, 'rb') as f:
266 |     model = torch.load(f)
267 |     # after load the rnn params are not a continuous chunk of memory
268 |     # this makes them a continuous chunk, and will speed up forward pass
269 |     # Currently, only rnn model supports flatten_parameters function.
270 |     if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
271 |         model.rnn.flatten_parameters()
272 | 
273 | # Run on test data.
274 | test_loss = evaluate(test_data)
275 | print('=' * 89)
276 | print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
277 |     test_loss, math.exp(test_loss)))
278 | print('=' * 89)
279 | 
280 | if len(args.onnx_export) > 0:
281 |     # Export the model in ONNX format.
282 |     export_onnx(args.onnx_export, batch_size=1, seq_len=args.bptt)
283 | 


--------------------------------------------------------------------------------
/Applications/language_model/model.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/Applications/language_model/model.pt


--------------------------------------------------------------------------------
/Applications/language_model/model.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | class RNN_Model(nn.Module):
 7 |     def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
 8 |         """
 9 |         :param rnn_type:
10 |         :param ntoken: num of tokens, vocab size
11 |         :param ninp: dimension of input tokens
12 |         :param nhid: hidden size
13 |         :param nlayers: num of layers
14 |         :param dropout:
15 |         :param tie_weights:
16 |         """
17 |         super(RNN_Model, self).__init__()
18 |         self.ntoken = ntoken
19 |         self.dropout_layer = nn.Dropout(dropout)
20 |         self.encoder = nn.Embedding(num_embeddings=ntoken, embedding_dim=ninp)
21 |         # choose the type of RNN:
22 |         if rnn_type in ['GRU','LSTM']:
23 |             # Docs: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM
24 |             # `getattr` is quite useful!
25 |             self.rnn = getattr(nn, rnn_type)(input_size=ninp, hidden_size=nhid, num_layers=nlayers)
26 |         else:
27 |             try:
28 |                  self.nonlinearity = {'RNN_RELU':'relu', 'RNN_TANH':'tanh'}[rnn_type]
29 |             except KeyError:
30 |                 raise ValueError("""only support ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
31 |             self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=self.nonlinearity, dropout=dropout)
32 |         # decoder, a simple linear layer:
33 |         # ntoken就是vocab size，所以输出维度要这么定。实际上，训练LM，就是next word prediction
34 |         self.decoder = nn.Linear(in_features=nhid, out_features=ntoken)
35 | 
36 |         self.rnn_type = rnn_type
37 |         self.nhid = nhid
38 |         self.nlayers = nlayers
39 | 
40 |     def init_weights(self):
41 |         initrange = 0.1
42 |         # Q: why we should manually initialize encoder and decoder?
43 |         nn.init.uniform_(self.encoder.weight, -initrange, initrange)
44 |         nn.init.zeros_(self.decoder.weight)  # Why?
45 |         nn.init.uniform_(self.decoder.weight, -initrange, initrange)
46 | 
47 |     def forward(self, input, hidden):
48 |         """
49 |         :param input:
50 |         :param hidden: init hidden state to RNN (h_0)
51 |         :return:
52 |         """
53 |         emb = self.dropout_layer(self.encoder(input))
54 |         # output: the hiddens of n tokens; hidden: the last hidden state (h_n)
55 |         output, hidden = self.rnn(emb, hidden)
56 |         # 注意输入decoder的是n个time steps的hidden，所以整个模型的输入的seq_len是多长，输出就会有多长
57 |         # 这也是seq2seq一般的做法，每个timestep的loss加起来组成整体的loss。
58 |         # 只不过这里的decoder就是一个简单的Linear，所以长度必须跟输入保持相同。如果单独一个RNN作为decoder，就可以长度不同了。
59 |         decoded = self.decoder(output)
60 |         decoded = decoded.view(-1, self.ntoken)
61 |         return F.log_softmax(decoded, dim=1), hidden  # Why log(softmax(x))?
62 | 
63 |     def init_hidden(self, bsz):
64 |         weight = next(self.parameters())
65 |         if self.rnn_type == 'LSTM':
66 |             return (weight.new_zeros(self.nlayers, bsz, self.nhid),
67 |                     weight.new_zeros(self.nlayers, bsz, self.nhid))
68 |         else:
69 |             return weight.new_zeros(self.nlayers, bsz, self.nhid)
70 | 
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/HuggingfaceNLP/C1. Start Playing Transformers/1. 直接使用pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Pipeline\n",
  8 |     "Pipeline是Huggingface的一个基本工具，可以理解为一个端到端(end-to-end)的一键调用Transformer模型的工具。\n",
  9 |     "\n",
 10 |     "It connects a model with its necessary preprocessing and postprocessing steps, allowing us to directly input any text and get an intelligible answer.\n",
 11 |     "\n",
 12 |     "给定一个任务之后，pipeline会自动调用一个预训练好的模型，然后根据你给的输入执行下面三个步骤：\n",
 13 |     "1. 预处理输入文本，让它可被模型读取\n",
 14 |     "2. 模型处理\n",
 15 |     "3. 模型输出的后处理，让预测结果可读\n",
 16 |     "\n",
 17 |     "一个例子如下："
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 1,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "from transformers import pipeline\n",
 27 |     "\n",
 28 |     "clf = pipeline('sentiment-analysis')"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 4,
 34 |    "metadata": {},
 35 |    "outputs": [
 36 |     {
 37 |      "data": {
 38 |       "text/plain": [
 39 |        "[{'label': 'POSITIVE', 'score': 0.9998709559440613}]"
 40 |       ]
 41 |      },
 42 |      "execution_count": 4,
 43 |      "metadata": {},
 44 |      "output_type": "execute_result"
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "clf('Haha, today is a nice day!')"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "还可以直接接受多个句子，一起预测："
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 13,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/plain": [
 66 |        "[{'label': 'POSITIVE', 'score': 0.9998160600662231},\n",
 67 |        " {'label': 'POSITIVE', 'score': 0.9998552799224854},\n",
 68 |        " {'label': 'NEGATIVE', 'score': 0.999782383441925}]"
 69 |       ]
 70 |      },
 71 |      "execution_count": 13,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "clf(['good','nice','bad'])"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "pipeline支持的task包括：\n",
 85 |     "\n",
 86 |     "- \"feature-extraction\": will return a FeatureExtractionPipeline.\n",
 87 |     "- \"text-classification\": will return a TextClassificationPipeline.\n",
 88 |     "- \"sentiment-analysis\": (alias of \"text-classification\") will return a TextClassificationPipeline.\n",
 89 |     "- \"token-classification\": will return a TokenClassificationPipeline.\n",
 90 |     "- \"ner\" (alias of \"token-classification\"): will return a TokenClassificationPipeline.\n",
 91 |     "- \"question-answering\": will return a QuestionAnsweringPipeline.\n",
 92 |     "- \"fill-mask\": will return a FillMaskPipeline.\n",
 93 |     "- \"summarization\": will return a SummarizationPipeline.\n",
 94 |     "- \"translation_xx_to_yy\": will return a TranslationPipeline.\n",
 95 |     "- \"text2text-generation\": will return a Text2TextGenerationPipeline.\n",
 96 |     "- \"text-generation\": will return a TextGenerationPipeline.\n",
 97 |     "- \"zero-shot-classification:: will return a ZeroShotClassificationPipeline.\n",
 98 |     "- \"conversational\": will return a ConversationalPipeline."
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "## Have a try: Zero-shot-classification\n",
106 |     "零样本学习，就是训练一个可以预测任何标签的模型，这些标签可以不出现在训练集中。\n",
107 |     "\n",
108 |     "一种零样本学习的方法，就是通过NLI（文本蕴含）任务，训练一个推理模型，比如这个例子：\n",
109 |     "```python\n",
110 |     "premise = 'Who are you voting for in 2020?'\n",
111 |     "hypothesis = 'This text is about politics.'\n",
112 |     "```\n",
113 |     "上面有一个前提(premise)和一个假设(hypothesis)，NLI任务就是去预测，在这个premise下，hypothesis是否成立。\n",
114 |     "\n",
115 |     "通过这样的训练，我们可以直接把hypothesis中的politics换成其他词儿，就可以实现zero-shot-learning了。\n",
116 |     "\n",
117 |     "NLI任务的解释：it classifies if two sentences are logically linked across three labels (contradiction, neutral, entailment) — a task also called natural language inference.\n",
118 |     "\n",
119 |     "参考阅读：\n",
120 |     "- 官方 Zero-shot-classification Pipeline文档：https://huggingface.co/transformers/main_classes/pipelines.html#transformers.ZeroShotClassificationPipeline\n",
121 |     "- 零样本学习简介：https://mp.weixin.qq.com/s/6aBzR0O3pwA8-btsuDX82g"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "clf = pipeline('zero-shot-classification')"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 21,
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "data": {
140 |       "text/plain": [
141 |        "[{'sequence': 'A helicopter is flying in the sky',\n",
142 |        "  'labels': ['machine', 'animal'],\n",
143 |        "  'scores': [0.9938627481460571, 0.006137280724942684]},\n",
144 |        " {'sequence': 'A bird is flying in the sky',\n",
145 |        "  'labels': ['animal', 'machine'],\n",
146 |        "  'scores': [0.9987970590591431, 0.0012029369827359915]}]"
147 |       ]
148 |      },
149 |      "execution_count": 21,
150 |      "metadata": {},
151 |      "output_type": "execute_result"
152 |     }
153 |    ],
154 |    "source": [
155 |     "clf(sequences=[\"A helicopter is flying in the sky\",\n",
156 |     "               \"A bird is flying in the sky\"],\n",
157 |     "    candidate_labels=['animal','machine'])  # labels可以完全自定义"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "## Have a try: Text Generation"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 27,
170 |    "metadata": {},
171 |    "outputs": [
172 |     {
173 |      "data": {
174 |       "application/vnd.jupyter.widget-view+json": {
175 |        "model_id": "0d84006ae024439fb571c12e15825b9e",
176 |        "version_major": 2,
177 |        "version_minor": 0
178 |       },
179 |       "text/plain": [
180 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=357.0, style=ProgressStyle(description_…"
181 |       ]
182 |      },
183 |      "metadata": {},
184 |      "output_type": "display_data"
185 |     },
186 |     {
187 |      "name": "stdout",
188 |      "output_type": "stream",
189 |      "text": [
190 |       "\n"
191 |      ]
192 |     },
193 |     {
194 |      "data": {
195 |       "application/vnd.jupyter.widget-view+json": {
196 |        "model_id": "b6e2a89ad3b4447582c1446c10cfd9f0",
197 |        "version_major": 2,
198 |        "version_minor": 0
199 |       },
200 |       "text/plain": [
201 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=616.0, style=ProgressStyle(description_…"
202 |       ]
203 |      },
204 |      "metadata": {},
205 |      "output_type": "display_data"
206 |     },
207 |     {
208 |      "name": "stdout",
209 |      "output_type": "stream",
210 |      "text": [
211 |       "\n"
212 |      ]
213 |     }
214 |    ],
215 |    "source": [
216 |     "generator = pipeline('text-generation', model='liam168/chat-DialoGPT-small-zh')  # 默认使用gpt2，也可以指定模型"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 43,
222 |    "metadata": {},
223 |    "outputs": [
224 |     {
225 |      "name": "stderr",
226 |      "output_type": "stream",
227 |      "text": [
228 |       "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
229 |      ]
230 |     },
231 |     {
232 |      "data": {
233 |       "text/plain": [
234 |        "[{'generated_text': '上午上班吧'}]"
235 |       ]
236 |      },
237 |      "execution_count": 43,
238 |      "metadata": {},
239 |      "output_type": "execute_result"
240 |     }
241 |    ],
242 |    "source": [
243 |     "generator('上午')"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "## Have a try: Mask Filling"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 46,
256 |    "metadata": {},
257 |    "outputs": [
258 |     {
259 |      "data": {
260 |       "application/vnd.jupyter.widget-view+json": {
261 |        "model_id": "03b6c5c4b57c4e76917967705df678cb",
262 |        "version_major": 2,
263 |        "version_minor": 0
264 |       },
265 |       "text/plain": [
266 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…"
267 |       ]
268 |      },
269 |      "metadata": {},
270 |      "output_type": "display_data"
271 |     },
272 |     {
273 |      "name": "stdout",
274 |      "output_type": "stream",
275 |      "text": [
276 |       "\n"
277 |      ]
278 |     }
279 |    ],
280 |    "source": [
281 |     "unmasker = pipeline('fill-mask')"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 52,
287 |    "metadata": {},
288 |    "outputs": [
289 |     {
290 |      "data": {
291 |       "text/plain": [
292 |        "[{'sequence': 'What the heck?',\n",
293 |        "  'score': 0.3783760964870453,\n",
294 |        "  'token': 17835,\n",
295 |        "  'token_str': ' heck'},\n",
296 |        " {'sequence': 'What the hell?',\n",
297 |        "  'score': 0.32931089401245117,\n",
298 |        "  'token': 7105,\n",
299 |        "  'token_str': ' hell'},\n",
300 |        " {'sequence': 'What the fuck?',\n",
301 |        "  'score': 0.14645449817180634,\n",
302 |        "  'token': 26536,\n",
303 |        "  'token_str': ' fuck'}]"
304 |       ]
305 |      },
306 |      "execution_count": 52,
307 |      "metadata": {},
308 |      "output_type": "execute_result"
309 |     }
310 |    ],
311 |    "source": [
312 |     "unmasker('What the <mask>?', top_k=3)  # 注意不同的模型，MASK token可能不一样，不一定都是 <mask>"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "markdown",
317 |    "metadata": {},
318 |    "source": [
319 |     "## 更多的Task，见官方教程\n",
320 |     "https://huggingface.co/course/chapter1/3?fw=pt"
321 |    ]
322 |   }
323 |  ],
324 |  "metadata": {
325 |   "kernelspec": {
326 |    "display_name": "Python 3 (ipykernel)",
327 |    "language": "python",
328 |    "name": "python3"
329 |   },
330 |   "language_info": {
331 |    "codemirror_mode": {
332 |     "name": "ipython",
333 |     "version": 3
334 |    },
335 |    "file_extension": ".py",
336 |    "mimetype": "text/x-python",
337 |    "name": "python",
338 |    "nbconvert_exporter": "python",
339 |    "pygments_lexer": "ipython3",
340 |    "version": "3.9.2"
341 |   }
342 |  },
343 |  "nbformat": 4,
344 |  "nbformat_minor": 4
345 | }
346 | 


--------------------------------------------------------------------------------
/HuggingfaceNLP/C1. Start Playing Transformers/2. Transformer家族及基本概念.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Transformer大家族\n",
  8 |     "\n",
  9 |     "## 1. Transformer结构\n",
 10 |     "\n",
 11 |     "Transformer结构最初就是在大2017年名鼎鼎的《Attention Is All You Need》论文中提出的，最开始是用于机器翻译任务。\n",
 12 |     "\n",
 13 |     "这里先简单回顾一下Transformer的基本结构：\n",
 14 |     "\n",
 15 |     "<img src='https://huggingface.co/course/static/chapter1/transformers_blocks.png' width=200 align=\"center\">\n",
 16 |     "\n",
 17 |     "- 左边是encoder，用于对输入的sequence进行表示，得到一个很好特征向量。\n",
 18 |     "- 右边是decoder，利用encoder得到的特征，以及原始的输入，进行新的sequence的生成。\n",
 19 |     "\n",
 20 |     "encoder、decoder既可以单独使用，又可以再一起使用，因此，基于Transformer的模型可以分为三大类：\n",
 21 |     "\n",
 22 |     "- Encoder-only\n",
 23 |     "- Decoder-only\n",
 24 |     "- Encoder-Decoder\n",
 25 |     "\n",
 26 |     "\n",
 27 |     "## 2. Transformer家族\n",
 28 |     "\n",
 29 |     "随后各种基于Transformer结构的模型就如雨后春笋般涌现出来，教程中有一张图展示了一些主要模型的时间轴：\n",
 30 |     "\n",
 31 |     "<img src='https://huggingface.co/course/static/chapter1/transformers_chrono.png' width=1000>\n",
 32 |     "\n",
 33 |     "虽然模型多到四只jio都数不过来，但总体上可以分为三个阵营，分别有三个组长：\n",
 34 |     "\n",
 35 |     "- 组长1：**BERT**。组员都是BERT类似的结构，是一类**自编码模型**。\n",
 36 |     "- 组长2：**GPT**。组员都是类似GPT的结构，是一类**自回归模型**。\n",
 37 |     "- 组长3：**BART/T5**。组员结构都差不多是**encoder-decoder**模型。\n",
 38 |     "\n",
 39 |     "### 不同的架构，不同的预训练方式，不同的特长\n",
 40 |     "\n",
 41 |     "对于**Encoder-only**的模型，预训练任务通常是“破坏一个句子，然后让模型去预测或填补”。例如BERT中使用的就是两个预训练任务就是**Masked language modeling**和**Next sentence prediction**。\n",
 42 |     "因此，这类模型擅长进行文本表示，适用于做**文本的分类、实体识别、关键信息抽取**等任务。\n",
 43 |     "\n",
 44 |     "对于**Decoder-only**的模型，预训练任务通常是**Next word prediction**，这种方式又被称为**Causal language modeling**。这个Causal就是“因果”的意思，对于decoder，它在训练时是无法看到全文的，只能看到前面的信息。\n",
 45 |     "因此这类模型适合做**文本生成**任务。\n",
 46 |     "\n",
 47 |     "而**Seq2seq**架构，由于包含了encoder和decoder，所以预训练的目标通常是融合了各自的目标，但通常还会设计一些更加复杂的目标，比如对于T5模型，会把一句话中一片区域的词都mask掉，然后让模型去预测。seq2seq架构的模型，就适合做**翻译、对话**等需要根据给定输入来生成输出的任务，这跟decoder-only的模型还是有很大差别的。\n",
 48 |     "\n",
 49 |     "### 总结表如下："
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "|类型|架构|Transformer组件 |\tExamples |\tTasks|\n",
 57 |     "|--|---|--- |\t--- |\t---|\n",
 58 |     "|**BERT**-like | auto-encoding models|\tEncoder  |\t\tALBERT, BERT, DistilBERT, ELECTRA, RoBERTa | \tSentence classification, named entity recognition, extractive question answering|\n",
 59 |     "|**GPT**-like |  auto-regressive models |\tDecoder  |\t\tCTRL, GPT, GPT-2, Transformer XL |\t \tText generation|\n",
 60 |     "|**BART/T5**-like |  sequence-to-sequence models|\tEncoder-decoder  |\t\tBART, T5, Marian, mBART |\t \tSummarization, translation, generative question answering|\n"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "---\n",
 68 |     "\n",
 69 |     "本部分对应的官方链接：\n",
 70 |     "https://huggingface.co/course/chapter1/4?fw=pt"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": []
 79 |   }
 80 |  ],
 81 |  "metadata": {
 82 |   "kernelspec": {
 83 |    "display_name": "Python 3 (ipykernel)",
 84 |    "language": "python",
 85 |    "name": "python3"
 86 |   },
 87 |   "language_info": {
 88 |    "codemirror_mode": {
 89 |     "name": "ipython",
 90 |     "version": 3
 91 |    },
 92 |    "file_extension": ".py",
 93 |    "mimetype": "text/x-python",
 94 |    "name": "python",
 95 |    "nbconvert_exporter": "python",
 96 |    "pygments_lexer": "ipython3",
 97 |    "version": "3.9.2"
 98 |   }
 99 |  },
100 |  "nbformat": 4,
101 |  "nbformat_minor": 4
102 | }
103 | 


--------------------------------------------------------------------------------
/HuggingfaceNLP/C1. Start Playing Transformers/3. 端到端的背后.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 端到端的背后到底是怎么处理的\n",
  8 |     "\n",
  9 |     "Pipeline的背后：\\\n",
 10 |     "<img src='https://huggingface.co/course/static/chapter2/full_nlp_pipeline.png' width=1000>"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "## 1. Tokenizer\n",
 18 |     "\n",
 19 |     "我们使用的tokenizer必须跟对应的模型在预训练时的tokenizer保持一致，也就是词表需要一致。\\\n",
 20 |     "Huggingface中可以直接指定模型的checkpoint的名字，然后自动下载对应的词表。\\\n",
 21 |     "具体方式是：\n",
 22 |     "- 使用`AutoTokenizer`的`from_pretrained`方法\n",
 23 |     "\n",
 24 |     "`tokenizer`这个对象可以直接接受参数并输出结果，即它是callable的。具体参数见：\\\n",
 25 |     "https://huggingface.co/transformers/master/internal/tokenization_utils.html#transformers.tokenization_utils_base.PreTrainedTokenizerBase \\\n",
 26 |     "主要参数包括：\n",
 27 |     "- text，可以是单条的string，也可以是一个string的list，还可以是list的list\n",
 28 |     "- padding，用于填白\n",
 29 |     "- truncation，用于截断\n",
 30 |     "- max_length，设置最大句长\n",
 31 |     "- return_tensors，设置返回数据类型"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "from transformers import AutoTokenizer\n",
 41 |     "\n",
 42 |     "checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'\n",
 43 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "先看看直接使用tokenizer的结果："
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 13,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "data": {
 60 |       "text/plain": [
 61 |        "{'input_ids': [[101, 2651, 2003, 1037, 2204, 2154, 999, 15854, 1066, 1066, 1066, 102], [101, 2129, 2055, 4826, 1029, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}"
 62 |       ]
 63 |      },
 64 |      "execution_count": 13,
 65 |      "metadata": {},
 66 |      "output_type": "execute_result"
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "raw_inputs = ['Today is a good day! Woo~~~',\n",
 71 |     "              'How about tomorrow?']\n",
 72 |     "tokenizer(raw_inputs)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "可以加上一个 `padding=Ture` 参数，让得到的序列长度对齐："
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 14,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "data": {
 89 |       "text/plain": [
 90 |        "{'input_ids': [[101, 2651, 2003, 1037, 2204, 2154, 999, 15854, 1066, 1066, 1066, 102], [101, 2129, 2055, 4826, 1029, 102, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]}"
 91 |       ]
 92 |      },
 93 |      "execution_count": 14,
 94 |      "metadata": {},
 95 |      "output_type": "execute_result"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "tokenizer(raw_inputs, padding=True)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "tokenizer还有`truncation`和`max_length`属性，用于在max_length处截断："
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 19,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "data": {
116 |       "text/plain": [
117 |        "{'input_ids': [[101, 2651, 2003, 1037, 2204, 2154, 102], [101, 2129, 2055, 4826, 1029, 102, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0]]}"
118 |       ]
119 |      },
120 |      "execution_count": 19,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "tokenizer(raw_inputs, padding=True, truncation=True, max_length=7) "
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "`return_tensors`属性也很重要，用来指定返回的是什么类型的tensors，`pt`就是pytorch，`tf`就是tensorflow："
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 22,
139 |    "metadata": {},
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "{'input_ids': tensor([[  101,  2651,  2003,  1037,  2204,  2154,   999, 15854,  1066,  1066,\n",
145 |        "          1066,   102],\n",
146 |        "        [  101,  2129,  2055,  4826,  1029,   102,     0,     0,     0,     0,\n",
147 |        "             0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
148 |        "        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])}"
149 |       ]
150 |      },
151 |      "execution_count": 22,
152 |      "metadata": {},
153 |      "output_type": "execute_result"
154 |     }
155 |    ],
156 |    "source": [
157 |     "tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "## 2. Model\n",
165 |     "也可以通过AutoModel来直接从checkpoint导入模型。\\\n",
166 |     "这里导入的模型，是Transformer的基础模型，接受tokenize之后的输入，输出hidden states，即文本的向量表示，是一种上下文表示。\n",
167 |     "\n",
168 |     "这个向量表示，会有三个维度：\n",
169 |     "1. batch size\n",
170 |     "2. sequence length\n",
171 |     "3. hidden size"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 23,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "from transformers import AutoModel\n",
181 |     "model = AutoModel.from_pretrained(checkpoint)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "加载了模型之后，就可以把tokenizer得到的输出，直接输入到model中："
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 40,
194 |    "metadata": {},
195 |    "outputs": [
196 |     {
197 |      "data": {
198 |       "text/plain": [
199 |        "dict_keys(['last_hidden_state', 'hidden_states', 'attentions'])"
200 |       ]
201 |      },
202 |      "execution_count": 40,
203 |      "metadata": {},
204 |      "output_type": "execute_result"
205 |     }
206 |    ],
207 |    "source": [
208 |     "inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')\n",
209 |     "outputs = model(**inputs)  # 这里变量前面的**，代表把inputs这个dictionary给分解成一个个参数单独输进去\n",
210 |     "vars(outputs).keys()  # 查看一下输出有哪些属性"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "metadata": {},
216 |    "source": [
217 |     ">这里顺便讲一讲这个函数中`**`的用法：\n",
218 |     "\n",
219 |     "`**`在函数中的作用就是把后面紧跟着的这个参数，从一个字典的格式，解压成一个个单独的参数。\n",
220 |     "\n",
221 |     "回顾一下上面tokenizer的输出，我们发现它是一个包含了input_ids和attention_mask两个key的**字典**，因此通过`**`的解压，相当于变成了`intput_ids=..., attention_mask=...`喂给函数。\n",
222 |     "\n",
223 |     "我们再来查看一下通过AutoModel加载的DistillBertModel模型的输入：\n",
224 |     "https://huggingface.co/transformers/master/model_doc/distilbert.html#distilbertmodel\n",
225 |     "\n",
226 |     "可以看到DistillBertModel的直接call的函数是：\n",
227 |     "\n",
228 |     "`forward(input_ids=None, attention_mask=None, ...)`\n",
229 |     "正好跟`**inputs`后的格式对应上。"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 31,
235 |    "metadata": {},
236 |    "outputs": [
237 |     {
238 |      "name": "stdout",
239 |      "output_type": "stream",
240 |      "text": [
241 |       "torch.Size([2, 12, 768])\n"
242 |      ]
243 |     },
244 |     {
245 |      "data": {
246 |       "text/plain": [
247 |        "tensor([[[ 0.4627,  0.3042,  0.5431,  ...,  0.3706,  1.0033, -0.6074],\n",
248 |        "         [ 0.6100,  0.3093,  0.2038,  ...,  0.3788,  0.9370, -0.6439],\n",
249 |        "         [ 0.6514,  0.3185,  0.3855,  ...,  0.4152,  1.0199, -0.4450],\n",
250 |        "         ...,\n",
251 |        "         [ 0.3674,  0.1380,  1.1619,  ...,  0.4976,  0.4758, -0.5896],\n",
252 |        "         [ 0.4182,  0.2503,  1.0898,  ...,  0.4745,  0.4042, -0.5444],\n",
253 |        "         [ 1.1614,  0.2516,  0.9561,  ...,  0.5742,  0.8437, -0.9604]],\n",
254 |        "\n",
255 |        "        [[ 0.7956, -0.2343,  0.3810,  ..., -0.1270,  0.5182, -0.1612],\n",
256 |        "         [ 0.9337,  0.2074,  0.6202,  ...,  0.1874,  0.6584, -0.1899],\n",
257 |        "         [ 0.6279, -0.3176,  0.1596,  ..., -0.2956,  0.2960, -0.1447],\n",
258 |        "         ...,\n",
259 |        "         [ 0.3050,  0.0396,  0.6345,  ...,  0.4271,  0.3367, -0.3285],\n",
260 |        "         [ 0.1773,  0.0111,  0.6275,  ...,  0.3831,  0.3543, -0.2919],\n",
261 |        "         [ 0.2756,  0.0048,  0.9281,  ...,  0.2006,  0.4375, -0.3238]]],\n",
262 |        "       grad_fn=<NativeLayerNormBackward>)"
263 |       ]
264 |      },
265 |      "execution_count": 31,
266 |      "metadata": {},
267 |      "output_type": "execute_result"
268 |     }
269 |    ],
270 |    "source": [
271 |     "print(outputs.last_hidden_state.shape)\n",
272 |     "outputs.last_hidden_state"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {},
278 |    "source": [
279 |     "可以看到，输出的shape是`torch.Size([2, 12, 768])`，三个维度分别是 batch，seq_len和hidden size。"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 41,
285 |    "metadata": {},
286 |    "outputs": [
287 |     {
288 |      "data": {
289 |       "text/plain": [
290 |        "odict_keys(['last_hidden_state'])"
291 |       ]
292 |      },
293 |      "execution_count": 41,
294 |      "metadata": {},
295 |      "output_type": "execute_result"
296 |     }
297 |    ],
298 |    "source": [
299 |     "outputs.keys()  # 这里查看发现只有 last_hidden_state， 因为其他的值都是None"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "## 3. Model Heads\n",
307 |     "模型头，接在基础模型的后面，用于将hidden states文本表示进一步处理，用于具体的任务。\n",
308 |     "\n",
309 |     "整体框架图：\\\n",
310 |     "<img src='https://huggingface.co/course/static/chapter2/transformer_and_head.png' width=1000>\n",
311 |     "\n",
312 |     "Head一般是由若干层的线性层来构成的。\n",
313 |     "\n",
314 |     "Transformers库中的主要模型架构有：\n",
315 |     "- *Model (retrieve the hidden states)\n",
316 |     "- *ForCausalLM\n",
317 |     "- *ForMaskedLM\n",
318 |     "- *ForMultipleChoice\n",
319 |     "- *ForQuestionAnswering\n",
320 |     "- *ForSequenceClassification\n",
321 |     "- *ForTokenClassification\n",
322 |     "- ...\n",
323 |     "\n",
324 |     "单纯的`*Model`，就是不包含 Head 的模型，而有`For*`的则是包含了具体 Head 的模型。\n",
325 |     "\n",
326 |     "例如，对于前面的那个做在情感分析上pretrain的checkpoint(distilbert-base-uncased-finetuned-sst-2-english)，我们可以使用包含 SequenceClassification 的Head的模型去加载，就可以直接得到对应分类问题的logits，而不仅仅是文本向量表示。"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 43,
332 |    "metadata": {},
333 |    "outputs": [
334 |     {
335 |      "name": "stdout",
336 |      "output_type": "stream",
337 |      "text": [
338 |       "dict_keys(['loss', 'logits', 'hidden_states', 'attentions'])\n"
339 |      ]
340 |     },
341 |     {
342 |      "data": {
343 |       "text/plain": [
344 |        "tensor([[-4.2098,  4.6444],\n",
345 |        "        [ 0.6367, -0.3753]], grad_fn=<AddmmBackward>)"
346 |       ]
347 |      },
348 |      "execution_count": 43,
349 |      "metadata": {},
350 |      "output_type": "execute_result"
351 |     }
352 |    ],
353 |    "source": [
354 |     "from transformers import AutoModelForSequenceClassification\n",
355 |     "clf = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
356 |     "inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')\n",
357 |     "outputs = clf(**inputs)\n",
358 |     "print(vars(outputs).keys())\n",
359 |     "outputs.logits"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "markdown",
364 |    "metadata": {},
365 |    "source": [
366 |     "从outputs的属性就可以看出，带有Head的Model，跟不带Head的Model，输出的东西是不一样的。\n",
367 |     "\n",
368 |     "没有Head的Model，输出的是`'last_hidden_state', 'hidden_states', 'attentions'`这些玩意儿，因为它仅仅是一个表示模型；\n",
369 |     "\n",
370 |     "而有Head的Model，输出的是`'loss', 'logits', 'hidden_states', 'attentions'`这些玩意儿，有logits，loss这些东西，因为它是一个完整的预测模型了。\n",
371 |     "\n",
372 |     "可以顺便看看，加了这个 SequenceClassification Head的DistillBertModel的文档，看看其输入和输出：\n",
373 |     "\n",
374 |     "https://huggingface.co/transformers/master/model_doc/distilbert.html#distilbertforsequenceclassification\n",
375 |     "\n",
376 |     "可以看到，输入中，我们还可以提供`labels`，这样就可以直接计算loss了。"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "metadata": {},
382 |    "source": [
383 |     "## 4. Post-Processing\n",
384 |     "后处理主要就是两步：\n",
385 |     "- 把logits转化成概率值 （用softmax）\n",
386 |     "- 把概率值跟具体的标签对应上 （使用模型的config中的id2label）"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": 46,
392 |    "metadata": {},
393 |    "outputs": [
394 |     {
395 |      "data": {
396 |       "text/plain": [
397 |        "tensor([[1.4276e-04, 9.9986e-01],\n",
398 |        "        [7.3341e-01, 2.6659e-01]], grad_fn=<SoftmaxBackward>)"
399 |       ]
400 |      },
401 |      "execution_count": 46,
402 |      "metadata": {},
403 |      "output_type": "execute_result"
404 |     }
405 |    ],
406 |    "source": [
407 |     "import torch\n",
408 |     "predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)  # dim=-1就是沿着最后一维进行操作\n",
409 |     "predictions"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "markdown",
414 |    "metadata": {},
415 |    "source": [
416 |     "得到了概率分布，还得知道具体是啥标签吧。标签跟id的隐射关系，也已经被保存在每个pretrain model的config中了，\n",
417 |     "我们可以去模型的`config`属性中查看`id2label`字段："
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": 54,
423 |    "metadata": {},
424 |    "outputs": [
425 |     {
426 |      "data": {
427 |       "text/plain": [
428 |        "{0: 'NEGATIVE', 1: 'POSITIVE'}"
429 |       ]
430 |      },
431 |      "execution_count": 54,
432 |      "metadata": {},
433 |      "output_type": "execute_result"
434 |     }
435 |    ],
436 |    "source": [
437 |     "id2label = clf.config.id2label\n",
438 |     "id2label"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": 58,
444 |    "metadata": {},
445 |    "outputs": [
446 |     {
447 |      "name": "stdout",
448 |      "output_type": "stream",
449 |      "text": [
450 |       "POSITIVE\n",
451 |       "NEGATIVE\n"
452 |      ]
453 |     }
454 |    ],
455 |    "source": [
456 |     "for i in torch.argmax(predictions, dim=-1):\n",
457 |     "    print(id2label[i.item()])"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": null,
463 |    "metadata": {},
464 |    "outputs": [],
465 |    "source": []
466 |   }
467 |  ],
468 |  "metadata": {
469 |   "kernelspec": {
470 |    "display_name": "Python 3 (ipykernel)",
471 |    "language": "python",
472 |    "name": "python3"
473 |   },
474 |   "language_info": {
475 |    "codemirror_mode": {
476 |     "name": "ipython",
477 |     "version": 3
478 |    },
479 |    "file_extension": ".py",
480 |    "mimetype": "text/x-python",
481 |    "name": "python",
482 |    "nbconvert_exporter": "python",
483 |    "pygments_lexer": "ipython3",
484 |    "version": "3.9.2"
485 |   }
486 |  },
487 |  "nbformat": 4,
488 |  "nbformat_minor": 4
489 | }
490 | 


--------------------------------------------------------------------------------
/HuggingfaceNLP/C1. Start Playing Transformers/4. Models & Tokenizers.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Models\n",
  8 |     "\n",
  9 |     "前面都是使用的`AutoModel`，这是一个智能的wrapper，可以根据你给定的checkpoint名字，自动去寻找对应的网络结构，故名Auto。\n",
 10 |     "\n",
 11 |     "如果明确知道我们需要的是什么网络架构，就可以直接使用具体的`*Model`，比如`BertModel`，就是使用Bert结构。"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## 随机初始化一个Transformer模型：通过`config`来加载\n",
 19 |     "\n",
 20 |     "`*Config`这个类，用于给出某个模型的网络结构，通过config来加载模型，得到的就是一个模型的架子，没有预训练的权重。"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "from transformers import BertModel, BertConfig\n",
 30 |     "\n",
 31 |     "config = BertConfig()\n",
 32 |     "model = BertModel(config)  # 模型是根据config来构建的，这时构建的模型是参数随机初始化的"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 3,
 38 |    "metadata": {},
 39 |    "outputs": [
 40 |     {
 41 |      "name": "stdout",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "BertConfig {\n",
 45 |       "  \"attention_probs_dropout_prob\": 0.1,\n",
 46 |       "  \"gradient_checkpointing\": false,\n",
 47 |       "  \"hidden_act\": \"gelu\",\n",
 48 |       "  \"hidden_dropout_prob\": 0.1,\n",
 49 |       "  \"hidden_size\": 768,\n",
 50 |       "  \"initializer_range\": 0.02,\n",
 51 |       "  \"intermediate_size\": 3072,\n",
 52 |       "  \"layer_norm_eps\": 1e-12,\n",
 53 |       "  \"max_position_embeddings\": 512,\n",
 54 |       "  \"model_type\": \"bert\",\n",
 55 |       "  \"num_attention_heads\": 12,\n",
 56 |       "  \"num_hidden_layers\": 12,\n",
 57 |       "  \"pad_token_id\": 0,\n",
 58 |       "  \"position_embedding_type\": \"absolute\",\n",
 59 |       "  \"transformers_version\": \"4.3.3\",\n",
 60 |       "  \"type_vocab_size\": 2,\n",
 61 |       "  \"use_cache\": true,\n",
 62 |       "  \"vocab_size\": 30522\n",
 63 |       "}\n",
 64 |       "\n"
 65 |      ]
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "print(config)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "更常用的做法则是直接加载预训练模型，然后微调。\n",
 77 |     "\n",
 78 |     "## 初始化一个预训练的Transformer模型：通过`from_pretrained`来加载"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 4,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "data": {
 88 |       "application/vnd.jupyter.widget-view+json": {
 89 |        "model_id": "96595079a4984858b21cff090d86dc71",
 90 |        "version_major": 2,
 91 |        "version_minor": 0
 92 |       },
 93 |       "text/plain": [
 94 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…"
 95 |       ]
 96 |      },
 97 |      "metadata": {},
 98 |      "output_type": "display_data"
 99 |     },
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "\n"
105 |      ]
106 |     },
107 |     {
108 |      "data": {
109 |       "application/vnd.jupyter.widget-view+json": {
110 |        "model_id": "bda0b82bdbd8466693dc28840ad24a1a",
111 |        "version_major": 2,
112 |        "version_minor": 0
113 |       },
114 |       "text/plain": [
115 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…"
116 |       ]
117 |      },
118 |      "metadata": {},
119 |      "output_type": "display_data"
120 |     },
121 |     {
122 |      "name": "stdout",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "\n"
126 |      ]
127 |     }
128 |    ],
129 |    "source": [
130 |     "from transformers import BertModel\n",
131 |     "\n",
132 |     "model = BertModel.from_pretrained('bert-base-cased')"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "模型的保存："
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 5,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "model.save_pretrained(\"directory_on_my_computer\")\n",
149 |     "# 会生成两个文件： config.json pytorch_model.bin"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "# Tokenizer\n",
157 |     "transformer模型使用的分词方法，往往不是直接的word-level分词或者char-level分词。\n",
158 |     "\n",
159 |     "前者会让词表过大，后者则表示能力很低。\n",
160 |     "\n",
161 |     "因此主流的方式是进行 **subword-level** 的分词。例如对 \"tokenization\" 这个词，可能会被分成 \"token\" 和 \"ization\" 两部分。\n",
162 |     "\n",
163 |     "常见的subword tokenization方法有：\n",
164 |     "- BPE\n",
165 |     "- WordPiece\n",
166 |     "- Unigram\n",
167 |     "- SentencePiece\n",
168 |     "- ...\n",
169 |     "\n",
170 |     "\n",
171 |     "这里对BPE做一个简单的介绍，让我们对 sub-word tokenization 的原理有一个基本了解：\n",
172 |     "\n",
173 |     "## Subword tokenization (☆☆☆)\n",
174 |     "Subword tokenization的核心思想是：“频繁出现了词不应该被切分成更小的单位，但不常出现的词应该被切分成更小的单位”。\n",
175 |     "\n",
176 |     "比方\"annoyingly\"这种词，就不是很常见，但是\"annoying\"和\"ly\"都很常见，因此细分成这两个sub-word就更合理。中文也是类似的，比如“仓库管理系统”作为一个单位就明显在语料中不会很多，因此分成“仓库”和“管理系统”就会好很多。\n",
177 |     "\n",
178 |     "这样分词的好处在于，大大节省了词表空间，还能够解决OOV问题。因为我们很多使用的词语，都是由更简单的词语或者词缀构成的，我们不用去保存那些“小词”各种排列组合形成的千变万化的“大词”，而用较少的词汇，去覆盖各种各样的词语表示。同时，相比与直接使用最基础的“字”作为词表，sub-word的语义表示能力也更强。\n",
179 |     "\n",
180 |     "那么，用什么样的标准得到sub-word呢？一个著名的算法就是 **Byte-Pair Encoding (BPE)** ：\n",
181 |     "\n",
182 |     "（下面的内容，主要翻译自Huggingface Docs中讲解tokenizer的部分，十分推荐大家直接阅读： https://huggingface.co/transformers/master/tokenizer_summary.html ）\n",
183 |     "\n",
184 |     "### BPE————Byte-Pair Encoding：\n",
185 |     "\n",
186 |     "#### **Step1**：首先，我们需要对**语料**进行一个**预分词（pre-tokenization）**：\n",
187 |     "\n",
188 |     "比方对于英文，我可以直接简单地使用空格加一些标点符号来分词；中文可以使用jieba或者直接字来进行分词。\n",
189 |     "\n",
190 |     "分词之后，我们就得到了一个**原始词集合**，同时，还会记录每个词在训练语料中出现的**频率**。\n",
191 |     "\n",
192 |     "假设我们的词集合以及词频是：\n",
193 |     "\n",
194 |     "```python\n",
195 |     "(\"hug\", 10), (\"pug\", 5), (\"pun\", 12), (\"bun\", 4), (\"hugs\", 5)\n",
196 |     "```\n",
197 |     "\n",
198 |     "#### **Step2**：构建**基础词表（base vocab）** 并开始学习 **结合规则（merge rules）**：\n",
199 |     "\n",
200 |     "\n",
201 |     "对于英语来说，我们选择字母来构成**基础词表**：\n",
202 |     "\n",
203 |     "`[\"b\", \"g\", \"h\", \"n\", \"p\", \"s\", \"u\"]`\n",
204 |     "\n",
205 |     "注：这个基础词表，就是我们最终词表的初始状态，我们会不断构建新词，加进去，直到达到我们理想的词表规模。\n",
206 |     "\n",
207 |     "根据这个基础词表，我们可以对原始的词集合进行细粒度分词，并看到基础词的词频：\n",
208 |     "\n",
209 |     "```python\n",
210 |     "(\"h\" \"u\" \"g\", 10), (\"p\" \"u\" \"g\", 5), (\"p\" \"u\" \"n\", 12), (\"b\" \"u\" \"n\", 4), (\"h\" \"u\" \"g\" \"s\", 5)\n",
211 |     "```\n",
212 |     "\n",
213 |     "接下来就是BPE的Byte-Pair核心部分————找symbol pair（符号对）并学习结合规则，即，我们从上面这个统计结果中，找出出现次数最多的那个符号对：\n",
214 |     "\n",
215 |     "统计一下：\n",
216 |     "```python\n",
217 |     "h+u   出现了 10+5=15 次\n",
218 |     "u+g   出现了 10+5+5 = 20 次\n",
219 |     "p+u   出现了 12 次\n",
220 |     "...\n",
221 |     "```\n",
222 |     "统计完毕，我们发现`u+g`出现了最多次，因此，第一个结合规则就是：**把`u`跟`g`拼起来，得到`ug`这个新词！**\n",
223 |     "\n",
224 |     "那么，我们就把`ug`加入到我们的基础词表：\n",
225 |     "\n",
226 |     "`[\"b\", \"g\", \"h\", \"n\", \"p\", \"s\", \"u\", \"ug\"]`\n",
227 |     "\n",
228 |     "同时，词频统计表也变成了：\n",
229 |     "```\n",
230 |     "(\"h\" \"ug\", 10), (\"p\" \"ug\", 5), (\"p\" \"u\" \"n\", 12), (\"b\" \"u\" \"n\", 4), (\"h\" \"ug\" \"s\", 5)\n",
231 |     "```\n",
232 |     "\n",
233 |     "#### **Step3**：反复地执行上一步，直到达到预设的词表规模。\n",
234 |     "\n",
235 |     "我们接着统计，发现下一个频率最高的symbol pair是`u+n`，出现了12+4=16次，因此词表中增加`un`这个词；再下一个则是`h+ug`，出现了10+5=15次，因此添加`hug`这个词......\n",
236 |     "\n",
237 |     "如此进行下去，当达到了预设的`vocab_size`的数目时，就停止，咱们的词表就得到啦！\n",
238 |     "\n",
239 |     "#### **Step4**：如何分词：\n",
240 |     "\n",
241 |     "得到了最终词表，在碰到一个词汇表中没有的词的时候，比如`bug`就会把它分成`b`和`ug`。也可以理解成，我首先把`bug`分解成最基本的字母，然后根据前面的结合规律，把`u`跟`g`结合起来，而`b`单独一个。具体在分词时候是如何做的，有时间去读读源码。\n",
242 |     "\n",
243 |     "---\n",
244 |     "\n",
245 |     "除了BPE，还有一些其他的sub-word分词法，可以参考 https://huggingface.co/transformers/master/tokenizer_summary.html 。\n",
246 |     "\n",
247 |     "下面，我们就直接使用Tokenizer来进行分词："
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 2,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "from transformers import BertTokenizer  # 或者 AutoTokenizer\n",
257 |     "\n",
258 |     "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 9,
264 |    "metadata": {},
265 |    "outputs": [
266 |     {
267 |      "data": {
268 |       "text/plain": [
269 |        "{'input_ids': [101, 2052, 1110, 170, 1363, 1285, 1106, 3858, 11303, 1468, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
270 |       ]
271 |      },
272 |      "execution_count": 9,
273 |      "metadata": {},
274 |      "output_type": "execute_result"
275 |     }
276 |    ],
277 |    "source": [
278 |     "s = 'today is a good day to learn transformers'\n",
279 |     "tokenizer()"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "## 了解一下内部的具体步骤：\n",
287 |     "\n",
288 |     "1. `tokenize()`"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 9,
294 |    "metadata": {},
295 |    "outputs": [
296 |     {
297 |      "data": {
298 |       "text/plain": [
299 |        "['today', 'is', 'a', 'good', 'day', 'to', 'learn', 'transform', '##ers']"
300 |       ]
301 |      },
302 |      "execution_count": 9,
303 |      "metadata": {},
304 |      "output_type": "execute_result"
305 |     }
306 |    ],
307 |    "source": [
308 |     "s = 'today is a good day to learn transformers'\n",
309 |     "tokens = tokenizer.tokenize(s)\n",
310 |     "tokens"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "注意这里的分词结果，`transformers`被分成了`transform`和`##ers`。这里的##代表这个词应该紧跟在前面的那个词，组成一个完整的词。\n",
318 |     "\n",
319 |     "这样设计，主要是为了方面我们在还原句子的时候，可以正确得把sub-word组成成原来的词。"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {},
325 |    "source": [
326 |     "2. `convert_tokens_to_ids()`"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 13,
332 |    "metadata": {},
333 |    "outputs": [
334 |     {
335 |      "data": {
336 |       "text/plain": [
337 |        "[2052, 1110, 170, 1363, 1285, 1106, 3858, 11303, 1468]"
338 |       ]
339 |      },
340 |      "execution_count": 13,
341 |      "metadata": {},
342 |      "output_type": "execute_result"
343 |     }
344 |    ],
345 |    "source": [
346 |     "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
347 |     "ids"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "markdown",
352 |    "metadata": {},
353 |    "source": [
354 |     "3. `decode`"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": 16,
360 |    "metadata": {},
361 |    "outputs": [
362 |     {
363 |      "name": "stdout",
364 |      "output_type": "stream",
365 |      "text": [
366 |       "##ers\n",
367 |       "today is a good day to learn transformers\n"
368 |      ]
369 |     }
370 |    ],
371 |    "source": [
372 |     "print(tokenizer.decode([1468]))\n",
373 |     "print(tokenizer.decode(ids))  # 注意这里会把subword自动拼起来"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "metadata": {},
379 |    "source": [
380 |     "## Special Tokens\n",
381 |     "\n",
382 |     "观察一下上面的结果，直接call tokenizer得到的ids是：\n",
383 |     "```\n",
384 |     "[101, 2052, 1110, 170, 1363, 1285, 1106, 3858, 11303, 1468, 102]\n",
385 |     "```\n",
386 |     "而通过`convert_tokens_to_ids`得到的ids是：\n",
387 |     "```\n",
388 |     "[2052, 1110, 170, 1363, 1285, 1106, 3858, 11303, 1468]\n",
389 |     "```\n",
390 |     "可以发现，前者在头和尾多了俩token，id分别是 101 和 102。\n",
391 |     "\n",
392 |     "decode出来瞅瞅："
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": 17,
398 |    "metadata": {},
399 |    "outputs": [
400 |     {
401 |      "data": {
402 |       "text/plain": [
403 |        "'[CLS] today is a good day to learn transformers [SEP]'"
404 |       ]
405 |      },
406 |      "execution_count": 17,
407 |      "metadata": {},
408 |      "output_type": "execute_result"
409 |     }
410 |    ],
411 |    "source": [
412 |     "tokenizer.decode([101, 2052, 1110, 170, 1363, 1285, 1106, 3858, 11303, 1468, 102])"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "markdown",
417 |    "metadata": {},
418 |    "source": [
419 |     "它们分别是 `[CLS]` 和 `[SEP]`。这两个token的出现，是因为我们调用的模型，在pre-train阶段使用了它们，所以tokenizer也会使用。\n",
420 |     "\n",
421 |     "不同的模型使用的special tokens不一定相同，所以一定要让tokenizer跟model保持一致！"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {},
428 |    "outputs": [],
429 |    "source": []
430 |   }
431 |  ],
432 |  "metadata": {
433 |   "kernelspec": {
434 |    "display_name": "Python 3",
435 |    "language": "python",
436 |    "name": "python3"
437 |   },
438 |   "language_info": {
439 |    "codemirror_mode": {
440 |     "name": "ipython",
441 |     "version": 3
442 |    },
443 |    "file_extension": ".py",
444 |    "mimetype": "text/x-python",
445 |    "name": "python",
446 |    "nbconvert_exporter": "python",
447 |    "pygments_lexer": "ipython3",
448 |    "version": "3.7.6"
449 |   }
450 |  },
451 |  "nbformat": 4,
452 |  "nbformat_minor": 4
453 | }
454 | 


--------------------------------------------------------------------------------
/HuggingfaceNLP/C1. Start Playing Transformers/5. 处理多个序列.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# `attention_mask`在处理多个序列时的作用\n",
  8 |     "\n",
  9 |     "现在我们训练和预测基本都是批量化处理的，而前面展示的例子很多都是单条数据。单条数据跟多条数据有一些需要注意的地方。\n",
 10 |     "\n",
 11 |     "## 处理单个序列\n",
 12 |     "\n",
 13 |     "我们首先加载一个在情感分类上微调过的模型，来进行我们的实验（注意，这里我们就不能能使用`AutoModel`，而应该使用`AutoModelFor*`这种带Head的model）。"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "from pprint import pprint as print  # 这个pprint能让打印的格式更好看一点\n",
 23 |     "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
 24 |     "checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'\n",
 25 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
 26 |     "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "对一个句子，使用tokenizer进行处理："
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "name": "stdout",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]]),\n",
 46 |       " 'input_ids': tensor([[ 101, 2651, 2003, 1037, 3835, 2154,  999,  102]])}\n"
 47 |      ]
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "s = 'Today is a nice day!'\n",
 52 |     "inputs = tokenizer(s, return_tensors='pt')\n",
 53 |     "print(inputs)"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "可以看到，这里的inputs包含了两个部分：`input_ids`和`attention_mask`.\n",
 61 |     "\n",
 62 |     "模型可以直接接受`input_ids`并输出："
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 3,
 68 |    "metadata": {},
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/plain": [
 73 |        "tensor([[-4.3232,  4.6906]], grad_fn=<AddmmBackward>)"
 74 |       ]
 75 |      },
 76 |      "execution_count": 3,
 77 |      "metadata": {},
 78 |      "output_type": "execute_result"
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "model(inputs.input_ids).logits"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "也可以通过`**inputs`同时接受`inputs`所有的属性："
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 4,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "data": {
 99 |       "text/plain": [
100 |        "tensor([[-4.3232,  4.6906]], grad_fn=<AddmmBackward>)"
101 |       ]
102 |      },
103 |      "execution_count": 4,
104 |      "metadata": {},
105 |      "output_type": "execute_result"
106 |     }
107 |    ],
108 |    "source": [
109 |     "model(**inputs).logits"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "上面两种方式的**结果是一样的**。\n",
117 |     "\n",
118 |     "## 但是当我们需要同时处理**多个序列**时，情况就有变了！"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 5,
124 |    "metadata": {},
125 |    "outputs": [
126 |     {
127 |      "name": "stdout",
128 |      "output_type": "stream",
129 |      "text": [
130 |       "{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],\n",
131 |       "        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),\n",
132 |       " 'input_ids': tensor([[  101,  2651,  2003,  1037,  3835,  2154,   999,   102,     0,     0,\n",
133 |       "             0],\n",
134 |       "        [  101,  2021,  2054,  2055,  4826,  1029, 10047,  2025,  2469,  1012,\n",
135 |       "           102]])}\n"
136 |      ]
137 |     }
138 |    ],
139 |    "source": [
140 |     "ss = ['Today is a nice day!',\n",
141 |     "      'But what about tomorrow? Im not sure.']\n",
142 |     "inputs = tokenizer(ss, padding=True, return_tensors='pt')\n",
143 |     "print(inputs)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "然后，我们试着直接把这里的`input_ids`喂给模型"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 6,
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/plain": [
161 |        "tensor([[-4.1957,  4.5675],\n",
162 |        "        [ 3.9803, -3.2120]], grad_fn=<AddmmBackward>)"
163 |       ]
164 |      },
165 |      "execution_count": 6,
166 |      "metadata": {},
167 |      "output_type": "execute_result"
168 |     }
169 |    ],
170 |    "source": [
171 |     "model(inputs.input_ids).logits  # 第一个句子原本的logits是 [-4.3232,  4.6906]"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "发现，第一个句子的`logits`变了！\n",
179 |     "\n",
180 |     "这是**因为在padding之后，第一个句子的encoding变了，多了很多0， 而self-attention会attend到所有的index的值，因此结果就变了**。\n",
181 |     "\n",
182 |     "这时，就需要我们不仅仅是传入`input_ids`，还需要给出`attention_mask`，这样模型就会在attention的时候，不去attend被mask掉的部分。\n",
183 |     "\n",
184 |     "\n",
185 |     "因此，在处理多个序列的时候，正确的做法是直接把tokenizer处理好的结果，整个输入到模型中，即直接`**inputs`。\n",
186 |     "通过`**inputs`，我们实际上就把`attention_mask`也传进去了:"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 7,
192 |    "metadata": {},
193 |    "outputs": [
194 |     {
195 |      "data": {
196 |       "text/plain": [
197 |        "tensor([[-4.3232,  4.6906],\n",
198 |        "        [ 3.9803, -3.2120]], grad_fn=<AddmmBackward>)"
199 |       ]
200 |      },
201 |      "execution_count": 7,
202 |      "metadata": {},
203 |      "output_type": "execute_result"
204 |     }
205 |    ],
206 |    "source": [
207 |     "model(**inputs).logits"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "现在第一个句子的结果，就跟前面单条处理时的一样了。"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": []
223 |   }
224 |  ],
225 |  "metadata": {
226 |   "kernelspec": {
227 |    "display_name": "Python 3",
228 |    "language": "python",
229 |    "name": "python3"
230 |   },
231 |   "language_info": {
232 |    "codemirror_mode": {
233 |     "name": "ipython",
234 |     "version": 3
235 |    },
236 |    "file_extension": ".py",
237 |    "mimetype": "text/x-python",
238 |    "name": "python",
239 |    "nbconvert_exporter": "python",
240 |    "pygments_lexer": "ipython3",
241 |    "version": "3.7.6"
242 |   }
243 |  },
244 |  "nbformat": 4,
245 |  "nbformat_minor": 4
246 | }
247 | 


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/1. 数据集预处理.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 数据集的预处理，使用dynamic padding构造batch"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 试着训练一两条样本"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "data": {
 24 |       "text/plain": [
 25 |        "True"
 26 |       ]
 27 |      },
 28 |      "execution_count": 1,
 29 |      "metadata": {},
 30 |      "output_type": "execute_result"
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "import torch\n",
 35 |     "torch.cuda.is_available()"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "name": "stderr",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
 48 |       "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
 49 |       "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
 50 |       "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
 51 |       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
 52 |      ]
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification\n",
 57 |     "\n",
 58 |     "# Same as before\n",
 59 |     "checkpoint = \"bert-base-uncased\"\n",
 60 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
 61 |     "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
 62 |     "sequences = [\n",
 63 |     "    \"I've been waiting for a HuggingFace course my whole life.\",\n",
 64 |     "    \"This course is amazing!\",\n",
 65 |     "]\n",
 66 |     "batch = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"pt\")"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 3,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "batch['labels'] = torch.tensor([1, 1])  # tokenizer出来的结果是一个dictionary，所以可以直接加入新的 key-value\n",
 76 |     "\n",
 77 |     "optimizer = AdamW(model.parameters())\n",
 78 |     "loss = model(**batch).loss  #这里的 loss 是直接根据 batch 中提供的 labels 来计算的，回忆：前面章节查看 model 的输出的时候，有loss这一项\n",
 79 |     "loss.backward()\n",
 80 |     "optimizer.step()"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "## 从Huggingface Hub中加载数据集\n",
 88 |     "\n",
 89 |     "MRPC (Microsoft Research Paraphrase Corpus) dataset consists of 5,801 pairs of sentences, with a label indicating if they are paraphrases or not (i.e., if both sentences mean the same thing)."
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 4,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stderr",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "Reusing dataset glue (C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n"
102 |      ]
103 |     },
104 |     {
105 |      "data": {
106 |       "application/vnd.jupyter.widget-view+json": {
107 |        "model_id": "14286509d57343f3bc94a8e2f7bb3c64",
108 |        "version_major": 2,
109 |        "version_minor": 0
110 |       },
111 |       "text/plain": [
112 |        "  0%|          | 0/3 [00:00<?, ?it/s]"
113 |       ]
114 |      },
115 |      "metadata": {},
116 |      "output_type": "display_data"
117 |     },
118 |     {
119 |      "data": {
120 |       "text/plain": [
121 |        "DatasetDict({\n",
122 |        "    train: Dataset({\n",
123 |        "        features: ['sentence1', 'sentence2', 'label', 'idx'],\n",
124 |        "        num_rows: 3668\n",
125 |        "    })\n",
126 |        "    validation: Dataset({\n",
127 |        "        features: ['sentence1', 'sentence2', 'label', 'idx'],\n",
128 |        "        num_rows: 408\n",
129 |        "    })\n",
130 |        "    test: Dataset({\n",
131 |        "        features: ['sentence1', 'sentence2', 'label', 'idx'],\n",
132 |        "        num_rows: 1725\n",
133 |        "    })\n",
134 |        "})"
135 |       ]
136 |      },
137 |      "execution_count": 4,
138 |      "metadata": {},
139 |      "output_type": "execute_result"
140 |     }
141 |    ],
142 |    "source": [
143 |     "from datasets import load_dataset\n",
144 |     "\n",
145 |     "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
146 |     "raw_datasets"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "load_dataset出来的是一个DatasetDict对象，它包含了train，validation，test三个属性。可以通过key来直接查询，得到对应的数据集。\n",
154 |     "\n",
155 |     "这里的train，valid，test都是Dataset类型，有 features和num_rows两个属性。还可以直接通过下标来查询对应的样本。"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 8,
161 |    "metadata": {},
162 |    "outputs": [
163 |     {
164 |      "data": {
165 |       "text/plain": [
166 |        "{'sentence1': 'Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n",
167 |        " 'sentence2': 'Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .',\n",
168 |        " 'label': 1,\n",
169 |        " 'idx': 0}"
170 |       ]
171 |      },
172 |      "execution_count": 8,
173 |      "metadata": {},
174 |      "output_type": "execute_result"
175 |     }
176 |    ],
177 |    "source": [
178 |     "raw_train_dataset = raw_datasets['train']\n",
179 |     "raw_train_dataset[0]"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "Dataset的features可以理解为一张表的columns，Dataset甚至可以看做一个pandas的dataframe，二者的使用很类似。\n",
187 |     "\n",
188 |     "我们可以直接像操作dataframe一样，取出某一列："
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 9,
194 |    "metadata": {},
195 |    "outputs": [
196 |     {
197 |      "data": {
198 |       "text/plain": [
199 |        "list"
200 |       ]
201 |      },
202 |      "execution_count": 9,
203 |      "metadata": {},
204 |      "output_type": "execute_result"
205 |     }
206 |    ],
207 |    "source": [
208 |     "type(raw_train_dataset['sentence1'])  # 直接取出所有的sentence1，形成一个list"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "通过Dataset的features属性，可以详细查看数据集特征，包括labels具体都是啥："
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 10,
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "data": {
225 |       "text/plain": [
226 |        "{'sentence1': Value(dtype='string', id=None),\n",
227 |        " 'sentence2': Value(dtype='string', id=None),\n",
228 |        " 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None),\n",
229 |        " 'idx': Value(dtype='int32', id=None)}"
230 |       ]
231 |      },
232 |      "execution_count": 10,
233 |      "metadata": {},
234 |      "output_type": "execute_result"
235 |     }
236 |    ],
237 |    "source": [
238 |     "raw_train_dataset.features"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "## 数据集的预处理"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 2,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "from transformers import AutoTokenizer\n",
255 |     "tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "我们可以直接下面这样处理：\n",
263 |     "```python\n",
264 |     "tokenized_sentences_1 = tokenizer(raw_train_dataset['sentence1'])\n",
265 |     "tokenized_sentences_2 = tokenizer(raw_train_dataset['sentence2'])\n",
266 |     "```\n",
267 |     "但对于MRPC任务，我们不能把两个句子分开输入到模型中，二者应该组成一个pair输进去。\n",
268 |     "\n",
269 |     "tokenizer也可以直接处理sequence pair："
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 20,
275 |    "metadata": {},
276 |    "outputs": [
277 |     {
278 |      "name": "stdout",
279 |      "output_type": "stream",
280 |      "text": [
281 |       "{'attention_mask': [1, 1, 1, 1, 1, 1, 1],\n",
282 |       " 'input_ids': [101, 2034, 6251, 102, 2117, 2028, 102],\n",
283 |       " 'token_type_ids': [0, 0, 0, 0, 1, 1, 1]}\n"
284 |      ]
285 |     }
286 |    ],
287 |    "source": [
288 |     "from pprint import pprint as print\n",
289 |     "inputs = tokenizer(\"first sentence\", \"second one\")\n",
290 |     "print(inputs)"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 21,
296 |    "metadata": {},
297 |    "outputs": [
298 |     {
299 |      "data": {
300 |       "text/plain": [
301 |        "'[CLS] first sentence [SEP] second one [SEP]'"
302 |       ]
303 |      },
304 |      "execution_count": 21,
305 |      "metadata": {},
306 |      "output_type": "execute_result"
307 |     }
308 |    ],
309 |    "source": [
310 |     "tokenizer.decode(inputs.input_ids)"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "可以看到这里inputs里，还有一个`token_type_ids`属性，它在这里的作用就很明显了，指示哪些词是属于第一个句子，哪些词是属于第二个句子\n",
318 |     "\n",
319 |     "这种神奇的做法，其实也是源于bert-base预训练的任务，即**next sentence prediction**。换成其他模型，比如DistilBert，它在预训练的时候没有这个任务，那它的tokenizer的结果就不会有这个`token_type_ids`属性了。"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {},
325 |    "source": [
326 |     "既然这里的tokenizer可以直接处理pair，我们就可以这么去分词："
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 32,
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "tokenized_dataset = tokenizer(\n",
336 |     "    raw_datasets[\"train\"][\"sentence1\"],\n",
337 |     "    raw_datasets[\"train\"][\"sentence2\"],\n",
338 |     "    padding=True,\n",
339 |     "    truncation=True,\n",
340 |     ")"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "markdown",
345 |    "metadata": {},
346 |    "source": [
347 |     "但是这样不一定好，因为先是直接把要处理的整个数据集都读进了内存，又返回一个新的dictionary，会占据很多内存。\n",
348 |     "\n",
349 |     "官方推荐的做法是通过`Dataset.map`方法，来调用一个分词方法，实现批量化的分词："
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 29,
355 |    "metadata": {},
356 |    "outputs": [
357 |     {
358 |      "data": {
359 |       "application/vnd.jupyter.widget-view+json": {
360 |        "model_id": "acc4da4c0f1d4c749535f86832149e6c",
361 |        "version_major": 2,
362 |        "version_minor": 0
363 |       },
364 |       "text/plain": [
365 |        "  0%|          | 0/4 [00:00<?, ?ba/s]"
366 |       ]
367 |      },
368 |      "metadata": {},
369 |      "output_type": "display_data"
370 |     },
371 |     {
372 |      "name": "stderr",
373 |      "output_type": "stream",
374 |      "text": [
375 |       "Loading cached processed dataset at C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\\cache-ef33b0c3c08e7836.arrow\n",
376 |       "Loading cached processed dataset at C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\\cache-aadac9a568777e3c.arrow\n"
377 |      ]
378 |     },
379 |     {
380 |      "data": {
381 |       "text/plain": [
382 |        "DatasetDict({\n",
383 |        "    train: Dataset({\n",
384 |        "        features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'],\n",
385 |        "        num_rows: 3668\n",
386 |        "    })\n",
387 |        "    validation: Dataset({\n",
388 |        "        features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'],\n",
389 |        "        num_rows: 408\n",
390 |        "    })\n",
391 |        "    test: Dataset({\n",
392 |        "        features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'],\n",
393 |        "        num_rows: 1725\n",
394 |        "    })\n",
395 |        "})"
396 |       ]
397 |      },
398 |      "execution_count": 29,
399 |      "metadata": {},
400 |      "output_type": "execute_result"
401 |     }
402 |    ],
403 |    "source": [
404 |     "def tokenize_function(sample):\n",
405 |     "    # 这里可以添加多种操作，不光是tokenize\n",
406 |     "    # 这个函数处理的对象，就是Dataset这种数据类型，通过features中的字段来选择要处理的数据\n",
407 |     "    return tokenizer(sample['sentence1'], sample['sentence2'], truncation=True)\n",
408 |     "\n",
409 |     "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
410 |     "tokenized_datasets"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "markdown",
415 |    "metadata": {},
416 |    "source": [
417 |     "看看这个map的一些参数：\n",
418 |     "\n",
419 |     "```shell\n",
420 |     "raw_datasets.map(\n",
421 |     "    function,\n",
422 |     "    with_indices: bool = False,\n",
423 |     "    input_columns: Union[str, List[str], NoneType] = None,\n",
424 |     "    batched: bool = False,\n",
425 |     "    batch_size: Union[int, NoneType] = 1000,\n",
426 |     "    remove_columns: Union[str, List[str], NoneType] = None,\n",
427 |     "    keep_in_memory: bool = False,\n",
428 |     "    load_from_cache_file: bool = True,\n",
429 |     "    cache_file_names: Union[Dict[str, Union[str, NoneType]], NoneType] = None,\n",
430 |     "    writer_batch_size: Union[int, NoneType] = 1000,\n",
431 |     "    features: Union[datasets.features.Features, NoneType] = None,\n",
432 |     "    disable_nullable: bool = False,\n",
433 |     "    fn_kwargs: Union[dict, NoneType] = None,\n",
434 |     "    num_proc: Union[int, NoneType] = None,  # 使用此参数，可以使用多进程处理\n",
435 |     "    desc: Union[str, NoneType] = None,\n",
436 |     ") -> 'DatasetDict'\n",
437 |     "Docstring:\n",
438 |     "Apply a function to all the elements in the table (individually or in batches)\n",
439 |     "and update the table (if function does updated examples).\n",
440 |     "The transformation is applied to all the datasets of the dataset dictionary.\n",
441 |     "```\n",
442 |     "\n",
443 |     "关于这个map，在Huggingface的测试题中有讲解，这里搬运并翻译一下，辅助理解：\n",
444 |     "\n",
445 |     "What are the benefits of the Dataset.map method?\n",
446 |     "- The results of the function are cached, so it won't take any time if we re-execute the code.\n",
447 |     "\n",
448 |     "    （通过这个map，对数据集的处理会被缓存，所以重新执行代码，也不会再费时间。）\n",
449 |     "- It can apply multiprocessing to go faster than applying the function on each element of the dataset.\n",
450 |     "\n",
451 |     "    （它可以使用多进程来处理从而提高处理速度。）\n",
452 |     "- It does not load the whole dataset into memory, saving the results as soon as one element is processed.\n",
453 |     "\n",
454 |     "    （它不需要把整个数据集都加载到内存里，同时每个元素一经处理就会马上被保存，因此十分节省内存。）"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "markdown",
459 |    "metadata": {},
460 |    "source": [
461 |     "观察一下，这里通过map之后，得到的Dataset的features变多了：\n",
462 |     "```python\n",
463 |     "features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids']\n",
464 |     "```\n",
465 |     "多的几个columns就是tokenizer处理后的结果。"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "markdown",
470 |    "metadata": {},
471 |    "source": [
472 |     "注意到，在这个`tokenize_function`中，我们没有使用`padding`，因为如果使用了padding之后，就会全局统一对一个maxlen进行padding，这样无论在tokenize还是模型的训练上都不够高效。"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "markdown",
477 |    "metadata": {},
478 |    "source": [
479 |     "## Dynamic Padding 动态padding\n",
480 |     "\n",
481 |     "实际上，我们是故意先不进行padding的，因为我们想**在划分batch的时候再进行padding**，这样可以避免出现很多有一堆padding的序列，从而可以显著节省我们的训练时间。\n",
482 |     "\n",
483 |     "这里，我们就需要用到`DataCollatorWithPadding`，来进行**动态padding**："
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "code",
488 |    "execution_count": 34,
489 |    "metadata": {},
490 |    "outputs": [],
491 |    "source": [
492 |     "from transformers import DataCollatorWithPadding\n",
493 |     "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "markdown",
498 |    "metadata": {},
499 |    "source": [
500 |     "注意，我们需要使用tokenizer来初始化这个`DataCollatorWithPadding`，因为需要tokenizer来告知具体的padding token是啥，以及padding的方式是在左边还是右边（不同的预训练模型，使用的padding token以及方式可能不同）。\n"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "markdown",
505 |    "metadata": {},
506 |    "source": [
507 |     "下面假设我们要搞一个size=5的batch，看看如何使用`DataCollatorWithPadding`来实现："
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": 61,
513 |    "metadata": {},
514 |    "outputs": [
515 |     {
516 |      "data": {
517 |       "text/plain": [
518 |        "[50, 59, 47, 67, 59]"
519 |       ]
520 |      },
521 |      "execution_count": 61,
522 |      "metadata": {},
523 |      "output_type": "execute_result"
524 |     }
525 |    ],
526 |    "source": [
527 |     "samples = tokenized_datasets['train'][:5]\n",
528 |     "samples.keys()\n",
529 |     "# >>> ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids']\n",
530 |     "samples = {k:v for k,v in samples.items() if k not in [\"idx\", \"sentence1\", \"sentence2\"]}  # 把这里多余的几列去掉\n",
531 |     "samples.keys()\n",
532 |     "# >>> ['attention_mask', 'input_ids', 'label', 'token_type_ids']\n",
533 |     "\n",
534 |     "# 打印出每个句子的长度：\n",
535 |     "[len(x) for x in samples[\"input_ids\"]]"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "code",
540 |    "execution_count": 57,
541 |    "metadata": {},
542 |    "outputs": [
543 |     {
544 |      "data": {
545 |       "text/plain": [
546 |        "[67, 67, 67, 67, 67]"
547 |       ]
548 |      },
549 |      "execution_count": 57,
550 |      "metadata": {},
551 |      "output_type": "execute_result"
552 |     }
553 |    ],
554 |    "source": [
555 |     "batch = data_collator(samples)  # samples中必须包含 input_ids 字段，因为这就是collator要处理的对象\n",
556 |     "batch.keys()\n",
557 |     "# >>> dict_keys(['attention_mask', 'input_ids', 'token_type_ids', 'labels'])\n",
558 |     "\n",
559 |     "# 再打印长度：\n",
560 |     "[len(x) for x in batch['input_ids']]"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "markdown",
565 |    "metadata": {},
566 |    "source": [
567 |     "可以看到，这个`data_collator`就是一个把给定dataset进行padding的工具，其输入跟输出是完全一样的格式。"
568 |    ]
569 |   },
570 |   {
571 |    "cell_type": "code",
572 |    "execution_count": 64,
573 |    "metadata": {},
574 |    "outputs": [
575 |     {
576 |      "data": {
577 |       "text/plain": [
578 |        "{'attention_mask': torch.Size([5, 67]),\n",
579 |        " 'input_ids': torch.Size([5, 67]),\n",
580 |        " 'token_type_ids': torch.Size([5, 67]),\n",
581 |        " 'labels': torch.Size([5])}"
582 |       ]
583 |      },
584 |      "execution_count": 64,
585 |      "metadata": {},
586 |      "output_type": "execute_result"
587 |     }
588 |    ],
589 |    "source": [
590 |     "{k:v.shape for k,v in batch.items()}"
591 |    ]
592 |   },
593 |   {
594 |    "cell_type": "markdown",
595 |    "metadata": {},
596 |    "source": [
597 |     "这个batch，可以形成一个tensor了！接下来就可以用于训练了！"
598 |    ]
599 |   },
600 |   {
601 |    "cell_type": "markdown",
602 |    "metadata": {},
603 |    "source": [
604 |     "---\n",
605 |     "\n",
606 |     "对了，这里多提一句，`collator`这个单词实际上在平时使用英语的时候并不常见，但却在编程中见到多次。\n",
607 |     "\n",
608 |     "最开始一直以为是`collector`，意为“收集者”等意思，后来查了查，发现不是的。下面是柯林斯词典中对`collate`这个词的解释：\n",
609 |     "\n",
610 |     "> **collate**: \n",
611 |     ">\n",
612 |     "> When you collate pieces of information, you **gather** them all together and **examine** them. \n",
613 |     "\n",
614 |     "就是归纳并整理的意思。所以在我们这个情景下，就是对这些杂乱无章长短不一的序列数据，进行一个个地分组，然后检查并统一长度。\n",
615 |     "\n",
616 |     "关于DataCollator更多的信息，可以参见文档：\n",
617 |     "https://huggingface.co/transformers/master/main_classes/data_collator.html?highlight=datacollatorwithpadding#data-collator"
618 |    ]
619 |   }
620 |  ],
621 |  "metadata": {
622 |   "kernelspec": {
623 |    "display_name": "Python 3",
624 |    "language": "python",
625 |    "name": "python3"
626 |   },
627 |   "language_info": {
628 |    "codemirror_mode": {
629 |     "name": "ipython",
630 |     "version": 3
631 |    },
632 |    "file_extension": ".py",
633 |    "mimetype": "text/x-python",
634 |    "name": "python",
635 |    "nbconvert_exporter": "python",
636 |    "pygments_lexer": "ipython3",
637 |    "version": "3.7.6"
638 |   }
639 |  },
640 |  "nbformat": 4,
641 |  "nbformat_minor": 4
642 | }
643 | 


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/2. 使用Trainer API来fine-tune.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 使用Trainer API来微调模型"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 1. 数据集准备和预处理：\n",
 15 |     "\n",
 16 |     "这部分就是回顾上一集的内容：\n",
 17 |     "- 通过dataset包加载数据集\n",
 18 |     "- 加载预训练模型和tokenizer\n",
 19 |     "- 定义Dataset.map要使用的预处理函数\n",
 20 |     "- 定义DataCollator来用于构造训练batch"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "name": "stderr",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "Reusing dataset glue (C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n"
 33 |      ]
 34 |     },
 35 |     {
 36 |      "data": {
 37 |       "application/vnd.jupyter.widget-view+json": {
 38 |        "model_id": "b4bdadebec1b4fa681fd5b7370f11abc",
 39 |        "version_major": 2,
 40 |        "version_minor": 0
 41 |       },
 42 |       "text/plain": [
 43 |        "  0%|          | 0/3 [00:00<?, ?it/s]"
 44 |       ]
 45 |      },
 46 |      "metadata": {},
 47 |      "output_type": "display_data"
 48 |     },
 49 |     {
 50 |      "name": "stderr",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "Loading cached processed dataset at C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\\cache-d7c1a56b0a079691.arrow\n",
 54 |       "Loading cached processed dataset at C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\\cache-4551ce60e93aa1ca.arrow\n",
 55 |       "Loading cached processed dataset at C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\\cache-8e3dd97f55b2d13b.arrow\n"
 56 |      ]
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "import numpy as np\n",
 61 |     "from transformers import AutoTokenizer, DataCollatorWithPadding\n",
 62 |     "import datasets\n",
 63 |     "checkpoint = 'bert-base-cased'\n",
 64 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
 65 |     "raw_datasets = datasets.load_dataset('glue', 'mrpc')\n",
 66 |     "\n",
 67 |     "def tokenize_function(sample):\n",
 68 |     "    return tokenizer(sample['sentence1'], sample['sentence2'], truncation=True)\n",
 69 |     "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
 70 |     "\n",
 71 |     "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "## 2. 加载我们要fine-tune的模型："
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 2,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stderr",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
 91 |       "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
 92 |       "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
 93 |       "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
 94 |       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
 95 |      ]
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "from transformers import AutoModelForSequenceClassification\n",
100 |     "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "不得不说，这个Huggingface很贴心，这里的warning写的很清楚。这里我们使用的是带`ForSequenceClassification`这个Head的模型，但是我们的`bert-baed-cased`虽然它本身也有自身的Head，但跟我们这里的二分类任务不匹配，所以可以看到，它的Head被移除了，使用了一个随机初始化的`ForSequenceClassification`Head。\n",
108 |     "\n",
109 |     "所以这里提示还说：\"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\"\n",
110 |     "\n",
111 |     "## 3. 使用`Trainer`来训练\n",
112 |     "\n",
113 |     "`Trainer`是Huggingface transformers库的一个高级API"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 3,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "from transformers import Trainer, TrainingArguments\n",
123 |     "\n",
124 |     "training_args = TrainingArguments(output_dir='test_trainer')\n",
125 |     "\n",
126 |     "trainer = Trainer(\n",
127 |     "    model,\n",
128 |     "    training_args,\n",
129 |     "    train_dataset=tokenized_datasets[\"train\"],\n",
130 |     "    eval_dataset=tokenized_datasets[\"validation\"],\n",
131 |     "    data_collator=data_collator,  # 在定义了tokenizer之后，其实这里的data_collator就不用再写了，会自动根据tokenizer创建\n",
132 |     "    tokenizer=tokenizer,\n",
133 |     ")"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "我们看看`TrainingArguments`和`Trainer`的参数都有些啥：\n",
141 |     "\n",
142 |     "- https://huggingface.co/transformers/master/main_classes/trainer.html\n",
143 |     "- https://huggingface.co/transformers/master/main_classes/trainer.html#trainingarguments\n",
144 |     "\n",
145 |     "\n",
146 |     "```python\n",
147 |     "TrainingArguments(\n",
148 |     "    output_dir: Union[str, NoneType] = None,\n",
149 |     "    overwrite_output_dir: bool = False,\n",
150 |     "    do_train: bool = False,\n",
151 |     "    do_eval: bool = None,\n",
152 |     "    do_predict: bool = False,\n",
153 |     "    evaluation_strategy: transformers.trainer_utils.EvaluationStrategy = 'no',\n",
154 |     "    prediction_loss_only: bool = False,\n",
155 |     "    per_device_train_batch_size: int = 8,  # 默认的batch_size=8\n",
156 |     "    per_device_eval_batch_size: int = 8,\n",
157 |     "    per_gpu_train_batch_size: Union[int, NoneType] = None,\n",
158 |     "    per_gpu_eval_batch_size: Union[int, NoneType] = None,\n",
159 |     "    gradient_accumulation_steps: int = 1,\n",
160 |     "    eval_accumulation_steps: Union[int, NoneType] = None,\n",
161 |     "    learning_rate: float = 5e-05,\n",
162 |     "    weight_decay: float = 0.0,\n",
163 |     "    adam_beta1: float = 0.9,\n",
164 |     "    adam_beta2: float = 0.999,\n",
165 |     "    adam_epsilon: float = 1e-08,\n",
166 |     "    max_grad_norm: float = 1.0,\n",
167 |     "    num_train_epochs: float = 3.0,   # 默认跑3轮\n",
168 |     "    ...\n",
169 |     "```\n",
170 |     "\n",
171 |     "```python\n",
172 |     "Trainer(\n",
173 |     "    model: Union[transformers.modeling_utils.PreTrainedModel, torch.nn.modules.module.Module] = None,\n",
174 |     "    args: transformers.training_args.TrainingArguments = None,\n",
175 |     "    data_collator: Union[DataCollator, NoneType] = None,\n",
176 |     "    train_dataset: Union[torch.utils.data.dataset.Dataset, NoneType] = None,\n",
177 |     "    eval_dataset: Union[torch.utils.data.dataset.Dataset, NoneType] = None,\n",
178 |     "    tokenizer: Union[ForwardRef('PreTrainedTokenizerBase'), NoneType] = None,\n",
179 |     "    model_init: Callable[[], transformers.modeling_utils.PreTrainedModel] = None,\n",
180 |     "    compute_metrics: Union[Callable[[transformers.trainer_utils.EvalPrediction], Dict], NoneType] = None,\n",
181 |     "    callbacks: Union[List[transformers.trainer_callback.TrainerCallback], NoneType] = None,\n",
182 |     "    optimizers: Tuple[torch.optim.optimizer.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),  # 默认会使用AdamW\n",
183 |     ")\n",
184 |     "Docstring:     \n",
185 |     "Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers.\n",
186 |     "```\n",
187 |     "\n",
188 |     "可见，这个`Trainer`把所有训练中需要考虑的参数、设计都包括在内了，我们可以在这里指定训练验证集、data_collator、metrics、optimizer，并通过`TrainingArguments`来提供各种超参数。\n",
189 |     "\n",
190 |     "默认情况下，`Trainer`和`TrainingArguments`会使用：\n",
191 |     "- batch size=8\n",
192 |     "- epochs = 3\n",
193 |     "- AdamW优化器\n",
194 |     "\n",
195 |     "\n",
196 |     "定义好之后，直接使用`.train()`来启动训练："
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 4,
202 |    "metadata": {},
203 |    "outputs": [
204 |     {
205 |      "data": {
206 |       "text/html": [
207 |        "\n",
208 |        "    <div>\n",
209 |        "        <style>\n",
210 |        "            /* Turns off some styling */\n",
211 |        "            progress {\n",
212 |        "                /* gets rid of default border in Firefox and Opera. */\n",
213 |        "                border: none;\n",
214 |        "                /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
215 |        "                background-size: auto;\n",
216 |        "            }\n",
217 |        "        </style>\n",
218 |        "      \n",
219 |        "      <progress value='1377' max='1377' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
220 |        "      [1377/1377 06:20, Epoch 3/3]\n",
221 |        "    </div>\n",
222 |        "    <table border=\"1\" class=\"dataframe\">\n",
223 |        "  <thead>\n",
224 |        "    <tr style=\"text-align: left;\">\n",
225 |        "      <th>Step</th>\n",
226 |        "      <th>Training Loss</th>\n",
227 |        "    </tr>\n",
228 |        "  </thead>\n",
229 |        "  <tbody>\n",
230 |        "    <tr>\n",
231 |        "      <td>500</td>\n",
232 |        "      <td>0.539400</td>\n",
233 |        "    </tr>\n",
234 |        "    <tr>\n",
235 |        "      <td>1000</td>\n",
236 |        "      <td>0.319400</td>\n",
237 |        "    </tr>\n",
238 |        "  </tbody>\n",
239 |        "</table><p>"
240 |       ],
241 |       "text/plain": [
242 |        "<IPython.core.display.HTML object>"
243 |       ]
244 |      },
245 |      "metadata": {},
246 |      "output_type": "display_data"
247 |     },
248 |     {
249 |      "data": {
250 |       "text/plain": [
251 |        "TrainOutput(global_step=1377, training_loss=0.35569445984728887, metrics={'train_runtime': 383.0158, 'train_samples_per_second': 3.595, 'total_flos': 530185443455520, 'epoch': 3.0})"
252 |       ]
253 |      },
254 |      "execution_count": 4,
255 |      "metadata": {},
256 |      "output_type": "execute_result"
257 |     }
258 |    ],
259 |    "source": [
260 |     "trainer.train()"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "然后我们用`Trainer`来预测：\n",
268 |     "\n",
269 |     "`trainer.predict()`函数处理的结果是一个named_tuple，类似一个字典，包含三个属性：predictions, label_ids, metrics\n",
270 |     "\n",
271 |     "注意，这里的三个属性：\n",
272 |     "- `predictions`实际上就是logits\n",
273 |     "- `label_ids`不是预测出来的id，而是数据集中自带的ground truth的label id，因此如果输入的数据集中没给标签，这里也不会输出\n",
274 |     "- `metrics`，也是只有输入的数据集中提供了`label_ids`才会输出metrics，包括loss之类的指标\n",
275 |     "\n",
276 |     "其中`metrics`中还可以包含我们自定义的字段，我们需要在定义`Trainer`的时候给定`compute_metrics`参数。\n",
277 |     "\n",
278 |     "文档参考： https://huggingface.co/transformers/master/main_classes/trainer.html#transformers.Trainer.predict"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 5,
284 |    "metadata": {},
285 |    "outputs": [
286 |     {
287 |      "data": {
288 |       "text/html": [
289 |        "\n",
290 |        "    <div>\n",
291 |        "        <style>\n",
292 |        "            /* Turns off some styling */\n",
293 |        "            progress {\n",
294 |        "                /* gets rid of default border in Firefox and Opera. */\n",
295 |        "                border: none;\n",
296 |        "                /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
297 |        "                background-size: auto;\n",
298 |        "            }\n",
299 |        "        </style>\n",
300 |        "      \n",
301 |        "      <progress value='51' max='51' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
302 |        "      [51/51 00:03]\n",
303 |        "    </div>\n",
304 |        "    "
305 |       ],
306 |       "text/plain": [
307 |        "<IPython.core.display.HTML object>"
308 |       ]
309 |      },
310 |      "metadata": {},
311 |      "output_type": "display_data"
312 |     },
313 |     {
314 |      "name": "stdout",
315 |      "output_type": "stream",
316 |      "text": [
317 |       "(408, 2)\n",
318 |       "(408,)\n",
319 |       "{'eval_loss': 0.7387174963951111, 'eval_runtime': 3.2872, 'eval_samples_per_second': 124.117}\n"
320 |      ]
321 |     }
322 |    ],
323 |    "source": [
324 |     "predictions = trainer.predict(tokenized_datasets['validation'])\n",
325 |     "print(predictions.predictions.shape)  # logits\n",
326 |     "# array([[-2.7887206,  3.1986978],\n",
327 |     "#       [ 2.5258656, -1.832253 ], ...], dtype=float32)\n",
328 |     "print(predictions.label_ids.shape) # array([1, 0, 0, 1, 0, 1, 0, 1, 1, 1, ...], dtype=int64)\n",
329 |     "print(predictions.metrics)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "markdown",
334 |    "metadata": {},
335 |    "source": [
336 |     "然后就可以用preds和labels来计算一些相关的metrics了。\n",
337 |     "\n",
338 |     "Huggingface `datasets`里面可以直接导入跟数据集相关的metrics："
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 6,
344 |    "metadata": {},
345 |    "outputs": [
346 |     {
347 |      "data": {
348 |       "text/plain": [
349 |        "{'accuracy': 0.8455882352941176, 'f1': 0.8911917098445595}"
350 |       ]
351 |      },
352 |      "execution_count": 6,
353 |      "metadata": {},
354 |      "output_type": "execute_result"
355 |     }
356 |    ],
357 |    "source": [
358 |     "from datasets import load_metric\n",
359 |     "\n",
360 |     "preds = np.argmax(predictions.predictions, axis=-1)\n",
361 |     "\n",
362 |     "metric = load_metric('glue', 'mrpc')\n",
363 |     "metric.compute(predictions=preds, references=predictions.label_ids)"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {},
369 |    "source": [
370 |     "metric，glue type的文档：\n",
371 |     "```\n",
372 |     "Args:\n",
373 |     "    predictions: list of predictions to score.\n",
374 |     "        Each translation should be tokenized into a list of tokens.\n",
375 |     "    references: list of lists of references for each translation.\n",
376 |     "        Each reference should be tokenized into a list of tokens.\n",
377 |     "Returns: depending on the GLUE subset, one or several of:\n",
378 |     "    \"accuracy\": Accuracy\n",
379 |     "    \"f1\": F1 score\n",
380 |     "    \"pearson\": Pearson Correlation\n",
381 |     "    \"spearmanr\": Spearman Correlation\n",
382 |     "    \"matthews_correlation\": Matthew Correlation\n",
383 |     "```"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "markdown",
388 |    "metadata": {},
389 |    "source": [
390 |     "## 4.构建`Trainer`中的`compute_metrics`函数\n",
391 |     "\n",
392 |     "Let’s see how we can build a useful compute_metrics function and use it the next time we train. The function must take an EvalPrediction object (which is a named tuple with a predictions field and a label_ids field) and will return a dictionary mapping strings to floats (the strings being the names of the metrics returned, and the floats their values). \n",
393 |     "\n",
394 |     "前面我们注意到`Trainer`的参数中，可以提供一个`compute_metrics`函数，用于输出我们希望有的一些指标。\n",
395 |     "\n",
396 |     "这个`compute_metrics`有一些输入输出的要求：\n",
397 |     "- 输入：是一个`EvalPrediction`对象，是一个named tuple，需要有至少`predictions`和`label_ids`两个字段；经过查看源码，这里的predictions，**就是logits**\n",
398 |     "- 输出：一个字典，包含各个metrics和对应的数值。\n",
399 |     "\n",
400 |     "源码地址： https://huggingface.co/transformers/master/_modules/transformers/trainer.html#Trainer"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": 4,
406 |    "metadata": {},
407 |    "outputs": [],
408 |    "source": [
409 |     "from datasets import load_metric\n",
410 |     "def compute_metrics(eval_preds):\n",
411 |     "    metric = load_metric(\"glue\", \"mrpc\")\n",
412 |     "    logits, labels = eval_preds.predictions, eval_preds.label_ids\n",
413 |     "    # 上一行可以直接简写成：\n",
414 |     "    # logits, labels = eval_preds  因为它相当于一个tuple\n",
415 |     "    predictions = np.argmax(logits, axis=-1)\n",
416 |     "    return metric.compute(predictions=predictions, references=labels)"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "总结一下这个过程：\n",
424 |     "\n",
425 |     "- 首先我们定义了一个`compute_metrics`函数，交给`Trainer`；\n",
426 |     "- `Trainer`训练模型，模型会对样本计算，产生 predictions (logits)；\n",
427 |     "- `Trainer`再把 predictions 和数据集中给定的 label_ids 打包成一个对象，发送给`compute_metrics`函数；\n",
428 |     "- `compute_metrics`函数计算好相应的 metrics 然后返回。"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "markdown",
433 |    "metadata": {},
434 |    "source": [
435 |     "## 看看带上了 compute_metrics 之后的训练："
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": 5,
441 |    "metadata": {},
442 |    "outputs": [
443 |     {
444 |      "name": "stderr",
445 |      "output_type": "stream",
446 |      "text": [
447 |       "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
448 |       "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
449 |       "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
450 |       "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
451 |       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
452 |      ]
453 |     },
454 |     {
455 |      "data": {
456 |       "text/html": [
457 |        "\n",
458 |        "    <div>\n",
459 |        "        <style>\n",
460 |        "            /* Turns off some styling */\n",
461 |        "            progress {\n",
462 |        "                /* gets rid of default border in Firefox and Opera. */\n",
463 |        "                border: none;\n",
464 |        "                /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
465 |        "                background-size: auto;\n",
466 |        "            }\n",
467 |        "        </style>\n",
468 |        "      \n",
469 |        "      <progress value='1377' max='1377' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
470 |        "      [1377/1377 06:51, Epoch 3/3]\n",
471 |        "    </div>\n",
472 |        "    <table border=\"1\" class=\"dataframe\">\n",
473 |        "  <thead>\n",
474 |        "    <tr style=\"text-align: left;\">\n",
475 |        "      <th>Epoch</th>\n",
476 |        "      <th>Training Loss</th>\n",
477 |        "      <th>Validation Loss</th>\n",
478 |        "      <th>Accuracy</th>\n",
479 |        "      <th>F1</th>\n",
480 |        "      <th>Runtime</th>\n",
481 |        "      <th>Samples Per Second</th>\n",
482 |        "    </tr>\n",
483 |        "  </thead>\n",
484 |        "  <tbody>\n",
485 |        "    <tr>\n",
486 |        "      <td>1</td>\n",
487 |        "      <td>No log</td>\n",
488 |        "      <td>0.329815</td>\n",
489 |        "      <td>0.867647</td>\n",
490 |        "      <td>0.903571</td>\n",
491 |        "      <td>5.873300</td>\n",
492 |        "      <td>69.467000</td>\n",
493 |        "    </tr>\n",
494 |        "    <tr>\n",
495 |        "      <td>2</td>\n",
496 |        "      <td>0.497900</td>\n",
497 |        "      <td>0.600649</td>\n",
498 |        "      <td>0.845588</td>\n",
499 |        "      <td>0.897227</td>\n",
500 |        "      <td>17.319700</td>\n",
501 |        "      <td>23.557000</td>\n",
502 |        "    </tr>\n",
503 |        "    <tr>\n",
504 |        "      <td>3</td>\n",
505 |        "      <td>0.283200</td>\n",
506 |        "      <td>0.605053</td>\n",
507 |        "      <td>0.872549</td>\n",
508 |        "      <td>0.910345</td>\n",
509 |        "      <td>9.244300</td>\n",
510 |        "      <td>44.135000</td>\n",
511 |        "    </tr>\n",
512 |        "  </tbody>\n",
513 |        "</table><p>"
514 |       ],
515 |       "text/plain": [
516 |        "<IPython.core.display.HTML object>"
517 |       ]
518 |      },
519 |      "metadata": {},
520 |      "output_type": "display_data"
521 |     },
522 |     {
523 |      "data": {
524 |       "text/plain": [
525 |        "TrainOutput(global_step=1377, training_loss=0.32063739751678666, metrics={'train_runtime': 414.1719, 'train_samples_per_second': 3.325, 'total_flos': 530351810395680, 'epoch': 3.0})"
526 |       ]
527 |      },
528 |      "execution_count": 5,
529 |      "metadata": {},
530 |      "output_type": "execute_result"
531 |     }
532 |    ],
533 |    "source": [
534 |     "training_args = TrainingArguments(output_dir='test_trainer', evaluation_strategy='epoch')\n",
535 |     "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)  # new model\n",
536 |     "trainer = Trainer(\n",
537 |     "    model,\n",
538 |     "    training_args,\n",
539 |     "    train_dataset=tokenized_datasets[\"train\"],\n",
540 |     "    eval_dataset=tokenized_datasets[\"validation\"],\n",
541 |     "    data_collator=data_collator,  # 在定义了tokenizer之后，其实这里的data_collator就不用再写了，会自动根据tokenizer创建\n",
542 |     "    tokenizer=tokenizer,\n",
543 |     "    compute_metrics=compute_metrics\n",
544 |     ")\n",
545 |     "\n",
546 |     "trainer.train()"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "markdown",
551 |    "metadata": {},
552 |    "source": [
553 |     "可见，带上了`compute_metircs`函数之后，在Trainer训练过程中，会把增加的metric也打印出来，方便我们时刻连接训练的进展。"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "code",
558 |    "execution_count": null,
559 |    "metadata": {},
560 |    "outputs": [],
561 |    "source": []
562 |   }
563 |  ],
564 |  "metadata": {
565 |   "kernelspec": {
566 |    "display_name": "Python 3",
567 |    "language": "python",
568 |    "name": "python3"
569 |   },
570 |   "language_info": {
571 |    "codemirror_mode": {
572 |     "name": "ipython",
573 |     "version": 3
574 |    },
575 |    "file_extension": ".py",
576 |    "mimetype": "text/x-python",
577 |    "name": "python",
578 |    "nbconvert_exporter": "python",
579 |    "pygments_lexer": "ipython3",
580 |    "version": "3.7.6"
581 |   }
582 |  },
583 |  "nbformat": 4,
584 |  "nbformat_minor": 4
585 | }
586 | 


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/3. 用纯PyTorch来fine-tune.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 更加透明的方式\n",
  8 |     "\n",
  9 |     "这里我们不使用Trainer这个高级API，而是用pytorch来实现。\n",
 10 |     "\n",
 11 |     "\n",
 12 |     "## 1. 数据集预处理\n",
 13 |     "在Huggingface官方教程里提到，在使用pytorch的dataloader之前，我们需要做一些事情：\n",
 14 |     "- 把dataset中一些不需要的列给去掉了，比如‘sentence1’，‘sentence2’等\n",
 15 |     "- 把数据转换成pytorch tensors\n",
 16 |     "- 修改列名 label 为 labels\n",
 17 |     "\n",
 18 |     "其他的都好说，但**为啥要修改列名 label 为 labels，好奇怪哦！**\n",
 19 |     "这里探究一下：\n",
 20 |     "\n",
 21 |     "\n",
 22 |     "首先，Huggingface的这些transformer Model直接call的时候，接受的标签这个参数是叫\"labels\"。\n",
 23 |     "所以不管你使用Trainer，还是原生pytorch去写，最终模型处理的时候，肯定是使用的名为\"labels\"的标签参数。\n",
 24 |     "\n",
 25 |     "\n",
 26 |     "但在Huggingface的datasets中，数据集的标签一般命名为\"label\"或者\"label_ids\"，那为什么在前两集中，我们没有对标签名进行处理呢？\n",
 27 |     "\n",
 28 |     "这一点在transformer的源码`trainer.py`里找到了端倪：\n",
 29 |     "```python\n",
 30 |     "# 位置在def _remove_unused_columns函数里\n",
 31 |     "# Labels may be named label or label_ids, the default data collator handles that.\n",
 32 |     "signature_columns += [\"label\", \"label_ids\"]\n",
 33 |     "```\n",
 34 |     "这里提示了， data collator 会负责处理标签问题。然后我又去查看了`data_collator.py`中发现了一下内容：\n",
 35 |     "```python\n",
 36 |     "class DataCollatorWithPadding:\n",
 37 |     "    ...\n",
 38 |     "    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
 39 |     "        ...\n",
 40 |     "        if \"label\" in batch:\n",
 41 |     "            batch[\"labels\"] = batch[\"label\"]\n",
 42 |     "            del batch[\"label\"]\n",
 43 |     "        if \"label_ids\" in batch:\n",
 44 |     "            batch[\"labels\"] = batch[\"label_ids\"]\n",
 45 |     "            del batch[\"label_ids\"]\n",
 46 |     "        return batch\n",
 47 |     "```\n",
 48 |     "这就真相大白了：不管数据集中提供的标签名叫\"label\"，还是\"label_ids\"，\n",
 49 |     "DataCollatorWithPadding 都会帮你转换成\"labels\"，装进batch里，再返回。\n",
 50 |     "\n",
 51 |     "前面使用Trainer的时候，DataCollatorWithPadding已经帮我们自动转换了，因此我们不需要操心这个问题。\n",
 52 |     "\n",
 53 |     "但这就是让我疑惑的地方：我们使用pytorch来写，其实也不用管这个，因为在pytorch的data_loader里面，有一个`collate_fn`参数，我们可以把DataCollatorWithPadding对象传进去，也会帮我们自动把\"label\"转换成\"labels\"。因此实际上，这应该是教程中的一个错误，我们不需要手动设计。"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 1,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stderr",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "Reusing dataset glue (C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n"
 66 |      ]
 67 |     },
 68 |     {
 69 |      "data": {
 70 |       "application/vnd.jupyter.widget-view+json": {
 71 |        "model_id": "b8102a966021470aa4688946db23983f",
 72 |        "version_major": 2,
 73 |        "version_minor": 0
 74 |       },
 75 |       "text/plain": [
 76 |        "  0%|          | 0/3 [00:00<?, ?it/s]"
 77 |       ]
 78 |      },
 79 |      "metadata": {},
 80 |      "output_type": "display_data"
 81 |     },
 82 |     {
 83 |      "name": "stderr",
 84 |      "output_type": "stream",
 85 |      "text": [
 86 |       "Loading cached processed dataset at C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\\cache-f34d74a51064f292.arrow\n",
 87 |       "Loading cached processed dataset at C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\\cache-8114cae97162778f.arrow\n",
 88 |       "Loading cached processed dataset at C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\\cache-4b384cc92726f5c6.arrow\n"
 89 |      ]
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "from datasets import load_dataset\n",
 94 |     "from transformers import AutoTokenizer, DataCollatorWithPadding\n",
 95 |     "\n",
 96 |     "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
 97 |     "checkpoint = \"bert-base-uncased\"\n",
 98 |     "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
 99 |     "\n",
100 |     "def tokenize_function(example):\n",
101 |     "    return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)\n",
102 |     "\n",
103 |     "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
104 |     "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 2,
110 |    "metadata": {},
111 |    "outputs": [
112 |     {
113 |      "name": "stdout",
114 |      "output_type": "stream",
115 |      "text": [
116 |       "['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids']\n"
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "print(tokenized_datasets['train'].column_names)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "huggingface datasets贴心地准备了三个方法：`remove_columns`, `rename_column`, `set_format`\n",
129 |     "\n",
130 |     "来方便我们为pytorch的dataloader做准备："
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 3,
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "name": "stdout",
140 |      "output_type": "stream",
141 |      "text": [
142 |       "['attention_mask', 'input_ids', 'label', 'token_type_ids']\n"
143 |      ]
144 |     }
145 |    ],
146 |    "source": [
147 |     "tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2','idx'])\n",
148 |     "# tokenized_datasets = tokenized_datasets.rename_column('label','labels')\n",
149 |     "tokenized_datasets.set_format('torch')\n",
150 |     "\n",
151 |     "print(tokenized_datasets['train'].column_names)"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 4,
157 |    "metadata": {},
158 |    "outputs": [
159 |     {
160 |      "data": {
161 |       "text/plain": [
162 |        "Dataset({\n",
163 |        "    features: ['attention_mask', 'input_ids', 'label', 'token_type_ids'],\n",
164 |        "    num_rows: 3668\n",
165 |        "})"
166 |       ]
167 |      },
168 |      "execution_count": 4,
169 |      "metadata": {},
170 |      "output_type": "execute_result"
171 |     }
172 |    ],
173 |    "source": [
174 |     "tokenized_datasets['train']  # 经过上面的处理，它就可以直接丢进pytorch的Dataloader中了，跟pytorch中的Dataset格式已经一样了"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "定义我们的pytorch dataloaders：\n",
182 |     "\n",
183 |     "在pytorch的DataLoader里，有一个`collate_fn`参数，其定义是：\"merges a list of samples to form a mini-batch of Tensor(s).  Used when using batched loading from a map-style dataset.\" 我们可以直接把Huggingface的DataCollatorWithPadding对象传进去，用于对数据进行padding等一系列处理："
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 5,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "from torch.utils.data import DataLoader, Dataset\n",
193 |     "train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=8, collate_fn=data_collator)  # 通过这里的dataloader，每个batch的seq_len可能不同\n",
194 |     "eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=8, collate_fn=data_collator)"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 6,
200 |    "metadata": {},
201 |    "outputs": [
202 |     {
203 |      "data": {
204 |       "text/plain": [
205 |        "{'attention_mask': torch.Size([8, 72]),\n",
206 |        " 'input_ids': torch.Size([8, 72]),\n",
207 |        " 'token_type_ids': torch.Size([8, 72]),\n",
208 |        " 'labels': torch.Size([8])}"
209 |       ]
210 |      },
211 |      "execution_count": 6,
212 |      "metadata": {},
213 |      "output_type": "execute_result"
214 |     }
215 |    ],
216 |    "source": [
217 |     "# 查看一下train_dataloader的元素长啥样\n",
218 |     "for batch in train_dataloader:\n",
219 |     "    break\n",
220 |     "{k: v.shape for k, v in batch.items()}\n",
221 |     "# 可见都是长度为72，size=8的batch"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "## 2. 模型"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 7,
234 |    "metadata": {},
235 |    "outputs": [
236 |     {
237 |      "name": "stderr",
238 |      "output_type": "stream",
239 |      "text": [
240 |       "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
241 |       "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
242 |       "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
243 |       "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
244 |       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
245 |      ]
246 |     }
247 |    ],
248 |    "source": [
249 |     "from transformers import AutoModelForSequenceClassification\n",
250 |     "\n",
251 |     "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 8,
257 |    "metadata": {},
258 |    "outputs": [
259 |     {
260 |      "data": {
261 |       "text/plain": [
262 |        "SequenceClassifierOutput(loss=tensor(0.7563, grad_fn=<NllLossBackward>), logits=tensor([[-0.2171, -0.4416],\n",
263 |        "        [-0.2248, -0.4694],\n",
264 |        "        [-0.2440, -0.4664],\n",
265 |        "        [-0.2421, -0.4510],\n",
266 |        "        [-0.2273, -0.4545],\n",
267 |        "        [-0.2339, -0.4515],\n",
268 |        "        [-0.2334, -0.4387],\n",
269 |        "        [-0.2362, -0.4601]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)"
270 |       ]
271 |      },
272 |      "execution_count": 8,
273 |      "metadata": {},
274 |      "output_type": "execute_result"
275 |     }
276 |    ],
277 |    "source": [
278 |     "model(**batch)  # 这样的batch可以直接丢进模型处理"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "## optimizer 和 learning rate scheduler"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 9,
291 |    "metadata": {},
292 |    "outputs": [
293 |     {
294 |      "name": "stdout",
295 |      "output_type": "stream",
296 |      "text": [
297 |       "1377\n"
298 |      ]
299 |     }
300 |    ],
301 |    "source": [
302 |     "from transformers import AdamW, get_scheduler\n",
303 |     "\n",
304 |     "optimizer = AdamW(model.parameters(), lr=5e-5)\n",
305 |     "\n",
306 |     "num_epochs = 3\n",
307 |     "num_training_steps = num_epochs * len(train_dataloader)  # num of batches * num of epochs\n",
308 |     "lr_scheduler = get_scheduler(\n",
309 |     "    'linear',\n",
310 |     "    optimizer=optimizer,  # scheduler是针对optimizer的lr的\n",
311 |     "    num_warmup_steps=0,\n",
312 |     "    num_training_steps=num_training_steps)\n",
313 |     "print(num_training_steps)"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "markdown",
318 |    "metadata": {},
319 |    "source": [
320 |     "## 3. Training"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 10,
326 |    "metadata": {},
327 |    "outputs": [
328 |     {
329 |      "data": {
330 |       "text/plain": [
331 |        "device(type='cuda')"
332 |       ]
333 |      },
334 |      "execution_count": 10,
335 |      "metadata": {},
336 |      "output_type": "execute_result"
337 |     }
338 |    ],
339 |    "source": [
340 |     "import torch\n",
341 |     "\n",
342 |     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
343 |     "model.to(device)\n",
344 |     "\n",
345 |     "device"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "markdown",
350 |    "metadata": {},
351 |    "source": [
352 |     "## training loops:"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": 11,
358 |    "metadata": {},
359 |    "outputs": [
360 |     {
361 |      "name": "stderr",
362 |      "output_type": "stream",
363 |      "text": [
364 |       "100%|██████████| 459/459 [01:54<00:00,  4.01it/s]\n",
365 |       "100%|██████████| 459/459 [01:55<00:00,  3.98it/s]\n",
366 |       "100%|██████████| 459/459 [01:55<00:00,  3.96it/s]\n"
367 |      ]
368 |     }
369 |    ],
370 |    "source": [
371 |     "from tqdm import tqdm\n",
372 |     "\n",
373 |     "for epoch in range(num_epochs):\n",
374 |     "    for batch in tqdm(train_dataloader):\n",
375 |     "        # 要在GPU上训练，需要把数据集都移动到GPU上：\n",
376 |     "        batch = {k:v.to(device) for k,v in batch.items()}\n",
377 |     "        loss = model(**batch).loss\n",
378 |     "        loss.backward()\n",
379 |     "        optimizer.step()\n",
380 |     "        lr_scheduler.step()\n",
381 |     "        optimizer.zero_grad()"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "markdown",
386 |    "metadata": {},
387 |    "source": [
388 |     "## 4. Evaluation"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 12,
394 |    "metadata": {},
395 |    "outputs": [
396 |     {
397 |      "data": {
398 |       "text/plain": [
399 |        "{'accuracy': 0.8651960784313726, 'f1': 0.9050086355785838}"
400 |       ]
401 |      },
402 |      "execution_count": 12,
403 |      "metadata": {},
404 |      "output_type": "execute_result"
405 |     }
406 |    ],
407 |    "source": [
408 |     "from datasets import load_metric\n",
409 |     "\n",
410 |     "metric= load_metric(\"glue\", \"mrpc\")\n",
411 |     "model.eval()\n",
412 |     "for batch in eval_dataloader:\n",
413 |     "    batch = {k: v.to(device) for k, v in batch.items()}\n",
414 |     "    with torch.no_grad():  # evaluation的时候不需要算梯度\n",
415 |     "        outputs = model(**batch)\n",
416 |     "    \n",
417 |     "    logits = outputs.logits\n",
418 |     "    predictions = torch.argmax(logits, dim=-1)\n",
419 |     "    metric.add_batch(predictions=predictions, references=batch[\"labels\"])\n",
420 |     "\n",
421 |     "metric.compute()"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "markdown",
426 |    "metadata": {},
427 |    "source": [
428 |     "## 5. 使用 Accelerate 库进一步加速\n",
429 |     "The training loop we defined earlier works fine on a single CPU or GPU. But using the 🤗 Accelerate library, with just a few adjustments we can enable distributed training on multiple GPUs or TPUs.\n",
430 |     "\n",
431 |     "日后再说吧~"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": null,
437 |    "metadata": {},
438 |    "outputs": [],
439 |    "source": []
440 |   }
441 |  ],
442 |  "metadata": {
443 |   "kernelspec": {
444 |    "display_name": "Python 3",
445 |    "language": "python",
446 |    "name": "python3"
447 |   },
448 |   "language_info": {
449 |    "codemirror_mode": {
450 |     "name": "ipython",
451 |     "version": 3
452 |    },
453 |    "file_extension": ".py",
454 |    "mimetype": "text/x-python",
455 |    "name": "python",
456 |    "nbconvert_exporter": "python",
457 |    "pygments_lexer": "ipython3",
458 |    "version": "3.7.6"
459 |   }
460 |  },
461 |  "nbformat": 4,
462 |  "nbformat_minor": 4
463 | }
464 | 


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-25-19_PC-201911051016/1632641123.3012567/events.out.tfevents.1632641123.PC-201911051016.50596.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-25-19_PC-201911051016/1632641123.3012567/events.out.tfevents.1632641123.PC-201911051016.50596.1


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-25-19_PC-201911051016/events.out.tfevents.1632641123.PC-201911051016.50596.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-25-19_PC-201911051016/events.out.tfevents.1632641123.PC-201911051016.50596.0


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-36-43_PC-201911051016/1632641809.055524/events.out.tfevents.1632641809.PC-201911051016.50596.3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-36-43_PC-201911051016/1632641809.055524/events.out.tfevents.1632641809.PC-201911051016.50596.3


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-36-43_PC-201911051016/events.out.tfevents.1632641808.PC-201911051016.50596.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-36-43_PC-201911051016/events.out.tfevents.1632641808.PC-201911051016.50596.2


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-37-55_PC-201911051016/1632641879.1103542/events.out.tfevents.1632641879.PC-201911051016.32468.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-37-55_PC-201911051016/1632641879.1103542/events.out.tfevents.1632641879.PC-201911051016.32468.1


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-37-55_PC-201911051016/events.out.tfevents.1632641879.PC-201911051016.32468.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-37-55_PC-201911051016/events.out.tfevents.1632641879.PC-201911051016.32468.0


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-44-26_PC-201911051016/1632642271.2198026/events.out.tfevents.1632642271.PC-201911051016.32468.3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-44-26_PC-201911051016/1632642271.2198026/events.out.tfevents.1632642271.PC-201911051016.32468.3


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-44-26_PC-201911051016/events.out.tfevents.1632642271.PC-201911051016.32468.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-44-26_PC-201911051016/events.out.tfevents.1632642271.PC-201911051016.32468.2


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-05_PC-201911051016/1632642852.8538904/events.out.tfevents.1632642852.PC-201911051016.3052.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-05_PC-201911051016/1632642852.8538904/events.out.tfevents.1632642852.PC-201911051016.3052.1


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-05_PC-201911051016/events.out.tfevents.1632642852.PC-201911051016.3052.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-05_PC-201911051016/events.out.tfevents.1632642852.PC-201911051016.3052.0


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-51_PC-201911051016/1632642898.3413022/events.out.tfevents.1632642898.PC-201911051016.3052.3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-51_PC-201911051016/1632642898.3413022/events.out.tfevents.1632642898.PC-201911051016.3052.3


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-51_PC-201911051016/events.out.tfevents.1632642898.PC-201911051016.3052.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-54-51_PC-201911051016/events.out.tfevents.1632642898.PC-201911051016.3052.2


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-55-27_PC-201911051016/1632642935.0711265/events.out.tfevents.1632642935.PC-201911051016.34932.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-55-27_PC-201911051016/1632642935.0711265/events.out.tfevents.1632642935.PC-201911051016.34932.1


--------------------------------------------------------------------------------
/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-55-27_PC-201911051016/events.out.tfevents.1632642934.PC-201911051016.34932.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondguo/Learn_PyTorch/ec5815df04a323e44eb5c0f1693efedeffad5e1c/HuggingfaceNLP/C2. Fine-tuning Transformers/runs/Sep26_15-55-27_PC-201911051016/events.out.tfevents.1632642934.PC-201911051016.34932.0


--------------------------------------------------------------------------------
/使用transformers库.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "D:\\Anaconda3\\envs\\torch\\lib\\importlib\\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n",
 13 |       "  return f(*args, **kwds)\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
 19 |     "import torch"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [
 27 |     {
 28 |      "data": {
 29 |       "application/vnd.jupyter.widget-view+json": {
 30 |        "model_id": "b81cefe174104cf0a816a2acd4f0f4fd",
 31 |        "version_major": 2,
 32 |        "version_minor": 0
 33 |       },
 34 |       "text/plain": [
 35 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=811.0, style=ProgressStyle(description_…"
 36 |       ]
 37 |      },
 38 |      "metadata": {},
 39 |      "output_type": "display_data"
 40 |     },
 41 |     {
 42 |      "name": "stdout",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "\n"
 46 |      ]
 47 |     },
 48 |     {
 49 |      "data": {
 50 |       "application/vnd.jupyter.widget-view+json": {
 51 |        "model_id": "fa26e4665b1949188359fe88264ccabf",
 52 |        "version_major": 2,
 53 |        "version_minor": 0
 54 |       },
 55 |       "text/plain": [
 56 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…"
 57 |       ]
 58 |      },
 59 |      "metadata": {},
 60 |      "output_type": "display_data"
 61 |     },
 62 |     {
 63 |      "name": "stdout",
 64 |      "output_type": "stream",
 65 |      "text": [
 66 |       "\n"
 67 |      ]
 68 |     },
 69 |     {
 70 |      "data": {
 71 |       "application/vnd.jupyter.widget-view+json": {
 72 |        "model_id": "dd9e12f8e20949b99144cec77a5a6ecb",
 73 |        "version_major": 2,
 74 |        "version_minor": 0
 75 |       },
 76 |       "text/plain": [
 77 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466197.0, style=ProgressStyle(descripti…"
 78 |       ]
 79 |      },
 80 |      "metadata": {},
 81 |      "output_type": "display_data"
 82 |     },
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "\n"
 88 |      ]
 89 |     },
 90 |     {
 91 |      "data": {
 92 |       "application/vnd.jupyter.widget-view+json": {
 93 |        "model_id": "05c023a6424147f899156566bdf0f2b9",
 94 |        "version_major": 2,
 95 |        "version_minor": 0
 96 |       },
 97 |       "text/plain": [
 98 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…"
 99 |       ]
100 |      },
101 |      "metadata": {},
102 |      "output_type": "display_data"
103 |     },
104 |     {
105 |      "name": "stdout",
106 |      "output_type": "stream",
107 |      "text": [
108 |       "\n"
109 |      ]
110 |     },
111 |     {
112 |      "data": {
113 |       "application/vnd.jupyter.widget-view+json": {
114 |        "model_id": "978e7f21bbdd4c0bab47a5135768e425",
115 |        "version_major": 2,
116 |        "version_minor": 0
117 |       },
118 |       "text/plain": [
119 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=333.0, style=ProgressStyle(description_…"
120 |       ]
121 |      },
122 |      "metadata": {},
123 |      "output_type": "display_data"
124 |     },
125 |     {
126 |      "name": "stdout",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "\n"
130 |      ]
131 |     },
132 |     {
133 |      "data": {
134 |       "application/vnd.jupyter.widget-view+json": {
135 |        "model_id": "ad7bb71ca98d48319dc3bddedbeedcac",
136 |        "version_major": 2,
137 |        "version_minor": 0
138 |       },
139 |       "text/plain": [
140 |        "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267866225.0, style=ProgressStyle(descri…"
141 |       ]
142 |      },
143 |      "metadata": {},
144 |      "output_type": "display_data"
145 |     },
146 |     {
147 |      "name": "stdout",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "ptm_name = \"liam168/c4-zh-distilbert-base-uncased\"\n",
156 |     "tokenizer = AutoTokenizer.from_pretrained(ptm_name)\n",
157 |     "model = AutoModelForSequenceClassification.from_pretrained(ptm_name)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 16,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "classes = [\"女性\",\"体育\",\"文学\",\"校园\"]\n",
167 |     "s1 = '女生的成绩往往比男生好'\n",
168 |     "s2 = '中国奥运军团在东京奥运会上取得了最多的金牌'"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 19,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "name": "stdout",
178 |      "output_type": "stream",
179 |      "text": [
180 |       "{'input_ids': tensor([[ 101, 1746, 1799,  100,  100,  100,  100,  100,  100, 1755,  100,  100,\n",
181 |       "         1763, 1742,  100,  100,  100,  100,  100, 1916, 1964,  100,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n",
182 |       "tensor([[-1.4325,  3.3059, -0.8026, -1.2892]], grad_fn=<AddmmBackward>)\n",
183 |       "[[0.008454332128167152, 0.9659157395362854, 0.015873165801167488, 0.00975674670189619]]\n",
184 |       "女性: 1%\n",
185 |       "体育: 97%\n",
186 |       "文学: 2%\n",
187 |       "校园: 1%\n"
188 |      ]
189 |     }
190 |    ],
191 |    "source": [
192 |     "x = tokenizer(s2, return_tensors='pt')  # return_tensors='pt' 让返回的格式为torch tensor\n",
193 |     "print(x)\n",
194 |     "\n",
195 |     "logits = model(**x).logits\n",
196 |     "print(logits)\n",
197 |     "\n",
198 |     "result = torch.softmax(logits, dim=1).tolist()  # 需要 dim=1\n",
199 |     "print(result)\n",
200 |     "\n",
201 |     "for i in range(len(classes)):\n",
202 |     "    print(f'{classes[i]}: {int(round(result[0][i]*100))}%')"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": []
211 |   }
212 |  ],
213 |  "metadata": {
214 |   "kernelspec": {
215 |    "display_name": "Python 3",
216 |    "language": "python",
217 |    "name": "python3"
218 |   },
219 |   "language_info": {
220 |    "codemirror_mode": {
221 |     "name": "ipython",
222 |     "version": 3
223 |    },
224 |    "file_extension": ".py",
225 |    "mimetype": "text/x-python",
226 |    "name": "python",
227 |    "nbconvert_exporter": "python",
228 |    "pygments_lexer": "ipython3",
229 |    "version": "3.7.6"
230 |   }
231 |  },
232 |  "nbformat": 4,
233 |  "nbformat_minor": 4
234 | }
235 | 


--------------------------------------------------------------------------------
/李沐PyTorch/1. 基础操作.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "9d086990",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# PyTorch基础数据操作"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "5702d925",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import torch"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "id": "4fe33148",
 25 |    "metadata": {},
 26 |    "outputs": [
 27 |     {
 28 |      "data": {
 29 |       "text/plain": [
 30 |        "tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])"
 31 |       ]
 32 |      },
 33 |      "execution_count": 2,
 34 |      "metadata": {},
 35 |      "output_type": "execute_result"
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "x = torch.arange(12)\n",
 40 |     "x"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "id": "cda9bc46",
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "data": {
 51 |       "text/plain": [
 52 |        "torch.Size([12])"
 53 |       ]
 54 |      },
 55 |      "execution_count": 3,
 56 |      "metadata": {},
 57 |      "output_type": "execute_result"
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "x.shape"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 4,
 67 |    "id": "0810c186",
 68 |    "metadata": {},
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/plain": [
 73 |        "device(type='cpu')"
 74 |       ]
 75 |      },
 76 |      "execution_count": 4,
 77 |      "metadata": {},
 78 |      "output_type": "execute_result"
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "x.device"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 4,
 88 |    "id": "ae9218fb",
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "data": {
 93 |       "text/plain": [
 94 |        "12"
 95 |       ]
 96 |      },
 97 |      "execution_count": 4,
 98 |      "metadata": {},
 99 |      "output_type": "execute_result"
100 |     }
101 |    ],
102 |    "source": [
103 |     "x.numel()  # number of elements"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 5,
109 |    "id": "bd016a9f",
110 |    "metadata": {},
111 |    "outputs": [
112 |     {
113 |      "data": {
114 |       "text/plain": [
115 |        "tensor([[ 0,  1,  2,  3],\n",
116 |        "        [ 4,  5,  6,  7],\n",
117 |        "        [ 8,  9, 10, 11]])"
118 |       ]
119 |      },
120 |      "execution_count": 5,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "X = x.reshape(3,4)\n",
127 |     "X"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 8,
133 |    "id": "33ec4f2d",
134 |    "metadata": {},
135 |    "outputs": [
136 |     {
137 |      "data": {
138 |       "text/plain": [
139 |        "tensor([[[1., 1., 1., 1.],\n",
140 |        "         [1., 1., 1., 1.],\n",
141 |        "         [1., 1., 1., 1.]],\n",
142 |        "\n",
143 |        "        [[1., 1., 1., 1.],\n",
144 |        "         [1., 1., 1., 1.],\n",
145 |        "         [1., 1., 1., 1.]]])"
146 |       ]
147 |      },
148 |      "execution_count": 8,
149 |      "metadata": {},
150 |      "output_type": "execute_result"
151 |     }
152 |    ],
153 |    "source": [
154 |     "torch.zeros((2,3,4))\n",
155 |     "torch.ones((2,3,4))"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 9,
161 |    "id": "62b2cb89",
162 |    "metadata": {},
163 |    "outputs": [
164 |     {
165 |      "data": {
166 |       "text/plain": [
167 |        "tensor([[1, 2, 3],\n",
168 |        "        [4, 5, 6]])"
169 |       ]
170 |      },
171 |      "execution_count": 9,
172 |      "metadata": {},
173 |      "output_type": "execute_result"
174 |     }
175 |    ],
176 |    "source": [
177 |     "torch.tensor([[1,2,3],[4,5,6]])"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 14,
183 |    "id": "c3a6feb8",
184 |    "metadata": {},
185 |    "outputs": [
186 |     {
187 |      "data": {
188 |       "text/plain": [
189 |        "(tensor([11., 12., 13.]),\n",
190 |        " tensor([10., 20., 30.]),\n",
191 |        " tensor([0.1000, 0.2000, 0.3000]),\n",
192 |        " tensor([  10.,  100., 1000.]))"
193 |       ]
194 |      },
195 |      "execution_count": 14,
196 |      "metadata": {},
197 |      "output_type": "execute_result"
198 |     }
199 |    ],
200 |    "source": [
201 |     "# 常见标准运算（+ - * / **）都是按元素运算\n",
202 |     "x = torch.tensor([1.0,2,3])  ## 在tensor中任意一个数加一个小数点，就可以把tensor类型转化为float浮点型\n",
203 |     "y = torch.tensor([10,10,10])\n",
204 |     "x + y, x * y, x / y, y ** x"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 17,
210 |    "id": "da26fc28",
211 |    "metadata": {},
212 |    "outputs": [
213 |     {
214 |      "data": {
215 |       "text/plain": [
216 |        "(tensor([[0., 1., 2., 3., 4.],\n",
217 |        "         [5., 6., 7., 8., 9.]]),\n",
218 |        " tensor([[0., 0., 0., 0., 0.],\n",
219 |        "         [0., 0., 0., 0., 0.]]))"
220 |       ]
221 |      },
222 |      "execution_count": 17,
223 |      "metadata": {},
224 |      "output_type": "execute_result"
225 |     }
226 |    ],
227 |    "source": [
228 |     "# tensor的拼接\n",
229 |     "x = torch.arange(10,dtype=torch.float32).reshape(2,5)\n",
230 |     "y = torch.zeros(10,dtype=torch.float32).reshape(2,5)\n",
231 |     "x,y"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 20,
237 |    "id": "37d1b7ef",
238 |    "metadata": {},
239 |    "outputs": [
240 |     {
241 |      "data": {
242 |       "text/plain": [
243 |        "(tensor([[0., 1., 2., 3., 4.],\n",
244 |        "         [5., 6., 7., 8., 9.],\n",
245 |        "         [0., 0., 0., 0., 0.],\n",
246 |        "         [0., 0., 0., 0., 0.]]),\n",
247 |        " tensor([[0., 1., 2., 3., 4., 0., 0., 0., 0., 0.],\n",
248 |        "         [5., 6., 7., 8., 9., 0., 0., 0., 0., 0.]]))"
249 |       ]
250 |      },
251 |      "execution_count": 20,
252 |      "metadata": {},
253 |      "output_type": "execute_result"
254 |     }
255 |    ],
256 |    "source": [
257 |     "torch.cat([x,y],dim=0), torch.cat([x,y],dim=1) # dim=0按行拼接，dim=1按列拼接"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 21,
263 |    "id": "ba36dc4e",
264 |    "metadata": {},
265 |    "outputs": [
266 |     {
267 |      "data": {
268 |       "text/plain": [
269 |        "tensor([[ True, False, False, False, False],\n",
270 |        "        [False, False, False, False, False]])"
271 |       ]
272 |      },
273 |      "execution_count": 21,
274 |      "metadata": {},
275 |      "output_type": "execute_result"
276 |     }
277 |    ],
278 |    "source": [
279 |     "# 逻辑运算符\n",
280 |     "x == y"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 24,
286 |    "id": "7ef8cbb3",
287 |    "metadata": {},
288 |    "outputs": [
289 |     {
290 |      "data": {
291 |       "text/plain": [
292 |        "(tensor([[0],\n",
293 |        "         [1],\n",
294 |        "         [2]]),\n",
295 |        " tensor([[0, 1]]))"
296 |       ]
297 |      },
298 |      "execution_count": 24,
299 |      "metadata": {},
300 |      "output_type": "execute_result"
301 |     }
302 |    ],
303 |    "source": [
304 |     "# 即时形状不同（但维数得相同），也可以通过“广播机制”来进行运算（按元素）\n",
305 |     "# 具体做法是，通过复制得到各自的一个大张量，然后再逐元素运算\n",
306 |     "# 比如一个(3,1)一个(1,2)，则需要先都统一成(3,2)的矩阵，再计算\n",
307 |     "a = torch.arange(3).reshape(3,1)\n",
308 |     "b = torch.arange(2).reshape(1,2)\n",
309 |     "a,b"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 25,
315 |    "id": "7924e822",
316 |    "metadata": {},
317 |    "outputs": [
318 |     {
319 |      "data": {
320 |       "text/plain": [
321 |        "tensor([[0, 1],\n",
322 |        "        [1, 2],\n",
323 |        "        [2, 3]])"
324 |       ]
325 |      },
326 |      "execution_count": 25,
327 |      "metadata": {},
328 |      "output_type": "execute_result"
329 |     }
330 |    ],
331 |    "source": [
332 |     "a + b"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": 82,
338 |    "id": "c7b1c5ce",
339 |    "metadata": {},
340 |    "outputs": [
341 |     {
342 |      "data": {
343 |       "text/plain": [
344 |        "(tensor([[0, 1],\n",
345 |        "         [2, 3],\n",
346 |        "         [4, 5]]),\n",
347 |        " tensor([0, 1]),\n",
348 |        " tensor([[0],\n",
349 |        "         [1]]),\n",
350 |        " tensor([[0, 1]]))"
351 |       ]
352 |      },
353 |      "execution_count": 82,
354 |      "metadata": {},
355 |      "output_type": "execute_result"
356 |     }
357 |    ],
358 |    "source": [
359 |     "a = torch.arange(6).reshape(3,2)\n",
360 |     "b = torch.arange(2)\n",
361 |     "c = b.reshape(2,1)\n",
362 |     "d = b.reshape(1,2)\n",
363 |     "a,b,c,d"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 85,
369 |    "id": "682d063d",
370 |    "metadata": {},
371 |    "outputs": [
372 |     {
373 |      "data": {
374 |       "text/plain": [
375 |        "tensor([[1],\n",
376 |        "        [3],\n",
377 |        "        [5]])"
378 |       ]
379 |      },
380 |      "execution_count": 85,
381 |      "metadata": {},
382 |      "output_type": "execute_result"
383 |     }
384 |    ],
385 |    "source": [
386 |     "# * torch.dot torch.matmul torch.mm这些乘法都是怎样的：\n",
387 |     "\n",
388 |     "a*b\n",
389 |     "# tensor([[0, 1],\n",
390 |     "#         [0, 3],\n",
391 |     "#         [0, 5]])\n",
392 |     "\n",
393 |     "torch.dot(a,b)\n",
394 |     "# RuntimeError: 1D tensors expected, but got 2D and 1D tensors\n",
395 |     "    \n",
396 |     "torch.matmul(a,b)\n",
397 |     "# tensor([1, 3, 5])\n",
398 |     "\n",
399 |     "torch.mm(a,b)\n",
400 |     "# RuntimeError: mat2 must be a matrix\n",
401 |     "\n",
402 |     "torch.matmul(a,c)\n",
403 |     "# tensor([[1],\n",
404 |     "#         [3],\n",
405 |     "#         [5]])\n",
406 |     "\n",
407 |     "torch.mm(a,c)\n",
408 |     "# tensor([[1],\n",
409 |     "#         [3],\n",
410 |     "#         [5]])\n",
411 |     "\n",
412 |     "torch.matmul(a,d)\n",
413 |     "torch.mm(a,d)\n",
414 |     "# RuntimeError: mat1 and mat2 shapes cannot be multiplied (3x2 and 1x2)\n",
415 |     "\n",
416 |     "torch.matmul(a,d.T)\n",
417 |     "torch.mm(a,d.T)\n",
418 |     "# tensor([[1],\n",
419 |     "#         [3],\n",
420 |     "#         [5]])"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "markdown",
425 |    "id": "d28f4c36",
426 |    "metadata": {},
427 |    "source": [
428 |     "### 总结一下：\n",
429 |     "- `*`就是逐元素相乘，采用broadcast机制，还可以使用torch.mul\n",
430 |     "- `.dot`只能用于两个1D向量做内积\n",
431 |     "- `.mm`只能用于两个2D的矩阵相乘，必须符合矩阵乘法的规则\n",
432 |     "- `.matmul`用途最广泛，兼容性最强。可以对1D，2D以及更高维数据进行乘法。当其中有1D时，采用broadcast，当都是2D时使用矩阵乘法规则。还可以使用 `@`符号。"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "code",
437 |    "execution_count": null,
438 |    "id": "1ea2c991",
439 |    "metadata": {},
440 |    "outputs": [],
441 |    "source": [
442 |     "# 赋值，可以对一个点或者一个区域赋值\n",
443 |     "x\n",
444 |     "# tensor([[0., 1., 2., 3., 4.],\n",
445 |     "#         [5., 6., 7., 8., 9.]])\n",
446 |     "x[1,2] = 100\n",
447 |     "x\n",
448 |     "# tensor([[  0.,   1.,   2.,   3.,   4.],\n",
449 |     "#         [  5.,   6., 100.,   8.,   9.]])\n",
450 |     "x[0, :] = 100\n",
451 |     "x\n",
452 |     "# tensor([[100., 100., 100., 100., 100.],\n",
453 |     "#         [  5.,   6., 100.,   8.,   9.]])"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": 33,
459 |    "id": "0f796d47",
460 |    "metadata": {},
461 |    "outputs": [
462 |     {
463 |      "name": "stdout",
464 |      "output_type": "stream",
465 |      "text": [
466 |       "4694868480\n",
467 |       "5230995712\n"
468 |      ]
469 |     }
470 |    ],
471 |    "source": [
472 |     "p = torch.arange(5)\n",
473 |     "orig_id = id(p)  # id是python自带函数，查询变量的内存地址，相当于指针\n",
474 |     "p = p + 1  # 这样赋值，会分配新的内存地址，导致占用更多内存\n",
475 |     "new_id = id(p)\n",
476 |     "print(orig_id)\n",
477 |     "print(new_id)"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": 34,
483 |    "id": "f32f639f",
484 |    "metadata": {},
485 |    "outputs": [
486 |     {
487 |      "name": "stdout",
488 |      "output_type": "stream",
489 |      "text": [
490 |       "5230929408\n",
491 |       "5230929408\n"
492 |      ]
493 |     }
494 |    ],
495 |    "source": [
496 |     "p = torch.arange(5)\n",
497 |     "orig_id = id(p)\n",
498 |     "p[:] = p + 1  # 使用:符号来赋值，就可以原地进行赋值，不占用新内存\n",
499 |     "new_id = id(p)\n",
500 |     "print(orig_id)\n",
501 |     "print(new_id)"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": 35,
507 |    "id": "73774f53",
508 |    "metadata": {},
509 |    "outputs": [
510 |     {
511 |      "name": "stdout",
512 |      "output_type": "stream",
513 |      "text": [
514 |       "5231238656\n",
515 |       "5231238656\n"
516 |      ]
517 |     }
518 |    ],
519 |    "source": [
520 |     "p = torch.arange(5)\n",
521 |     "orig_id = id(p)\n",
522 |     "p += 1  # 使用 += 这样的方式在运算，也可以进行原地赋值\n",
523 |     "new_id = id(p)\n",
524 |     "print(orig_id)\n",
525 |     "print(new_id)"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "code",
530 |    "execution_count": 36,
531 |    "id": "4a23af46",
532 |    "metadata": {},
533 |    "outputs": [
534 |     {
535 |      "data": {
536 |       "text/plain": [
537 |        "(numpy.ndarray, torch.Tensor)"
538 |       ]
539 |      },
540 |      "execution_count": 36,
541 |      "metadata": {},
542 |      "output_type": "execute_result"
543 |     }
544 |    ],
545 |    "source": [
546 |     "# 转换为numpy张量，从numpy张量转化成torch的tensor\n",
547 |     "A = x.numpy()\n",
548 |     "B = torch.tensor(A)\n",
549 |     "type(A), type(B)"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": 41,
555 |    "id": "8afe642f",
556 |    "metadata": {},
557 |    "outputs": [
558 |     {
559 |      "data": {
560 |       "text/plain": [
561 |        "(tensor(3.5000), <function Tensor.item>, 3.5, 3)"
562 |       ]
563 |      },
564 |      "execution_count": 41,
565 |      "metadata": {},
566 |      "output_type": "execute_result"
567 |     }
568 |    ],
569 |    "source": [
570 |     "# 将大小为1的张量，转换为Python标量\n",
571 |     "# 两种方式：.item 或者 float()/int()\n",
572 |     "a = torch.tensor(3.5)\n",
573 |     "a, a.item, float(a), int(a)"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": null,
579 |    "id": "d3486883",
580 |    "metadata": {},
581 |    "outputs": [],
582 |    "source": [
583 |     "# 当x是一个向量或张量时，就没法使用.item了\n",
584 |     "x.item()\n",
585 |     "# ValueError: only one element tensors can be converted to Python scalars"
586 |    ]
587 |   },
588 |   {
589 |    "cell_type": "markdown",
590 |    "id": "78b3c59d",
591 |    "metadata": {},
592 |    "source": [
593 |     "# 数据预处理"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": 56,
599 |    "id": "47bd8d73",
600 |    "metadata": {},
601 |    "outputs": [
602 |     {
603 |      "data": {
604 |       "text/html": [
605 |        "<div>\n",
606 |        "<style scoped>\n",
607 |        "    .dataframe tbody tr th:only-of-type {\n",
608 |        "        vertical-align: middle;\n",
609 |        "    }\n",
610 |        "\n",
611 |        "    .dataframe tbody tr th {\n",
612 |        "        vertical-align: top;\n",
613 |        "    }\n",
614 |        "\n",
615 |        "    .dataframe thead th {\n",
616 |        "        text-align: right;\n",
617 |        "    }\n",
618 |        "</style>\n",
619 |        "<table border=\"1\" class=\"dataframe\">\n",
620 |        "  <thead>\n",
621 |        "    <tr style=\"text-align: right;\">\n",
622 |        "      <th></th>\n",
623 |        "      <th>A</th>\n",
624 |        "      <th>B</th>\n",
625 |        "      <th>C</th>\n",
626 |        "    </tr>\n",
627 |        "  </thead>\n",
628 |        "  <tbody>\n",
629 |        "    <tr>\n",
630 |        "      <th>0</th>\n",
631 |        "      <td>1.0</td>\n",
632 |        "      <td>2.0</td>\n",
633 |        "      <td>good</td>\n",
634 |        "    </tr>\n",
635 |        "    <tr>\n",
636 |        "      <th>1</th>\n",
637 |        "      <td>4.0</td>\n",
638 |        "      <td>5.0</td>\n",
639 |        "      <td>None</td>\n",
640 |        "    </tr>\n",
641 |        "    <tr>\n",
642 |        "      <th>2</th>\n",
643 |        "      <td>10.0</td>\n",
644 |        "      <td>10.0</td>\n",
645 |        "      <td>None</td>\n",
646 |        "    </tr>\n",
647 |        "  </tbody>\n",
648 |        "</table>\n",
649 |        "</div>"
650 |       ],
651 |       "text/plain": [
652 |        "      A     B     C\n",
653 |        "0   1.0   2.0  good\n",
654 |        "1   4.0   5.0  None\n",
655 |        "2  10.0  10.0  None"
656 |       ]
657 |      },
658 |      "execution_count": 56,
659 |      "metadata": {},
660 |      "output_type": "execute_result"
661 |     }
662 |    ],
663 |    "source": [
664 |     "import pandas as pd\n",
665 |     "columns = ['A','B','C']\n",
666 |     "data = [[1.,2.,'good'],[4,5],[10,10]]\n",
667 |     "df = pd.DataFrame(data, columns=columns)\n",
668 |     "df"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": 62,
674 |    "id": "91dbcf28",
675 |    "metadata": {},
676 |    "outputs": [
677 |     {
678 |      "data": {
679 |       "text/plain": [
680 |        "array([[1.0, 2.0, 'good'],\n",
681 |        "       [4.0, 5.0, None],\n",
682 |        "       [10.0, 10.0, None]], dtype=object)"
683 |       ]
684 |      },
685 |      "execution_count": 62,
686 |      "metadata": {},
687 |      "output_type": "execute_result"
688 |     }
689 |    ],
690 |    "source": [
691 |     "df.values"
692 |    ]
693 |   },
694 |   {
695 |    "cell_type": "code",
696 |    "execution_count": 58,
697 |    "id": "42b811df",
698 |    "metadata": {},
699 |    "outputs": [
700 |     {
701 |      "data": {
702 |       "text/html": [
703 |        "<div>\n",
704 |        "<style scoped>\n",
705 |        "    .dataframe tbody tr th:only-of-type {\n",
706 |        "        vertical-align: middle;\n",
707 |        "    }\n",
708 |        "\n",
709 |        "    .dataframe tbody tr th {\n",
710 |        "        vertical-align: top;\n",
711 |        "    }\n",
712 |        "\n",
713 |        "    .dataframe thead th {\n",
714 |        "        text-align: right;\n",
715 |        "    }\n",
716 |        "</style>\n",
717 |        "<table border=\"1\" class=\"dataframe\">\n",
718 |        "  <thead>\n",
719 |        "    <tr style=\"text-align: right;\">\n",
720 |        "      <th></th>\n",
721 |        "      <th>A</th>\n",
722 |        "      <th>B</th>\n",
723 |        "      <th>C_good</th>\n",
724 |        "      <th>C_nan</th>\n",
725 |        "    </tr>\n",
726 |        "  </thead>\n",
727 |        "  <tbody>\n",
728 |        "    <tr>\n",
729 |        "      <th>0</th>\n",
730 |        "      <td>1.0</td>\n",
731 |        "      <td>2.0</td>\n",
732 |        "      <td>1</td>\n",
733 |        "      <td>0</td>\n",
734 |        "    </tr>\n",
735 |        "    <tr>\n",
736 |        "      <th>1</th>\n",
737 |        "      <td>4.0</td>\n",
738 |        "      <td>5.0</td>\n",
739 |        "      <td>0</td>\n",
740 |        "      <td>1</td>\n",
741 |        "    </tr>\n",
742 |        "    <tr>\n",
743 |        "      <th>2</th>\n",
744 |        "      <td>10.0</td>\n",
745 |        "      <td>10.0</td>\n",
746 |        "      <td>0</td>\n",
747 |        "      <td>1</td>\n",
748 |        "    </tr>\n",
749 |        "  </tbody>\n",
750 |        "</table>\n",
751 |        "</div>"
752 |       ],
753 |       "text/plain": [
754 |        "      A     B  C_good  C_nan\n",
755 |        "0   1.0   2.0       1      0\n",
756 |        "1   4.0   5.0       0      1\n",
757 |        "2  10.0  10.0       0      1"
758 |       ]
759 |      },
760 |      "execution_count": 58,
761 |      "metadata": {},
762 |      "output_type": "execute_result"
763 |     }
764 |    ],
765 |    "source": [
766 |     "inputs = pd.get_dummies(df,dummy_na=True)  # 通过这种方法，可以将缺失值转化成0，1特征（一般是非数值特征这样做，数值化的特征就直接fillna即可）\n",
767 |     "inputs"
768 |    ]
769 |   },
770 |   {
771 |    "cell_type": "code",
772 |    "execution_count": 61,
773 |    "id": "10608127",
774 |    "metadata": {},
775 |    "outputs": [
776 |     {
777 |      "data": {
778 |       "text/plain": [
779 |        "array([[ 1.,  2.,  1.,  0.],\n",
780 |        "       [ 4.,  5.,  0.,  1.],\n",
781 |        "       [10., 10.,  0.,  1.]])"
782 |       ]
783 |      },
784 |      "execution_count": 61,
785 |      "metadata": {},
786 |      "output_type": "execute_result"
787 |     }
788 |    ],
789 |    "source": [
790 |     "inputs.values"
791 |    ]
792 |   },
793 |   {
794 |    "cell_type": "code",
795 |    "execution_count": null,
796 |    "id": "0e5a8fdb",
797 |    "metadata": {},
798 |    "outputs": [],
799 |    "source": []
800 |   }
801 |  ],
802 |  "metadata": {
803 |   "kernelspec": {
804 |    "display_name": "Python 3 (ipykernel)",
805 |    "language": "python",
806 |    "name": "python3"
807 |   },
808 |   "language_info": {
809 |    "codemirror_mode": {
810 |     "name": "ipython",
811 |     "version": 3
812 |    },
813 |    "file_extension": ".py",
814 |    "mimetype": "text/x-python",
815 |    "name": "python",
816 |    "nbconvert_exporter": "python",
817 |    "pygments_lexer": "ipython3",
818 |    "version": "3.9.2"
819 |   }
820 |  },
821 |  "nbformat": 4,
822 |  "nbformat_minor": 5
823 | }
824 | 


--------------------------------------------------------------------------------
/李沐PyTorch/2. 自动求导.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "0c428e76-5ac2-4293-b766-8ae72395df7c",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# 自动求导"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 17,
 14 |    "id": "c03e70d3-aee5-4c18-9514-d6924a3a9ec6",
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "tensor([1., 2., 3.])\n"
 22 |      ]
 23 |     },
 24 |     {
 25 |      "data": {
 26 |       "text/plain": [
 27 |        "tensor([1., 4., 9.])"
 28 |       ]
 29 |      },
 30 |      "execution_count": 17,
 31 |      "metadata": {},
 32 |      "output_type": "execute_result"
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "import torch\n",
 37 |     "x = torch.tensor([1.0,2.0,3.0])\n",
 38 |     "print(x)\n"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 25,
 44 |    "id": "99063507-e4c8-4677-993b-6cbe0b9ae59e",
 45 |    "metadata": {},
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/plain": [
 50 |        "tensor([[14.]])"
 51 |       ]
 52 |      },
 53 |      "execution_count": 25,
 54 |      "metadata": {},
 55 |      "output_type": "execute_result"
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "a = x.reshape(1,3)\n",
 60 |     "b = x.reshape(3,1)\n",
 61 |     "a,b\n",
 62 |     "\n",
 63 |     "torch.mm(a,b)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 8,
 69 |    "id": "5a52e219-6135-43fb-a3d2-3c2c8fc77a4e",
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "ename": "RuntimeError",
 74 |      "evalue": "only Tensors of floating point dtype can require gradients",
 75 |      "output_type": "error",
 76 |      "traceback": [
 77 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 78 |       "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
 79 |       "\u001b[0;32m/var/folders/ts/ft1kkj55399gmd5c5cr535dm0000gn/T/ipykernel_81586/1455190116.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequires_grad_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgrad\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 80 |       "\u001b[0;31mRuntimeError\u001b[0m: only Tensors of floating point dtype can require gradients"
 81 |      ]
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "x.requires_grad_(True)\n",
 86 |     "x.grad"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 3,
 92 |    "id": "644c3fbc-a1c9-4dd2-a3b8-8b46cb1ddc80",
 93 |    "metadata": {},
 94 |    "outputs": [
 95 |     {
 96 |      "data": {
 97 |       "text/plain": [
 98 |        "0.0"
 99 |       ]
100 |      },
101 |      "execution_count": 3,
102 |      "metadata": {},
103 |      "output_type": "execute_result"
104 |     }
105 |    ],
106 |    "source": [
107 |     "y = torch.dot(x,x)\n",
108 |     "y.item()"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 6,
114 |    "id": "702ddb8c-203c-48cb-9b89-79af11d56777",
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "data": {
119 |       "text/plain": [
120 |        "tensor(0., grad_fn=<DotBackward>)"
121 |       ]
122 |      },
123 |      "execution_count": 6,
124 |      "metadata": {},
125 |      "output_type": "execute_result"
126 |     }
127 |    ],
128 |    "source": [
129 |     "torch.dot(x,x)"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 10,
135 |    "id": "0042f697-991a-4bd9-92d7-04b2dadb826a",
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "data": {
140 |       "text/plain": [
141 |        "tensor([0., 1., 2., 3.], grad_fn=<PermuteBackward>)"
142 |       ]
143 |      },
144 |      "execution_count": 10,
145 |      "metadata": {},
146 |      "output_type": "execute_result"
147 |     }
148 |    ],
149 |    "source": [
150 |     "x.T"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "id": "b82ea2f0-0a37-45da-9a92-5e9065c2ace3",
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead."
161 |    ]
162 |   }
163 |  ],
164 |  "metadata": {
165 |   "kernelspec": {
166 |    "display_name": "Python 3 (ipykernel)",
167 |    "language": "python",
168 |    "name": "python3"
169 |   },
170 |   "language_info": {
171 |    "codemirror_mode": {
172 |     "name": "ipython",
173 |     "version": 3
174 |    },
175 |    "file_extension": ".py",
176 |    "mimetype": "text/x-python",
177 |    "name": "python",
178 |    "nbconvert_exporter": "python",
179 |    "pygments_lexer": "ipython3",
180 |    "version": "3.9.2"
181 |   }
182 |  },
183 |  "nbformat": 4,
184 |  "nbformat_minor": 5
185 | }
186 | 


--------------------------------------------------------------------------------