├── .gitignore
├── README.md
├── chapter_appendix
├── buy-gpu.ipynb
├── d2ltorch.ipynb
├── index.md
├── jupyter.ipynb
├── math.ipynb
└── notation.ipynb
├── chapter_convolutional-neural-networks
├── alexnet.ipynb
├── batch-norm.ipynb
├── channels.ipynb
├── conv-layer.ipynb
├── densenet.ipynb
├── googlenet.ipynb
├── index.md
├── lenet.ipynb
├── nin.ipynb
├── padding-and-strides.ipynb
├── pooling.ipynb
├── resnet.ipynb
└── vgg.ipynb
├── chapter_deep-learning-basics
├── backprop.ipynb
├── dropout.ipynb
├── fashion-mnist.ipynb
├── index.md
├── kaggle-house-price.ipynb
├── linear-regression-nn.ipynb
├── linear-regression-scratch.ipynb
├── linear-regression.ipynb
├── mlp-nn.ipynb
├── mlp-scratch.ipynb
├── mlp.ipynb
├── numerical-stability-and-init.ipynb
├── softmax-regression-nn.ipynb
├── softmax-regression-scratch.ipynb
├── softmax-regression.ipynb
├── underfit-overfit.ipynb
└── weight-decay.ipynb
├── chapter_deep-learning-computation
├── custom-layer.ipynb
├── deferred-init.ipynb
├── index.md
├── model-construction.ipynb
├── parameters.ipynb
├── read-write.ipynb
└── use-gpu.ipynb
├── chapter_how-to-use
└── how-to-use.ipynb
├── chapter_introduction
└── deep-learning-intro.ipynb
├── chapter_natural-language-processing
├── approx-training.ipynb
├── attention.ipynb
├── beam-search.ipynb
├── fasttext.ipynb
├── glove.ipynb
├── index.md
├── machine-translation.ipynb
├── sentiment-analysis-cnn.ipynb
├── sentiment-analysis-rnn.ipynb
├── seq2seq.ipynb
├── similarity-analogy.ipynb
├── word2vec-nn.ipynb
└── word2vec.ipynb
├── chapter_optimization
├── adadelta.ipynb
├── adagrad.ipynb
├── adam.ipynb
├── gd-sgd.ipynb
├── index.md
├── minibatch-sgd.ipynb
├── momentum.ipynb
├── optimization-intro.ipynb
└── rmsprop.ipynb
├── chapter_preface
└── preface.ipynb
├── chapter_prerequisite
├── autograd.ipynb
├── index.md
├── install.ipynb
├── lookup-api.ipynb
└── tensor.ipynb
├── chapter_recurrent-neural-networks
├── bi-rnn.ipynb
├── bptt.ipynb
├── deep-rnn.ipynb
├── gru.ipynb
├── index.md
├── lang-model-dataset.ipynb
├── lang-model.ipynb
├── lstm.ipynb
├── rnn-nn.ipynb
├── rnn-scratch.ipynb
└── rnn.ipynb
├── d2ltorch
├── __init__.py
└── utils.py
├── data
├── airfoil_self_noise.dat
├── fr-en-small.txt
├── jaychou_lyrics.txt.zip
├── kaggle_cifar10
│ ├── test_tiny.zip
│ ├── trainLabels.csv.zip
│ └── train_tiny.zip
├── kaggle_dog
│ └── train_valid_test_tiny.zip
├── kaggle_house_pred_test.csv
├── kaggle_house_pred_train.csv
└── ptb.zip
├── environment.yml
├── img
├── 404.jpg
├── anchor-label.svg
├── attention.svg
├── autumn_oak.jpg
├── aws.png
├── beam_search.svg
├── birnn.svg
├── book-org.svg
├── capacity_vs_error.svg
├── cat1.jpg
├── catdog.jpg
├── cbow.svg
├── cifar10.png
├── comp-comm.svg
├── connect.png
├── contrib01.png
├── contrib02.png
├── contrib03.png
├── contrib04.png
├── contrib05.png
├── contrib06.png
├── conv1d-2d.svg
├── conv1d-channel.svg
├── conv1d.svg
├── conv_1x1.svg
├── conv_multi_in.svg
├── conv_pad.svg
├── conv_stride.svg
├── correlation.svg
├── cuda.png
├── data-parallel.svg
├── deep-rnn.svg
├── densenet.svg
├── disk.png
├── dropout.svg
├── ec2.png
├── fast-rcnn.svg
├── faster-rcnn.svg
├── fcn.svg
├── finetune.svg
├── forward.svg
├── gru_1.svg
├── gru_2.svg
├── gru_3.svg
├── gtx.png
├── hi-softmax.svg
├── house_pricing.png
├── inception.svg
├── install_gpu.png
├── iou.svg
├── jupyter.png
├── jupyter00.png
├── jupyter01.png
├── jupyter02.png
├── jupyter03.png
├── jupyter04.png
├── jupyter05.png
├── jupyter06.png
├── kaggle-dog.png
├── kaggle.png
├── kaggle_cifar10.png
├── kaggle_submit2.png
├── keypair.png
├── koebel.jpg
├── launching.png
├── limits.png
├── linreg.svg
├── lstm_0.svg
├── lstm_1.svg
├── lstm_2.svg
├── lstm_3.svg
├── mask-rcnn.svg
├── mlp.svg
├── neural-style-1.png
├── neural-style-2.png
├── neural-style.svg
├── nin.svg
├── ones_like.png
├── os.png
├── p2x.png
├── pikachu.jpg
├── pooling.svg
├── pytorch-website.png
├── qq.png
├── qr_adadelta.svg
├── qr_adagrad.svg
├── qr_adam.svg
├── qr_alexnet.svg
├── qr_anchor.svg
├── qr_async-computation.svg
├── qr_attention.svg
├── qr_auto-parallelism.svg
├── qr_autograd.svg
├── qr_aws.svg
├── qr_backprop.svg
├── qr_batch-norm.svg
├── qr_beam-search.svg
├── qr_bi-rnn.svg
├── qr_bounding-box.svg
├── qr_bptt.svg
├── qr_buy-gpu.svg
├── qr_channels.svg
├── qr_conv-layer.svg
├── qr_custom-layer.svg
├── qr_deep-learning-intro.svg
├── qr_deep-rnn.svg
├── qr_deferred-init.svg
├── qr_densenet.svg
├── qr_dropout.svg
├── qr_fashion-mnist.svg
├── qr_fasttext.svg
├── qr_fcn.svg
├── qr_fine-tuning.svg
├── qr_gd-sgd.svg
├── qr_glove.svg
├── qr_googlenet.svg
├── qr_gru.svg
├── qr_how-to-contribute.svg
├── qr_how-to-use.svg
├── qr_hybridize.svg
├── qr_image-augmentation.svg
├── qr_install.svg
├── qr_jupyter.svg
├── qr_kaggle-gluon-cifar10.svg
├── qr_kaggle-gluon-dog.svg
├── qr_kaggle-house-price.svg
├── qr_lang-model-dataset.svg
├── qr_lang-model.svg
├── qr_lenet.svg
├── qr_linear-regression-gluon.svg
├── qr_linear-regression-scratch.svg
├── qr_linear-regression.svg
├── qr_lookup-api.svg
├── qr_lstm.svg
├── qr_machine-translation.svg
├── qr_math.svg
├── qr_minibatch-sgd.svg
├── qr_mlp-gluon.svg
├── qr_mlp-scratch.svg
├── qr_mlp.svg
├── qr_model-construction.svg
├── qr_momentum.svg
├── qr_multiple-gpus-gluon.svg
├── qr_multiple-gpus.svg
├── qr_multiscale-object-detection.svg
├── qr_ndarray.svg
├── qr_neural-style.svg
├── qr_nin.svg
├── qr_numerical-stability-and-init.svg
├── qr_object-detection-dataset.svg
├── qr_optimization-intro.svg
├── qr_optimization-summary.svg
├── qr_padding-and-strides.svg
├── qr_parameters.svg
├── qr_pooling.svg
├── qr_rcnn.svg
├── qr_read-write.svg
├── qr_resnet.svg
├── qr_rmsprop.svg
├── qr_rnn-gluon.svg
├── qr_rnn-scratch.svg
├── qr_rnn.svg
├── qr_semantic-segmentation-and-dataset.svg
├── qr_sentiment-analysis-cnn.svg
├── qr_sentiment-analysis.svg
├── qr_seq2seq.svg
├── qr_similarity-analogy.svg
├── qr_softmax-regression-gluon.svg
├── qr_softmax-regression-scratch.svg
├── qr_softmax-regression.svg
├── qr_ssd.svg
├── qr_underfit-overfit.svg
├── qr_use-gpu.svg
├── qr_vgg.svg
├── qr_weight-decay.svg
├── qr_word2vec-approx-train.svg
├── qr_word2vec-gluon.svg
├── qr_word2vec.svg
├── r-cnn.svg
├── rainier.jpg
├── residual-block.svg
├── rnn-bptt.svg
├── rnn-train.svg
├── rnn.svg
├── roi.svg
├── s2s_prob1.svg
├── s2s_prob2.svg
├── segmentation.svg
├── seq2seq.svg
├── skip-gram.svg
├── softmaxreg.svg
├── ssd.svg
├── ssh.png
├── style-transfer.svg
└── textcnn.svg
├── toc.ipynb
└── todo
├── chapter_computational-performance
├── async-computation.ipynb
├── auto-parallelism.ipynb
├── hybridize.ipynb
├── index.md
├── multiple-gpus-gluon.ipynb
└── multiple-gpus.ipynb
└── chapter_computer-vision
├── anchor.ipynb
├── bounding-box.ipynb
├── fcn.ipynb
├── fine-tuning.ipynb
├── image-augmentation.ipynb
├── index.md
├── kaggle-gluon-cifar10.ipynb
├── kaggle-gluon-dog.ipynb
├── multiscale-object-detection.ipynb
├── neural-style.ipynb
├── object-detection-dataset.ipynb
├── rcnn.ipynb
├── semantic-segmentation-and-dataset.ipynb
└── ssd.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | debug.py
6 | sentiment/
7 | .vector_cache/
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 |
62 | # Flask stuff:
63 | instance/
64 | .webassets-cache
65 |
66 | # Scrapy stuff:
67 | .scrapy
68 |
69 | # Sphinx documentation
70 | docs/_build/
71 |
72 | # PyBuilder
73 | target/
74 |
75 | # Jupyter Notebook
76 | .ipynb_checkpoints
77 |
78 | # pyenv
79 | .python-version
80 |
81 | # celery beat schedule file
82 | celerybeat-schedule
83 |
84 | # SageMath parsed files
85 | *.sage.py
86 |
87 | # Environments
88 | .env
89 | .venv
90 | env/
91 | venv/
92 | ENV/
93 | env.bak/
94 | venv.bak/
95 |
96 | # Spyder project settings
97 | .spyderproject
98 | .spyproject
99 |
100 | # Rope project settings
101 | .ropeproject
102 |
103 | # mkdocs documentation
104 | /site
105 |
106 | # mypy
107 | .mypy_cache/
108 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # d2l-torch
2 |
3 | **首先感谢《动手学深度学习》的原作者及贡献者为我们提供了一本极其优秀的书籍。如果您对框架没有特定偏好或需求,不妨尝试MXNet,一款极其优秀的深度学习框架。
4 | 原书地址:,原书视频教程:[B站](https://space.bilibili.com/209599371/channel/detail?cid=23541),[youtube](https://www.youtube.com/playlist?list=PLLbeS1kM6teJqdFzw1ICHfa4a1y0hg8Ax)**
5 |
6 | 本书在原书(19年5月20日版本)基础上将所有代码改用 PyTorch 进行实现,并以**注的形式**对部分内容的进行了解释与扩展。因为 PyTorch 与 MXNet 在设计上存在不同,对原书部分内容进行了删改。
7 |
8 | **请按照目录[toc.ipynb](https://nbviewer.jupyter.org/github/sangyx/d2l-torch/blob/master/toc.ipynb)中的顺序阅读学习。**
9 |
10 | 如果你喜欢这本书,请给本项目点个star,并购买原书纸质版支持原作者及贡献者。
11 |
12 | 项目未来短期内不再更新,如需要计算性能、计算机视觉两章可使用项目[Dive-into-DL-PyTorch](https://github.com/ShusenTang/Dive-into-DL-PyTorch)。如有疑问欢迎使用issue。
--------------------------------------------------------------------------------
/chapter_appendix/buy-gpu.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# GPU购买指南\n",
8 | "\n",
9 | "深度学习训练通常需要大量的计算资源。GPU目前是深度学习最常使用的计算加速硬件。相对于CPU来说,GPU更便宜且计算更加密集。一方面,相同计算能力的GPU的价格一般是CPU价格的十分之一;另一方面,一台服务器通常可以搭载8块或者16块GPU。因此,GPU数量可以看作是衡量一台服务器的深度学习计算能力的一个指标。\n",
10 | "\n",
11 | "## 选择GPU\n",
12 | "\n",
13 | "目前独立显卡主要有AMD和NVIDIA两家厂商。其中NVIDIA在深度学习布局较早,对深度学习框架支持更好。因此,目前大家主要会选择NVIDIA的GPU。\n",
14 | "\n",
15 | "NVIDIA有面向个人用户(如GTX系列)和企业用户(如Tesla系列)的两类GPU。这两类GPU的计算能力相当。然而,面向企业用户的GPU通常使用被动散热并增加了显存校验,从而更适合数据中心,并通常要比面向个人用户的GPU贵上10倍。\n",
16 | "\n",
17 | "如果是拥有100台机器以上的大公司用户,通常可以考虑针对企业用户的NVIDIA Tesla系列。如果是拥有10~100台机器的实验室和中小公司用户,预算充足的情况下可以考虑NVIDIA DGX系列,否则可以考虑购买如Supermicro之类的性价比比较高的服务器,然后再购买安装GTX系列的GPU。\n",
18 | "\n",
19 | "NVIDIA一般每一两年发布一次新版本的GPU,例如2016年发布的GTX 1000系列以及2018年发布的RTX 2000系列。每个系列中会有数个不同的型号,分别对应不同的性能。\n",
20 | "\n",
21 | "GPU的性能主要由以下3个参数构成。\n",
22 | "\n",
23 | "1. 计算能力。通常我们关心的是32位浮点计算能力。16位浮点训练也开始流行,如果只做预测的话也可以用8位整数。\n",
24 | "2. 显存大小。当模型越大或者训练时的批量越大时,所需要的显存就越多。\n",
25 | "3. 显存带宽。只有当显存带宽足够时才能充分发挥计算能力。\n",
26 | "\n",
27 | "对大部分用户来说,只要考虑计算能力就可以了。显存尽量不小于4 GB。但如果GPU要同时显示图形界面,那么推荐的显存大小至少为6 GB。显存带宽通常相对固定,选择空间较小。\n",
28 | "\n",
29 | "图11.19描绘了GTX 900和GTX 1000系列里各个型号的32位浮点计算能力和价格的对比(其中的价格为Wikipedia的建议价格)。\n",
30 | "\n",
31 | "\n",
32 | "\n",
33 | "我们可以从图11.19中读出以下两点信息。\n",
34 | "\n",
35 | "1. 在同一个系列里面,价格和性能大体上成正比。但后发布的型号性价比更高,如980 Ti和1080 Ti。\n",
36 | "2. GTX 1000系列比900系列在性价比上高出2倍左右。\n",
37 | "\n",
38 | "如果大家继续比较NVIDIA的一些其他系列,也可以发现类似的规律。据此,我们推荐大家在能力范围内尽可能买较新的GPU。\n",
39 | "\n",
40 | "\n",
41 | "## 整机配置\n",
42 | "\n",
43 | "通常,我们主要用GPU做深度学习训练。因此,不需要购买高端的CPU。至于整机配置,尽量参考网上推荐的中高档的配置就好。不过,考虑到GPU的功耗、散热和体积,在整机配置上也需要考虑以下3个额外因素。\n",
44 | "\n",
45 | "1. 机箱体积。显卡尺寸较大,通常考虑较大且自带风扇的机箱。\n",
46 | "2. 电源。购买GPU时需要查一下GPU的功耗,如50 W到300 W不等。购买电源要确保功率足够,且不会造成机房供电过载。\n",
47 | "3. 主板的PCIe卡槽。推荐使用PCIe 3.0 16x来保证充足的GPU到内存的带宽。如果搭载多块GPU,要仔细阅读主板说明,以确保多块GPU一起使用时仍然是16倍带宽。注意,有些主板搭载4块GPU时会降到8倍甚至4倍带宽。\n",
48 | "\n",
49 | "\n",
50 | "## 小结\n",
51 | "\n",
52 | "* 在预算范围内,尽可能买较新的GPU。\n",
53 | "* 整机配置需要考虑到GPU的功耗、散热和体积。\n",
54 | "\n",
55 | "## 练习\n",
56 | "\n",
57 | "* 浏览本节讨论区中大家有关机器配置方面的交流。\n",
58 | "\n",
59 | "\n",
60 | "\n",
61 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/1177)\n",
62 | "\n",
63 | ""
64 | ]
65 | }
66 | ],
67 | "metadata": {
68 | "kernelspec": {
69 | "display_name": "Python [conda env:pytorch]",
70 | "language": "python",
71 | "name": "conda-env-pytorch-py"
72 | },
73 | "language_info": {
74 | "codemirror_mode": {
75 | "name": "ipython",
76 | "version": 3
77 | },
78 | "file_extension": ".py",
79 | "mimetype": "text/x-python",
80 | "name": "python",
81 | "nbconvert_exporter": "python",
82 | "pygments_lexer": "ipython3",
83 | "version": "3.6.9"
84 | }
85 | },
86 | "nbformat": 4,
87 | "nbformat_minor": 4
88 | }
89 |
--------------------------------------------------------------------------------
/chapter_appendix/d2ltorch.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# `d2ltorch`包索引\n",
8 | "\n",
9 | "\n",
10 | "函数、类等名称:定义所在章节\n",
11 | "\n",
12 | "**注:暂未实现的函数用删除线标注**\n",
13 | "\n",
14 | "* ~~`bbox_to_rect`:[物体检测和边界框](../chapter_computer-vision/bounding-box.ipynb)~~\n",
15 | "* ~~`Benchmark`:[异步计算](../chapter_computational-performance/async-computation.ipynb)~~\n",
16 | "* `corr2d`:[二维卷积层](../chapter_convolutional-neural-networks/conv-layer.ipynb)\n",
17 | "* `count_tokens`:[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis-rnn.ipynb)\n",
18 | "* `data_iter`:[线性回归的从零开始实现](../chapter_deep-learning-basics/linear-regression-scratch.ipynb)\n",
19 | "* `data_iter_consecutive`:[语言模型数据集(周杰伦专辑歌词)](../chapter_recurrent-neural-networks/lang-model-dataset.ipynb)\n",
20 | "* `data_iter_random`:[语言模型数据集(周杰伦专辑歌词)](../chapter_recurrent-neural-networks/lang-model-dataset.ipynb)\n",
21 | "* `download_imdb`:[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis-rnn.ipynb)\n",
22 | "* ~~`download_voc_pascal`:[语义分割和数据集](../chapter_computer-vision/semantic-segmentation-and-dataset.ipynb)~~\n",
23 | "* ~~`evaluate_accuracy`:[图像增广](../chapter_computer-vision/image-augmentation.ipynb)~~\n",
24 | "* `get_data_ch7`:[小批量随机梯度下降](../chapter_optimization/minibatch-sgd.ipynb)\n",
25 | "* `get_fashion_mnist_labels`:[图像分类数据集(Fashion-MNIST)](../chapter_deep-learning-basics/fashion-mnist.ipynb)\n",
26 | "* `get_tokenized_imdb`:[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis-rnn.ipynb)\n",
27 | "* `get_vocab_imdb`:[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis-rnn.ipynb)\n",
28 | "* `grad_clipping`:[循环神经网络的从零开始实现](../chapter_recurrent-neural-networks/rnn-scratch.ipynb)\n",
29 | "* `linreg`:[线性回归的从零开始实现](../chapter_deep-learning-basics/linear-regression-scratch.ipynb)\n",
30 | "* `load_data_fashion_mnist`:[深度卷积神经网络(AlexNet)](../chapter_convolutional-neural-networks/alexnet.ipynb)\n",
31 | "* `load_data_jay_lyrics`:[语言模型数据集(周杰伦专辑歌词)](../chapter_recurrent-neural-networks/lang-model-dataset.ipynb)\n",
32 | "* ~~`load_data_pikachu`:[物体检测数据集(皮卡丘)](../chapter_computer-vision/object-detection-dataset.ipynb)~~\n",
33 | "* ~~`mkdir_if_not_exist`:[实战Kaggle比赛:图像分类(CIFAR-10)](../chapter_computer-vision/kaggle-gluon-cifar10.ipynb)~~\n",
34 | "* `params_init`: [模型参数的访问、初始化和共享](../chapter_deep-learning-computation/parameters.ipynb)\n",
35 | "* `plt`:[线性回归的从零开始实现](../chapter_deep-learning-basics/linear-regression-scratch.ipynb)\n",
36 | "* `predict_rnn`:[循环神经网络的从零开始实现](../chapter_recurrent-neural-networks/rnn-scratch.ipynb)\n",
37 | "* `predict_rnn_nn`:[循环神经网络的简洁实现](../chapter_recurrent-neural-networks/rnn-nn.ipynb)\n",
38 | "* `predict_sentiment`:[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis-rnn.ipynb)\n",
39 | "* `preprocess_imdb`:[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis-rnn.ipynb)\n",
40 | "* `read_imdb`:[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis-rnn.ipynb)\n",
41 | "* ~~`read_voc_images`:[语义分割和数据集](../chapter_computer-vision/semantic-segmentation-and-dataset.ipynb)~~\n",
42 | "* `Residual`:[残差网络(ResNet)](../chapter_convolutional-neural-networks/resnet.ipynb)\n",
43 | "* ~~`resnet18`:[多GPU计算的简洁实现](../chapter_computational-performance/multiple-gpus-gluon.ipynb)~~\n",
44 | "* `RNNModel`:[循环神经网络的简洁实现](../chapter_recurrent-neural-networks/rnn-nn.ipynb)\n",
45 | "* `semilogy`:[模型选择、欠拟合和过拟合](../chapter_deep-learning-basics/underfit-overfit.ipynb)\n",
46 | "* `set_figsize`:[线性回归的从零开始实现](../chapter_deep-learning-basics/linear-regression-scratch.ipynb)\n",
47 | "* `sgd`:[线性回归的从零开始实现](../chapter_deep-learning-basics/linear-regression-scratch.ipynb)\n",
48 | "* ~~`show_bboxes`:[锚框](../chapter_computer-vision/anchor.ipynb)~~\n",
49 | "* `show_fashion_mnist`:[图像分类数据集(Fashion-MNIST)](../chapter_deep-learning-basics/fashion-mnist.ipynb)\n",
50 | "* ~~`show_images`:[图像增广](../chapter_computer-vision/image-augmentation.ipynb)~~\n",
51 | "* `show_trace_2d`:[梯度下降和随机梯度下降](../chapter_optimization/gd-sgd.ipynb)\n",
52 | "* `squared_loss`:[线性回归的从零开始实现](../chapter_deep-learning-basics/linear-regression-scratch.ipynb)\n",
53 | "* `to_onehot`:[循环神经网络的从零开始实现](../chapter_recurrent-neural-networks/rnn-scratch.ipynb)\n",
54 | "* ~~`train`:[图像增广](../chapter_computer-vision/image-augmentation.ipynb)~~\n",
55 | "* `train_2d`:[梯度下降和随机梯度下降](../chapter_optimization/gd-sgd.ipynb)\n",
56 | "* `train_and_predict_rnn`:[循环神经网络的从零开始实现](../chapter_recurrent-neural-networks/rnn-scratch.ipynb)\n",
57 | "* `train_and_predict_rnn_nn `:[循环神经网络的简洁实现](../chapter_recurrent-neural-networks/rnn-nn.ipynb)\n",
58 | "* `train_ch3`:[softmax回归的从零开始实现](../chapter_deep-learning-basics/softmax-regression-scratch.ipynb)\n",
59 | "* `train_ch5`:[卷积神经网络(LeNet)](../chapter_convolutional-neural-networks/lenet.ipynb)\n",
60 | "* `train_ch7`:[小批量随机梯度下降](../chapter_optimization/minibatch-sgd.ipynb)\n",
61 | "* `train_nn_ch7`:[小批量随机梯度下降](../chapter_optimization/minibatch-sgd.ipynb)\n",
62 | "* ~~`try_all_gpus`:[图像增广](../chapter_computer-vision/image-augmentation.ipynb)~~\n",
63 | "* `try_gpu`:[卷积神经网络(LeNet)](../chapter_convolutional-neural-networks/lenet.ipynb)\n",
64 | "* `use_svg_display`:[线性回归的从零开始实现](../chapter_deep-learning-basics/linear-regression-scratch.ipynb)\n",
65 | "* ~~`VOC_CLASSES`:[语义分割和数据集](../chapter_computer-vision/semantic-segmentation-and-dataset.ipynb)~~\n",
66 | "* ~~`VOC_COLORMAP`:[语义分割和数据集](../chapter_computer-vision/semantic-segmentation-and-dataset.ipynb)~~\n",
67 | "* ~~`voc_label_indices`:[语义分割和数据集](../chapter_computer-vision/semantic-segmentation-and-dataset.ipynb)~~\n",
68 | "* ~~`voc_rand_crop`:[语义分割和数据集](../chapter_computer-vision/semantic-segmentation-and-dataset.ipynb)~~\n",
69 | "* ~~`VOCSegDataset`:[语义分割和数据集](../chapter_computer-vision/semantic-segmentation-and-dataset.ipynb)~~"
70 | ]
71 | }
72 | ],
73 | "metadata": {
74 | "kernelspec": {
75 | "display_name": "Python [conda env:pytorch]",
76 | "language": "python",
77 | "name": "conda-env-pytorch-py"
78 | },
79 | "language_info": {
80 | "codemirror_mode": {
81 | "name": "ipython",
82 | "version": 3
83 | },
84 | "file_extension": ".py",
85 | "mimetype": "text/x-python",
86 | "name": "python",
87 | "nbconvert_exporter": "python",
88 | "pygments_lexer": "ipython3",
89 | "version": "3.6.9"
90 | }
91 | },
92 | "nbformat": 4,
93 | "nbformat_minor": 4
94 | }
95 |
--------------------------------------------------------------------------------
/chapter_appendix/index.md:
--------------------------------------------------------------------------------
1 | # 附录
2 |
3 | ```eval_rst
4 |
5 | .. toctree::
6 | :maxdepth: 2
7 |
8 | notation
9 | math
10 | jupyter
11 | aws
12 | buy-gpu
13 | how-to-contribute
14 | d2lzh
15 | ```
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/chapter_appendix/jupyter.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 使用Jupyter记事本\n",
8 | "\n",
9 | "本节介绍如何使用Jupyter记事本编辑和运行本书的代码。请确保你已按照[“获取和运行本书的代码”](../chapter_prerequisite/install.ipynb)一节中的步骤安装好Jupyter记事本并获取了本书的代码。\n",
10 | "\n",
11 | "\n",
12 | "## 在本地编辑和运行本书的代码\n",
13 | "\n",
14 | "下面我们介绍如何在本地使用Jupyter记事本来编辑和运行本书的代码。假设本书的代码所在的本地路径为`xx/yy/d2l-zh/`。在命令行模式下进入该路径(`cd xx/yy/d2l-zh`),然后运行命令`jupyter notebook`。这时在浏览器打开 http://localhost:8888 (通常会自动打开)就可以看到Jupyter记事本的界面和本书的代码所在的各个文件夹,如图11.1所示。\n",
15 | "\n",
16 | "\n",
17 | "\n",
18 | "\n",
19 | "我们可以通过点击网页上显示的文件夹访问其中的记事本文件。它们的后缀通常是“ipynb”。\n",
20 | "简洁起见,我们创建一个临时的test.ipynb文件,点击后所显示的内容如图11.2所示。该记事本包括了格式化文本单元(markdown cell)和代码单元(code cell),其中格式化文本单元中的内容包括“这是标题”和“这是一段正文。”,代码单元中包括两行Python代码。\n",
21 | "\n",
22 | "\n",
23 | "\n",
24 | "\n",
25 | "双击格式化文本单元,进入编辑模式。在该单元的末尾添加一段新文本“你好世界。”,如图11.3所示。\n",
26 | "\n",
27 | "\n",
28 | "\n",
29 | "\n",
30 | "如图11.4所示,点击菜单栏的“Cell” $\\rightarrow$ “Run Cells”,运行编辑好的单元。\n",
31 | "\n",
32 | "\n",
33 | "\n",
34 | "\n",
35 | "运行完以后,图11.5展示了编辑后的格式化文本单元。\n",
36 | "\n",
37 | "\n",
38 | "\n",
39 | "\n",
40 | "接下来,点击代码单元。在最后一行代码后添加乘以2的操作 `* 2`,如图11.6所示。\n",
41 | "\n",
42 | "\n",
43 | "\n",
44 | "\n",
45 | "我们也可以用快捷键运行单元(默认Ctrl + Enter),并得到图11.7所示的输出结果。\n",
46 | "\n",
47 | "\n",
48 | "\n",
49 | "\n",
50 | "当一个记事本包含的单元较多时,我们可以点击菜单栏的“Kernel” $\\rightarrow$ “Restart & Run All”,以运行整个记事本中的所有单元。点击菜单栏的“Help” $\\rightarrow$ “Edit Keyboard Shortcuts”后可以根据自己的偏好编辑快捷键。\n",
51 | "\n",
52 | "\n",
53 | "## 高级选项\n",
54 | "\n",
55 | "下面介绍有关使用Jupyter记事本的一些高级选项。你可以根据自己的兴趣参考其中的内容。\n",
56 | "\n",
57 | "### 用Jupyter记事本读写GitHub源文件\n",
58 | "\n",
59 | "如果想为本书内容做贡献,需要修改在GitHub上markdown格式的源文件(后缀为.md)。通过notedown插件,就可以使用Jupyter记事本修改并运行markdown格式的源代码。Linux/macOS用户可以执行以下命令获得GitHub源文件并激活运行环境:"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "```\n",
67 | "git clone https://github.com/d2l-ai/d2l-zh.git\n",
68 | "cd d2l-zh\n",
69 | "conda env create -f environment.yml\n",
70 | "# 若conda版本低于4.4,运行source activate gluon;Windows用户则运行activate gluon\n",
71 | "conda activate gluon\n",
72 | "```\n"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "下面安装notedown插件,运行Jupyter记事本并加载插件:"
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "```\n",
87 | "pip install https://github.com/mli/notedown/tarball/master\n",
88 | "jupyter notebook --NotebookApp.contents_manager_class='notedown.NotedownContentsManager'\n",
89 | "```\n"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "如果想每次运行Jupyter记事本时默认开启notedown插件,可以参考下面的步骤。\n",
97 | "\n",
98 | "首先,执行下面的命令生成Jupyter记事本配置文件(如果已经生成,可以跳过):"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "```\n",
106 | "jupyter notebook --generate-config\n",
107 | "```\n"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "然后,将下面这一行加入到Jupyter记事本配置文件(一般在用户主目录下的隐藏文件夹`.jupyter`中的`jupyter_notebook_config.py`)的末尾"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {},
120 | "source": [
121 | "```\n",
122 | "c.NotebookApp.contents_manager_class = 'notedown.NotedownContentsManager'\n",
123 | "```\n"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "之后,只需要运行`jupyter notebook`命令即可默认开启notedown插件。\n",
131 | "\n",
132 | "\n",
133 | "### 在远端服务器上运行Jupyter记事本\n",
134 | "\n",
135 | "有时候,我们希望在远端服务器上运行Jupyter记事本,并通过本地计算机上的浏览器访问。如果本地计算机上安装了Linux或者macOS(Windows通过putty等第三方软件也能支持),那么可以使用端口映射:"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "```\n",
143 | "ssh myserver -L 8888:localhost:8888\n",
144 | "```\n"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "以上`myserver`是远端服务器地址。然后我们可以使用 http://localhost:8888 打开运行Jupyter记事本的远端服务器`myserver`。我们将在下一节详细介绍如何在AWS实例上运行Jupyter记事本。\n",
152 | "\n",
153 | "### 运行计时\n",
154 | "\n",
155 | "我们可以通过ExecutionTime插件来对Jupyter记事本的每个代码单元的运行计时。下面是安装该插件的命令:"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "```\n",
163 | "pip install jupyter_contrib_nbextensions\n",
164 | "jupyter contrib nbextension install --user\n",
165 | "jupyter nbextension enable execute_time/ExecuteTime\n",
166 | "```\n"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "## 小结\n",
174 | "\n",
175 | "* 可以使用Jupyter记事本编辑和运行本书的代码。\n",
176 | "\n",
177 | "## 练习\n",
178 | "\n",
179 | "* 尝试在本地编辑和运行本书的代码。\n",
180 | "\n",
181 | "\n",
182 | "\n",
183 | "\n",
184 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/6965)\n",
185 | "\n",
186 | ""
187 | ]
188 | }
189 | ],
190 | "metadata": {
191 | "kernelspec": {
192 | "display_name": "Python [conda env:pytorch]",
193 | "language": "python",
194 | "name": "conda-env-pytorch-py"
195 | },
196 | "language_info": {
197 | "codemirror_mode": {
198 | "name": "ipython",
199 | "version": 3
200 | },
201 | "file_extension": ".py",
202 | "mimetype": "text/x-python",
203 | "name": "python",
204 | "nbconvert_exporter": "python",
205 | "pygments_lexer": "ipython3",
206 | "version": "3.6.9"
207 | }
208 | },
209 | "nbformat": 4,
210 | "nbformat_minor": 4
211 | }
212 |
--------------------------------------------------------------------------------
/chapter_appendix/notation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 主要符号一览\n",
8 | "\n",
9 | "下面列举了本书中使用的主要符号。\n",
10 | "\n",
11 | "## 数\n",
12 | "\n",
13 | "* $x$:标量\n",
14 | "* $\\boldsymbol{x}$:向量\n",
15 | "* $\\boldsymbol{X}$:矩阵\n",
16 | "* $\\mathsf{X}$:张量\n",
17 | "\n",
18 | "\n",
19 | "## 集合\n",
20 | "\n",
21 | "* $\\mathcal{X}$:集合\n",
22 | "* $\\mathbb{R}$:实数集合\n",
23 | "* $\\mathbb{R}^n$:$n$维的实数向量集合\n",
24 | "* $\\mathbb{R}^{x\\times y}$:$x$行$y$列的实数矩阵集合\n",
25 | "\n",
26 | "\n",
27 | "## 操作符\n",
28 | "\n",
29 | "* $\\boldsymbol{(\\cdot)}^\\top$:向量或矩阵的转置\n",
30 | "* $\\odot$:按元素相乘,即阿达马(Hadamard)积\n",
31 | "* $\\lvert\\mathcal{X}\\rvert$:集合$\\mathcal{X}$中元素个数\n",
32 | "* $\\|\\cdot\\|_p$:$L_p$范数\n",
33 | "* $\\|\\cdot\\|$:$L_2$范数\n",
34 | "* $\\sum$:连加\n",
35 | "* $\\prod$:连乘\n",
36 | "\n",
37 | "\n",
38 | "## 函数\n",
39 | "\n",
40 | "* $f(\\cdot)$:函数\n",
41 | "* $\\log(\\cdot)$:自然对数函数\n",
42 | "* $\\exp(\\cdot)$:指数函数\n",
43 | "\n",
44 | "\n",
45 | "## 导数和梯度\n",
46 | "\n",
47 | "* $\\frac{dy}{dx}$:$y$关于$x$的导数\n",
48 | "* $\\frac{\\partial y}{\\partial x}$:$y$关于$x$的偏导数\n",
49 | "* $\\nabla_{\\cdot} y$:$y$关于$\\cdot$的梯度\n",
50 | "\n",
51 | "\n",
52 | "## 概率和统计\n",
53 | "\n",
54 | "* $P(\\cdot)$:概率分布\n",
55 | "* $\\cdot\\sim P$:随机变量$\\cdot$的概率分布是$P$\n",
56 | "* $P(\\cdot \\mid\\cdot)$:条件概率分布\n",
57 | "* $E_{\\cdot}\\left(f(\\cdot)\\right)$:函数$f(\\cdot)$对$\\cdot$的数学期望\n",
58 | "\n",
59 | "## 复杂度\n",
60 | "\n",
61 | "* $\\mathcal{O}$:大O符号(渐进符号)"
62 | ]
63 | }
64 | ],
65 | "metadata": {
66 | "kernelspec": {
67 | "display_name": "Python [conda env:pytorch]",
68 | "language": "python",
69 | "name": "conda-env-pytorch-py"
70 | },
71 | "language_info": {
72 | "codemirror_mode": {
73 | "name": "ipython",
74 | "version": 3
75 | },
76 | "file_extension": ".py",
77 | "mimetype": "text/x-python",
78 | "name": "python",
79 | "nbconvert_exporter": "python",
80 | "pygments_lexer": "ipython3",
81 | "version": "3.6.9"
82 | }
83 | },
84 | "nbformat": 4,
85 | "nbformat_minor": 4
86 | }
87 |
--------------------------------------------------------------------------------
/chapter_convolutional-neural-networks/index.md:
--------------------------------------------------------------------------------
1 | # 卷积神经网络
2 |
3 | 本章将介绍卷积神经网络。它是近年来深度学习能在计算机视觉领域取得突破性成果的基石。它也逐渐在被其他诸如自然语言处理、推荐系统和语音识别等领域广泛使用。我们将先描述卷积神经网络中卷积层和池化层的工作原理,并解释填充、步幅、输入通道和输出通道的含义。在掌握了这些基础知识以后,我们将探究数个具有代表性的深度卷积神经网络的设计思路。这些模型包括最早提出的AlexNet,以及后来的使用重复元素的网络(VGG)、网络中的网络(NiN)、含并行连结的网络(GoogLeNet)、残差网络(ResNet)和稠密连接网络(DenseNet)。它们中有不少在过去几年的ImageNet比赛(一个著名的计算机视觉竞赛)中大放异彩。虽然深度模型看上去只是具有很多层的神经网络,然而获得有效的深度模型并不容易。有幸的是,本章阐述的批量归一化和残差网络为训练和设计深度模型提供了两类重要思路。
4 |
5 | ```eval_rst
6 |
7 | .. toctree::
8 | :maxdepth: 2
9 |
10 | conv-layer
11 | padding-and-strides
12 | channels
13 | pooling
14 | lenet
15 | alexnet
16 | vgg
17 | nin
18 | googlenet
19 | batch-norm
20 | resnet
21 | densenet
22 | ```
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/chapter_convolutional-neural-networks/padding-and-strides.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 填充和步幅\n",
8 | "\n",
9 | "在上一节的例子里,我们使用高和宽为3的输入与高和宽为2的卷积核得到高和宽为2的输出。一般来说,假设输入形状是$n_h\\times n_w$,卷积核窗口形状是$k_h\\times k_w$,那么输出形状将会是\n",
10 | "\n",
11 | "$$(n_h-k_h+1) \\times (n_w-k_w+1).$$\n",
12 | "\n",
13 | "所以卷积层的输出形状由输入形状和卷积核窗口形状决定。本节我们将介绍卷积层的两个超参数,即填充和步幅。它们可以对给定形状的输入和卷积核改变输出形状。\n",
14 | "\n",
15 | "## 填充\n",
16 | "\n",
17 | "填充(padding)是指在输入高和宽的两侧填充元素(通常是0元素)。图5.2里我们在原输入高和宽的两侧分别添加了值为0的元素,使得输入高和宽从3变成了5,并导致输出高和宽由2增加到4。图5.2中的阴影部分为第一个输出元素及其计算所使用的输入和核数组元素:$0\\times0+0\\times1+0\\times2+0\\times3=0$。\n",
18 | "\n",
19 | "\n",
20 | "\n",
21 | "一般来说,如果在高的两侧一共填充$p_h$行,在宽的两侧一共填充$p_w$列,那么输出形状将会是\n",
22 | "\n",
23 | "$$(n_h-k_h+p_h+1)\\times(n_w-k_w+p_w+1),$$\n",
24 | "\n",
25 | "也就是说,输出的高和宽会分别增加$p_h$和$p_w$。\n",
26 | "\n",
27 | "在很多情况下,我们会设置$p_h=k_h-1$和$p_w=k_w-1$来使输入和输出具有相同的高和宽。这样会方便在构造网络时推测每个层的输出形状。假设这里$k_h$是奇数,我们会在高的两侧分别填充$p_h/2$行。如果$k_h$是偶数,一种可能是在输入的顶端一侧填充$\\lceil p_h/2\\rceil$行,而在底端一侧填充$\\lfloor p_h/2\\rfloor$行。在宽的两侧填充同理。\n",
28 | "\n",
29 | "卷积神经网络经常使用奇数高宽的卷积核,如1、3、5和7,所以两端上的填充个数相等。对任意的二维数组`X`,设它的第`i`行第`j`列的元素为`X[i,j]`。当两端上的填充个数相等,并使输入和输出具有相同的高和宽时,我们就知道输出`Y[i,j]`是由输入以`X[i,j]`为中心的窗口同卷积核进行互相关计算得到的。\n",
30 | "\n",
31 | "下面的例子里我们创建一个高和宽为3的二维卷积层,然后设输入高和宽两侧的填充数分别为1。给定一个高和宽为8的输入,我们发现输出的高和宽也是8。"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 1,
37 | "metadata": {},
38 | "outputs": [
39 | {
40 | "data": {
41 | "text/plain": [
42 | "torch.Size([8, 8])"
43 | ]
44 | },
45 | "execution_count": 1,
46 | "metadata": {},
47 | "output_type": "execute_result"
48 | }
49 | ],
50 | "source": [
51 | "import torch\n",
52 | "from torch import nn\n",
53 | "\n",
54 | "# 定义一个函数计算卷积层。它初始化卷积层权重,并对输入和输出做相应的升维和降维\n",
55 | "def comp_conv2d(conv2d, X):\n",
56 | " # (1, 1)代表批量大小和通道数(“多输入通道和多输出通道”一节将介绍)均为1\n",
57 | " X = X.reshape((1, 1) + X.shape)\n",
58 | " Y = conv2d(X)\n",
59 | " return Y.reshape(Y.shape[2:]) # 排除不关心的前两维:批量和通道\n",
60 | "\n",
61 | "# 注意这里是两侧分别填充1行或列,所以在两侧一共填充2行或列\n",
62 | "conv2d = nn.Conv2d(1, 1, kernel_size=3, padding=1)\n",
63 | "X = torch.rand(8, 8)\n",
64 | "comp_conv2d(conv2d, X).shape"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "当卷积核的高和宽不同时,我们也可以通过设置高和宽上不同的填充数使输出和输入具有相同的高和宽。"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 2,
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "data": {
81 | "text/plain": [
82 | "torch.Size([8, 8])"
83 | ]
84 | },
85 | "execution_count": 2,
86 | "metadata": {},
87 | "output_type": "execute_result"
88 | }
89 | ],
90 | "source": [
91 | "# 使用高为5、宽为3的卷积核。在高和宽两侧的填充数分别为2和1\n",
92 | "conv2d = nn.Conv2d(1, 1, kernel_size=(5, 3), padding=(2, 1))\n",
93 | "comp_conv2d(conv2d, X).shape"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "## 步幅\n",
101 | "\n",
102 | "在上一节里我们介绍了二维互相关运算。卷积窗口从输入数组的最左上方开始,按从左往右、从上往下的顺序,依次在输入数组上滑动。我们将每次滑动的行数和列数称为步幅(stride)。\n",
103 | "\n",
104 | "目前我们看到的例子里,在高和宽两个方向上步幅均为1。我们也可以使用更大步幅。图5.3展示了在高上步幅为3、在宽上步幅为2的二维互相关运算。可以看到,输出第一列第二个元素时,卷积窗口向下滑动了3行,而在输出第一行第二个元素时卷积窗口向右滑动了2列。当卷积窗口在输入上再向右滑动2列时,由于输入元素无法填满窗口,无结果输出。图5.3中的阴影部分为输出元素及其计算所使用的输入和核数组元素:$0\\times0+0\\times1+1\\times2+2\\times3=8$、$0\\times0+6\\times1+0\\times2+0\\times3=6$。\n",
105 | "\n",
106 | "\n",
107 | "\n",
108 | "一般来说,当高上步幅为$s_h$,宽上步幅为$s_w$时,输出形状为\n",
109 | "\n",
110 | "$$\\lfloor(n_h-k_h+p_h+s_h)/s_h\\rfloor \\times \\lfloor(n_w-k_w+p_w+s_w)/s_w\\rfloor.$$\n",
111 | "\n",
112 | "如果设置$p_h=k_h-1$和$p_w=k_w-1$,那么输出形状将简化为$\\lfloor(n_h+s_h-1)/s_h\\rfloor \\times \\lfloor(n_w+s_w-1)/s_w\\rfloor$。更进一步,如果输入的高和宽能分别被高和宽上的步幅整除,那么输出形状将是$(n_h/s_h) \\times (n_w/s_w)$。\n",
113 | "\n",
114 | "下面我们令高和宽上的步幅均为2,从而使输入的高和宽减半。"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 3,
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "data": {
124 | "text/plain": [
125 | "torch.Size([4, 4])"
126 | ]
127 | },
128 | "execution_count": 3,
129 | "metadata": {},
130 | "output_type": "execute_result"
131 | }
132 | ],
133 | "source": [
134 | "conv2d = nn.Conv2d(1, 1, kernel_size=3, padding=1, stride=2)\n",
135 | "comp_conv2d(conv2d, X).shape"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "接下来是一个稍微复杂点儿的例子。"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 4,
148 | "metadata": {},
149 | "outputs": [
150 | {
151 | "data": {
152 | "text/plain": [
153 | "torch.Size([2, 2])"
154 | ]
155 | },
156 | "execution_count": 4,
157 | "metadata": {},
158 | "output_type": "execute_result"
159 | }
160 | ],
161 | "source": [
162 | "conv2d = nn.Conv2d(1, 1, kernel_size=(3, 5), padding=(0, 1), stride=(3, 4))\n",
163 | "comp_conv2d(conv2d, X).shape"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "为了表述简洁,当输入的高和宽两侧的填充数分别为$p_h$和$p_w$时,我们称填充为$(p_h, p_w)$。特别地,当$p_h = p_w = p$时,填充为$p$。当在高和宽上的步幅分别为$s_h$和$s_w$时,我们称步幅为$(s_h, s_w)$。特别地,当$s_h = s_w = s$时,步幅为$s$。在默认情况下,填充为0,步幅为1。\n",
171 | "\n",
172 | "\n",
173 | "\n",
174 | "## 小结\n",
175 | "\n",
176 | "* 填充可以增加输出的高和宽。这常用来使输出与输入具有相同的高和宽。\n",
177 | "* 步幅可以减小输出的高和宽,例如输出的高和宽仅为输入的高和宽的$1/n$($n$为大于1的整数)。\n",
178 | "\n",
179 | "## 练习\n",
180 | "\n",
181 | "* 对本节最后一个例子通过形状计算公式来计算输出形状,看看是否和实验结果一致。\n",
182 | "* 在本节实验中,试一试其他的填充和步幅组合。\n",
183 | "\n",
184 | "\n",
185 | "\n",
186 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/6404)\n",
187 | "\n",
188 | ""
189 | ]
190 | }
191 | ],
192 | "metadata": {
193 | "kernelspec": {
194 | "display_name": "Python [conda env:pytorch]",
195 | "language": "python",
196 | "name": "conda-env-pytorch-py"
197 | },
198 | "language_info": {
199 | "codemirror_mode": {
200 | "name": "ipython",
201 | "version": 3
202 | },
203 | "file_extension": ".py",
204 | "mimetype": "text/x-python",
205 | "name": "python",
206 | "nbconvert_exporter": "python",
207 | "pygments_lexer": "ipython3",
208 | "version": "3.6.9"
209 | },
210 | "toc": {
211 | "base_numbering": 1,
212 | "nav_menu": {},
213 | "number_sections": true,
214 | "sideBar": true,
215 | "skip_h1_title": false,
216 | "title_cell": "Table of Contents",
217 | "title_sidebar": "Contents",
218 | "toc_cell": false,
219 | "toc_position": {},
220 | "toc_section_display": true,
221 | "toc_window_display": true
222 | }
223 | },
224 | "nbformat": 4,
225 | "nbformat_minor": 4
226 | }
227 |
--------------------------------------------------------------------------------
/chapter_deep-learning-basics/backprop.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 正向传播、反向传播和计算图\n",
8 | "\n",
9 | "前面几节里我们使用了小批量随机梯度下降的优化算法来训练模型。在实现中,我们只提供了模型的正向传播(forward propagation)的计算,即对输入计算模型输出,然后通过`autograd`模块来调用系统自动生成的`backward`函数计算梯度。基于反向传播(back-propagation)算法的自动求梯度极大简化了深度学习模型训练算法的实现。本节我们将使用数学和计算图(computational graph)两个方式来描述正向传播和反向传播。具体来说,我们将以带$L_2$范数正则化的含单隐藏层的多层感知机为样例模型解释正向传播和反向传播。\n",
10 | "\n",
11 | "## 正向传播\n",
12 | "\n",
13 | "正向传播是指对神经网络沿着从输入层到输出层的顺序,依次计算并存储模型的中间变量(包括输出)。为简单起见,假设输入是一个特征为$\\boldsymbol{x} \\in \\mathbb{R}^d$的样本,且不考虑偏差项,那么中间变量\n",
14 | "\n",
15 | "$$\\boldsymbol{z} = \\boldsymbol{W}^{(1)} \\boldsymbol{x},$$\n",
16 | "\n",
17 | "其中$\\boldsymbol{W}^{(1)} \\in \\mathbb{R}^{h \\times d}$是隐藏层的权重参数。把中间变量$\\boldsymbol{z} \\in \\mathbb{R}^h$输入按元素运算的激活函数$\\phi$后,将得到向量长度为$h$的隐藏层变量\n",
18 | "\n",
19 | "$$\\boldsymbol{h} = \\phi (\\boldsymbol{z}).$$\n",
20 | "\n",
21 | "隐藏层变量$\\boldsymbol{h}$也是一个中间变量。假设输出层参数只有权重$\\boldsymbol{W}^{(2)} \\in \\mathbb{R}^{q \\times h}$,可以得到向量长度为$q$的输出层变量\n",
22 | "\n",
23 | "$$\\boldsymbol{o} = \\boldsymbol{W}^{(2)} \\boldsymbol{h}.$$\n",
24 | "\n",
25 | "假设损失函数为$\\ell$,且样本标签为$y$,可以计算出单个数据样本的损失项\n",
26 | "\n",
27 | "$$L = \\ell(\\boldsymbol{o}, y).$$\n",
28 | "\n",
29 | "根据$L_2$范数正则化的定义,给定超参数$\\lambda$,正则化项即\n",
30 | "\n",
31 | "$$s = \\frac{\\lambda}{2} \\left(\\|\\boldsymbol{W}^{(1)}\\|_F^2 + \\|\\boldsymbol{W}^{(2)}\\|_F^2\\right),$$\n",
32 | "\n",
33 | "其中矩阵的Frobenius范数等价于将矩阵变平为向量后计算$L_2$范数。最终,模型在给定的数据样本上带正则化的损失为\n",
34 | "\n",
35 | "$$J = L + s.$$\n",
36 | "\n",
37 | "我们将$J$称为有关给定数据样本的目标函数,并在以下的讨论中简称目标函数。\n",
38 | "\n",
39 | "\n",
40 | "## 正向传播的计算图\n",
41 | "\n",
42 | "我们通常绘制计算图来可视化运算符和变量在计算中的依赖关系。图3.6绘制了本节中样例模型正向传播的计算图,其中左下角是输入,右上角是输出。可以看到,图中箭头方向大多是向右和向上,其中方框代表变量,圆圈代表运算符,箭头表示从输入到输出之间的依赖关系。\n",
43 | "\n",
44 | "\n",
45 | "\n",
46 | "\n",
47 | "## 反向传播\n",
48 | "\n",
49 | "反向传播指的是计算神经网络参数梯度的方法。总的来说,反向传播依据微积分中的链式法则,沿着从输出层到输入层的顺序,依次计算并存储目标函数有关神经网络各层的中间变量以及参数的梯度。对输入或输出$\\mathsf{X}, \\mathsf{Y}, \\mathsf{Z}$为任意形状张量的函数$\\mathsf{Y}=f(\\mathsf{X})$和$\\mathsf{Z}=g(\\mathsf{Y})$,通过链式法则,我们有\n",
50 | "\n",
51 | "$$\\frac{\\partial \\mathsf{Z}}{\\partial \\mathsf{X}} = \\text{prod}\\left(\\frac{\\partial \\mathsf{Z}}{\\partial \\mathsf{Y}}, \\frac{\\partial \\mathsf{Y}}{\\partial \\mathsf{X}}\\right),$$\n",
52 | "\n",
53 | "其中$\\text{prod}$运算符将根据两个输入的形状,在必要的操作(如转置和互换输入位置)后对两个输入做乘法。\n",
54 | "\n",
55 | "回顾一下本节中样例模型,它的参数是$\\boldsymbol{W}^{(1)}$和$\\boldsymbol{W}^{(2)}$,因此反向传播的目标是计算$\\partial J/\\partial \\boldsymbol{W}^{(1)}$和$\\partial J/\\partial \\boldsymbol{W}^{(2)}$。我们将应用链式法则依次计算各中间变量和参数的梯度,其计算次序与前向传播中相应中间变量的计算次序恰恰相反。首先,分别计算目标函数$J=L+s$有关损失项$L$和正则项$s$的梯度\n",
56 | "\n",
57 | "$$\\frac{\\partial J}{\\partial L} = 1, \\quad \\frac{\\partial J}{\\partial s} = 1.$$\n",
58 | "\n",
59 | "其次,依据链式法则计算目标函数有关输出层变量的梯度$\\partial J/\\partial \\boldsymbol{o} \\in \\mathbb{R}^q$:\n",
60 | "\n",
61 | "$$\n",
62 | "\\frac{\\partial J}{\\partial \\boldsymbol{o}}\n",
63 | "= \\text{prod}\\left(\\frac{\\partial J}{\\partial L}, \\frac{\\partial L}{\\partial \\boldsymbol{o}}\\right)\n",
64 | "= \\frac{\\partial L}{\\partial \\boldsymbol{o}}.\n",
65 | "$$\n",
66 | "\n",
67 | "\n",
68 | "接下来,计算正则项有关两个参数的梯度:\n",
69 | "\n",
70 | "$$\\frac{\\partial s}{\\partial \\boldsymbol{W}^{(1)}} = \\lambda \\boldsymbol{W}^{(1)},\\quad\\frac{\\partial s}{\\partial \\boldsymbol{W}^{(2)}} = \\lambda \\boldsymbol{W}^{(2)}.$$\n",
71 | "\n",
72 | "\n",
73 | "现在,我们可以计算最靠近输出层的模型参数的梯度$\\partial J/\\partial \\boldsymbol{W}^{(2)} \\in \\mathbb{R}^{q \\times h}$。依据链式法则,得到\n",
74 | "\n",
75 | "$$\n",
76 | "\\frac{\\partial J}{\\partial \\boldsymbol{W}^{(2)}}\n",
77 | "= \\text{prod}\\left(\\frac{\\partial J}{\\partial \\boldsymbol{o}}, \\frac{\\partial \\boldsymbol{o}}{\\partial \\boldsymbol{W}^{(2)}}\\right) + \\text{prod}\\left(\\frac{\\partial J}{\\partial s}, \\frac{\\partial s}{\\partial \\boldsymbol{W}^{(2)}}\\right)\n",
78 | "= \\frac{\\partial J}{\\partial \\boldsymbol{o}} \\boldsymbol{h}^\\top + \\lambda \\boldsymbol{W}^{(2)}.\n",
79 | "$$\n",
80 | "\n",
81 | "\n",
82 | "沿着输出层向隐藏层继续反向传播,隐藏层变量的梯度$\\partial J/\\partial \\boldsymbol{h} \\in \\mathbb{R}^h$可以这样计算:\n",
83 | "\n",
84 | "$$\n",
85 | "\\frac{\\partial J}{\\partial \\boldsymbol{h}}\n",
86 | "= \\text{prod}\\left(\\frac{\\partial J}{\\partial \\boldsymbol{o}}, \\frac{\\partial \\boldsymbol{o}}{\\partial \\boldsymbol{h}}\\right)\n",
87 | "= {\\boldsymbol{W}^{(2)}}^\\top \\frac{\\partial J}{\\partial \\boldsymbol{o}}.\n",
88 | "$$\n",
89 | "\n",
90 | "\n",
91 | "由于激活函数$\\phi$是按元素运算的,中间变量$\\boldsymbol{z}$的梯度$\\partial J/\\partial \\boldsymbol{z} \\in \\mathbb{R}^h$的计算需要使用按元素乘法符$\\odot$:\n",
92 | "\n",
93 | "$$\n",
94 | "\\frac{\\partial J}{\\partial \\boldsymbol{z}}\n",
95 | "= \\text{prod}\\left(\\frac{\\partial J}{\\partial \\boldsymbol{h}}, \\frac{\\partial \\boldsymbol{h}}{\\partial \\boldsymbol{z}}\\right)\n",
96 | "= \\frac{\\partial J}{\\partial \\boldsymbol{h}} \\odot \\phi'\\left(\\boldsymbol{z}\\right).\n",
97 | "$$\n",
98 | "\n",
99 | "最终,我们可以得到最靠近输入层的模型参数的梯度$\\partial J/\\partial \\boldsymbol{W}^{(1)} \\in \\mathbb{R}^{h \\times d}$。依据链式法则,得到\n",
100 | "\n",
101 | "$$\n",
102 | "\\frac{\\partial J}{\\partial \\boldsymbol{W}^{(1)}}\n",
103 | "= \\text{prod}\\left(\\frac{\\partial J}{\\partial \\boldsymbol{z}}, \\frac{\\partial \\boldsymbol{z}}{\\partial \\boldsymbol{W}^{(1)}}\\right) + \\text{prod}\\left(\\frac{\\partial J}{\\partial s}, \\frac{\\partial s}{\\partial \\boldsymbol{W}^{(1)}}\\right)\n",
104 | "= \\frac{\\partial J}{\\partial \\boldsymbol{z}} \\boldsymbol{x}^\\top + \\lambda \\boldsymbol{W}^{(1)}.\n",
105 | "$$\n",
106 | "\n",
107 | "## 训练深度学习模型\n",
108 | "\n",
109 | "在训练深度学习模型时,正向传播和反向传播之间相互依赖。下面我们仍然以本节中的样例模型分别阐述它们之间的依赖关系。\n",
110 | "\n",
111 | "一方面,正向传播的计算可能依赖于模型参数的当前值,而这些模型参数是在反向传播的梯度计算后通过优化算法迭代的。例如,计算正则化项$s = (\\lambda/2) \\left(\\|\\boldsymbol{W}^{(1)}\\|_F^2 + \\|\\boldsymbol{W}^{(2)}\\|_F^2\\right)$依赖模型参数$\\boldsymbol{W}^{(1)}$和$\\boldsymbol{W}^{(2)}$的当前值,而这些当前值是优化算法最近一次根据反向传播算出梯度后迭代得到的。\n",
112 | "\n",
113 | "另一方面,反向传播的梯度计算可能依赖于各变量的当前值,而这些变量的当前值是通过正向传播计算得到的。举例来说,参数梯度$\\partial J/\\partial \\boldsymbol{W}^{(2)} = (\\partial J / \\partial \\boldsymbol{o}) \\boldsymbol{h}^\\top + \\lambda \\boldsymbol{W}^{(2)}$的计算需要依赖隐藏层变量的当前值$\\boldsymbol{h}$。这个当前值是通过从输入层到输出层的正向传播计算并存储得到的。\n",
114 | "\n",
115 | "因此,在模型参数初始化完成后,我们交替地进行正向传播和反向传播,并根据反向传播计算的梯度迭代模型参数。既然我们在反向传播中使用了正向传播中计算得到的中间变量来避免重复计算,那么这个复用也导致正向传播结束后不能立即释放中间变量内存。这也是训练要比预测占用更多内存的一个重要原因。另外需要指出的是,这些中间变量的个数大体上与网络层数线性相关,每个变量的大小跟批量大小和输入个数也是线性相关的,它们是导致较深的神经网络使用较大批量训练时更容易超内存的主要原因。\n",
116 | "\n",
117 | "\n",
118 | "## 小结\n",
119 | "\n",
120 | "* 正向传播沿着从输入层到输出层的顺序,依次计算并存储神经网络的中间变量。\n",
121 | "* 反向传播沿着从输出层到输入层的顺序,依次计算并存储神经网络中间变量和参数的梯度。\n",
122 | "* 在训练深度学习模型时,正向传播和反向传播相互依赖。\n",
123 | "\n",
124 | "\n",
125 | "## 练习\n",
126 | "\n",
127 | "* 在本节样例模型的隐藏层和输出层中添加偏差参数,修改计算图以及正向传播和反向传播的数学表达式。\n",
128 | "\n",
129 | "\n",
130 | "\n",
131 | "\n",
132 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/3710)\n",
133 | "\n",
134 | ""
135 | ]
136 | }
137 | ],
138 | "metadata": {
139 | "kernelspec": {
140 | "display_name": "Python [conda env:pytorch]",
141 | "language": "python",
142 | "name": "conda-env-pytorch-py"
143 | },
144 | "language_info": {
145 | "codemirror_mode": {
146 | "name": "ipython",
147 | "version": 3
148 | },
149 | "file_extension": ".py",
150 | "mimetype": "text/x-python",
151 | "name": "python",
152 | "nbconvert_exporter": "python",
153 | "pygments_lexer": "ipython3",
154 | "version": "3.7.4"
155 | },
156 | "toc": {
157 | "base_numbering": 1,
158 | "nav_menu": {},
159 | "number_sections": true,
160 | "sideBar": true,
161 | "skip_h1_title": false,
162 | "title_cell": "Table of Contents",
163 | "title_sidebar": "Contents",
164 | "toc_cell": false,
165 | "toc_position": {},
166 | "toc_section_display": true,
167 | "toc_window_display": false
168 | }
169 | },
170 | "nbformat": 4,
171 | "nbformat_minor": 4
172 | }
173 |
--------------------------------------------------------------------------------
/chapter_deep-learning-basics/index.md:
--------------------------------------------------------------------------------
1 | # 深度学习基础
2 |
3 | 从本章开始,我们将探索深度学习的奥秘。作为机器学习的一类,深度学习通常基于神经网络模型逐级表示越来越抽象的概念或模式。我们先从线性回归和 softmax 回归这两种单层神经网络入手,简要介绍机器学习中的基本概念。然后,我们由单层神经网络延伸到多层神经网络,并通过多层感知机引入深度学习模型。在观察和了解了模型的过拟合现象后,我们将介绍深度学习中应对过拟合的常用方法:权重衰减和丢弃法。接着,为了进一步理解深度学习模型训练的本质,我们将详细解释正向传播和反向传播。掌握这两个概念后,我们能更好地认识深度学习中的数值稳定性和初始化的一些问题。最后,我们通过一个深度学习应用案例对本章内容学以致用。
4 |
5 | 在本章的前几节,我们先介绍单层神经网络:线性回归和softmax回归。
6 |
7 | ```eval_rst
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 |
12 | linear-regression
13 | linear-regression-scratch
14 | linear-regression-nn
15 | softmax-regression
16 | fashion-mnist
17 | softmax-regression-scratch
18 | softmax-regression-nn
19 | mlp
20 | mlp-scratch
21 | mlp-nn
22 | underfit-overfit
23 | weight-decay
24 | dropout
25 | backprop
26 | numerical-stability-and-init
27 | kaggle-house-price
28 |
29 | ```
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/chapter_deep-learning-basics/mlp-nn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 多层感知机的简洁实现\n",
8 | "\n",
9 | "下面我们使用nn来实现上一节中的多层感知机。首先导入所需的包或模块。"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import d2ltorch as d2lt\n",
19 | "import torch\n",
20 | "from torch import nn, optim"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "## 定义模型\n",
28 | "\n",
29 | "和softmax回归唯一的不同在于,我们多加了一个全连接层作为隐藏层。它的隐藏单元个数为256,并使用ReLU函数作为激活函数。"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 2,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "class MyMLP(nn.Module):\n",
39 | " def __init__(self, **kwargs):\n",
40 | " super(MyMLP, self).__init__(**kwargs)\n",
41 | " self.mlp = nn.Sequential(\n",
42 | " nn.Linear(28 * 28, 256),\n",
43 | " nn.ReLU(),\n",
44 | " nn.Linear(256, 10)\n",
45 | " )\n",
46 | " \n",
47 | " def forward(self, x):\n",
48 | " return self.mlp(x.reshape(-1, 28*28))\n",
49 | "\n",
50 | "net = MyMLP()\n",
51 | "d2lt.params_init(net, nn.init.normal_, std=0.01)"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "## 读取数据并训练模型\n",
59 | "\n",
60 | "我们使用与[“softmax回归的简洁实现”](softmax-regression-gluon.ipynb)一节中训练softmax回归几乎相同的步骤来读取数据并训练模型。"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 3,
66 | "metadata": {},
67 | "outputs": [
68 | {
69 | "name": "stdout",
70 | "output_type": "stream",
71 | "text": [
72 | "epoch 1, loss 0.0031, train acc 0.704, test acc 0.795\n",
73 | "epoch 2, loss 0.0019, train acc 0.823, test acc 0.834\n",
74 | "epoch 3, loss 0.0016, train acc 0.844, test acc 0.836\n",
75 | "epoch 4, loss 0.0015, train acc 0.855, test acc 0.744\n",
76 | "epoch 5, loss 0.0014, train acc 0.864, test acc 0.855\n"
77 | ]
78 | }
79 | ],
80 | "source": [
81 | "root = '~/dataset/'\n",
82 | "batch_size = 256\n",
83 | "train_iter, test_iter = d2lt.load_data_fashion_mnist(root, batch_size=batch_size)\n",
84 | "\n",
85 | "loss = nn.CrossEntropyLoss()\n",
86 | "optimizer = optim.SGD(net.parameters(), lr=0.5)\n",
87 | "num_epochs = 5\n",
88 | "d2lt.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, \n",
89 | " None, optimizer)"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "## 小结\n",
97 | "\n",
98 | "* 通过PyTorch可以更简洁地实现多层感知机。\n",
99 | "\n",
100 | "## 练习\n",
101 | "\n",
102 | "* 尝试多加入几个隐藏层,对比上一节中从零开始的实现。\n",
103 | "* 使用其他的激活函数,看看对结果的影响。\n",
104 | "\n",
105 | "\n",
106 | "\n",
107 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/738)\n",
108 | "\n",
109 | ""
110 | ]
111 | }
112 | ],
113 | "metadata": {
114 | "kernelspec": {
115 | "display_name": "Python [conda env:pytorch]",
116 | "language": "python",
117 | "name": "conda-env-pytorch-py"
118 | },
119 | "language_info": {
120 | "codemirror_mode": {
121 | "name": "ipython",
122 | "version": 3
123 | },
124 | "file_extension": ".py",
125 | "mimetype": "text/x-python",
126 | "name": "python",
127 | "nbconvert_exporter": "python",
128 | "pygments_lexer": "ipython3",
129 | "version": "3.6.9"
130 | },
131 | "toc": {
132 | "base_numbering": 1,
133 | "nav_menu": {},
134 | "number_sections": true,
135 | "sideBar": true,
136 | "skip_h1_title": false,
137 | "title_cell": "Table of Contents",
138 | "title_sidebar": "Contents",
139 | "toc_cell": false,
140 | "toc_position": {},
141 | "toc_section_display": true,
142 | "toc_window_display": false
143 | }
144 | },
145 | "nbformat": 4,
146 | "nbformat_minor": 4
147 | }
148 |
--------------------------------------------------------------------------------
/chapter_deep-learning-basics/mlp-scratch.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 多层感知机的从零开始实现\n",
8 | "\n",
9 | "我们已经从上一节里了解了多层感知机的原理。下面,我们一起来动手实现一个多层感知机。首先导入实现所需的包或模块。"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "%matplotlib inline\n",
19 | "import d2ltorch as d2lt\n",
20 | "import torch\n",
21 | "from torch import nn"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "## 获取和读取数据\n",
29 | "\n",
30 | "这里继续使用Fashion-MNIST数据集。我们将使用多层感知机对图像进行分类。"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "root = '~/dataset'\n",
40 | "batch_size = 256\n",
41 | "train_iter, test_iter = d2lt.load_data_fashion_mnist(root, batch_size=batch_size)"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "## 定义模型参数\n",
49 | "\n",
50 | "我们在[“softmax回归的从零开始实现”](softmax-regression-scratch.ipynb)一节里已经介绍了,Fashion-MNIST数据集中图像形状为$28 \\times 28$,类别数为10。本节中我们依然使用长度为$28 \\times 28 = 784$的向量表示每一张图像。因此,输入个数为784,输出个数为10。实验中,我们设超参数隐藏单元个数为256。"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 3,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "num_inputs, num_outputs, num_hiddens = 784, 10, 256\n",
60 | "\n",
61 | "W1 = torch.normal(torch.zeros(num_inputs, num_hiddens), std=0.01)\n",
62 | "b1 = torch.zeros(num_hiddens)\n",
63 | "W2 = torch.normal(torch.zeros(num_hiddens, num_outputs), std=0.01)\n",
64 | "b2 = torch.zeros(num_outputs)\n",
65 | "params = [W1, b1, W2, b2]\n",
66 | "\n",
67 | "for param in params:\n",
68 | " param.requires_grad_()"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "## 定义激活函数\n",
76 | "\n",
77 | "这里我们使用基础的`clamp`函数来实现ReLU,而非直接调用`relu`函数。"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 4,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "def relu(X):\n",
87 | " return torch.clamp(X, min=0)"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "## 定义模型\n",
95 | "\n",
96 | "同softmax回归一样,我们通过`reshape`函数将每张原始图像改成长度为`num_inputs`的向量。然后我们实现上一节中多层感知机的计算表达式。"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 5,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "def net(X):\n",
106 | " X = X.reshape(-1, num_inputs)\n",
107 | " H = relu(torch.mm(X, W1) + b1)\n",
108 | " return torch.mm(H, W2) + b2"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "## 定义损失函数\n",
116 | "\n",
117 | "为了得到更好的数值稳定性,我们直接使用nn提供的包括softmax运算和交叉熵损失计算的函数。"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 6,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "loss = nn.CrossEntropyLoss()"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {},
132 | "source": [
133 | "## 训练模型\n",
134 | "\n",
135 | "训练多层感知机的步骤和[“softmax回归的从零开始实现”](softmax-regression-scratch.ipynb)一节中训练softmax回归的步骤没什么区别。我们直接调用`d2lzh`包中的`train_ch3`函数,它的实现已经在[“softmax回归的从零开始实现”](softmax-regression-scratch.ipynb)一节里介绍过。我们在这里设超参数迭代周期数为5,学习率为0.5。"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 7,
141 | "metadata": {},
142 | "outputs": [
143 | {
144 | "name": "stdout",
145 | "output_type": "stream",
146 | "text": [
147 | "epoch 1, loss 0.0033, train acc 0.688, test acc 0.754\n",
148 | "epoch 2, loss 0.0019, train acc 0.817, test acc 0.804\n",
149 | "epoch 3, loss 0.0017, train acc 0.844, test acc 0.797\n",
150 | "epoch 4, loss 0.0015, train acc 0.855, test acc 0.850\n",
151 | "epoch 5, loss 0.0015, train acc 0.864, test acc 0.854\n"
152 | ]
153 | }
154 | ],
155 | "source": [
156 | "num_epochs, lr = 5, 0.5\n",
157 | "d2lt.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, \n",
158 | " params, lr)"
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {},
164 | "source": [
165 | "## 小结\n",
166 | "\n",
167 | "* 可以通过手动定义模型及其参数来实现简单的多层感知机。\n",
168 | "* 当多层感知机的层数较多时,本节的实现方法会显得较烦琐,例如在定义模型参数的时候。\n",
169 | "\n",
170 | "## 练习\n",
171 | "\n",
172 | "* 改变超参数`num_hiddens`的值,看看对实验结果有什么影响。\n",
173 | "* 试着加入一个新的隐藏层,看看对实验结果有什么影响。\n",
174 | "\n",
175 | "\n",
176 | "\n",
177 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/739)\n",
178 | "\n",
179 | ""
180 | ]
181 | }
182 | ],
183 | "metadata": {
184 | "kernelspec": {
185 | "display_name": "Python [conda env:pytorch]",
186 | "language": "python",
187 | "name": "conda-env-pytorch-py"
188 | },
189 | "language_info": {
190 | "codemirror_mode": {
191 | "name": "ipython",
192 | "version": 3
193 | },
194 | "file_extension": ".py",
195 | "mimetype": "text/x-python",
196 | "name": "python",
197 | "nbconvert_exporter": "python",
198 | "pygments_lexer": "ipython3",
199 | "version": "3.6.9"
200 | },
201 | "toc": {
202 | "base_numbering": 1,
203 | "nav_menu": {},
204 | "number_sections": true,
205 | "sideBar": true,
206 | "skip_h1_title": false,
207 | "title_cell": "Table of Contents",
208 | "title_sidebar": "Contents",
209 | "toc_cell": false,
210 | "toc_position": {},
211 | "toc_section_display": true,
212 | "toc_window_display": false
213 | }
214 | },
215 | "nbformat": 4,
216 | "nbformat_minor": 4
217 | }
218 |
--------------------------------------------------------------------------------
/chapter_deep-learning-basics/numerical-stability-and-init.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 数值稳定性和模型初始化\n",
8 | "\n",
9 | "理解了正向传播与反向传播以后,我们来讨论一下深度学习模型的数值稳定性问题以及模型参数的初始化方法。深度模型有关数值稳定性的典型问题是衰减(vanishing)和爆炸(explosion)。\n",
10 | "\n",
11 | "\n",
12 | "## 衰减和爆炸\n",
13 | "\n",
14 | "当神经网络的层数较多时,模型的数值稳定性容易变差。假设一个层数为$L$的多层感知机的第$l$层$\\boldsymbol{H}^{(l)}$的权重参数为$\\boldsymbol{W}^{(l)}$,输出层$\\boldsymbol{H}^{(L)}$的权重参数为$\\boldsymbol{W}^{(L)}$。为了便于讨论,不考虑偏差参数,且设所有隐藏层的激活函数为恒等映射(identity mapping)$\\phi(x) = x$。给定输入$\\boldsymbol{X}$,多层感知机的第$l$层的输出$\\boldsymbol{H}^{(l)} = \\boldsymbol{X} \\boldsymbol{W}^{(1)} \\boldsymbol{W}^{(2)} \\ldots \\boldsymbol{W}^{(l)}$。此时,如果层数$l$较大,$\\boldsymbol{H}^{(l)}$的计算可能会出现衰减或爆炸。举个例子,假设输入和所有层的权重参数都是标量,如权重参数为0.2和5,多层感知机的第30层输出为输入$\\boldsymbol{X}$分别与$0.2^{30} \\approx 1 \\times 10^{-21}$(衰减)和$5^{30} \\approx 9 \\times 10^{20}$(爆炸)的乘积。类似地,当层数较多时,梯度的计算也更容易出现衰减或爆炸。\n",
15 | "\n",
16 | "随着内容的不断深入,我们会在后面的章节进一步介绍深度学习的数值稳定性问题以及解决方法。\n",
17 | "\n",
18 | "\n",
19 | "## 随机初始化模型参数\n",
20 | "\n",
21 | "在神经网络中,通常需要随机初始化模型参数。下面我们来解释这样做的原因。\n",
22 | "\n",
23 | "回顾[“多层感知机”](mlp.ipynb)一节图3.3描述的多层感知机。为了方便解释,假设输出层只保留一个输出单元$o_1$(删去$o_2$和$o_3$以及指向它们的箭头),且隐藏层使用相同的激活函数。如果将每个隐藏单元的参数都初始化为相等的值,那么在正向传播时每个隐藏单元将根据相同的输入计算出相同的值,并传递至输出层。在反向传播中,每个隐藏单元的参数梯度值相等。因此,这些参数在使用基于梯度的优化算法迭代后值依然相等。之后的迭代也是如此。在这种情况下,无论隐藏单元有多少,隐藏层本质上只有1个隐藏单元在发挥作用。因此,正如在前面的实验中所做的那样,我们通常将神经网络的模型参数,特别是权重参数,进行随机初始化。\n",
24 | "\n",
25 | "\n",
26 | "### PyTorch的默认随机初始化\n",
27 | "\n",
28 | "随机初始化模型参数的方法有很多。在[“线性回归的简洁实现”](linear-regression-nn.ipynb)一节中,我们使用`nn.init.normal_(tensor=model.weight.data, std=0.01)`使模型`net`的权重参数采用正态分布的随机初始化方式。如果不指定初始化方法,PyTorch将使用默认的初始化方法初始化Linear层的权重参数:[init.kaiming_uniform_(self.weight, a=math.sqrt(5))](https://pytorch.org/docs/stable/nn.html#torch.nn.init.kaiming_uniform_)。\n",
29 | "\n",
30 | "**注:在PyTorch中不同的层,不同的参数会采用不同的初始化方法,具体的初始化方法可以查看相应层源码中的 `reset_parameters` 函数**\n",
31 | "\n",
32 | "\n",
33 | "### Xavier随机初始化\n",
34 | "\n",
35 | "还有一种比较常用的随机初始化方法叫作Xavier随机初始化 [1]。\n",
36 | "假设某全连接层的输入个数为$a$,输出个数为$b$,Xavier随机初始化将使该层中权重参数的每个元素都随机采样于均匀分布\n",
37 | "\n",
38 | "$$U\\left(-\\sqrt{\\frac{6}{a+b}}, \\sqrt{\\frac{6}{a+b}}\\right).$$\n",
39 | "\n",
40 | "它的设计主要考虑到,模型参数初始化后,每层输出的方差不该受该层输入个数影响,且每层梯度的方差也不该受该层输出个数影响。\n",
41 | "\n",
42 | "## 小结\n",
43 | "\n",
44 | "* 深度模型有关数值稳定性的典型问题是衰减和爆炸。当神经网络的层数较多时,模型的数值稳定性容易变差。\n",
45 | "* 我们通常需要随机初始化神经网络的模型参数,如权重参数。\n",
46 | "\n",
47 | "\n",
48 | "## 练习\n",
49 | "\n",
50 | "* 有人说随机初始化模型参数是为了“打破对称性”。这里的“对称”应如何理解?\n",
51 | "* 是否可以将线性回归或softmax回归中所有的权重参数都初始化为相同值?\n",
52 | "\n",
53 | "\n",
54 | "\n",
55 | "\n",
56 | "## 参考文献\n",
57 | "\n",
58 | "[1] Glorot, X., & Bengio, Y. (2010, March). Understanding the difficulty of training deep feedforward neural networks. In Proceedings of the thirteenth international conference on artificial intelligence and statistics (pp. 249-256).\n",
59 | "\n",
60 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/8052)\n",
61 | "\n",
62 | ""
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "## 参考\n",
70 | "2. 权重相同,隐藏层的神经元相当于单个神经元,此时神经网络相当于线性分类器。所以线性回归可以设相同值,而softmax是非线性分类器,不能设相同的值。"
71 | ]
72 | }
73 | ],
74 | "metadata": {
75 | "kernelspec": {
76 | "display_name": "Python [conda env:pytorch]",
77 | "language": "python",
78 | "name": "conda-env-pytorch-py"
79 | },
80 | "language_info": {
81 | "codemirror_mode": {
82 | "name": "ipython",
83 | "version": 3
84 | },
85 | "file_extension": ".py",
86 | "mimetype": "text/x-python",
87 | "name": "python",
88 | "nbconvert_exporter": "python",
89 | "pygments_lexer": "ipython3",
90 | "version": "3.6.9"
91 | },
92 | "toc": {
93 | "base_numbering": 1,
94 | "nav_menu": {},
95 | "number_sections": true,
96 | "sideBar": true,
97 | "skip_h1_title": false,
98 | "title_cell": "Table of Contents",
99 | "title_sidebar": "Contents",
100 | "toc_cell": false,
101 | "toc_position": {},
102 | "toc_section_display": true,
103 | "toc_window_display": false
104 | }
105 | },
106 | "nbformat": 4,
107 | "nbformat_minor": 4
108 | }
109 |
--------------------------------------------------------------------------------
/chapter_deep-learning-basics/softmax-regression-nn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# softmax回归的简洁实现\n",
8 | "\n",
9 | "我们在[“线性回归的简洁实现”](linear-regression-gluon.ipynb)一节中已经了解了使用PyTorch实现模型的便利。下面,让我们再次使用PyTorch来实现一个softmax回归模型。首先导入所需的包或模块。"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "%matplotlib inline\n",
19 | "import d2ltorch as d2lt\n",
20 | "import torch\n",
21 | "from torch import nn, optim"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "## 获取和读取数据\n",
29 | "\n",
30 | "我们仍然使用Fashion-MNIST数据集和上一节中设置的批量大小。"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "root = '~/dataset/'\n",
40 | "batch_size = 256\n",
41 | "train_iter, test_iter = d2lt.load_data_fashion_mnist(root, batch_size=batch_size)"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "## 定义和初始化模型\n",
49 | "\n",
50 | "在[“softmax回归”](softmax-regression.ipynb)一节中提到,softmax回归的输出层是一个全连接层。因此,我们添加一个输出个数为10的全连接层。我们使用均值为0、标准差为0.01的正态分布随机初始化模型的权重参数。\n",
51 | "\n",
52 | "*这里我们提前使用 [“4.2. 模型参数的访问、初始化和共享”](../chapter_deep-learning-computation/parameters.ipynb) 一节中定义的初始化方法 `params_init(model, init, **kwargs)`*"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 3,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "class MySoftMax(nn.Module):\n",
62 | " def __init__(self, **kwargs):\n",
63 | " super(MySoftMax, self).__init__(**kwargs)\n",
64 | " self.linear = nn.Linear(28 * 28, 10)\n",
65 | " \n",
66 | " def forward(self, x):\n",
67 | " x = x.reshape(-1, 28 * 28)\n",
68 | " return self.linear(x)\n",
69 | "\n",
70 | "net = MySoftMax()\n",
71 | "d2lt.params_init(model=net, init=nn.init.normal_, std=0.01)"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "## softmax和交叉熵损失函数\n",
79 | "\n",
80 | "如果做了上一节的练习,那么你可能意识到了分开定义softmax运算和交叉熵损失函数可能会造成数值不稳定。因此,PyTorch提供了一个包括softmax运算和交叉熵损失计算的函数。它的数值稳定性更好。"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 4,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "loss = nn.CrossEntropyLoss()"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "## 定义优化算法\n",
97 | "\n",
98 | "我们使用学习率为0.1的小批量随机梯度下降作为优化算法。"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 5,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "optimizer = optim.SGD(net.parameters(), lr=0.1)"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "## 训练模型\n",
115 | "\n",
116 | "接下来,我们使用上一节中定义的训练函数来训练模型。"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 6,
122 | "metadata": {},
123 | "outputs": [
124 | {
125 | "name": "stdout",
126 | "output_type": "stream",
127 | "text": [
128 | "epoch 1, loss 0.0031, train acc 0.751, test acc 0.775\n",
129 | "epoch 2, loss 0.0022, train acc 0.813, test acc 0.795\n",
130 | "epoch 3, loss 0.0021, train acc 0.825, test acc 0.810\n",
131 | "epoch 4, loss 0.0020, train acc 0.832, test acc 0.811\n",
132 | "epoch 5, loss 0.0019, train acc 0.836, test acc 0.823\n"
133 | ]
134 | }
135 | ],
136 | "source": [
137 | "num_epochs = 5\n",
138 | "d2lt.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer)"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "## 小结\n",
146 | "\n",
147 | "* PyTorch提供的函数往往具有更好的数值稳定性。\n",
148 | "* 可以使用PyTorch更简洁地实现softmax回归。\n",
149 | "\n",
150 | "## 练习\n",
151 | "\n",
152 | "* 尝试调一调超参数,如批量大小、迭代周期和学习率,看看结果会怎样。\n",
153 | "\n",
154 | "\n",
155 | "\n",
156 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/740)\n",
157 | "\n",
158 | ""
159 | ]
160 | }
161 | ],
162 | "metadata": {
163 | "kernelspec": {
164 | "display_name": "Python [conda env:pytorch]",
165 | "language": "python",
166 | "name": "conda-env-pytorch-py"
167 | },
168 | "language_info": {
169 | "codemirror_mode": {
170 | "name": "ipython",
171 | "version": 3
172 | },
173 | "file_extension": ".py",
174 | "mimetype": "text/x-python",
175 | "name": "python",
176 | "nbconvert_exporter": "python",
177 | "pygments_lexer": "ipython3",
178 | "version": "3.6.9"
179 | },
180 | "toc": {
181 | "base_numbering": 1,
182 | "nav_menu": {},
183 | "number_sections": true,
184 | "sideBar": true,
185 | "skip_h1_title": false,
186 | "title_cell": "Table of Contents",
187 | "title_sidebar": "Contents",
188 | "toc_cell": false,
189 | "toc_position": {},
190 | "toc_section_display": true,
191 | "toc_window_display": false
192 | }
193 | },
194 | "nbformat": 4,
195 | "nbformat_minor": 4
196 | }
197 |
--------------------------------------------------------------------------------
/chapter_deep-learning-basics/softmax-regression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# softmax回归\n",
8 | "\n",
9 | "前几节介绍的线性回归模型适用于输出为连续值的情景。在另一类情景中,模型输出可以是一个像图像类别这样的离散值。对于这样的离散值预测问题,我们可以使用诸如softmax回归在内的分类模型。和线性回归不同,softmax回归的输出单元从一个变成了多个,且引入了softmax运算使输出更适合离散值的预测和训练。本节以softmax回归模型为例,介绍神经网络中的分类模型。\n",
10 | "\n",
11 | "\n",
12 | "## 分类问题\n",
13 | "\n",
14 | "让我们考虑一个简单的图像分类问题,其输入图像的高和宽均为2像素,且色彩为灰度。这样每个像素值都可以用一个标量表示。我们将图像中的4像素分别记为$x_1, x_2, x_3, x_4$。假设训练数据集中图像的真实标签为狗、猫或鸡(假设可以用4像素表示出这3种动物),这些标签分别对应离散值$y_1, y_2, y_3$。\n",
15 | "\n",
16 | "我们通常使用离散的数值来表示类别,例如$y_1=1, y_2=2, y_3=3$。如此,一张图像的标签为1、2和3这3个数值中的一个。虽然我们仍然可以使用回归模型来进行建模,并将预测值就近定点化到1、2和3这3个离散值之一,但这种连续值到离散值的转化通常会影响到分类质量。因此我们一般使用更加适合离散值输出的模型来解决分类问题。\n",
17 | "\n",
18 | "## softmax回归模型\n",
19 | "\n",
20 | "softmax回归跟线性回归一样将输入特征与权重做线性叠加。与线性回归的一个主要不同在于,softmax回归的输出值个数等于标签里的类别数。因为一共有4种特征和3种输出动物类别,所以权重包含12个标量(带下标的$w$)、偏差包含3个标量(带下标的$b$),且对每个输入计算$o_1, o_2, o_3$这3个输出:\n",
21 | "\n",
22 | "$$\n",
23 | "\\begin{aligned}\n",
24 | "o_1 &= x_1 w_{11} + x_2 w_{21} + x_3 w_{31} + x_4 w_{41} + b_1,\\\\\n",
25 | "o_2 &= x_1 w_{12} + x_2 w_{22} + x_3 w_{32} + x_4 w_{42} + b_2,\\\\\n",
26 | "o_3 &= x_1 w_{13} + x_2 w_{23} + x_3 w_{33} + x_4 w_{43} + b_3.\n",
27 | "\\end{aligned}\n",
28 | "$$\n",
29 | "\n",
30 | "\n",
31 | "图3.2用神经网络图描绘了上面的计算。softmax回归同线性回归一样,也是一个单层神经网络。由于每个输出$o_1, o_2, o_3$的计算都要依赖于所有的输入$x_1, x_2, x_3, x_4$,softmax回归的输出层也是一个全连接层。\n",
32 | "\n",
33 | "\n",
34 | "\n",
35 | "### softmax运算\n",
36 | "\n",
37 | "既然分类问题需要得到离散的预测输出,一个简单的办法是将输出值$o_i$当作预测类别是$i$的置信度,并将值最大的输出所对应的类作为预测输出,即输出$\\operatorname*{argmax}_i o_i$。例如,如果$o_1,o_2,o_3$分别为$0.1,10,0.1$,由于$o_2$最大,那么预测类别为2,其代表猫。\n",
38 | "\n",
39 | "然而,直接使用输出层的输出有两个问题。一方面,由于输出层的输出值的范围不确定,我们难以直观上判断这些值的意义。例如,刚才举的例子中的输出值10表示“很置信”图像类别为猫,因为该输出值是其他两类的输出值的100倍。但如果$o_1=o_3=10^3$,那么输出值10却又表示图像类别为猫的概率很低。另一方面,由于真实标签是离散值,这些离散值与不确定范围的输出值之间的误差难以衡量。\n",
40 | "\n",
41 | "softmax运算符(softmax operator)解决了以上两个问题。它通过下式将输出值变换成值为正且和为1的概率分布:\n",
42 | "\n",
43 | "$$\\hat{y}_1, \\hat{y}_2, \\hat{y}_3 = \\text{softmax}(o_1, o_2, o_3),$$\n",
44 | "\n",
45 | "其中\n",
46 | "\n",
47 | "$$\n",
48 | "\\hat{y}_1 = \\frac{ \\exp(o_1)}{\\sum_{i=1}^3 \\exp(o_i)},\\quad\n",
49 | "\\hat{y}_2 = \\frac{ \\exp(o_2)}{\\sum_{i=1}^3 \\exp(o_i)},\\quad\n",
50 | "\\hat{y}_3 = \\frac{ \\exp(o_3)}{\\sum_{i=1}^3 \\exp(o_i)}.\n",
51 | "$$\n",
52 | "\n",
53 | "容易看出$\\hat{y}_1 + \\hat{y}_2 + \\hat{y}_3 = 1$且$0 \\leq \\hat{y}_1, \\hat{y}_2, \\hat{y}_3 \\leq 1$,因此$\\hat{y}_1, \\hat{y}_2, \\hat{y}_3$是一个合法的概率分布。这时候,如果$\\hat{y}_2=0.8$,不管$\\hat{y}_1$和$\\hat{y}_3$的值是多少,我们都知道图像类别为猫的概率是80%。此外,我们注意到\n",
54 | "\n",
55 | "$$\\operatorname*{argmax}_i o_i = \\operatorname*{argmax}_i \\hat y_i,$$\n",
56 | "\n",
57 | "因此softmax运算不改变预测类别输出。\n",
58 | "\n",
59 | "## 单样本分类的矢量计算表达式\n",
60 | "\n",
61 | "为了提高计算效率,我们可以将单样本分类通过矢量计算来表达。在上面的图像分类问题中,假设softmax回归的权重和偏差参数分别为\n",
62 | "\n",
63 | "$$\n",
64 | "\\boldsymbol{W} = \n",
65 | "\\begin{bmatrix}\n",
66 | " w_{11} & w_{12} & w_{13} \\\\\n",
67 | " w_{21} & w_{22} & w_{23} \\\\\n",
68 | " w_{31} & w_{32} & w_{33} \\\\\n",
69 | " w_{41} & w_{42} & w_{43}\n",
70 | "\\end{bmatrix},\\quad\n",
71 | "\\boldsymbol{b} = \n",
72 | "\\begin{bmatrix}\n",
73 | " b_1 & b_2 & b_3\n",
74 | "\\end{bmatrix},\n",
75 | "$$\n",
76 | "\n",
77 | "\n",
78 | "\n",
79 | "设高和宽分别为2个像素的图像样本$i$的特征为\n",
80 | "\n",
81 | "$$\\boldsymbol{x}^{(i)} = \\begin{bmatrix}x_1^{(i)} & x_2^{(i)} & x_3^{(i)} & x_4^{(i)}\\end{bmatrix},$$\n",
82 | "\n",
83 | "输出层的输出为\n",
84 | "\n",
85 | "$$\\boldsymbol{o}^{(i)} = \\begin{bmatrix}o_1^{(i)} & o_2^{(i)} & o_3^{(i)}\\end{bmatrix},$$\n",
86 | "\n",
87 | "预测为狗、猫或鸡的概率分布为\n",
88 | "\n",
89 | "$$\\boldsymbol{\\hat{y}}^{(i)} = \\begin{bmatrix}\\hat{y}_1^{(i)} & \\hat{y}_2^{(i)} & \\hat{y}_3^{(i)}\\end{bmatrix}.$$\n",
90 | "\n",
91 | "\n",
92 | "softmax回归对样本$i$分类的矢量计算表达式为\n",
93 | "\n",
94 | "$$\n",
95 | "\\begin{aligned}\n",
96 | "\\boldsymbol{o}^{(i)} &= \\boldsymbol{x}^{(i)} \\boldsymbol{W} + \\boldsymbol{b},\\\\\n",
97 | "\\boldsymbol{\\hat{y}}^{(i)} &= \\text{softmax}(\\boldsymbol{o}^{(i)}).\n",
98 | "\\end{aligned}\n",
99 | "$$\n",
100 | "\n",
101 | "## 小批量样本分类的矢量计算表达式\n",
102 | "\n",
103 | "\n",
104 | "为了进一步提升计算效率,我们通常对小批量数据做矢量计算。广义上讲,给定一个小批量样本,其批量大小为$n$,输入个数(特征数)为$d$,输出个数(类别数)为$q$。设批量特征为$\\boldsymbol{X} \\in \\mathbb{R}^{n \\times d}$。假设softmax回归的权重和偏差参数分别为$\\boldsymbol{W} \\in \\mathbb{R}^{d \\times q}$和$\\boldsymbol{b} \\in \\mathbb{R}^{1 \\times q}$。softmax回归的矢量计算表达式为\n",
105 | "\n",
106 | "$$\n",
107 | "\\begin{aligned}\n",
108 | "\\boldsymbol{O} &= \\boldsymbol{X} \\boldsymbol{W} + \\boldsymbol{b},\\\\\n",
109 | "\\boldsymbol{\\hat{Y}} &= \\text{softmax}(\\boldsymbol{O}),\n",
110 | "\\end{aligned}\n",
111 | "$$\n",
112 | "\n",
113 | "其中的加法运算使用了广播机制,$\\boldsymbol{O}, \\boldsymbol{\\hat{Y}} \\in \\mathbb{R}^{n \\times q}$且这两个矩阵的第$i$行分别为样本$i$的输出$\\boldsymbol{o}^{(i)}$和概率分布$\\boldsymbol{\\hat{y}}^{(i)}$。\n",
114 | "\n",
115 | "\n",
116 | "## 交叉熵损失函数\n",
117 | "\n",
118 | "前面提到,使用softmax运算后可以更方便地与离散标签计算误差。我们已经知道,softmax运算将输出变换成一个合法的类别预测分布。实际上,真实标签也可以用类别分布表达:对于样本$i$,我们构造向量$\\boldsymbol{y}^{(i)}\\in \\mathbb{R}^{q}$ ,使其第$y^{(i)}$(样本$i$类别的离散数值)个元素为1,其余为0。这样我们的训练目标可以设为使预测概率分布$\\boldsymbol{\\hat y}^{(i)}$尽可能接近真实的标签概率分布$\\boldsymbol{y}^{(i)}$。\n",
119 | "\n",
120 | "我们可以像线性回归那样使用平方损失函数$\\|\\boldsymbol{\\hat y}^{(i)}-\\boldsymbol{y}^{(i)}\\|^2/2$。然而,想要预测分类结果正确,我们其实并不需要预测概率完全等于标签概率。例如,在图像分类的例子里,如果$y^{(i)}=3$,那么我们只需要$\\hat{y}^{(i)}_3$比其他两个预测值$\\hat{y}^{(i)}_1$和$\\hat{y}^{(i)}_2$大就行了。即使$\\hat{y}^{(i)}_3$值为0.6,不管其他两个预测值为多少,类别预测均正确。而平方损失则过于严格,例如$\\hat y^{(i)}_1=\\hat y^{(i)}_2=0.2$比$\\hat y^{(i)}_1=0, \\hat y^{(i)}_2=0.4$的损失要小很多,虽然两者都有同样正确的分类预测结果。\n",
121 | "\n",
122 | "改善上述问题的一个方法是使用更适合衡量两个概率分布差异的测量函数。其中,交叉熵(cross entropy)是一个常用的衡量方法:\n",
123 | "\n",
124 | "$$H\\left(\\boldsymbol y^{(i)}, \\boldsymbol {\\hat y}^{(i)}\\right ) = -\\sum_{j=1}^q y_j^{(i)} \\log \\hat y_j^{(i)},$$\n",
125 | "\n",
126 | "其中带下标的$y_j^{(i)}$是向量$\\boldsymbol y^{(i)}$中非0即1的元素,需要注意将它与样本$i$类别的离散数值,即不带下标的$y^{(i)}$区分。在上式中,我们知道向量$\\boldsymbol y^{(i)}$中只有第$y^{(i)}$个元素$y^{(i)}_{y^{(i)}}$为1,其余全为0,于是$H(\\boldsymbol y^{(i)}, \\boldsymbol {\\hat y}^{(i)}) = -\\log \\hat y_{y^{(i)}}^{(i)}$。也就是说,交叉熵只关心对正确类别的预测概率,因为只要其值足够大,就可以确保分类结果正确。当然,遇到一个样本有多个标签时,例如图像里含有不止一个物体时,我们并不能做这一步简化。但即便对于这种情况,交叉熵同样只关心对图像中出现的物体类别的预测概率。\n",
127 | "\n",
128 | "\n",
129 | "假设训练数据集的样本数为$n$,交叉熵损失函数定义为\n",
130 | "$$\\ell(\\boldsymbol{\\Theta}) = \\frac{1}{n} \\sum_{i=1}^n H\\left(\\boldsymbol y^{(i)}, \\boldsymbol {\\hat y}^{(i)}\\right ),$$\n",
131 | "\n",
132 | "其中$\\boldsymbol{\\Theta}$代表模型参数。同样地,如果每个样本只有一个标签,那么交叉熵损失可以简写成$\\ell(\\boldsymbol{\\Theta}) = -(1/n) \\sum_{i=1}^n \\log \\hat y_{y^{(i)}}^{(i)}$。从另一个角度来看,我们知道最小化$\\ell(\\boldsymbol{\\Theta})$等价于最大化$\\exp(-n\\ell(\\boldsymbol{\\Theta}))=\\prod_{i=1}^n \\hat y_{y^{(i)}}^{(i)}$,即最小化交叉熵损失函数等价于最大化训练数据集所有标签类别的联合预测概率。\n",
133 | "\n",
134 | "\n",
135 | "## 模型预测及评价\n",
136 | "\n",
137 | "在训练好softmax回归模型后,给定任一样本特征,就可以预测每个输出类别的概率。通常,我们把预测概率最大的类别作为输出类别。如果它与真实类别(标签)一致,说明这次预测是正确的。在之后[“softmax回归的从零开始实现”](softmax-regression-scratch.ipynb)一节的实验中,我们将使用准确率(accuracy)来评价模型的表现。它等于正确预测数量与总预测数量之比。\n",
138 | "\n",
139 | "## 小结\n",
140 | "\n",
141 | "* softmax回归适用于分类问题。它使用softmax运算输出类别的概率分布。\n",
142 | "* softmax回归是一个单层神经网络,输出个数等于分类问题中的类别个数。\n",
143 | "* 交叉熵适合衡量两个概率分布的差异。\n",
144 | "\n",
145 | "\n",
146 | "## 练习\n",
147 | "\n",
148 | "* 查阅资料,了解最大似然估计。它与最小化交叉熵损失函数有哪些异曲同工之妙?\n",
149 | "\n",
150 | "\n",
151 | "\n",
152 | "\n",
153 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/6403)\n",
154 | "\n",
155 | ""
156 | ]
157 | }
158 | ],
159 | "metadata": {
160 | "kernelspec": {
161 | "display_name": "Python [conda env:pytorch]",
162 | "language": "python",
163 | "name": "conda-env-pytorch-py"
164 | },
165 | "language_info": {
166 | "codemirror_mode": {
167 | "name": "ipython",
168 | "version": 3
169 | },
170 | "file_extension": ".py",
171 | "mimetype": "text/x-python",
172 | "name": "python",
173 | "nbconvert_exporter": "python",
174 | "pygments_lexer": "ipython3",
175 | "version": "3.7.4"
176 | },
177 | "toc": {
178 | "base_numbering": 1,
179 | "nav_menu": {},
180 | "number_sections": true,
181 | "sideBar": true,
182 | "skip_h1_title": false,
183 | "title_cell": "Table of Contents",
184 | "title_sidebar": "Contents",
185 | "toc_cell": false,
186 | "toc_position": {},
187 | "toc_section_display": true,
188 | "toc_window_display": false
189 | }
190 | },
191 | "nbformat": 4,
192 | "nbformat_minor": 4
193 | }
194 |
--------------------------------------------------------------------------------
/chapter_deep-learning-computation/custom-layer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 自定义层\n",
8 | "\n",
9 | "深度学习的一个魅力在于神经网络中各式各样的层,例如全连接层和后面章节中将要介绍的卷积层、池化层与循环层。虽然`torch.nn`提供了大量常用的层,但有时候我们依然希望自定义层。本节将介绍如何使用`torch`来自定义一个网络层,从而可以被重复调用。\n",
10 | "\n",
11 | "\n",
12 | "## 不含模型参数的自定义层\n",
13 | "\n",
14 | "我们先介绍如何定义一个不含模型参数的自定义层。事实上,这和[“模型构造”](model-construction.ipynb)一节中介绍的使用`Module`类构造模型类似。下面的`CenteredLayer`类通过继承`Module`类自定义了一个将输入减掉均值后输出的层,并将层的计算定义在了`forward`函数里。这个层里不含模型参数。"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import torch\n",
24 | "from torch import nn\n",
25 | "\n",
26 | "class CenteredLayer(nn.Module):\n",
27 | " def __init__(self, **kwargs):\n",
28 | " super(CenteredLayer, self).__init__(**kwargs)\n",
29 | " \n",
30 | " def forward(self, x):\n",
31 | " return x - x.mean()"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "metadata": {},
37 | "source": [
38 | "我们可以实例化这个层,然后做前向计算。"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 2,
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "tensor([-2., -1., 0., 1., 2.])"
50 | ]
51 | },
52 | "execution_count": 2,
53 | "metadata": {},
54 | "output_type": "execute_result"
55 | }
56 | ],
57 | "source": [
58 | "layer = CenteredLayer()\n",
59 | "layer(torch.Tensor([1, 2, 3, 4, 5]))"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "我们也可以用它来构造更复杂的模型。"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 3,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "net = nn.Sequential(\n",
76 | " nn.Linear(8, 128),\n",
77 | " CenteredLayer()\n",
78 | ")"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "下面打印自定义层各个输出的均值。因为均值是浮点数,所以它的值是一个很接近0的数。"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 4,
91 | "metadata": {},
92 | "outputs": [
93 | {
94 | "data": {
95 | "text/plain": [
96 | "7.450580596923828e-09"
97 | ]
98 | },
99 | "execution_count": 4,
100 | "metadata": {},
101 | "output_type": "execute_result"
102 | }
103 | ],
104 | "source": [
105 | "y = net(torch.rand(4, 8))\n",
106 | "y.mean().item()"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "## 含模型参数的自定义层\n",
114 | "\n",
115 | "我们还可以自定义含模型参数的自定义层。其中的模型参数可以通过训练学出。\n",
116 | "\n",
117 | "对于模型的可学习参数,需要使用`slef.param_name = nn.Parameter(初始化形式)`进行创建。当Paramenter赋值给Module的属性的时候,他会自动的被加到 Module的 参数列表中(即:会出现在 parameters() 迭代器中),并且参数名为`param_name`。"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "现在我们尝试实现一个含权重参数和偏差参数的全连接层。它使用ReLU函数作为激活函数。其中`in_units`和`units`分别代表输入个数和输出个数。"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 5,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "class MyLinear(nn.Module):\n",
134 | " # in_features为该层的输入特征数, out_features为该层的输出特征数\n",
135 | " def __init__(self, in_features, out_features, **kwargs):\n",
136 | " super(MyLinear, self).__init__(**kwargs)\n",
137 | " self.weight = nn.Parameter(torch.rand(in_features, out_features))\n",
138 | " self.bias = nn.Parameter(torch.rand(out_features))\n",
139 | " \n",
140 | " def forward(self, x):\n",
141 | " linear = x.mm(self.weight) + self.bias\n",
142 | " return torch.relu(linear)"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "下面,我们实例化`MyDense`类并访问它的模型参数。"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 6,
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "name": "stdout",
159 | "output_type": "stream",
160 | "text": [
161 | "weight torch.Size([5, 3]) torch.float32\n",
162 | "bias torch.Size([3]) torch.float32\n"
163 | ]
164 | }
165 | ],
166 | "source": [
167 | "linear = MyLinear(in_features=5, out_features=3)\n",
168 | "for name, param in linear.named_parameters():\n",
169 | " print(name, param.shape, param.dtype)"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "我们可以直接使用自定义层做前向计算。"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 7,
182 | "metadata": {},
183 | "outputs": [
184 | {
185 | "data": {
186 | "text/plain": [
187 | "tensor([[0.5579, 1.2363, 1.2546],\n",
188 | " [0.7231, 1.3465, 1.1097]], grad_fn=)"
189 | ]
190 | },
191 | "execution_count": 7,
192 | "metadata": {},
193 | "output_type": "execute_result"
194 | }
195 | ],
196 | "source": [
197 | "linear(torch.rand(2, 5))"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "我们也可以使用自定义层构造模型。它和PyTorch的其他层在使用上很类似。"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 8,
210 | "metadata": {},
211 | "outputs": [
212 | {
213 | "data": {
214 | "text/plain": [
215 | "tensor([[39.8864],\n",
216 | " [39.3420]], grad_fn=)"
217 | ]
218 | },
219 | "execution_count": 8,
220 | "metadata": {},
221 | "output_type": "execute_result"
222 | }
223 | ],
224 | "source": [
225 | "net = nn.Sequential(\n",
226 | " MyLinear(64, 8),\n",
227 | " MyLinear(8, 1)\n",
228 | ")\n",
229 | "\n",
230 | "net(torch.rand(2, 64))"
231 | ]
232 | },
233 | {
234 | "cell_type": "markdown",
235 | "metadata": {},
236 | "source": [
237 | "## 小结\n",
238 | "\n",
239 | "* 可以通过`Module`类自定义神经网络中的层,从而可以被重复调用。\n",
240 | "\n",
241 | "\n",
242 | "## 练习\n",
243 | "\n",
244 | "* 自定义一个层,使用它做一次前向计算。\n",
245 | "\n",
246 | "\n",
247 | "\n",
248 | "\n",
249 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/1256)\n",
250 | "\n",
251 | ""
252 | ]
253 | }
254 | ],
255 | "metadata": {
256 | "kernelspec": {
257 | "display_name": "Python [conda env:pytorch]",
258 | "language": "python",
259 | "name": "conda-env-pytorch-py"
260 | },
261 | "language_info": {
262 | "codemirror_mode": {
263 | "name": "ipython",
264 | "version": 3
265 | },
266 | "file_extension": ".py",
267 | "mimetype": "text/x-python",
268 | "name": "python",
269 | "nbconvert_exporter": "python",
270 | "pygments_lexer": "ipython3",
271 | "version": "3.6.9"
272 | },
273 | "toc": {
274 | "base_numbering": 1,
275 | "nav_menu": {},
276 | "number_sections": true,
277 | "sideBar": true,
278 | "skip_h1_title": false,
279 | "title_cell": "Table of Contents",
280 | "title_sidebar": "Contents",
281 | "toc_cell": false,
282 | "toc_position": {},
283 | "toc_section_display": true,
284 | "toc_window_display": false
285 | }
286 | },
287 | "nbformat": 4,
288 | "nbformat_minor": 4
289 | }
290 |
--------------------------------------------------------------------------------
/chapter_deep-learning-computation/deferred-init.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "**因为PyTorch的网络层都需要指定输入特征数及输出特征数,所以PyTorch不存在延迟初始化问题。故本章内容对PyTorch无效**"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# 模型参数的延后初始化\n",
15 | "\n",
16 | "如果做了上一节练习,你会发现模型`net`在调用初始化函数`initialize`之后、在做前向计算`net(X)`之前时,权重参数的形状中出现了0。虽然直觉上`initialize`完成了所有参数初始化过程,然而这在Gluon中却是不一定的。我们在本节中详细讨论这个话题。\n",
17 | "\n",
18 | "\n",
19 | "## 延后初始化\n",
20 | "\n",
21 | "也许读者早就注意到了,在之前使用Gluon创建的全连接层都没有指定输入个数。例如,在上一节使用的多层感知机`net`里,我们创建的隐藏层仅仅指定了输出大小为256。当调用`initialize`函数时,由于隐藏层输入个数依然未知,系统也无法得知该层权重参数的形状。只有在当我们将形状是(2, 20)的输入`X`传进网络做前向计算`net(X)`时,系统才推断出该层的权重参数形状为(256, 20)。因此,这时候我们才能真正开始初始化参数。\n",
22 | "\n",
23 | "让我们使用上一节中定义的`MyInit`类来演示这一过程。我们创建多层感知机,并使用`MyInit`实例来初始化模型参数。"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 1,
29 | "metadata": {
30 | "attributes": {
31 | "classes": [],
32 | "id": "",
33 | "n": "22"
34 | }
35 | },
36 | "outputs": [],
37 | "source": [
38 | "from mxnet import init, nd\n",
39 | "from mxnet.gluon import nn\n",
40 | "\n",
41 | "class MyInit(init.Initializer):\n",
42 | " def _init_weight(self, name, data):\n",
43 | " print('Init', name, data.shape)\n",
44 | " # 实际的初始化逻辑在此省略了\n",
45 | "\n",
46 | "net = nn.Sequential()\n",
47 | "net.add(nn.Dense(256, activation='relu'),\n",
48 | " nn.Dense(10))\n",
49 | "\n",
50 | "net.initialize(init=MyInit())"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "注意,虽然`MyInit`被调用时会打印模型参数的相关信息,但上面的`initialize`函数执行完并未打印任何信息。由此可见,调用`initialize`函数时并没有真正初始化参数。下面我们定义输入并执行一次前向计算。"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 2,
63 | "metadata": {
64 | "attributes": {
65 | "classes": [],
66 | "id": "",
67 | "n": "25"
68 | }
69 | },
70 | "outputs": [
71 | {
72 | "name": "stdout",
73 | "output_type": "stream",
74 | "text": [
75 | "Init dense0_weight (256, 20)\n",
76 | "Init dense1_weight (10, 256)\n"
77 | ]
78 | }
79 | ],
80 | "source": [
81 | "X = nd.random.uniform(shape=(2, 20))\n",
82 | "Y = net(X)"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "这时候,有关模型参数的信息被打印出来。在根据输入`X`做前向计算时,系统能够根据输入的形状自动推断出所有层的权重参数的形状。系统在创建这些参数之后,调用`MyInit`实例对它们进行初始化,然后才进行前向计算。\n",
90 | "\n",
91 | "当然,这个初始化只会在第一次前向计算时被调用。之后我们再运行前向计算`net(X)`时则不会重新初始化,因此不会再次产生`MyInit`实例的输出。"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 3,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "Y = net(X)"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "系统将真正的参数初始化延后到获得足够信息时才执行的行为叫作延后初始化(deferred initialization)。它可以让模型的创建更加简单:只需要定义每个层的输出大小,而不用人工推测它们的输入个数。这对于之后将介绍的定义多达数十甚至数百层的网络来说尤其方便。\n",
108 | "\n",
109 | "然而,任何事物都有两面性。正如本节开头提到的那样,延后初始化也可能会带来一定的困惑。在第一次前向计算之前,我们无法直接操作模型参数,例如无法使用`data`函数和`set_data`函数来获取和修改参数。因此,我们经常会额外做一次前向计算来迫使参数被真正地初始化。\n",
110 | "\n",
111 | "## 避免延后初始化\n",
112 | "\n",
113 | "如果系统在调用`initialize`函数时能够知道所有参数的形状,那么延后初始化就不会发生。我们在这里分别介绍两种这样的情况。\n",
114 | "\n",
115 | "第一种情况是我们要对已初始化的模型重新初始化时。因为参数形状不会发生变化,所以系统能够立即进行重新初始化。"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 4,
121 | "metadata": {},
122 | "outputs": [
123 | {
124 | "name": "stdout",
125 | "output_type": "stream",
126 | "text": [
127 | "Init dense0_weight (256, 20)\n",
128 | "Init dense1_weight (10, 256)\n"
129 | ]
130 | }
131 | ],
132 | "source": [
133 | "net.initialize(init=MyInit(), force_reinit=True)"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "第二种情况是我们在创建层的时候指定了它的输入个数,使系统不需要额外的信息来推测参数形状。下例中我们通过`in_units`来指定每个全连接层的输入个数,使初始化能够在`initialize`函数被调用时立即发生。"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 5,
146 | "metadata": {},
147 | "outputs": [
148 | {
149 | "name": "stdout",
150 | "output_type": "stream",
151 | "text": [
152 | "Init dense2_weight (256, 20)\n",
153 | "Init dense3_weight (10, 256)\n"
154 | ]
155 | }
156 | ],
157 | "source": [
158 | "net = nn.Sequential()\n",
159 | "net.add(nn.Dense(256, in_units=20, activation='relu'))\n",
160 | "net.add(nn.Dense(10, in_units=256))\n",
161 | "\n",
162 | "net.initialize(init=MyInit())"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {},
168 | "source": [
169 | "## 小结\n",
170 | "\n",
171 | "* 系统将真正的参数初始化延后到获得足够信息时才执行的行为叫作延后初始化。\n",
172 | "* 延后初始化的主要好处是让模型构造更加简单。例如,我们无须人工推测每个层的输入个数。\n",
173 | "* 也可以避免延后初始化。\n",
174 | "\n",
175 | "\n",
176 | "## 练习\n",
177 | "\n",
178 | "* 如果在下一次前向计算`net(X)`前改变输入`X`的形状,包括批量大小和输入个数,会发生什么?\n",
179 | "\n",
180 | "\n",
181 | "\n",
182 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/6320)\n",
183 | "\n",
184 | ""
185 | ]
186 | }
187 | ],
188 | "metadata": {
189 | "kernelspec": {
190 | "display_name": "Python 3",
191 | "language": "python",
192 | "name": "python3"
193 | },
194 | "language_info": {
195 | "codemirror_mode": {
196 | "name": "ipython",
197 | "version": 3
198 | },
199 | "file_extension": ".py",
200 | "mimetype": "text/x-python",
201 | "name": "python",
202 | "nbconvert_exporter": "python",
203 | "pygments_lexer": "ipython3",
204 | "version": "3.7.3"
205 | },
206 | "toc": {
207 | "base_numbering": 1,
208 | "nav_menu": {},
209 | "number_sections": true,
210 | "sideBar": true,
211 | "skip_h1_title": false,
212 | "title_cell": "Table of Contents",
213 | "title_sidebar": "Contents",
214 | "toc_cell": false,
215 | "toc_position": {},
216 | "toc_section_display": true,
217 | "toc_window_display": false
218 | }
219 | },
220 | "nbformat": 4,
221 | "nbformat_minor": 2
222 | }
223 |
--------------------------------------------------------------------------------
/chapter_deep-learning-computation/index.md:
--------------------------------------------------------------------------------
1 | # 深度学习计算
2 |
3 | 上一章介绍了包括多层感知机在内的简单深度学习模型的原理和实现。本章我们将简要概括深度学习计算的各个重要组成部分,如模型构造、参数的访问和初始化等,自定义层,读取、存储和使用GPU。通过本章的学习,我们将能够深入了解模型实现和计算的各个细节,并为在之后章节实现更复杂模型打下坚实的基础。
4 |
5 | ```eval_rst
6 |
7 | .. toctree::
8 | :maxdepth: 2
9 |
10 | model-construction
11 | parameters
12 | deferred-init
13 | custom-layer
14 | read-write
15 | use-gpu
16 |
17 | ```
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/chapter_deep-learning-computation/read-write.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 读取和存储\n",
8 | "\n",
9 | "到目前为止,我们介绍了如何处理数据以及如何构建、训练和测试深度学习模型。然而在实际中,我们有时需要把训练好的模型部署到很多不同的设备。在这种情况下,我们可以把内存中训练好的模型参数存储在硬盘上供后续读取使用。\n",
10 | "\n",
11 | "\n",
12 | "## 读写`Tensor`\n",
13 | "\n",
14 | "我们可以直接使用`save`函数和`load`函数分别存储和读取`Tensor`。下面的例子创建了`Tensor`变量`x`,并将其存在文件名同为`x`的文件里。"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import torch\n",
24 | "from torch import nn\n",
25 | "\n",
26 | "x = torch.ones(3)\n",
27 | "torch.save(x, 'x')"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "然后我们将数据从存储的文件读回内存。"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 2,
40 | "metadata": {},
41 | "outputs": [
42 | {
43 | "data": {
44 | "text/plain": [
45 | "tensor([1., 1., 1.])"
46 | ]
47 | },
48 | "execution_count": 2,
49 | "metadata": {},
50 | "output_type": "execute_result"
51 | }
52 | ],
53 | "source": [
54 | "x2 = torch.load('x')\n",
55 | "x2"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "我们还可以存储一列`Tensor`并读回内存。"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 3,
68 | "metadata": {},
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "(tensor([1., 1., 1.]), tensor([0., 0., 0., 0.]))"
74 | ]
75 | },
76 | "execution_count": 3,
77 | "metadata": {},
78 | "output_type": "execute_result"
79 | }
80 | ],
81 | "source": [
82 | "y = torch.zeros(4)\n",
83 | "torch.save([x, y], 'xy')\n",
84 | "x2, y2 = torch.load('xy')\n",
85 | "(x2, y2)"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "我们甚至可以存储并读取一个从字符串映射到`Tensor`的字典。"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 4,
98 | "metadata": {},
99 | "outputs": [
100 | {
101 | "data": {
102 | "text/plain": [
103 | "{'x': tensor([1., 1., 1.]), 'y': tensor([0., 0., 0., 0.])}"
104 | ]
105 | },
106 | "execution_count": 4,
107 | "metadata": {},
108 | "output_type": "execute_result"
109 | }
110 | ],
111 | "source": [
112 | "mydict = {'x': x, 'y': y}\n",
113 | "torch.save(mydict, 'mydict')\n",
114 | "mydict2 = torch.load('mydict')\n",
115 | "mydict2"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {},
121 | "source": [
122 | "## 读写模型的参数\n",
123 | "\n",
124 | "除`Tensor`以外,我们还可以读写模型的参数。我们可以使用`save`方法来保存模型的`state_dict`,`Module`类提供了`load_state_dict`函数来读取模型参数。为了演示方便,我们先创建一个多层感知机,并将其初始化。"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 5,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "class MLP(nn.Module):\n",
134 | " def __init__(self, **kwargs):\n",
135 | " super(MLP, self).__init__(**kwargs)\n",
136 | " self.hidden = nn.Linear(20, 256)\n",
137 | " self.activation = nn.ReLU()\n",
138 | " self.output = nn.Linear(256, 10)\n",
139 | " \n",
140 | " def forward(self, x):\n",
141 | " return self.output(self.activation(self.hidden(x)))\n",
142 | " \n",
143 | "net = MLP()\n",
144 | "X = torch.rand(2, 20)\n",
145 | "Y = net(X)"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "下面把该模型的参数存成文件,文件名为mlp.params。"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 6,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "filename = 'mlp.params'\n",
162 | "torch.save(net.state_dict(), filename)"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {},
168 | "source": [
169 | "接下来,我们再实例化一次定义好的多层感知机。与随机初始化模型参数不同,我们在这里直接读取保存在文件里的参数。"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 7,
175 | "metadata": {},
176 | "outputs": [
177 | {
178 | "data": {
179 | "text/plain": [
180 | ""
181 | ]
182 | },
183 | "execution_count": 7,
184 | "metadata": {},
185 | "output_type": "execute_result"
186 | }
187 | ],
188 | "source": [
189 | "net2 = MLP()\n",
190 | "net2.load_state_dict(torch.load(filename))"
191 | ]
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "metadata": {},
196 | "source": [
197 | "因为这两个实例都有同样的模型参数,那么对同一个输入`X`的计算结果将会是一样的。我们来验证一下。"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": 8,
203 | "metadata": {},
204 | "outputs": [
205 | {
206 | "data": {
207 | "text/plain": [
208 | "tensor([[True, True, True, True, True, True, True, True, True, True],\n",
209 | " [True, True, True, True, True, True, True, True, True, True]])"
210 | ]
211 | },
212 | "execution_count": 8,
213 | "metadata": {},
214 | "output_type": "execute_result"
215 | }
216 | ],
217 | "source": [
218 | "Y2 = net2(X)\n",
219 | "Y2 == Y"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {},
225 | "source": [
226 | "## 小结\n",
227 | "\n",
228 | "* 通过`save`函数和`load`函数可以很方便地读写`Tensor`。\n",
229 | "* 通过`load_state_dict`函数可以很方便地读取模型的参数。\n",
230 | "\n",
231 | "## 练习\n",
232 | "\n",
233 | "* 即使无须把训练好的模型部署到不同的设备,存储模型参数在实际中还有哪些好处?\n",
234 | "\n",
235 | "\n",
236 | "\n",
237 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/1255)\n",
238 | "\n",
239 | ""
240 | ]
241 | }
242 | ],
243 | "metadata": {
244 | "kernelspec": {
245 | "display_name": "Python [conda env:pytorch]",
246 | "language": "python",
247 | "name": "conda-env-pytorch-py"
248 | },
249 | "language_info": {
250 | "codemirror_mode": {
251 | "name": "ipython",
252 | "version": 3
253 | },
254 | "file_extension": ".py",
255 | "mimetype": "text/x-python",
256 | "name": "python",
257 | "nbconvert_exporter": "python",
258 | "pygments_lexer": "ipython3",
259 | "version": "3.6.9"
260 | },
261 | "toc": {
262 | "base_numbering": 1,
263 | "nav_menu": {},
264 | "number_sections": true,
265 | "sideBar": true,
266 | "skip_h1_title": false,
267 | "title_cell": "Table of Contents",
268 | "title_sidebar": "Contents",
269 | "toc_cell": false,
270 | "toc_position": {},
271 | "toc_section_display": true,
272 | "toc_window_display": false
273 | }
274 | },
275 | "nbformat": 4,
276 | "nbformat_minor": 4
277 | }
278 |
--------------------------------------------------------------------------------
/chapter_how-to-use/how-to-use.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 如何使用本书\n",
8 | "\n",
9 | "本书将全面介绍深度学习从模型构造到模型训练的方方面面,以及它们在计算机视觉和自然语言处理中的应用。我们不仅将阐述算法原理,还将基于PyTorch对算法进行实现,并实际运行它们。本书的每一节都是一个Jupyter记事本。它将文字、公式、图像、代码和运行结果结合在了一起。读者不但能直接阅读它们,而且可以运行它们以获得交互式的学习体验。\n",
10 | "\n",
11 | "\n",
12 | "## 面向的读者\n",
13 | "\n",
14 | "本书面向希望了解深度学习,特别是对实际使用深度学习感兴趣的大学生、工程师和研究人员。本书并不要求读者有任何深度学习或者机器学习的背景知识,我们将从头开始解释每一个概念。虽然深度学习技术与应用的阐述涉及了数学和编程,但读者只需了解基础的数学和编程,如基础的线性代数、微分和概率,以及基础的Python编程。在附录中我们提供了本书所涉及的主要数学知识供读者参考。如果读者之前没有接触过Python,可以参考中文教程 http://www.runoob.com/python/python-tutorial.html 或英文教程 http://learnpython.org/ 。当然,如果读者只对本书中的数学部分感兴趣,可以忽略掉编程部分,反之亦然。\n",
15 | "\n",
16 | "\n",
17 | "## 内容和结构\n",
18 | "\n",
19 | "本书内容大体可以分为3个部分:\n",
20 | "\n",
21 | "\n",
22 | "* 第一部分(第1章至第3章)涵盖预备工作和基础知识。第1章介绍深度学习的背景。第2章提供动手学深度学习所需要的预备知识,例如,如何获取并运行本书中的代码。第3章包括深度学习最基础的概念和技术,如多层感知机和模型正则化。如果读者时间有限,并且只想了解深度学习最基础的概念和技术,那么只需阅读第一部分。\n",
23 | "\n",
24 | "* 第二部分(第4章至第6章)关注现代深度学习技术。第4章描述深度学习计算的各个重要组成部分,并为实现后续更复杂的模型打下基础。第5章解释近年来令深度学习在计算机视觉领域大获成功的卷积神经网络。第6章阐述近年来常用于处理序列数据的循环神经网络。阅读第二部分有助于掌握现代深度学习技术。\n",
25 | "\n",
26 | "* 第三部分(第7章至第10章)讨论计算性能和应用。第7章评价各种用来训练深度学习模型的优化算法。第8章检验影响深度学习计算性能的几个重要因素。第9章和第10章分别列举深度学习在计算机视觉和自然语言处理中的重要应用。这部分内容读者可根据兴趣选择阅读。\n",
27 | "\n",
28 | "图1描绘了本书的结构。\n",
29 | "\n",
30 | "\n",
31 | "\n",
32 | "图1中由甲章指向乙章的箭头表明甲章的知识有助于理解乙章的内容。如果读者想短时间了解深度学习最基础的概念和技术,只需阅读第1章至第3章;如果读者希望掌握现代深度学习技术,还需阅读第4章至第6章。第7章至第10章读者可以根据兴趣选择阅读。\n",
33 | "\n",
34 | "\n",
35 | "## 代码\n",
36 | "\n",
37 | "本书的一大特点是每一节的代码都是可以运行的。读者可以改动代码后重新运行,并通过运行结果进一步理解改动所带来的影响。我们认为,这种交互式的学习体验对于学习深度学习非常重要。因为深度学习目前并没有很好的理论解释框架,很多论断只可意会。文字解释在这时候可能比较苍白无力,而且不足以覆盖所有细节。读者需要不断改动代码、观察运行结果并总结经验,从而逐步领悟和掌握深度学习。\n",
38 | "\n",
39 | "本书的代码在原书基础上改用PyTorch实现。为避免重复描述,我们将本书中多次使用的函数和类封装在`d2ltorch`包中。这些函数和类的定义的所在章节已在附录[“d2ltorch包索引”](../chapter_appendix/d2ltorch.ipynb)里列出。由于深度学习发展极为迅速,未来版本的PyTorch可能会造成书中部分代码无法正常运行。遇到相关问题可参考[“获取和运行本书的代码”](../chapter_prerequisite/install.ipynb)一节来更新代码和运行环境。如果你想了解运行本书代码所依赖的PyTorch和`d2ltorch`包的版本号,也可参考[“获取和运行本书的代码”](../chapter_prerequisite/install.ipynb)一节。\n",
40 | "\n",
41 | "我们提供代码的主要目的在于增加一个在文字、图像和公式外的学习深度学习算法的方式,以及一个便于理解各算法在真实数据上的实际效果的交互式环境。书中只使用了PyTorch的`nn`、`autograd`、`optim`等模块或包的基础功能,从而使读者尽可能了解深度学习算法的实现细节。即便读者在研究和工作中使用的是其他深度学习框架,书中的代码也能有助于读者更好地理解和应用深度学习算法。\n",
42 | "\n",
43 | "\n",
44 | "## 讨论区\n",
45 | "\n",
46 | "原书的网站是 https://zh.d2l.ai ,上面提供了学习社区地址(https://discuss.gluon.ai/ )和GitHub开源地址(https://github.com/d2l-ai/d2l-zh )。如果读者对书中某节内容有疑惑,可扫一扫该节末尾的二维码参与该节内容的讨论。值得一提的是,在有关Kaggle比赛章节的讨论区中,众多社区成员提供了丰富的高水平方法,我们强烈推荐给大家。希望诸位积极参与学习社区中的讨论,并相信大家一定会有所收获。原书作者和MXNet开发人员也时常参与社区中的讨论。\n",
47 | "\n",
48 | "PyTorch版本的读者或者希望为本项目做贡献的朋友可以加入QQ群讨论:\n",
49 | "\n",
50 | "\n",
51 | "\n",
52 | "\n",
53 | "## 小结\n",
54 | "\n",
55 | "* 我们选择PyTorch作为本书使用的深度学习框架。\n",
56 | "* 本书力求提供一个多方位交互式的学习体验。\n",
57 | "\n",
58 | "\n",
59 | "## 练习\n",
60 | "\n",
61 | "* 在本书的学习社区 https://discuss.gluon.ai/ 上注册一个账号。搜索关键字Kaggle,浏览其中回复量最大的几个帖子。\n",
62 | "\n",
63 | "\n",
64 | "\n",
65 | "\n",
66 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/6915)\n",
67 | "\n",
68 | ""
69 | ]
70 | }
71 | ],
72 | "metadata": {
73 | "kernelspec": {
74 | "display_name": "Python [conda env:pytorch]",
75 | "language": "python",
76 | "name": "conda-env-pytorch-py"
77 | },
78 | "language_info": {
79 | "codemirror_mode": {
80 | "name": "ipython",
81 | "version": 3
82 | },
83 | "file_extension": ".py",
84 | "mimetype": "text/x-python",
85 | "name": "python",
86 | "nbconvert_exporter": "python",
87 | "pygments_lexer": "ipython3",
88 | "version": "3.6.9"
89 | }
90 | },
91 | "nbformat": 4,
92 | "nbformat_minor": 4
93 | }
94 |
--------------------------------------------------------------------------------
/chapter_natural-language-processing/approx-training.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 近似训练\n",
8 | "\n",
9 | "回忆上一节的内容。跳字模型的核心在于使用softmax运算得到给定中心词$w_c$来生成背景词$w_o$的条件概率\n",
10 | "\n",
11 | "$$P(w_o \\mid w_c) = \\frac{\\text{exp}(\\boldsymbol{u}_o^\\top \\boldsymbol{v}_c)}{ \\sum_{i \\in \\mathcal{V}} \\text{exp}(\\boldsymbol{u}_i^\\top \\boldsymbol{v}_c)}.$$\n",
12 | "\n",
13 | "该条件概率相应的对数损失\n",
14 | "\n",
15 | "$$-\\log P(w_o \\mid w_c) =\n",
16 | "-\\boldsymbol{u}_o^\\top \\boldsymbol{v}_c + \\log\\left(\\sum_{i \\in \\mathcal{V}} \\text{exp}(\\boldsymbol{u}_i^\\top \\boldsymbol{v}_c)\\right).$$\n",
17 | "\n",
18 | "\n",
19 | "由于softmax运算考虑了背景词可能是词典$\\mathcal{V}$中的任一词,以上损失包含了词典大小数目的项的累加。在上一节中我们看到,不论是跳字模型还是连续词袋模型,由于条件概率使用了softmax运算,每一步的梯度计算都包含词典大小数目的项的累加。对于含几十万或上百万词的较大词典,每次的梯度计算开销可能过大。为了降低该计算复杂度,本节将介绍两种近似训练方法,即负采样(negative sampling)或层序softmax(hierarchical softmax)。由于跳字模型和连续词袋模型类似,本节仅以跳字模型为例介绍这两种方法。\n",
20 | "\n",
21 | "\n",
22 | "\n",
23 | "## 负采样\n",
24 | "\n",
25 | "负采样修改了原来的目标函数。给定中心词$w_c$的一个背景窗口,我们把背景词$w_o$出现在该背景窗口看作一个事件,并将该事件的概率计算为\n",
26 | "\n",
27 | "$$P(D=1\\mid w_c, w_o) = \\sigma(\\boldsymbol{u}_o^\\top \\boldsymbol{v}_c),$$\n",
28 | "\n",
29 | "其中的$\\sigma$函数与sigmoid激活函数的定义相同:\n",
30 | "\n",
31 | "$$\\sigma(x) = \\frac{1}{1+\\exp(-x)}.$$\n",
32 | "\n",
33 | "我们先考虑最大化文本序列中所有该事件的联合概率来训练词向量。具体来说,给定一个长度为$T$的文本序列,设时间步$t$的词为$w^{(t)}$且背景窗口大小为$m$,考虑最大化联合概率\n",
34 | "\n",
35 | "$$ \\prod_{t=1}^{T} \\prod_{-m \\leq j \\leq m,\\ j \\neq 0} P(D=1\\mid w^{(t)}, w^{(t+j)}).$$\n",
36 | "\n",
37 | "然而,以上模型中包含的事件仅考虑了正类样本。这导致当所有词向量相等且值为无穷大时,以上的联合概率才被最大化为1。很明显,这样的词向量毫无意义。负采样通过采样并添加负类样本使目标函数更有意义。设背景词$w_o$出现在中心词$w_c$的一个背景窗口为事件$P$,我们根据分布$P(w)$采样$K$个未出现在该背景窗口中的词,即噪声词。设噪声词$w_k$($k=1, \\ldots, K$)不出现在中心词$w_c$的该背景窗口为事件$N_k$。假设同时含有正类样本和负类样本的事件$P, N_1, \\ldots, N_K$相互独立,负采样将以上需要最大化的仅考虑正类样本的联合概率改写为\n",
38 | "\n",
39 | "\n",
40 | "$$ \\prod_{t=1}^{T} \\prod_{-m \\leq j \\leq m,\\ j \\neq 0} P(w^{(t+j)} \\mid w^{(t)}),$$\n",
41 | "\n",
42 | "其中条件概率被近似表示为\n",
43 | "$$ P(w^{(t+j)} \\mid w^{(t)}) =P(D=1\\mid w^{(t)}, w^{(t+j)})\\prod_{k=1,\\ w_k \\sim P(w)}^K P(D=0\\mid w^{(t)}, w_k).$$\n",
44 | "\n",
45 | "\n",
46 | "设文本序列中时间步$t$的词$w^{(t)}$在词典中的索引为$i_t$,噪声词$w_k$在词典中的索引为$h_k$。有关以上条件概率的对数损失为\n",
47 | "\n",
48 | "$$\n",
49 | "\\begin{aligned}\n",
50 | "-\\log P(w^{(t+j)} \\mid w^{(t)})\n",
51 | "=& -\\log P(D=1\\mid w^{(t)}, w^{(t+j)}) - \\sum_{k=1,\\ w_k \\sim P(w)}^K \\log P(D=0\\mid w^{(t)}, w_k)\\\\\n",
52 | "=&- \\log\\, \\sigma\\left(\\boldsymbol{u}_{i_{t+j}}^\\top \\boldsymbol{v}_{i_t}\\right) - \\sum_{k=1,\\ w_k \\sim P(w)}^K \\log\\left(1-\\sigma\\left(\\boldsymbol{u}_{h_k}^\\top \\boldsymbol{v}_{i_t}\\right)\\right)\\\\\n",
53 | "=&- \\log\\, \\sigma\\left(\\boldsymbol{u}_{i_{t+j}}^\\top \\boldsymbol{v}_{i_t}\\right) - \\sum_{k=1,\\ w_k \\sim P(w)}^K \\log\\sigma\\left(-\\boldsymbol{u}_{h_k}^\\top \\boldsymbol{v}_{i_t}\\right).\n",
54 | "\\end{aligned}\n",
55 | "$$\n",
56 | "\n",
57 | "现在,训练中每一步的梯度计算开销不再与词典大小相关,而与$K$线性相关。当$K$取较小的常数时,负采样在每一步的梯度计算开销较小。\n",
58 | "\n",
59 | "\n",
60 | "## 层序softmax\n",
61 | "\n",
62 | "层序softmax是另一种近似训练法。它使用了二叉树这一数据结构,树的每个叶结点代表词典$\\mathcal{V}$中的每个词。\n",
63 | "\n",
64 | "\n",
65 | "\n",
66 | "\n",
67 | "假设$L(w)$为从二叉树的根结点到词$w$的叶结点的路径(包括根结点和叶结点)上的结点数。设$n(w,j)$为该路径上第$j$个结点,并设该结点的背景词向量为$\\boldsymbol{u}_{n(w,j)}$。以图10.3为例,$L(w_3) = 4$。层序softmax将跳字模型中的条件概率近似表示为\n",
68 | "\n",
69 | "$$P(w_o \\mid w_c) = \\prod_{j=1}^{L(w_o)-1} \\sigma\\left( [\\![ n(w_o, j+1) = \\text{leftChild}(n(w_o,j)) ]\\!] \\cdot \\boldsymbol{u}_{n(w_o,j)}^\\top \\boldsymbol{v}_c\\right),$$\n",
70 | "\n",
71 | "其中$\\sigma$函数与[“多层感知机”](../chapter_deep-learning-basics/mlp.ipynb)一节中sigmoid激活函数的定义相同,$\\text{leftChild}(n)$是结点$n$的左子结点:如果判断$x$为真,$[\\![x]\\!] = 1$;反之$[\\![x]\\!] = -1$。\n",
72 | "让我们计算图10.3中给定词$w_c$生成词$w_3$的条件概率。我们需要将$w_c$的词向量$\\boldsymbol{v}_c$和根结点到$w_3$路径上的非叶结点向量一一求内积。由于在二叉树中由根结点到叶结点$w_3$的路径上需要向左、向右再向左地遍历(图10.3中加粗的路径),我们得到\n",
73 | "\n",
74 | "$$P(w_3 \\mid w_c) = \\sigma(\\boldsymbol{u}_{n(w_3,1)}^\\top \\boldsymbol{v}_c) \\cdot \\sigma(-\\boldsymbol{u}_{n(w_3,2)}^\\top \\boldsymbol{v}_c) \\cdot \\sigma(\\boldsymbol{u}_{n(w_3,3)}^\\top \\boldsymbol{v}_c).$$\n",
75 | "\n",
76 | "由于$\\sigma(x)+\\sigma(-x) = 1$,给定中心词$w_c$生成词典$\\mathcal{V}$中任一词的条件概率之和为1这一条件也将满足:\n",
77 | "\n",
78 | "$$\\sum_{w \\in \\mathcal{V}} P(w \\mid w_c) = 1.$$\n",
79 | "\n",
80 | "此外,由于$L(w_o)-1$的数量级为$\\mathcal{O}(\\text{log}_2|\\mathcal{V}|)$,当词典$\\mathcal{V}$很大时,层序softmax在训练中每一步的梯度计算开销相较未使用近似训练时大幅降低。\n",
81 | "\n",
82 | "## 小结\n",
83 | "\n",
84 | "* 负采样通过考虑同时含有正类样本和负类样本的相互独立事件来构造损失函数。其训练中每一步的梯度计算开销与采样的噪声词的个数线性相关。\n",
85 | "* 层序softmax使用了二叉树,并根据根结点到叶结点的路径来构造损失函数。其训练中每一步的梯度计算开销与词典大小的对数相关。\n",
86 | "\n",
87 | "## 练习\n",
88 | "\n",
89 | "\n",
90 | "* 在阅读下一节之前,你觉得在负采样中应如何采样噪声词?\n",
91 | "* 本节中最后一个公式为什么成立?\n",
92 | "* 如何将负采样或层序softmax用于训练连续词袋模型?\n",
93 | "\n",
94 | "\n",
95 | "\n",
96 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/8135)\n",
97 | "\n",
98 | ""
99 | ]
100 | }
101 | ],
102 | "metadata": {
103 | "kernelspec": {
104 | "display_name": "Python 3",
105 | "language": "python",
106 | "name": "python3"
107 | },
108 | "language_info": {
109 | "codemirror_mode": {
110 | "name": "ipython",
111 | "version": 3
112 | },
113 | "file_extension": ".py",
114 | "mimetype": "text/x-python",
115 | "name": "python",
116 | "nbconvert_exporter": "python",
117 | "pygments_lexer": "ipython3",
118 | "version": "3.7.4"
119 | },
120 | "toc": {
121 | "base_numbering": 1,
122 | "nav_menu": {},
123 | "number_sections": true,
124 | "sideBar": true,
125 | "skip_h1_title": false,
126 | "title_cell": "Table of Contents",
127 | "title_sidebar": "Contents",
128 | "toc_cell": false,
129 | "toc_position": {},
130 | "toc_section_display": true,
131 | "toc_window_display": false
132 | }
133 | },
134 | "nbformat": 4,
135 | "nbformat_minor": 4
136 | }
137 |
--------------------------------------------------------------------------------
/chapter_natural-language-processing/attention.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 注意力机制\n",
8 | "\n",
9 | "在[“编码器—解码器(seq2seq)”](seq2seq.ipynb)一节里,解码器在各个时间步依赖相同的背景变量来获取输入序列信息。当编码器为循环神经网络时,背景变量来自它最终时间步的隐藏状态。\n",
10 | "\n",
11 | "现在,让我们再次思考那一节提到的翻译例子:输入为英语序列“They”“are”“watching”“.”,输出为法语序列“Ils”“regardent”“.”。不难想到,解码器在生成输出序列中的每一个词时可能只需利用输入序列某一部分的信息。例如,在输出序列的时间步1,解码器可以主要依赖“They”“are”的信息来生成“Ils”,在时间步2则主要使用来自“watching”的编码信息生成“regardent”,最后在时间步3则直接映射句号“.”。这看上去就像是在解码器的每一时间步对输入序列中不同时间步的表征或编码信息分配不同的注意力一样。这也是注意力机制的由来 [1]。\n",
12 | "\n",
13 | "仍然以循环神经网络为例,注意力机制通过对编码器所有时间步的隐藏状态做加权平均来得到背景变量。解码器在每一时间步调整这些权重,即注意力权重,从而能够在不同时间步分别关注输入序列中的不同部分并编码进相应时间步的背景变量。本节我们将讨论注意力机制是怎么工作的。\n",
14 | "\n",
15 | "\n",
16 | "在[“编码器—解码器(seq2seq)”](seq2seq.ipynb)一节里我们区分了输入序列或编码器的索引$t$与输出序列或解码器的索引$t'$。该节中,解码器在时间步$t'$的隐藏状态$\\boldsymbol{s}_{t'} = g(\\boldsymbol{y}_{t'-1}, \\boldsymbol{c}, \\boldsymbol{s}_{t'-1})$,其中$\\boldsymbol{y}_{t'-1}$是上一时间步$t'-1$的输出$y_{t'-1}$的表征,且任一时间步$t'$使用相同的背景变量$\\boldsymbol{c}$。但在注意力机制中,解码器的每一时间步将使用可变的背景变量。记$\\boldsymbol{c}_{t'}$是解码器在时间步$t'$的背景变量,那么解码器在该时间步的隐藏状态可以改写为\n",
17 | "\n",
18 | "$$\\boldsymbol{s}_{t'} = g(\\boldsymbol{y}_{t'-1}, \\boldsymbol{c}_{t'}, \\boldsymbol{s}_{t'-1}).$$\n",
19 | "\n",
20 | "这里的关键是如何计算背景变量$\\boldsymbol{c}_{t'}$和如何利用它来更新隐藏状态$\\boldsymbol{s}_{t'}$。下面将分别描述这两个关键点。\n",
21 | "\n",
22 | "\n",
23 | "## 计算背景变量\n",
24 | "\n",
25 | "我们先描述第一个关键点,即计算背景变量。图10.12描绘了注意力机制如何为解码器在时间步2计算背景变量。首先,函数$a$根据解码器在时间步1的隐藏状态和编码器在各个时间步的隐藏状态计算softmax运算的输入。softmax运算输出概率分布并对编码器各个时间步的隐藏状态做加权平均,从而得到背景变量。\n",
26 | "\n",
27 | "\n",
28 | "\n",
29 | "\n",
30 | "具体来说,令编码器在时间步$t$的隐藏状态为$\\boldsymbol{h}_t$,且总时间步数为$T$。那么解码器在时间步$t'$的背景变量为所有编码器隐藏状态的加权平均:\n",
31 | "\n",
32 | "$$\\boldsymbol{c}_{t'} = \\sum_{t=1}^T \\alpha_{t' t} \\boldsymbol{h}_t,$$\n",
33 | "\n",
34 | "其中给定$t'$时,权重$\\alpha_{t' t}$在$t=1,\\ldots,T$的值是一个概率分布。为了得到概率分布,我们可以使用softmax运算:\n",
35 | "\n",
36 | "$$\\alpha_{t' t} = \\frac{\\exp(e_{t' t})}{ \\sum_{k=1}^T \\exp(e_{t' k}) },\\quad t=1,\\ldots,T.$$\n",
37 | "\n",
38 | "现在,我们需要定义如何计算上式中softmax运算的输入$e_{t' t}$。由于$e_{t' t}$同时取决于解码器的时间步$t'$和编码器的时间步$t$,我们不妨以解码器在时间步$t'-1$的隐藏状态$\\boldsymbol{s}_{t' - 1}$与编码器在时间步$t$的隐藏状态$\\boldsymbol{h}_t$为输入,并通过函数$a$计算$e_{t' t}$:\n",
39 | "\n",
40 | "$$e_{t' t} = a(\\boldsymbol{s}_{t' - 1}, \\boldsymbol{h}_t).$$\n",
41 | "\n",
42 | "\n",
43 | "这里函数$a$有多种选择,如果两个输入向量长度相同,一个简单的选择是计算它们的内积$a(\\boldsymbol{s}, \\boldsymbol{h})=\\boldsymbol{s}^\\top \\boldsymbol{h}$。而最早提出注意力机制的论文则将输入连结后通过含单隐藏层的多层感知机变换 [1]:\n",
44 | "\n",
45 | "$$a(\\boldsymbol{s}, \\boldsymbol{h}) = \\boldsymbol{v}^\\top \\tanh(\\boldsymbol{W}_s \\boldsymbol{s} + \\boldsymbol{W}_h \\boldsymbol{h}),$$\n",
46 | "\n",
47 | "其中$\\boldsymbol{v}$、$\\boldsymbol{W}_s$、$\\boldsymbol{W}_h$都是可以学习的模型参数。\n",
48 | "\n",
49 | "### 矢量化计算\n",
50 | "\n",
51 | "我们还可以对注意力机制采用更高效的矢量化计算。广义上,注意力机制的输入包括查询项以及一一对应的键项和值项,其中值项是需要加权平均的一组项。在加权平均中,值项的权重来自查询项以及与该值项对应的键项的计算。\n",
52 | "\n",
53 | "在上面的例子中,查询项为解码器的隐藏状态,键项和值项均为编码器的隐藏状态。\n",
54 | "让我们考虑一个常见的简单情形,即编码器和解码器的隐藏单元个数均为$h$,且函数$a(\\boldsymbol{s}, \\boldsymbol{h})=\\boldsymbol{s}^\\top \\boldsymbol{h}$。假设我们希望根据解码器单个隐藏状态$\\boldsymbol{s}_{t' - 1} \\in \\mathbb{R}^{h}$和编码器所有隐藏状态$\\boldsymbol{h}_t \\in \\mathbb{R}^{h}, t = 1,\\ldots,T$来计算背景向量$\\boldsymbol{c}_{t'}\\in \\mathbb{R}^{h}$。\n",
55 | "我们可以将查询项矩阵$\\boldsymbol{Q} \\in \\mathbb{R}^{1 \\times h}$设为$\\boldsymbol{s}_{t' - 1}^\\top$,并令键项矩阵$\\boldsymbol{K} \\in \\mathbb{R}^{T \\times h}$和值项矩阵$\\boldsymbol{V} \\in \\mathbb{R}^{T \\times h}$相同且第$t$行均为$\\boldsymbol{h}_t^\\top$。此时,我们只需要通过矢量化计算\n",
56 | "\n",
57 | "$$\\text{softmax}(\\boldsymbol{Q}\\boldsymbol{K}^\\top)\\boldsymbol{V}$$\n",
58 | "\n",
59 | "即可算出转置后的背景向量$\\boldsymbol{c}_{t'}^\\top$。当查询项矩阵$\\boldsymbol{Q}$的行数为$n$时,上式将得到$n$行的输出矩阵。输出矩阵与查询项矩阵在相同行上一一对应。\n",
60 | "\n",
61 | "\n",
62 | "\n",
63 | "## 更新隐藏状态\n",
64 | "\n",
65 | "现在我们描述第二个关键点,即更新隐藏状态。以门控循环单元为例,在解码器中我们可以对[“门控循环单元(GRU)”](../chapter_recurrent-neural-networks/gru.ipynb)一节中门控循环单元的设计稍作修改,从而变换上一时间步$t'-1$的输出$\\boldsymbol{y}_{t'-1}$、隐藏状态$\\boldsymbol{s}_{t' - 1}$和当前时间步$t'$的含注意力机制的背景变量$\\boldsymbol{c}_{t'}$ [1]。解码器在时间步$t'$的隐藏状态为\n",
66 | "\n",
67 | "$$\\boldsymbol{s}_{t'} = \\boldsymbol{z}_{t'} \\odot \\boldsymbol{s}_{t'-1} + (1 - \\boldsymbol{z}_{t'}) \\odot \\tilde{\\boldsymbol{s}}_{t'},$$\n",
68 | "\n",
69 | "其中的重置门、更新门和候选隐藏状态分别为\n",
70 | "\n",
71 | "$$\n",
72 | "\\begin{aligned}\n",
73 | "\\boldsymbol{r}_{t'} &= \\sigma(\\boldsymbol{W}_{yr} \\boldsymbol{y}_{t'-1} + \\boldsymbol{W}_{sr} \\boldsymbol{s}_{t' - 1} + \\boldsymbol{W}_{cr} \\boldsymbol{c}_{t'} + \\boldsymbol{b}_r),\\\\\n",
74 | "\\boldsymbol{z}_{t'} &= \\sigma(\\boldsymbol{W}_{yz} \\boldsymbol{y}_{t'-1} + \\boldsymbol{W}_{sz} \\boldsymbol{s}_{t' - 1} + \\boldsymbol{W}_{cz} \\boldsymbol{c}_{t'} + \\boldsymbol{b}_z),\\\\\n",
75 | "\\tilde{\\boldsymbol{s}}_{t'} &= \\text{tanh}(\\boldsymbol{W}_{ys} \\boldsymbol{y}_{t'-1} + \\boldsymbol{W}_{ss} (\\boldsymbol{s}_{t' - 1} \\odot \\boldsymbol{r}_{t'}) + \\boldsymbol{W}_{cs} \\boldsymbol{c}_{t'} + \\boldsymbol{b}_s),\n",
76 | "\\end{aligned}\n",
77 | "$$\n",
78 | "\n",
79 | "其中含下标的$\\boldsymbol{W}$和$\\boldsymbol{b}$分别为门控循环单元的权重参数和偏差参数。\n",
80 | "\n",
81 | "\n",
82 | "\n",
83 | "## 发展\n",
84 | "\n",
85 | "本质上,注意力机制能够为表征中较有价值的部分分配较多的计算资源。这个有趣的想法自提出后得到了快速发展,特别是启发了依靠注意力机制来编码输入序列并解码出输出序列的变换器(Transformer)模型的设计 [2]。变换器抛弃了卷积神经网络和循环神经网络的架构。它在计算效率上比基于循环神经网络的编码器—解码器模型通常更具明显优势。含注意力机制的变换器的编码结构在后来的BERT预训练模型中得以应用并令后者大放异彩:微调后的模型在多达11项自然语言处理任务中取得了当时最先进的结果 [3]。不久后,同样是基于变换器设计的GPT-2模型于新收集的语料数据集预训练后,在7个未参与训练的语言模型数据集上均取得了当时最先进的结果 [4]。除了自然语言处理领域,注意力机制还被广泛用于图像分类、自动图像描述、唇语解读以及语音识别。\n",
86 | "\n",
87 | "\n",
88 | "## 小结\n",
89 | "\n",
90 | "* 可以在解码器的每个时间步使用不同的背景变量,并对输入序列中不同时间步编码的信息分配不同的注意力。\n",
91 | "* 广义上,注意力机制的输入包括查询项以及一一对应的键项和值项。\n",
92 | "* 注意力机制可以采用更为高效的矢量化计算。\n",
93 | "\n",
94 | "\n",
95 | "## 练习\n",
96 | "\n",
97 | "* 基于本节的模型设计,为什么不可以将解码器在不同时间步的隐藏状态$\\boldsymbol{s}_{t' - 1}^\\top \\in \\mathbb{R}^{1 \\times h}, t' \\in 1, \\ldots, T'$连结成查询项矩阵$\\boldsymbol{Q} \\in \\mathbb{R}^{T' \\times h}$,从而同时计算不同时间步的含注意力机制的背景变量$\\boldsymbol{c}_{t'}^\\top, t' \\in 1, \\ldots, T'$?\n",
98 | "\n",
99 | "* 不修改[“门控循环单元(GRU)”](../chapter_recurrent-neural-networks/gru.ipynb)一节中的`gru`函数,应如何用它实现本节介绍的解码器?\n",
100 | "\n",
101 | "\n",
102 | "\n",
103 | "\n",
104 | "\n",
105 | "## 参考文献\n",
106 | "\n",
107 | "[1] Bahdanau, D., Cho, K., & Bengio, Y. (2014). Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473.\n",
108 | "\n",
109 | "[2] Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017). Attention is all you need. In Advances in Neural Information Processing Systems (pp. 5998-6008).\n",
110 | "\n",
111 | "[3] Devlin, J., Chang, M. W., Lee, K., & Toutanova, K. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805.\n",
112 | "\n",
113 | "[4] Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever I. (2019). Language Models are Unsupervised Multitask Learners. OpenAI.\n",
114 | "\n",
115 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/6759)\n",
116 | "\n",
117 | ""
118 | ]
119 | }
120 | ],
121 | "metadata": {
122 | "kernelspec": {
123 | "display_name": "Python 3",
124 | "language": "python",
125 | "name": "python3"
126 | },
127 | "language_info": {
128 | "codemirror_mode": {
129 | "name": "ipython",
130 | "version": 3
131 | },
132 | "file_extension": ".py",
133 | "mimetype": "text/x-python",
134 | "name": "python",
135 | "nbconvert_exporter": "python",
136 | "pygments_lexer": "ipython3",
137 | "version": "3.7.4"
138 | },
139 | "toc": {
140 | "base_numbering": 1,
141 | "nav_menu": {},
142 | "number_sections": true,
143 | "sideBar": true,
144 | "skip_h1_title": false,
145 | "title_cell": "Table of Contents",
146 | "title_sidebar": "Contents",
147 | "toc_cell": false,
148 | "toc_position": {},
149 | "toc_section_display": true,
150 | "toc_window_display": false
151 | }
152 | },
153 | "nbformat": 4,
154 | "nbformat_minor": 4
155 | }
156 |
--------------------------------------------------------------------------------
/chapter_natural-language-processing/beam-search.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 束搜索\n",
8 | "\n",
9 | "上一节介绍了如何训练输入和输出均为不定长序列的编码器—解码器。本节我们介绍如何使用编码器—解码器来预测不定长的序列。\n",
10 | "\n",
11 | "上一节里已经提到,在准备训练数据集时,我们通常会在样本的输入序列和输出序列后面分别附上一个特殊符号“<eos>”表示序列的终止。我们在接下来的讨论中也将沿用上一节的全部数学符号。为了便于讨论,假设解码器的输出是一段文本序列。设输出文本词典$\\mathcal{Y}$(包含特殊符号“<eos>”)的大小为$\\left|\\mathcal{Y}\\right|$,输出序列的最大长度为$T'$。所有可能的输出序列一共有$\\mathcal{O}(\\left|\\mathcal{Y}\\right|^{T'})$种。这些输出序列中所有特殊符号“<eos>”后面的子序列将被舍弃。\n",
12 | "\n",
13 | "\n",
14 | "## 贪婪搜索\n",
15 | "\n",
16 | "让我们先来看一个简单的解决方案:贪婪搜索(greedy search)。对于输出序列任一时间步$t'$,我们从$|\\mathcal{Y}|$个词中搜索出条件概率最大的词\n",
17 | "\n",
18 | "$$y_{t'} = \\operatorname*{argmax}_{y \\in \\mathcal{Y}} P(y \\mid y_1, \\ldots, y_{t'-1}, \\boldsymbol{c})$$\n",
19 | "\n",
20 | "作为输出。一旦搜索出“<eos>”符号,或者输出序列长度已经达到了最大长度$T'$,便完成输出。\n",
21 | "\n",
22 | "我们在描述解码器时提到,基于输入序列生成输出序列的条件概率是$\\prod_{t'=1}^{T'} P(y_{t'} \\mid y_1, \\ldots, y_{t'-1}, \\boldsymbol{c})$。我们将该条件概率最大的输出序列称为最优输出序列。而贪婪搜索的主要问题是不能保证得到最优输出序列。\n",
23 | "\n",
24 | "下面来看一个例子。假设输出词典里面有“A”“B”“C”和“<eos>”这4个词。图10.9中每个时间步下的4个数字分别代表了该时间步生成“A”“B”“C”和“<eos>”这4个词的条件概率。在每个时间步,贪婪搜索选取条件概率最大的词。因此,图10.9中将生成输出序列“A”“B”“C”“<eos>”。该输出序列的条件概率是$0.5\\times0.4\\times0.4\\times0.6 = 0.048$。\n",
25 | "\n",
26 | "\n",
27 | "\n",
28 | "\n",
29 | "\n",
30 | "接下来,观察图10.10演示的例子。与图10.9中不同,图10.10在时间步2中选取了条件概率第二大的词“C”。由于时间步3所基于的时间步1和2的输出子序列由图10.9中的“A”“B”变为了图10.10中的“A”“C”,图10.10中时间步3生成各个词的条件概率发生了变化。我们选取条件概率最大的词“B”。此时时间步4所基于的前3个时间步的输出子序列为“A”“C”“B”,与图10.9中的“A”“B”“C”不同。因此,图10.10中时间步4生成各个词的条件概率也与图10.9中的不同。我们发现,此时的输出序列“A”“C”“B”“<eos>”的条件概率是$0.5\\times0.3\\times0.6\\times0.6=0.054$,大于贪婪搜索得到的输出序列的条件概率。因此,贪婪搜索得到的输出序列“A”“B”“C”“<eos>”并非最优输出序列。\n",
31 | "\n",
32 | "\n",
33 | "\n",
34 | "## 穷举搜索\n",
35 | "\n",
36 | "如果目标是得到最优输出序列,我们可以考虑穷举搜索(exhaustive search):穷举所有可能的输出序列,输出条件概率最大的序列。\n",
37 | "\n",
38 | "虽然穷举搜索可以得到最优输出序列,但它的计算开销$\\mathcal{O}(\\left|\\mathcal{Y}\\right|^{T'})$很容易过大。例如,当$|\\mathcal{Y}|=10000$且$T'=10$时,我们将评估$10000^{10} = 10^{40}$个序列:这几乎不可能完成。而贪婪搜索的计算开销是$\\mathcal{O}(\\left|\\mathcal{Y}\\right|T')$,通常显著小于穷举搜索的计算开销。例如,当$|\\mathcal{Y}|=10000$且$T'=10$时,我们只需评估$10000\\times10=10^5$个序列。\n",
39 | "\n",
40 | "\n",
41 | "## 束搜索\n",
42 | "\n",
43 | "束搜索(beam search)是对贪婪搜索的一个改进算法。它有一个束宽(beam size)超参数。我们将它设为$k$。在时间步1时,选取当前时间步条件概率最大的$k$个词,分别组成$k$个候选输出序列的首词。在之后的每个时间步,基于上个时间步的$k$个候选输出序列,从$k\\left|\\mathcal{Y}\\right|$个可能的输出序列中选取条件概率最大的$k$个,作为该时间步的候选输出序列。最终,我们从各个时间步的候选输出序列中筛选出包含特殊符号“<eos>”的序列,并将它们中所有特殊符号“<eos>”后面的子序列舍弃,得到最终候选输出序列的集合。\n",
44 | "\n",
45 | "\n",
46 | "\n",
47 | "\n",
48 | "图10.11通过一个例子演示了束搜索的过程。假设输出序列的词典中只包含5个元素,即$\\mathcal{Y} = \\{A, B, C, D, E\\}$,且其中一个为特殊符号“<eos>”。设束搜索的束宽等于2,输出序列最大长度为3。在输出序列的时间步1时,假设条件概率$P(y_1 \\mid \\boldsymbol{c})$最大的2个词为$A$和$C$。我们在时间步2时将对所有的$y_2 \\in \\mathcal{Y}$都分别计算$P(A, y_2 \\mid \\boldsymbol{c}) = P(A \\mid \\boldsymbol{c})P(y_2 \\mid A, \\boldsymbol{c})$和$P(C, y_2 \\mid \\boldsymbol{c}) = P(C \\mid \\boldsymbol{c})P(y_2 \\mid C, \\boldsymbol{c})$,并从计算出的10个条件概率中取最大的2个,假设为$P(A, B \\mid \\boldsymbol{c})$和$P(C, E \\mid \\boldsymbol{c})$。那么,我们在时间步3时将对所有的$y_3 \\in \\mathcal{Y}$都分别计算$P(A, B, y_3 \\mid \\boldsymbol{c}) = P(A, B \\mid \\boldsymbol{c})P(y_3 \\mid A, B, \\boldsymbol{c})$和$P(C, E, y_3 \\mid \\boldsymbol{c}) = P(C, E \\mid \\boldsymbol{c})P(y_3 \\mid C, E, \\boldsymbol{c})$,并从计算出的10个条件概率中取最大的2个,假设为$P(A, B, D \\mid \\boldsymbol{c})$和$P(C, E, D \\mid \\boldsymbol{c})$。如此一来,我们得到6个候选输出序列:(1)$A$;(2)$C$;(3)$A$、$B$;(4)$C$、$E$;(5)$A$、$B$、$D$和(6)$C$、$E$、$D$。接下来,我们将根据这6个序列得出最终候选输出序列的集合。\n",
49 | "\n",
50 | "\n",
51 | "\n",
52 | "在最终候选输出序列的集合中,我们取以下分数最高的序列作为输出序列:\n",
53 | "\n",
54 | "$$ \\frac{1}{L^\\alpha} \\log P(y_1, \\ldots, y_{L}) = \\frac{1}{L^\\alpha} \\sum_{t'=1}^L \\log P(y_{t'} \\mid y_1, \\ldots, y_{t'-1}, \\boldsymbol{c}),$$\n",
55 | "\n",
56 | "其中$L$为最终候选序列长度,$\\alpha$一般可选为0.75。分母上的$L^\\alpha$是为了惩罚较长序列在以上分数中较多的对数相加项。分析可知,束搜索的计算开销为$\\mathcal{O}(k\\left|\\mathcal{Y}\\right|T')$。这介于贪婪搜索和穷举搜索的计算开销之间。此外,贪婪搜索可看作是束宽为1的束搜索。束搜索通过灵活的束宽$k$来权衡计算开销和搜索质量。\n",
57 | "\n",
58 | "\n",
59 | "## 小结\n",
60 | "\n",
61 | "* 预测不定长序列的方法包括贪婪搜索、穷举搜索和束搜索。\n",
62 | "* 束搜索通过灵活的束宽来权衡计算开销和搜索质量。\n",
63 | "\n",
64 | "\n",
65 | "## 练习\n",
66 | "\n",
67 | "* 穷举搜索可否看作特殊束宽的束搜索?为什么?\n",
68 | "* 在[“循环神经网络的从零开始实现”](../chapter_recurrent-neural-networks/rnn-scratch.ipynb)一节中,我们使用语言模型创作歌词。它的输出属于哪种搜索?你能改进它吗?\n",
69 | "\n",
70 | "\n",
71 | "\n",
72 | "\n",
73 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/6817)\n",
74 | "\n",
75 | ""
76 | ]
77 | }
78 | ],
79 | "metadata": {
80 | "kernelspec": {
81 | "display_name": "Python [conda env:pytorch]",
82 | "language": "python",
83 | "name": "conda-env-pytorch-py"
84 | },
85 | "language_info": {
86 | "codemirror_mode": {
87 | "name": "ipython",
88 | "version": 3
89 | },
90 | "file_extension": ".py",
91 | "mimetype": "text/x-python",
92 | "name": "python",
93 | "nbconvert_exporter": "python",
94 | "pygments_lexer": "ipython3",
95 | "version": "3.6.9"
96 | },
97 | "toc": {
98 | "base_numbering": 1,
99 | "nav_menu": {},
100 | "number_sections": true,
101 | "sideBar": true,
102 | "skip_h1_title": false,
103 | "title_cell": "Table of Contents",
104 | "title_sidebar": "Contents",
105 | "toc_cell": false,
106 | "toc_position": {},
107 | "toc_section_display": true,
108 | "toc_window_display": false
109 | }
110 | },
111 | "nbformat": 4,
112 | "nbformat_minor": 4
113 | }
114 |
--------------------------------------------------------------------------------
/chapter_natural-language-processing/fasttext.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 子词嵌入(fastText)\n",
8 | "\n",
9 | "英语单词通常有其内部结构和形成方式。例如,我们可以从“dog”“dogs”和“dogcatcher”的字面上推测它们的关系。这些词都有同一个词根“dog”,但使用不同的后缀来改变词的含义。而且,这个关联可以推广至其他词汇。例如,“dog”和“dogs”的关系如同“cat”和“cats”的关系,“boy”和“boyfriend”的关系如同“girl”和“girlfriend”的关系。这一特点并非为英语所独有。在法语和西班牙语中,很多动词根据场景不同有40多种不同的形态,而在芬兰语中,一个名词可能有15种以上的形态。事实上,构词学(morphology)作为语言学的一个重要分支,研究的正是词的内部结构和形成方式。\n",
10 | "\n",
11 | "在word2vec中,我们并没有直接利用构词学中的信息。无论是在跳字模型还是连续词袋模型中,我们都将形态不同的单词用不同的向量来表示。例如,“dog”和“dogs”分别用两个不同的向量表示,而模型中并未直接表达这两个向量之间的关系。鉴于此,fastText提出了子词嵌入(subword embedding)的方法,从而试图将构词信息引入word2vec中的跳字模型 [1]。\n",
12 | "\n",
13 | "在fastText中,每个中心词被表示成子词的集合。下面我们用单词“where”作为例子来了解子词是如何产生的。首先,我们在单词的首尾分别添加特殊字符“<”和“>”以区分作为前后缀的子词。然后,将单词当成一个由字符构成的序列来提取$n$元语法。例如,当$n=3$时,我们得到所有长度为3的子词:“<wh”“whe”“her”“ere”“re>”以及特殊子词“<where>”。\n",
14 | "\n",
15 | "在fastText中,对于一个词$w$,我们将它所有长度在$3 \\sim 6$的子词和特殊子词的并集记为$\\mathcal{G}_w$。那么词典则是所有词的子词集合的并集。假设词典中子词$g$的向量为$\\boldsymbol{z}_g$,那么跳字模型中词$w$的作为中心词的向量$\\boldsymbol{v}_w$则表示成\n",
16 | "\n",
17 | "$$\\boldsymbol{v}_w = \\sum_{g\\in\\mathcal{G}_w} \\boldsymbol{z}_g.$$\n",
18 | "\n",
19 | "fastText的其余部分同跳字模型一致,不在此重复。可以看到,与跳字模型相比,fastText中词典规模更大,造成模型参数更多,同时一个词的向量需要对所有子词向量求和,继而导致计算复杂度更高。但与此同时,较生僻的复杂单词,甚至是词典中没有的单词,可能会从同它结构类似的其他词那里获取更好的词向量表示。\n",
20 | "\n",
21 | "\n",
22 | "## 小结\n",
23 | "\n",
24 | "* fastText提出了子词嵌入方法。它在word2vec中的跳字模型的基础上,将中心词向量表示成单词的子词向量之和。\n",
25 | "* 子词嵌入利用构词上的规律,通常可以提升生僻词表示的质量。\n",
26 | "\n",
27 | "\n",
28 | "## 练习\n",
29 | "\n",
30 | "* 子词过多(例如,6字英文组合数约为$3\\times 10^8$)会有什么问题?你有什么办法来解决它吗?提示:可参考fastText论文3.2节末尾 [1]。\n",
31 | "* 如何基于连续词袋模型设计子词嵌入模型?\n",
32 | "\n",
33 | "\n",
34 | "\n",
35 | "\n",
36 | "\n",
37 | "\n",
38 | "## 参考文献\n",
39 | "\n",
40 | "[1] Bojanowski, P., Grave, E., Joulin, A., & Mikolov, T. (2016). Enriching word vectors with subword information. arXiv preprint arXiv:1607.04606.\n",
41 | "\n",
42 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/8057)\n",
43 | "\n",
44 | ""
45 | ]
46 | }
47 | ],
48 | "metadata": {
49 | "kernelspec": {
50 | "display_name": "Python 3",
51 | "language": "python",
52 | "name": "python3"
53 | },
54 | "language_info": {
55 | "codemirror_mode": {
56 | "name": "ipython",
57 | "version": 3
58 | },
59 | "file_extension": ".py",
60 | "mimetype": "text/x-python",
61 | "name": "python",
62 | "nbconvert_exporter": "python",
63 | "pygments_lexer": "ipython3",
64 | "version": "3.7.4"
65 | },
66 | "toc": {
67 | "base_numbering": 1,
68 | "nav_menu": {},
69 | "number_sections": true,
70 | "sideBar": true,
71 | "skip_h1_title": false,
72 | "title_cell": "Table of Contents",
73 | "title_sidebar": "Contents",
74 | "toc_cell": false,
75 | "toc_position": {},
76 | "toc_section_display": true,
77 | "toc_window_display": false
78 | }
79 | },
80 | "nbformat": 4,
81 | "nbformat_minor": 4
82 | }
83 |
--------------------------------------------------------------------------------
/chapter_natural-language-processing/glove.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 全局向量的词嵌入(GloVe)\n",
8 | "\n",
9 | "让我们先回顾一下word2vec中的跳字模型。将跳字模型中使用softmax运算表达的条件概率$P(w_j\\mid w_i)$记作$q_{ij}$,即\n",
10 | "\n",
11 | "$$q_{ij}=\\frac{\\exp(\\boldsymbol{u}_j^\\top \\boldsymbol{v}_i)}{ \\sum_{k \\in \\mathcal{V}} \\text{exp}(\\boldsymbol{u}_k^\\top \\boldsymbol{v}_i)},$$\n",
12 | "\n",
13 | "其中$\\boldsymbol{v}_i$和$\\boldsymbol{u}_i$分别是索引为$i$的词$w_i$作为中心词和背景词时的向量表示,$\\mathcal{V} = \\{0, 1, \\ldots, |\\mathcal{V}|-1\\}$为词典索引集。\n",
14 | "\n",
15 | "对于词$w_i$,它在数据集中可能多次出现。我们将每一次以它作为中心词的所有背景词全部汇总并保留重复元素,记作多重集(multiset)$\\mathcal{C}_i$。一个元素在多重集中的个数称为该元素的重数(multiplicity)。举例来说,假设词$w_i$在数据集中出现2次:文本序列中以这2个$w_i$作为中心词的背景窗口分别包含背景词索引$2,1,5,2$和$2,3,2,1$。那么多重集$\\mathcal{C}_i = \\{1,1,2,2,2,2,3,5\\}$,其中元素1的重数为2,元素2的重数为4,元素3和5的重数均为1。将多重集$\\mathcal{C}_i$中元素$j$的重数记作$x_{ij}$:它表示了整个数据集中所有以$w_i$为中心词的背景窗口中词$w_j$的个数。那么,跳字模型的损失函数还可以用另一种方式表达:\n",
16 | "\n",
17 | "$$-\\sum_{i\\in\\mathcal{V}}\\sum_{j\\in\\mathcal{V}} x_{ij} \\log\\,q_{ij}.$$\n",
18 | "\n",
19 | "我们将数据集中所有以词$w_i$为中心词的背景词的数量之和$\\left|\\mathcal{C}_i\\right|$记为$x_i$,并将以$w_i$为中心词生成背景词$w_j$的条件概率$x_{ij}/x_i$记作$p_{ij}$。我们可以进一步改写跳字模型的损失函数为\n",
20 | "\n",
21 | "$$-\\sum_{i\\in\\mathcal{V}} x_i \\sum_{j\\in\\mathcal{V}} p_{ij} \\log\\,q_{ij}.$$\n",
22 | "\n",
23 | "上式中,$-\\sum_{j\\in\\mathcal{V}} p_{ij} \\log\\,q_{ij}$计算的是以$w_i$为中心词的背景词条件概率分布$p_{ij}$和模型预测的条件概率分布$q_{ij}$的交叉熵,且损失函数使用所有以词$w_i$为中心词的背景词的数量之和来加权。最小化上式中的损失函数会令预测的条件概率分布尽可能接近真实的条件概率分布。\n",
24 | "\n",
25 | "然而,作为常用损失函数的一种,交叉熵损失函数有时并不是好的选择。一方面,正如我们在[“近似训练”](approx-training.ipynb)一节中所提到的,令模型预测$q_{ij}$成为合法概率分布的代价是它在分母中基于整个词典的累加项。这很容易带来过大的计算开销。另一方面,词典中往往有大量生僻词,它们在数据集中出现的次数极少。而有关大量生僻词的条件概率分布在交叉熵损失函数中的最终预测往往并不准确。\n",
26 | "\n",
27 | "\n",
28 | "\n",
29 | "## GloVe模型\n",
30 | "\n",
31 | "鉴于此,作为在word2vec之后提出的词嵌入模型,GloVe模型采用了平方损失,并基于该损失对跳字模型做了3点改动 [1]:\n",
32 | "\n",
33 | "1. 使用非概率分布的变量$p'_{ij}=x_{ij}$和$q'_{ij}=\\exp(\\boldsymbol{u}_j^\\top \\boldsymbol{v}_i)$,并对它们取对数。因此,平方损失项是$\\left(\\log\\,p'_{ij} - \\log\\,q'_{ij}\\right)^2 = \\left(\\boldsymbol{u}_j^\\top \\boldsymbol{v}_i - \\log\\,x_{ij}\\right)^2$。\n",
34 | "2. 为每个词$w_i$增加两个为标量的模型参数:中心词偏差项$b_i$和背景词偏差项$c_i$。\n",
35 | "3. 将每个损失项的权重替换成函数$h(x_{ij})$。权重函数$h(x)$是值域在$[0,1]$的单调递增函数。\n",
36 | "\n",
37 | "如此一来,GloVe模型的目标是最小化损失函数\n",
38 | "\n",
39 | "$$\\sum_{i\\in\\mathcal{V}} \\sum_{j\\in\\mathcal{V}} h(x_{ij}) \\left(\\boldsymbol{u}_j^\\top \\boldsymbol{v}_i + b_i + c_j - \\log\\,x_{ij}\\right)^2.$$\n",
40 | "\n",
41 | "其中权重函数$h(x)$的一个建议选择是:当$x < c$时(如$c = 100$),令$h(x) = (x/c)^\\alpha$(如$\\alpha = 0.75$),反之令$h(x) = 1$。因为$h(0)=0$,所以对于$x_{ij}=0$的平方损失项可以直接忽略。当使用小批量随机梯度下降来训练时,每个时间步我们随机采样小批量非零$x_{ij}$,然后计算梯度来迭代模型参数。这些非零$x_{ij}$是预先基于整个数据集计算得到的,包含了数据集的全局统计信息。因此,GloVe模型的命名取“全局向量”(Global Vectors)之意。\n",
42 | "\n",
43 | "需要强调的是,如果词$w_i$出现在词$w_j$的背景窗口里,那么词$w_j$也会出现在词$w_i$的背景窗口里。也就是说,$x_{ij}=x_{ji}$。不同于word2vec中拟合的是非对称的条件概率$p_{ij}$,GloVe模型拟合的是对称的$\\log\\, x_{ij}$。因此,任意词的中心词向量和背景词向量在GloVe模型中是等价的。但由于初始化值的不同,同一个词最终学习到的两组词向量可能不同。当学习得到所有词向量以后,GloVe模型使用中心词向量与背景词向量之和作为该词的最终词向量。\n",
44 | "\n",
45 | "\n",
46 | "## 从条件概率比值理解GloVe模型\n",
47 | "\n",
48 | "我们还可以从另外一个角度来理解GloVe模型。沿用本节前面的符号,$P(w_j \\mid w_i)$表示数据集中以$w_i$为中心词生成背景词$w_j$的条件概率,并记作$p_{ij}$。作为源于某大型语料库的真实例子,以下列举了两组分别以“ice”(冰)和“steam”(蒸汽)为中心词的条件概率以及它们之间的比值 [1]:\n",
49 | "\n",
50 | "|$w_k$=|“solid”|“gas”|“water”|“fashion”|\n",
51 | "|--:|:-:|:-:|:-:|:-:|\n",
52 | "|$p_1=P(w_k\\mid$ “ice” $)$|0.00019|0.000066|0.003|0.000017|\n",
53 | "|$p_2=P(w_k\\mid$ “steam” $)$|0.000022|0.00078|0.0022|0.000018|\n",
54 | "|$p_1/p_2$|8.9|0.085|1.36|0.96|\n",
55 | "\n",
56 | "\n",
57 | "我们可以观察到以下现象。\n",
58 | "\n",
59 | "* 对于与“ice”相关而与“steam”不相关的词$w_k$,如$w_k=$“solid”(固体),我们期望条件概率比值较大,如上表最后一行中的值8.9;\n",
60 | "* 对于与“ice”不相关而与“steam”相关的词$w_k$,如$w_k=$“gas”(气体),我们期望条件概率比值较小,如上表最后一行中的值0.085;\n",
61 | "* 对于与“ice”和“steam”都相关的词$w_k$,如$w_k=$“water”(水),我们期望条件概率比值接近1,如上表最后一行中的值1.36;\n",
62 | "* 对于与“ice”和“steam”都不相关的词$w_k$,如$w_k=$“fashion”(时尚),我们期望条件概率比值接近1,如上表最后一行中的值0.96。\n",
63 | "\n",
64 | "由此可见,条件概率比值能比较直观地表达词与词之间的关系。我们可以构造一个词向量函数使它能有效拟合条件概率比值。我们知道,任意一个这样的比值需要3个词$w_i$、$w_j$和$w_k$。以$w_i$作为中心词的条件概率比值为${p_{ij}}/{p_{ik}}$。我们可以找一个函数,它使用词向量来拟合这个条件概率比值\n",
65 | "\n",
66 | "$$f(\\boldsymbol{u}_j, \\boldsymbol{u}_k, {\\boldsymbol{v}}_i) \\approx \\frac{p_{ij}}{p_{ik}}.$$\n",
67 | "\n",
68 | "这里函数$f$可能的设计并不唯一,我们只需考虑一种较为合理的可能性。注意到条件概率比值是一个标量,我们可以将$f$限制为一个标量函数:$f(\\boldsymbol{u}_j, \\boldsymbol{u}_k, {\\boldsymbol{v}}_i) = f\\left((\\boldsymbol{u}_j - \\boldsymbol{u}_k)^\\top {\\boldsymbol{v}}_i\\right)$。交换索引$j$和$k$后可以看到函数$f$应该满足$f(x)f(-x)=1$,因此一种可能是$f(x)=\\exp(x)$,于是\n",
69 | "\n",
70 | "$$f(\\boldsymbol{u}_j, \\boldsymbol{u}_k, {\\boldsymbol{v}}_i) = \\frac{\\exp\\left(\\boldsymbol{u}_j^\\top {\\boldsymbol{v}}_i\\right)}{\\exp\\left(\\boldsymbol{u}_k^\\top {\\boldsymbol{v}}_i\\right)} \\approx \\frac{p_{ij}}{p_{ik}}.$$\n",
71 | "\n",
72 | "满足最右边约等号的一种可能是$\\exp\\left(\\boldsymbol{u}_j^\\top {\\boldsymbol{v}}_i\\right) \\approx \\alpha p_{ij}$,这里$\\alpha$是一个常数。考虑到$p_{ij}=x_{ij}/x_i$,取对数后$\\boldsymbol{u}_j^\\top {\\boldsymbol{v}}_i \\approx \\log\\,\\alpha + \\log\\,x_{ij} - \\log\\,x_i$。我们使用额外的偏差项来拟合$- \\log\\,\\alpha + \\log\\,x_i$,例如,中心词偏差项$b_i$和背景词偏差项$c_j$:\n",
73 | "\n",
74 | "$$\\boldsymbol{u}_j^\\top \\boldsymbol{v}_i + b_i + c_j \\approx \\log(x_{ij}).$$\n",
75 | "\n",
76 | "对上式左右两边取平方误差并加权,我们可以得到GloVe模型的损失函数。\n",
77 | "\n",
78 | "\n",
79 | "## 小结\n",
80 | "\n",
81 | "* 在有些情况下,交叉熵损失函数有劣势。GloVe模型采用了平方损失,并通过词向量拟合预先基于整个数据集计算得到的全局统计信息。\n",
82 | "* 任意词的中心词向量和背景词向量在GloVe模型中是等价的。\n",
83 | "\n",
84 | "\n",
85 | "## 练习\n",
86 | "\n",
87 | "* 如果一个词出现在另一个词的背景窗口中,如何利用它们之间在文本序列的距离重新设计条件概率$p_{ij}$的计算方式?(提示:可参考GloVe论文4.2节 [1]。)\n",
88 | "* 对于任意词,它在GloVe模型的中心词偏差项和背景词偏差项是否等价?为什么?\n",
89 | "\n",
90 | "\n",
91 | "\n",
92 | "## 参考文献\n",
93 | "\n",
94 | "[1] Pennington, J., Socher, R., & Manning, C. (2014). Glove: Global vectors for word representation. In Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP) (pp. 1532-1543).\n",
95 | "\n",
96 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/4372)\n",
97 | "\n",
98 | ""
99 | ]
100 | }
101 | ],
102 | "metadata": {
103 | "kernelspec": {
104 | "display_name": "Python 3",
105 | "language": "python",
106 | "name": "python3"
107 | },
108 | "language_info": {
109 | "codemirror_mode": {
110 | "name": "ipython",
111 | "version": 3
112 | },
113 | "file_extension": ".py",
114 | "mimetype": "text/x-python",
115 | "name": "python",
116 | "nbconvert_exporter": "python",
117 | "pygments_lexer": "ipython3",
118 | "version": "3.7.4"
119 | },
120 | "toc": {
121 | "base_numbering": 1,
122 | "nav_menu": {},
123 | "number_sections": true,
124 | "sideBar": true,
125 | "skip_h1_title": false,
126 | "title_cell": "Table of Contents",
127 | "title_sidebar": "Contents",
128 | "toc_cell": false,
129 | "toc_position": {},
130 | "toc_section_display": true,
131 | "toc_window_display": false
132 | }
133 | },
134 | "nbformat": 4,
135 | "nbformat_minor": 4
136 | }
137 |
--------------------------------------------------------------------------------
/chapter_natural-language-processing/index.md:
--------------------------------------------------------------------------------
1 | # 自然语言处理
2 |
3 | 自然语言处理关注计算机与人类之间的自然语言交互。在实际中,我们常常使用自然语言处理技术,如“循环神经网络”一章中介绍的语言模型,来处理和分析大量的自然语言数据。本章中,根据输入与输出的不同形式,我们按“定长到定长”、“不定长到定长”、“不定长到不定长”的顺序,逐步展示在自然语言处理中如何表征并变换定长的词或类别以及不定长的句子或段落序列。
4 |
5 | 我们先介绍如何用向量表示词,并在语料库上训练词向量。之后,我们把在更大语料库上预训练的词向量应用于求近义词和类比词,即“定长到定长”。接着,在文本分类这种“不定长到定长”的任务中,我们进一步应用词向量来分析文本情感,并分别基于循环神经网络和卷积神经网络为表征时序数据提供两种思路。此外,自然语言处理任务中很多输出是不定长的,如任意长度的句子或段落。我们将描述应对这类问题的编码器—解码器模型、束搜索和注意力机制,并动手实践“不定长到不定长”的机器翻译任务。
6 |
7 | ```eval_rst
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 |
12 | word2vec
13 | approx-training
14 | word2vec-nn
15 | fasttext
16 | glove
17 | similarity-analogy
18 | sentiment-analysis-rnn
19 | sentiment-analysis-cnn
20 | seq2seq
21 | beam-search
22 | attention
23 | machine-translation
24 | ```
25 |
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/chapter_natural-language-processing/seq2seq.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 编码器—解码器(seq2seq)\n",
8 | "\n",
9 | "我们已经在前两节中表征并变换了不定长的输入序列。但在自然语言处理的很多应用中,输入和输出都可以是不定长序列。以机器翻译为例,输入可以是一段不定长的英语文本序列,输出可以是一段不定长的法语文本序列,例如\n",
10 | "\n",
11 | "> 英语输入:“They”、“are”、“watching”、“.”\n",
12 | "\n",
13 | "> 法语输出:“Ils”、“regardent”、“.”\n",
14 | "\n",
15 | "当输入和输出都是不定长序列时,我们可以使用编码器—解码器(encoder-decoder)[1] 或者seq2seq模型 [2]。这两个模型本质上都用到了两个循环神经网络,分别叫做编码器和解码器。编码器用来分析输入序列,解码器用来生成输出序列。\n",
16 | "\n",
17 | "图10.8描述了使用编码器—解码器将上述英语句子翻译成法语句子的一种方法。在训练数据集中,我们可以在每个句子后附上特殊符号“<eos>”(end of sequence)以表示序列的终止。编码器每个时间步的输入依次为英语句子中的单词、标点和特殊符号“<eos>”。图10.8中使用了编码器在最终时间步的隐藏状态作为输入句子的表征或编码信息。解码器在各个时间步中使用输入句子的编码信息和上个时间步的输出以及隐藏状态作为输入。\n",
18 | "我们希望解码器在各个时间步能正确依次输出翻译后的法语单词、标点和特殊符号“<eos>”。\n",
19 | "需要注意的是,解码器在最初时间步的输入用到了一个表示序列开始的特殊符号“<bos>”(beginning of sequence)。\n",
20 | "\n",
21 | "\n",
22 | "\n",
23 | "接下来,我们分别介绍编码器和解码器的定义。\n",
24 | "\n",
25 | "## 编码器\n",
26 | "\n",
27 | "编码器的作用是把一个不定长的输入序列变换成一个定长的背景变量$\\boldsymbol{c}$,并在该背景变量中编码输入序列信息。常用的编码器是循环神经网络。\n",
28 | "\n",
29 | "让我们考虑批量大小为1的时序数据样本。假设输入序列是$x_1,\\ldots,x_T$,例如$x_i$是输入句子中的第$i$个词。在时间步$t$,循环神经网络将输入$x_t$的特征向量$\\boldsymbol{x}_t$和上个时间步的隐藏状态$\\boldsymbol{h}_{t-1}$变换为当前时间步的隐藏状态$\\boldsymbol{h}_t$。我们可以用函数$f$表达循环神经网络隐藏层的变换:\n",
30 | "\n",
31 | "$$\\boldsymbol{h}_t = f(\\boldsymbol{x}_t, \\boldsymbol{h}_{t-1}). $$\n",
32 | "\n",
33 | "接下来,编码器通过自定义函数$q$将各个时间步的隐藏状态变换为背景变量\n",
34 | "\n",
35 | "$$\\boldsymbol{c} = q(\\boldsymbol{h}_1, \\ldots, \\boldsymbol{h}_T).$$\n",
36 | "\n",
37 | "例如,当选择$q(\\boldsymbol{h}_1, \\ldots, \\boldsymbol{h}_T) = \\boldsymbol{h}_T$时,背景变量是输入序列最终时间步的隐藏状态$\\boldsymbol{h}_T$。\n",
38 | "\n",
39 | "以上描述的编码器是一个单向的循环神经网络,每个时间步的隐藏状态只取决于该时间步及之前的输入子序列。我们也可以使用双向循环神经网络构造编码器。在这种情况下,编码器每个时间步的隐藏状态同时取决于该时间步之前和之后的子序列(包括当前时间步的输入),并编码了整个序列的信息。\n",
40 | "\n",
41 | "\n",
42 | "## 解码器\n",
43 | "\n",
44 | "刚刚已经介绍,编码器输出的背景变量$\\boldsymbol{c}$编码了整个输入序列$x_1, \\ldots, x_T$的信息。给定训练样本中的输出序列$y_1, y_2, \\ldots, y_{T'}$,对每个时间步$t'$(符号与输入序列或编码器的时间步$t$有区别),解码器输出$y_{t'}$的条件概率将基于之前的输出序列$y_1,\\ldots,y_{t'-1}$和背景变量$\\boldsymbol{c}$,即$P(y_{t'} \\mid y_1, \\ldots, y_{t'-1}, \\boldsymbol{c})$。\n",
45 | "\n",
46 | "为此,我们可以使用另一个循环神经网络作为解码器。\n",
47 | "在输出序列的时间步$t^\\prime$,解码器将上一时间步的输出$y_{t^\\prime-1}$以及背景变量$\\boldsymbol{c}$作为输入,并将它们与上一时间步的隐藏状态$\\boldsymbol{s}_{t^\\prime-1}$变换为当前时间步的隐藏状态$\\boldsymbol{s}_{t^\\prime}$。因此,我们可以用函数$g$表达解码器隐藏层的变换:\n",
48 | "\n",
49 | "$$\\boldsymbol{s}_{t^\\prime} = g(y_{t^\\prime-1}, \\boldsymbol{c}, \\boldsymbol{s}_{t^\\prime-1}).$$\n",
50 | "\n",
51 | "有了解码器的隐藏状态后,我们可以使用自定义的输出层和softmax运算来计算$P(y_{t^\\prime} \\mid y_1, \\ldots, y_{t^\\prime-1}, \\boldsymbol{c})$,例如,基于当前时间步的解码器隐藏状态 $\\boldsymbol{s}_{t^\\prime}$、上一时间步的输出$y_{t^\\prime-1}$以及背景变量$\\boldsymbol{c}$来计算当前时间步输出$y_{t^\\prime}$的概率分布。\n",
52 | "\n",
53 | "\n",
54 | "## 训练模型\n",
55 | "\n",
56 | "根据最大似然估计,我们可以最大化输出序列基于输入序列的条件概率\n",
57 | "\n",
58 | "$$\n",
59 | "\\begin{aligned}\n",
60 | "P(y_1, \\ldots, y_{T'} \\mid x_1, \\ldots, x_T)\n",
61 | "&= \\prod_{t'=1}^{T'} P(y_{t'} \\mid y_1, \\ldots, y_{t'-1}, x_1, \\ldots, x_T)\\\\\n",
62 | "&= \\prod_{t'=1}^{T'} P(y_{t'} \\mid y_1, \\ldots, y_{t'-1}, \\boldsymbol{c}),\n",
63 | "\\end{aligned}\n",
64 | "$$\n",
65 | "\n",
66 | "并得到该输出序列的损失\n",
67 | "\n",
68 | "$$- \\log P(y_1, \\ldots, y_{T'} \\mid x_1, \\ldots, x_T) = -\\sum_{t'=1}^{T'} \\log P(y_{t'} \\mid y_1, \\ldots, y_{t'-1}, \\boldsymbol{c}),$$\n",
69 | "\n",
70 | "在模型训练中,所有输出序列损失的均值通常作为需要最小化的损失函数。在图10.8所描述的模型预测中,我们需要将解码器在上一个时间步的输出作为当前时间步的输入。与此不同,在训练中我们也可以将标签序列(训练集的真实输出序列)在上一个时间步的标签作为解码器在当前时间步的输入。这叫作强制教学(teacher forcing)。\n",
71 | "\n",
72 | "\n",
73 | "## 小结\n",
74 | "\n",
75 | "* 编码器-解码器(seq2seq)可以输入并输出不定长的序列。\n",
76 | "* 编码器—解码器使用了两个循环神经网络。\n",
77 | "* 在编码器—解码器的训练中,可以采用强制教学。\n",
78 | "\n",
79 | "\n",
80 | "## 练习\n",
81 | "\n",
82 | "* 除了机器翻译,你还能想到编码器-解码器的哪些应用?\n",
83 | "* 有哪些方法可以设计解码器的输出层?\n",
84 | "\n",
85 | "\n",
86 | "\n",
87 | "\n",
88 | "## 参考文献\n",
89 | "\n",
90 | "[1] Cho, K., Van Merriënboer, B., Gulcehre, C., Bahdanau, D., Bougares, F., Schwenk, H., & Bengio, Y. (2014). Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078.\n",
91 | "\n",
92 | "[2] Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. In Advances in neural information processing systems (pp. 3104-3112).\n",
93 | "\n",
94 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/4523)\n",
95 | "\n",
96 | ""
97 | ]
98 | }
99 | ],
100 | "metadata": {
101 | "kernelspec": {
102 | "display_name": "Python [conda env:pytorch]",
103 | "language": "python",
104 | "name": "conda-env-pytorch-py"
105 | },
106 | "language_info": {
107 | "codemirror_mode": {
108 | "name": "ipython",
109 | "version": 3
110 | },
111 | "file_extension": ".py",
112 | "mimetype": "text/x-python",
113 | "name": "python",
114 | "nbconvert_exporter": "python",
115 | "pygments_lexer": "ipython3",
116 | "version": "3.6.9"
117 | },
118 | "toc": {
119 | "base_numbering": 1,
120 | "nav_menu": {},
121 | "number_sections": true,
122 | "sideBar": true,
123 | "skip_h1_title": false,
124 | "title_cell": "Table of Contents",
125 | "title_sidebar": "Contents",
126 | "toc_cell": false,
127 | "toc_position": {},
128 | "toc_section_display": true,
129 | "toc_window_display": false
130 | }
131 | },
132 | "nbformat": 4,
133 | "nbformat_minor": 4
134 | }
135 |
--------------------------------------------------------------------------------
/chapter_natural-language-processing/word2vec.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 词嵌入(word2vec)\n",
8 | "\n",
9 | "\n",
10 | "自然语言是一套用来表达含义的复杂系统。在这套系统中,词是表义的基本单元。顾名思义,词向量是用来表示词的向量,也可被认为是词的特征向量或表征。把词映射为实数域向量的技术也叫词嵌入(word embedding)。近年来,词嵌入已逐渐成为自然语言处理的基础知识。\n",
11 | "\n",
12 | "\n",
13 | "## 为何不采用one-hot向量\n",
14 | "\n",
15 | "我们在[“循环神经网络的从零开始实现”](../chapter_recurrent-neural-networks/rnn-scratch.ipynb)一节中使用one-hot向量表示词(字符为词)。回忆一下,假设词典中不同词的数量(词典大小)为$N$,每个词可以和从0到$N-1$的连续整数一一对应。这些与词对应的整数叫作词的索引。\n",
16 | "假设一个词的索引为$i$,为了得到该词的one-hot向量表示,我们创建一个全0的长为$N$的向量,并将其第$i$位设成1。这样一来,每个词就表示成了一个长度为$N$的向量,可以直接被神经网络使用。\n",
17 | "\n",
18 | "虽然one-hot词向量构造起来很容易,但通常并不是一个好选择。一个主要的原因是,one-hot词向量无法准确表达不同词之间的相似度,如我们常常使用的余弦相似度。对于向量$\\boldsymbol{x}, \\boldsymbol{y} \\in \\mathbb{R}^d$,它们的余弦相似度是它们之间夹角的余弦值\n",
19 | "\n",
20 | "$$\\frac{\\boldsymbol{x}^\\top \\boldsymbol{y}}{\\|\\boldsymbol{x}\\| \\|\\boldsymbol{y}\\|} \\in [-1, 1].$$\n",
21 | "\n",
22 | "由于任何两个不同词的one-hot向量的余弦相似度都为0,多个不同词之间的相似度难以通过one-hot向量准确地体现出来。\n",
23 | "\n",
24 | "word2vec工具的提出正是为了解决上面这个问题 [1]。它将每个词表示成一个定长的向量,并使得这些向量能较好地表达不同词之间的相似和类比关系。word2vec工具包含了两个模型,即跳字模型(skip-gram)[2] 和连续词袋模型(continuous bag of words,CBOW)[3]。接下来让我们分别介绍这两个模型以及它们的训练方法。\n",
25 | "\n",
26 | "\n",
27 | "## 跳字模型\n",
28 | "\n",
29 | "跳字模型假设基于某个词来生成它在文本序列周围的词。举个例子,假设文本序列是“the”“man”“loves”“his”“son”。以“loves”作为中心词,设背景窗口大小为2。如图10.1所示,跳字模型所关心的是,给定中心词“loves”,生成与它距离不超过2个词的背景词“the”“man”“his”“son”的条件概率,即\n",
30 | "\n",
31 | "$$P(\\textrm{``the\"},\\textrm{``man\"},\\textrm{``his\"},\\textrm{``son\"}\\mid\\textrm{``loves\"}).$$\n",
32 | "\n",
33 | "假设给定中心词的情况下,背景词的生成是相互独立的,那么上式可以改写成\n",
34 | "\n",
35 | "$$P(\\textrm{``the\"}\\mid\\textrm{``loves\"})\\cdot P(\\textrm{``man\"}\\mid\\textrm{``loves\"})\\cdot P(\\textrm{``his\"}\\mid\\textrm{``loves\"})\\cdot P(\\textrm{``son\"}\\mid\\textrm{``loves\"}).$$\n",
36 | "\n",
37 | "\n",
38 | "\n",
39 | "\n",
40 | "在跳字模型中,每个词被表示成两个$d$维向量,用来计算条件概率。假设这个词在词典中索引为$i$,当它为中心词时向量表示为$\\boldsymbol{v}_i\\in\\mathbb{R}^d$,而为背景词时向量表示为$\\boldsymbol{u}_i\\in\\mathbb{R}^d$。设中心词$w_c$在词典中索引为$c$,背景词$w_o$在词典中索引为$o$,给定中心词生成背景词的条件概率可以通过对向量内积做softmax运算而得到:\n",
41 | "\n",
42 | "$$P(w_o \\mid w_c) = \\frac{\\text{exp}(\\boldsymbol{u}_o^\\top \\boldsymbol{v}_c)}{ \\sum_{i \\in \\mathcal{V}} \\text{exp}(\\boldsymbol{u}_i^\\top \\boldsymbol{v}_c)},$$\n",
43 | "\n",
44 | "其中词典索引集$\\mathcal{V} = \\{0, 1, \\ldots, |\\mathcal{V}|-1\\}$。假设给定一个长度为$T$的文本序列,设时间步$t$的词为$w^{(t)}$。假设给定中心词的情况下背景词的生成相互独立,当背景窗口大小为$m$时,跳字模型的似然函数即给定任一中心词生成所有背景词的概率\n",
45 | "\n",
46 | "$$ \\prod_{t=1}^{T} \\prod_{-m \\leq j \\leq m,\\ j \\neq 0} P(w^{(t+j)} \\mid w^{(t)}),$$\n",
47 | "\n",
48 | "这里小于1和大于$T$的时间步可以忽略。\n",
49 | "\n",
50 | "### 训练跳字模型\n",
51 | "\n",
52 | "跳字模型的参数是每个词所对应的中心词向量和背景词向量。训练中我们通过最大化似然函数来学习模型参数,即最大似然估计。这等价于最小化以下损失函数:\n",
53 | "\n",
54 | "$$ - \\sum_{t=1}^{T} \\sum_{-m \\leq j \\leq m,\\ j \\neq 0} \\text{log}\\, P(w^{(t+j)} \\mid w^{(t)}).$$\n",
55 | "\n",
56 | "\n",
57 | "如果使用随机梯度下降,那么在每一次迭代里我们随机采样一个较短的子序列来计算有关该子序列的损失,然后计算梯度来更新模型参数。梯度计算的关键是条件概率的对数有关中心词向量和背景词向量的梯度。根据定义,首先看到\n",
58 | "\n",
59 | "\n",
60 | "$$\\log P(w_o \\mid w_c) =\n",
61 | "\\boldsymbol{u}_o^\\top \\boldsymbol{v}_c - \\log\\left(\\sum_{i \\in \\mathcal{V}} \\text{exp}(\\boldsymbol{u}_i^\\top \\boldsymbol{v}_c)\\right)$$\n",
62 | "\n",
63 | "通过微分,我们可以得到上式中$\\boldsymbol{v}_c$的梯度\n",
64 | "\n",
65 | "$$\n",
66 | "\\begin{aligned}\n",
67 | "\\frac{\\partial \\text{log}\\, P(w_o \\mid w_c)}{\\partial \\boldsymbol{v}_c} \n",
68 | "&= \\boldsymbol{u}_o - \\frac{\\sum_{j \\in \\mathcal{V}} \\exp(\\boldsymbol{u}_j^\\top \\boldsymbol{v}_c)\\boldsymbol{u}_j}{\\sum_{i \\in \\mathcal{V}} \\exp(\\boldsymbol{u}_i^\\top \\boldsymbol{v}_c)}\\\\\n",
69 | "&= \\boldsymbol{u}_o - \\sum_{j \\in \\mathcal{V}} \\left(\\frac{\\text{exp}(\\boldsymbol{u}_j^\\top \\boldsymbol{v}_c)}{ \\sum_{i \\in \\mathcal{V}} \\text{exp}(\\boldsymbol{u}_i^\\top \\boldsymbol{v}_c)}\\right) \\boldsymbol{u}_j\\\\ \n",
70 | "&= \\boldsymbol{u}_o - \\sum_{j \\in \\mathcal{V}} P(w_j \\mid w_c) \\boldsymbol{u}_j.\n",
71 | "\\end{aligned}\n",
72 | "$$\n",
73 | "\n",
74 | "它的计算需要词典中所有词以$w_c$为中心词的条件概率。有关其他词向量的梯度同理可得。\n",
75 | "\n",
76 | "训练结束后,对于词典中的任一索引为$i$的词,我们均得到该词作为中心词和背景词的两组词向量$\\boldsymbol{v}_i$和$\\boldsymbol{u}_i$。在自然语言处理应用中,一般使用跳字模型的中心词向量作为词的表征向量。\n",
77 | "\n",
78 | "\n",
79 | "## 连续词袋模型\n",
80 | "\n",
81 | "连续词袋模型与跳字模型类似。与跳字模型最大的不同在于,连续词袋模型假设基于某中心词在文本序列前后的背景词来生成该中心词。在同样的文本序列“the”“man”“loves”“his”“son”里,以“loves”作为中心词,且背景窗口大小为2时,连续词袋模型关心的是,给定背景词“the”“man”“his”“son”生成中心词“loves”的条件概率(如图10.2所示),也就是\n",
82 | "\n",
83 | "$$P(\\textrm{``loves\"}\\mid\\textrm{``the\"},\\textrm{``man\"},\\textrm{``his\"},\\textrm{``son\"}).$$\n",
84 | "\n",
85 | "\n",
86 | "\n",
87 | "因为连续词袋模型的背景词有多个,我们将这些背景词向量取平均,然后使用和跳字模型一样的方法来计算条件概率。设$\\boldsymbol{v_i}\\in\\mathbb{R}^d$和$\\boldsymbol{u_i}\\in\\mathbb{R}^d$分别表示词典中索引为$i$的词作为背景词和中心词的向量(注意符号的含义与跳字模型中的相反)。设中心词$w_c$在词典中索引为$c$,背景词$w_{o_1}, \\ldots, w_{o_{2m}}$在词典中索引为$o_1, \\ldots, o_{2m}$,那么给定背景词生成中心词的条件概率\n",
88 | "\n",
89 | "$$P(w_c \\mid w_{o_1}, \\ldots, w_{o_{2m}}) = \\frac{\\text{exp}\\left(\\frac{1}{2m}\\boldsymbol{u}_c^\\top (\\boldsymbol{v}_{o_1} + \\ldots + \\boldsymbol{v}_{o_{2m}}) \\right)}{ \\sum_{i \\in \\mathcal{V}} \\text{exp}\\left(\\frac{1}{2m}\\boldsymbol{u}_i^\\top (\\boldsymbol{v}_{o_1} + \\ldots + \\boldsymbol{v}_{o_{2m}}) \\right)}.$$\n",
90 | "\n",
91 | "为了让符号更加简单,我们记$\\mathcal{W}_o= \\{w_{o_1}, \\ldots, w_{o_{2m}}\\}$,且$\\bar{\\boldsymbol{v}}_o = \\left(\\boldsymbol{v}_{o_1} + \\ldots + \\boldsymbol{v}_{o_{2m}} \\right)/(2m)$,那么上式可以简写成\n",
92 | "\n",
93 | "$$P(w_c \\mid \\mathcal{W}_o) = \\frac{\\exp\\left(\\boldsymbol{u}_c^\\top \\bar{\\boldsymbol{v}}_o\\right)}{\\sum_{i \\in \\mathcal{V}} \\exp\\left(\\boldsymbol{u}_i^\\top \\bar{\\boldsymbol{v}}_o\\right)}.$$\n",
94 | "\n",
95 | "给定一个长度为$T$的文本序列,设时间步$t$的词为$w^{(t)}$,背景窗口大小为$m$。连续词袋模型的似然函数是由背景词生成任一中心词的概率\n",
96 | "\n",
97 | "$$ \\prod_{t=1}^{T} P(w^{(t)} \\mid w^{(t-m)}, \\ldots, w^{(t-1)}, w^{(t+1)}, \\ldots, w^{(t+m)}).$$\n",
98 | "\n",
99 | "### 训练连续词袋模型\n",
100 | "\n",
101 | "训练连续词袋模型同训练跳字模型基本一致。连续词袋模型的最大似然估计等价于最小化损失函数\n",
102 | "\n",
103 | "$$ -\\sum_{t=1}^T \\text{log}\\, P(w^{(t)} \\mid w^{(t-m)}, \\ldots, w^{(t-1)}, w^{(t+1)}, \\ldots, w^{(t+m)}).$$\n",
104 | "\n",
105 | "注意到\n",
106 | "\n",
107 | "$$\\log\\,P(w_c \\mid \\mathcal{W}_o) = \\boldsymbol{u}_c^\\top \\bar{\\boldsymbol{v}}_o - \\log\\,\\left(\\sum_{i \\in \\mathcal{V}} \\exp\\left(\\boldsymbol{u}_i^\\top \\bar{\\boldsymbol{v}}_o\\right)\\right).$$\n",
108 | "\n",
109 | "通过微分,我们可以计算出上式中条件概率的对数有关任一背景词向量$\\boldsymbol{v}_{o_i}$($i = 1, \\ldots, 2m$)的梯度\n",
110 | "\n",
111 | "$$\\frac{\\partial \\log\\, P(w_c \\mid \\mathcal{W}_o)}{\\partial \\boldsymbol{v}_{o_i}} = \\frac{1}{2m} \\left(\\boldsymbol{u}_c - \\sum_{j \\in \\mathcal{V}} \\frac{\\exp(\\boldsymbol{u}_j^\\top \\bar{\\boldsymbol{v}}_o)\\boldsymbol{u}_j}{ \\sum_{i \\in \\mathcal{V}} \\text{exp}(\\boldsymbol{u}_i^\\top \\bar{\\boldsymbol{v}}_o)} \\right) = \\frac{1}{2m}\\left(\\boldsymbol{u}_c - \\sum_{j \\in \\mathcal{V}} P(w_j \\mid \\mathcal{W}_o) \\boldsymbol{u}_j \\right).$$\n",
112 | "\n",
113 | "有关其他词向量的梯度同理可得。同跳字模型不一样的一点在于,我们一般使用连续词袋模型的背景词向量作为词的表征向量。\n",
114 | "\n",
115 | "## 小结\n",
116 | "\n",
117 | "* 词向量是用来表示词的向量。把词映射为实数域向量的技术也叫词嵌入。\n",
118 | "* word2vec包含跳字模型和连续词袋模型。跳字模型假设基于中心词来生成背景词。连续词袋模型假设基于背景词来生成中心词。\n",
119 | "\n",
120 | "\n",
121 | "## 练习\n",
122 | "\n",
123 | "* 每次梯度的计算复杂度是多少?当词典很大时,会有什么问题?\n",
124 | "* 英语中有些固定短语由多个词组成,如“new york”。如何训练它们的词向量?提示:可参考word2vec论文第4节 [2]。\n",
125 | "* 让我们以跳字模型为例思考word2vec模型的设计。跳字模型中两个词向量的内积与余弦相似度有什么关系?对语义相近的一对词来说,为什么它们的词向量的余弦相似度可能会高?\n",
126 | "\n",
127 | "\n",
128 | "\n",
129 | "\n",
130 | "\n",
131 | "## 参考文献\n",
132 | "\n",
133 | "[1] word2vec工具。https://code.google.com/archive/p/word2vec/\n",
134 | "\n",
135 | "[2] Mikolov, T., Sutskever, I., Chen, K., Corrado, G. S., & Dean, J. (2013). Distributed representations of words and phrases and their compositionality. In Advances in neural information processing systems (pp. 3111-3119).\n",
136 | "\n",
137 | "[3] Mikolov, T., Chen, K., Corrado, G., & Dean, J. (2013). Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781.\n",
138 | "\n",
139 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/4203)\n",
140 | "\n",
141 | ""
142 | ]
143 | }
144 | ],
145 | "metadata": {
146 | "kernelspec": {
147 | "display_name": "Python 3",
148 | "language": "python",
149 | "name": "python3"
150 | },
151 | "language_info": {
152 | "codemirror_mode": {
153 | "name": "ipython",
154 | "version": 3
155 | },
156 | "file_extension": ".py",
157 | "mimetype": "text/x-python",
158 | "name": "python",
159 | "nbconvert_exporter": "python",
160 | "pygments_lexer": "ipython3",
161 | "version": "3.7.4"
162 | },
163 | "toc": {
164 | "base_numbering": 1,
165 | "nav_menu": {},
166 | "number_sections": true,
167 | "sideBar": true,
168 | "skip_h1_title": false,
169 | "title_cell": "Table of Contents",
170 | "title_sidebar": "Contents",
171 | "toc_cell": false,
172 | "toc_position": {},
173 | "toc_section_display": true,
174 | "toc_window_display": false
175 | }
176 | },
177 | "nbformat": 4,
178 | "nbformat_minor": 4
179 | }
180 |
--------------------------------------------------------------------------------
/chapter_optimization/index.md:
--------------------------------------------------------------------------------
1 | # 优化算法
2 |
3 | 如果你一直按照本书的顺序读到这里,那么你已经使用了优化算法来训练深度学习模型。具体来说,在训练模型时,我们会使用优化算法不断迭代模型参数以降低模型损失函数的值。当迭代终止时,模型的训练随之终止,此时的模型参数就是模型通过训练所学习到的参数。
4 |
5 | 优化算法对于深度学习十分重要。一方面,训练一个复杂的深度学习模型可能需要数小时、数日,甚至数周时间,而优化算法的表现直接影响模型的训练效率;另一方面,理解各种优化算法的原理以及其中超参数的意义将有助于我们更有针对性地调参,从而使深度学习模型表现更好。
6 |
7 | 本章将详细介绍深度学习中常用的优化算法。
8 |
9 | ```eval_rst
10 |
11 | .. toctree::
12 | :maxdepth: 2
13 |
14 | optimization-intro
15 | gd-sgd
16 | minibatch-sgd
17 | momentum
18 | adagrad
19 | rmsprop
20 | adadelta
21 | adam
22 | ```
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/chapter_preface/preface.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 前言\n",
8 | "\n",
9 | "就在几年前,不管在大公司还是创业公司,都鲜有工程师和科学家来将深度学习应用到智能产品与服务中。作为深度学习前身的神经网络,才刚刚摆脱被机器学习学术界认为是过时工具的印象。那个时候,即使是机器学习也非新闻头条的常客。它仅仅被看作是一门具有前瞻性,并拥有一系列小范围实际应用的学科。在包含计算机视觉和自然语言处理在内的实际应用通常需要大量的相关领域知识:这些实际应用被视为相互独立的领域,而机器学习只占其中一小部分。\n",
10 | "\n",
11 | "然而仅仅在这几年之内,深度学习便令全世界大吃一惊。它非常有力地推动了计算机视觉、自然语言处理、自动语音识别、强化学习和统计建模等多个领域的快速发展。随着这些领域的不断进步,我们现在可以制造自动驾驶的汽车,基于短信、邮件甚至电话的自动回复系统,以及在围棋中击败最优秀人类选手的软件。这些由深度学习带来的新工具也正产生着广泛的影响:它们改变了电影制作和疾病诊断的方式,并在从天体物理学到生物学等各个基础科学中扮演越来越重要的角色。\n",
12 | "\n",
13 | "与此同时,深度学习也给它的使用者们带来了独一无二的挑战:任何单一的应用都汇集了各学科的知识。具体来说,应用深度学习需要同时理解:\n",
14 | "\n",
15 | "1. 问题的动机和特点;\n",
16 | "1. 将大量不同类型神经网络层通过特定方式组合在一起的模型背后的数学原理;\n",
17 | "1. 在原始数据上拟合极复杂的深层模型的优化算法;\n",
18 | "1. 有效训练模型、避免数值计算陷阱以及充分利用硬件性能所需的工程技能;\n",
19 | "1. 为解决方案挑选合适的变量(超参数)组合的经验。\n",
20 | "\n",
21 | "同样,我们几位作者也面临前所未有的挑战:我们需要在有限的篇幅里糅合深度学习的多方面知识,从而使读者能够较快理解并应用深度学习技术。本书代表了我们的一种尝试:我们将教给读者概念、背景知识和代码;我们将在同一个地方阐述剖析问题所需的批判性思维、解决问题所需的数学知识,以及实现解决方案所需的工程技能。\n",
22 | "\n",
23 | "\n",
24 | "## 包含代码、数学、网页、讨论的统一资源\n",
25 | "\n",
26 | "我们在2017年7月启动了写作这本书的项目。当时我们需要向用户解释Apache MXNet在那时的新接口Gluon。不幸的是,我们并没有找到任何一个资源可以同时满足以下几点需求:\n",
27 | "\n",
28 | "1. 包含较新的方法和应用,并不断更新;\n",
29 | "1. 广泛覆盖现代深度学习技术并具有一定的技术深度;\n",
30 | "1. 既是严谨的教科书,又是包含可运行代码的生动的教程。\n",
31 | "\n",
32 | "那时,我们在博客和GitHub上找到了大量的演示特定深度学习框架(例如用TensorFlow进行数值计算)或实现特定模型(例如AlexNet、ResNet等)的示例代码。这些示例代码的一大价值在于提供了教科书或论文往往省略的实现细节,比如数据的处理和运算的高效率实现。如果不了解这些,即使能将算法倒背如流,也难以将算法应用到自己的项目中去。此外,这些示例代码还使得用户能通过观察修改代码所导致的结果变化而快速验证想法、积累经验。因此,我们坚信动手实践对于学习深度学习的重要性。然而可惜的是,这些示例代码通常侧重于如何实现给定的方法,却忽略了有关算法设计的探究或者实现细节的解释。虽然在像Distill这样的网站和某些博客上出现了一些有关算法设计和实现细节的讨论,但它们常常缺少示例代码,并通常仅覆盖深度学习的一小部分。\n",
33 | "\n",
34 | "另外,我们欣喜地看到了一些有关深度学习的教科书不断问世,其中最著名的要数Goodfellow、Bengio和Courville的《深度学习》。该书梳理了深度学习背后的众多概念与方法,是一本极为优秀的教材。然而,这类资源并没有将概念描述与实际代码相结合,以至于有时会令读者对如何实现它们感到毫无头绪。除了这些以外,商业课程提供者们虽然制作了众多的优质资源,但它们的付费门槛依然令不少用户望而生畏。\n",
35 | "\n",
36 | "正因为这样,深度学习用户,尤其是初学者,往往不得不参考来源不同的多种资料。例如,通过教科书或者论文来掌握算法及其相关数学知识,阅读线上文档学习深度学习框架的使用方法,然后寻找感兴趣的算法在这个框架上的实现并摸索如何将它应用到自己的项目中去。如果你正亲身经历这一过程,你可能会感到痛苦:不同来源的资料有时难以相互一一对应,即便能够对应也可能需要花费大量的精力。例如,我们需要将某篇论文公式中的数学变量与某段网上实现中的程序变量一一对应,并在代码中找到论文可能没交代清楚的实现细节,甚至要为运行不同的代码安装不同的运行环境。\n",
37 | "\n",
38 | "针对以上存在的痛点,我们正在着手创建一个为实现以下目标的统一资源:\n",
39 | "\n",
40 | "1. 所有人均可在网上免费获取;\n",
41 | "1. 提供足够的技术深度,从而帮助读者实际成为深度学习应用科学家:既理解数学原理,又能够实现并不断改进方法;\n",
42 | "1. 包含可运行的代码,为读者展示如何在实际中解决问题。这样不仅直接将数学公式对应成实际代码,而且可以修改代码、观察结果并及时获取经验;\n",
43 | "1. 允许我们和整个社区不断快速迭代内容,从而紧跟仍在高速发展的深度学习领域;\n",
44 | "1. 由包含有关技术细节问答的论坛作为补充,使大家可以相互答疑并交换经验。\n",
45 | "\n",
46 | "这些目标往往互有冲突:公式、定理和引用最容易通过LaTeX进行管理和展示,代码自然应该用简单易懂的Python描述,而网页本身应该是一堆HTML及配套的CSS和JavaScript。此外,我们希望这个资源可以作为可执行代码、实体书以及网站。然而,目前并没有任何工具可以完美地满足以上所有需求。\n",
47 | "\n",
48 | "因此,我们不得不自己来集成这样的一个工作流。我们决定在GitHub上分享源代码并允许提交编辑,通过Jupyter记事本来整合代码、公式、文本、图片等,使用Sphinx作为渲染引擎来生成不同格式的输出,并使用Discourse作为论坛。虽然我们的系统尚未完善,但这些选择在互有冲突的目标之间取得了较好的折中。这很可能是使用这种集成工作流发布的第一本书。\n",
49 | "\n",
50 | "\n",
51 | "## 从在线课程到纸质书\n",
52 | "\n",
53 | "本书的两位中国作者曾每周末在线免费讲授“动手学深度学习”系列课程。课程的讲义自然成为了本书内容的蓝本。这个课程持续了5个月,其间近3,000名同学参与了讨论,并贡献了近5,000多个有价值的讨论,特别是其中几个参加比赛的练习很受欢迎。这个课程的受欢迎程度出乎我们的意料。尽管我们将课件和课程视频都公开在了网上,但我们同时觉得出版成纸质书也许能让更多喜爱纸质阅读的读者受益。因此,我们委托人民邮电出版社来出版这本书。\n",
54 | "\n",
55 | "从蓝本到成书花费了更多的时间。我们对所有涉及的所有技术点补充了背景介绍,并使用了更加严谨的写作风格,还对版式和示意图做了大量修改。书中所有的代码执行结果都是自动生成的,任何改动都会触发对书中每一段代码的测试,以保证读者在动手实践时能复现结果。\n",
56 | "\n",
57 | "我们的初衷是让更多人更容易地使用深度学习。为了让大家能够便利地获取这些资源,我们保留了免费的网站内容,并且通过不收取出版稿费的方式来降低纸质书的价格,使更多人有能力购买。\n",
58 | "\n",
59 | "\n",
60 | "## 致谢\n",
61 | "\n",
62 | "我们无比感谢本书的中英文版稿件贡献者和论坛用户们。他们帮助增添或改进了书中内容并提供了有价值的反馈。特别地,我们要感谢每一位为这本中文版开源书提交内容改动的贡献者们。这些贡献者的GitHub用户名或姓名是(排名不分先后):许致中、邓杨、崔永明、Aaron Sun、陈斌斌、曾元豪、周长安、李昂、王晨光、Chaitanya Prakash Bapat、金杰、赵小华、戴作卓、刘捷、张建浩、梓善、唐佐林、DHRUV536、丁海、郭晶博、段弘、杨英明、林海滨、范舟、李律、李阳、夏鲁豫、张鹏、徐曦、Kangel Zenn、Richard CUI、郭云鹏、hank123456、金颢、hardfish82、何通、高剑伟、王海龙、htoooth、hufuyu、Kun Hu、刘俊朋、沈海晨、韩承宇、张钟越、罗晶、jiqirer、贾忠祥、姜蔚蔚、田宇琛、王曜、李凯、兰青、王乐园、Leonard Lausen、张雷、鄭宇翔、linbojin、lingss0918、杨大卫、刘佳、戴玮、贾老坏、陆明、张亚鹏、李超、周俊佐、Liang Jinzheng、童话、彭小平、王皓、彭大发、彭远卓、黄瓒、解浚源、彭艺宇、刘铭、吴俊、刘睿、张绍明、施洪、刘天池、廖翊康、施行健、孙畔勇、查晟、郑帅、任杰骥、王海珍、王鑫、wangzhe258369、王振荟、周军、吴侃、汪磊、wudayo、徐驰、夏根源、何孝霆、谢国超、刘新伟、肖梅峰、黄晓烽、燕文磊、王贻达、马逸飞、邱怡轩、吴勇、杨培文、余峰、Peng Yu、王雨薇、王宇翔、喻心悦、赵越、刘忆智、张航、郑达、陈志、周航、张帜、周远、汪汇泽、谢乘胜、aitehappiness、张满闯、孙焱、林健、董进、陈宇泓、魏耀武、田慧媛、陈琛、许柏楠、bowcr、张宇楠、王晨、李居正、王宗冰、刘垣德。谢谢你们为每一位读者改进这本开源书。\n",
63 | "\n",
64 | "本书的初稿在中国科学技术大学、上海财经大学的“深度学习”课程以及浙江大学的“物联网与信息处理”课程和上海交通大学的“面向视觉识别的卷积神经网络”课程中被用于教学。我们在此感谢这些课程的师生,特别是连德富教授、王智教授和罗家佳教授,感谢他们对改进本书提供的宝贵意见。\n",
65 | "\n",
66 | "此外,我们感谢Amazon Web Services,特别是Swami Sivasubramanian、Raju Gulabani、Charlie Bell和Andrew Jassy在我们撰写本书时给予的慷慨支持。如果没有可用的时间、资源以及来自同事们的讨论和鼓励,就没有这本书的项目。我们还要感谢Apache MXNet团队实现了很多本书所使用的特性。另外,经过同事们的校勘,本书的质量得到了极大的提升。在此我们一一列出章节和校勘人,以表示我们由衷的感谢:引言的校勘人为金颢,预备知识的校勘人为吴俊,深度学习基础的校勘人为张航、王晨光、林海滨,深度学习计算的校勘人为查晟,卷积神经网络的校勘人为张帜、何通,循环神经网络的校勘人为查晟,优化算法的校勘人为郑帅,计算性能的校勘人为郑达、吴俊,计算机视觉的校勘人为解浚源、张帜、何通、张航,自然语言处理的校勘人为王晨光,附录的校勘人为金颢。\n",
67 | "\n",
68 | "感谢将门创投,特别是王慧、高欣欣、常铭珊和白玉为本书的两位中国作者在讲授”动手学深度学习“系列课程时所提供的平台。感谢所有参与这一系列课程的数千名同学们。感谢Amazon Web Services中国团队的同事们,特别是费良宏和王晨对作者的支持与鼓励。感谢本书论坛的3位版主:王鑫、夏鲁豫和杨培文。他们牺牲了自己宝贵的休息时间来回复大家的提问。感谢人民邮电出版社的杨海玲编辑为我们在本书的出版过程中所提供的各种帮助。\n",
69 | "\n",
70 | "最后,我们要感谢我们的家人。谢谢你们一直陪伴着我们。\n",
71 | "\n",
72 | "\n",
73 | "## 教学资源和反馈\n",
74 | "\n",
75 | "本书的英文版Dive into Deep Learning是加州大学伯克利分校2019年春学期“Introduction to Deep Learning”(深度学习导论)课程的教材。截至2019年春学期,本书中的内容已被全球15所知名大学用于教学。本书的学习社区、免费教学资源(课件、教学视频、更多习题等),以及用于本书学习或教学的免费计算资源(仅限学生和老师)的申请方法在本书网站 https://zh.d2l.ai 上发布。诚然,将算法、公式、图片、代码和样例统一进一本适合阅读的书,而且又是一系列有交互式体验的Jupyter记事本,是对我们极大的挑战。书中难免有很多疏忽的地方,敬请大家原谅,并希望你能通过每一节后面的二维码向我们反馈问题。\n",
76 | "\n",
77 | "结尾处,附上陆游的一句诗作为勉励:\n",
78 | "\n",
79 | "> “纸上得来终觉浅,绝知此事要躬行。”\n",
80 | "\n",
81 | "\n",
82 | "阿斯顿·张、李沐、扎卡里 C. 立顿、亚历山大 J. 斯莫拉\n",
83 | "\n",
84 | "2019年4月"
85 | ]
86 | }
87 | ],
88 | "metadata": {
89 | "kernelspec": {
90 | "display_name": "Python [conda env:pytorch]",
91 | "language": "python",
92 | "name": "conda-env-pytorch-py"
93 | },
94 | "language_info": {
95 | "codemirror_mode": {
96 | "name": "ipython",
97 | "version": 3
98 | },
99 | "file_extension": ".py",
100 | "mimetype": "text/x-python",
101 | "name": "python",
102 | "nbconvert_exporter": "python",
103 | "pygments_lexer": "ipython3",
104 | "version": "3.6.9"
105 | }
106 | },
107 | "nbformat": 4,
108 | "nbformat_minor": 4
109 | }
110 |
--------------------------------------------------------------------------------
/chapter_prerequisite/autograd.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 自动求梯度\n",
8 | "\n",
9 | "在深度学习中,我们经常需要对函数求梯度(gradient)。本节将介绍如何使用PyTorch提供的`autograd`模块来自动求梯度。如果对本节中的数学概念(如梯度)不是很熟悉,可以参阅附录中[“数学基础”](../chapter_appendix/math.ipynb)一节。"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import torch\n",
19 | "from torch import autograd"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## 简单例子\n",
27 | "\n",
28 | "我们先看一个简单例子:对函数 $y = 2\\boldsymbol{x}^{\\top}\\boldsymbol{x}$ 求关于列向量 $\\boldsymbol{x}$ 的梯度。我们先创建变量`x`,并赋初值。"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {},
35 | "outputs": [
36 | {
37 | "data": {
38 | "text/plain": [
39 | "tensor([[0.],\n",
40 | " [1.],\n",
41 | " [2.],\n",
42 | " [3.]])"
43 | ]
44 | },
45 | "execution_count": 2,
46 | "metadata": {},
47 | "output_type": "execute_result"
48 | }
49 | ],
50 | "source": [
51 | "x = torch.arange(4, dtype=torch.float).reshape(4, 1)\n",
52 | "x"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "为了求有关变量`x`的梯度,我们需要设置`x`的`requires_grad`属性为`True`。\n",
60 | "\n",
61 | "\n",
62 | "**注: 在 PyTorch 中设置一个 `Tensor` 的 `requires_grad` 属性为 `True` 要求这个 `Tensor` 的数据类型必须是 `float`**"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 3,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "x.requires_grad = True"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "下面定义有关变量`x`的函数。如果一个`Tensor`的`requires_grad`属性被设置为`True`,那么所有依赖于它运算得到的`Tensor`的`requires_grad`属性也会被设置为`True`。"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 4,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "data": {
88 | "text/plain": [
89 | "True"
90 | ]
91 | },
92 | "execution_count": 4,
93 | "metadata": {},
94 | "output_type": "execute_result"
95 | }
96 | ],
97 | "source": [
98 | "y = 2 * torch.mm(x.t(), x)\n",
99 | "y.requires_grad"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "由于`x`的形状为(4, 1),`y`是一个标量。接下来我们可以通过调用`backward`函数自动求梯度。需要注意的是,如果`y`不是一个标量,那么需要调用`backward(torch.ones_like(y))`函数自动求梯度。"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 5,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "y.backward()"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {},
121 | "source": [
122 | "函数 $y = 2\\boldsymbol{x}^{\\top}\\boldsymbol{x}$ 关于$\\boldsymbol{x}$ 的梯度应为$4\\boldsymbol{x}$。现在我们来验证一下求出来的梯度是正确的。"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 6,
128 | "metadata": {},
129 | "outputs": [
130 | {
131 | "data": {
132 | "text/plain": [
133 | "tensor([[ 0.],\n",
134 | " [ 4.],\n",
135 | " [ 8.],\n",
136 | " [12.]])"
137 | ]
138 | },
139 | "execution_count": 6,
140 | "metadata": {},
141 | "output_type": "execute_result"
142 | }
143 | ],
144 | "source": [
145 | "assert (x.grad - 4 * x).norm().item() == 0\n",
146 | "x.grad"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "## 训练模式和预测模式\n",
154 | "\n",
155 | "默认情况下PyTorch处于训练模式,但在预测模式时,我们不需要对变量进行求导,可以使用`with torch.no_grad():`,该上下文管理器后面进行的计算所得到的`Tensor`都不会保存梯度。\n",
156 | "\n",
157 | "**注:也可以通过 `model.train()` 和 `model.eval()` 切换训练模型的两种模式。**"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 7,
163 | "metadata": {},
164 | "outputs": [
165 | {
166 | "name": "stdout",
167 | "output_type": "stream",
168 | "text": [
169 | "True\n",
170 | "False\n"
171 | ]
172 | }
173 | ],
174 | "source": [
175 | "print(y.requires_grad)\n",
176 | "with torch.no_grad():\n",
177 | " y2 = 2 * torch.mm(x.t(), x)\n",
178 | " print(y2.requires_grad)"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "在有些情况下,同一个模型在训练模式和预测模式下的行为并不相同。我们会在后面的章节详细介绍这些区别。\n",
186 | "\n",
187 | "\n",
188 | "## 对Python控制流求梯度\n",
189 | "\n",
190 | "使用PyTorch的一个便利之处是,即使函数的计算图包含了Python的控制流(如条件和循环控制),我们也有可能对变量求梯度。\n",
191 | "\n",
192 | "考虑下面程序,其中包含Python的条件和循环控制。需要强调的是,这里循环(while循环)迭代的次数和条件判断(if语句)的执行都取决于输入`a`的值。"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 8,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "def f(a):\n",
202 | " b = a * 2\n",
203 | " while b.norm().item() < 1000:\n",
204 | " b = b * 2\n",
205 | " if b.sum().item() > 0:\n",
206 | " c = b\n",
207 | " else:\n",
208 | " c = 100 * b\n",
209 | " return c"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "metadata": {},
215 | "source": [
216 | "我们像之前一样使用`record`函数记录计算,并调用`backward`函数求梯度。"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 9,
222 | "metadata": {},
223 | "outputs": [],
224 | "source": [
225 | "a = torch.randn(1)\n",
226 | "a.requires_grad = True\n",
227 | "c = f(a)\n",
228 | "c.backward()"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "我们来分析一下上面定义的`f`函数。事实上,给定任意输入`a`,其输出必然是 `f(a) = x * a`的形式,其中标量系数`x`的值取决于输入`a`。由于`c = f(a)`有关`a`的梯度为`x`,且值为`c / a`,我们可以像下面这样验证对本例中控制流求梯度的结果的正确性。"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 10,
241 | "metadata": {},
242 | "outputs": [
243 | {
244 | "data": {
245 | "text/plain": [
246 | "tensor([1], dtype=torch.uint8)"
247 | ]
248 | },
249 | "execution_count": 10,
250 | "metadata": {},
251 | "output_type": "execute_result"
252 | }
253 | ],
254 | "source": [
255 | "a.grad == c / a"
256 | ]
257 | },
258 | {
259 | "cell_type": "markdown",
260 | "metadata": {},
261 | "source": [
262 | "## 小结\n",
263 | "\n",
264 | "* PyTorch提供`autograd`模块来自动化求导过程。\n",
265 | "* PyTorch的`autograd`模块可以对一般的命令式程序进行求导。\n",
266 | "* PyTorch的运行模式包括训练模式和预测模式。我们可以通过`autograd.is_training()`来判断运行模式。\n",
267 | "\n",
268 | "## 练习\n",
269 | "\n",
270 | "* 在本节对控制流求梯度的例子中,把变量`a`改成一个随机向量或矩阵。此时计算结果`c`不再是标量,运行结果将有何变化?该如何分析该结果?\n",
271 | "* 重新设计一个对控制流求梯度的例子。运行并分析结果。\n",
272 | "\n",
273 | "\n",
274 | "\n",
275 | "\n",
276 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/744)\n",
277 | "\n",
278 | ""
279 | ]
280 | }
281 | ],
282 | "metadata": {
283 | "kernelspec": {
284 | "display_name": "Python [conda env:pytorch]",
285 | "language": "python",
286 | "name": "conda-env-pytorch-py"
287 | },
288 | "language_info": {
289 | "codemirror_mode": {
290 | "name": "ipython",
291 | "version": 3
292 | },
293 | "file_extension": ".py",
294 | "mimetype": "text/x-python",
295 | "name": "python",
296 | "nbconvert_exporter": "python",
297 | "pygments_lexer": "ipython3",
298 | "version": "3.6.9"
299 | },
300 | "toc": {
301 | "base_numbering": 1,
302 | "nav_menu": {},
303 | "number_sections": true,
304 | "sideBar": true,
305 | "skip_h1_title": false,
306 | "title_cell": "Table of Contents",
307 | "title_sidebar": "Contents",
308 | "toc_cell": false,
309 | "toc_position": {},
310 | "toc_section_display": true,
311 | "toc_window_display": false
312 | }
313 | },
314 | "nbformat": 4,
315 | "nbformat_minor": 4
316 | }
317 |
--------------------------------------------------------------------------------
/chapter_prerequisite/index.md:
--------------------------------------------------------------------------------
1 | # 预备知识
2 |
3 | 在动手学习之前,我们需要获取本书的代码,并安装运行本书的代码所需要的软件。作为动手学深度学习的基础,我们还需要了解如何对内存中的数据进行操作,以及对函数求梯度的方法。最后,我们应养成主动查阅文档来学习代码的良好习惯。
4 |
5 | ```eval_rst
6 |
7 | .. toctree::
8 | :maxdepth: 2
9 |
10 | install
11 | tensor
12 | autograd
13 | lookup-api
14 |
15 | ```
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/chapter_prerequisite/install.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 获取和运行本书的代码\n",
8 | "\n",
9 | "本节将介绍如何获取本书的代码和安装运行代码所依赖的软件。虽然跳过本节不会影响后面的阅读,但我们还是强烈建议读者按照下面的步骤来动手操作一遍。本书大部分章节的练习都涉及改动代码并观察运行结果。因此,本节是完成这些练习的基础。\n",
10 | "\n",
11 | "## 获取代码并安装运行环境\n",
12 | "\n",
13 | "本书的内容和代码均可在网上免费获取。我们推荐使用conda来安装运行代码所依赖的软件。conda是一个流行的Python包管理软件。Windows和Linux/macOS用户可分别参照以下步骤。\n",
14 | "\n",
15 | "### Windows用户\n",
16 | "\n",
17 | "第一次运行需要完整完成下面5个步骤。如果是再次运行,可以忽略前面3步的下载和安装,直接跳转到第四步和第五步。\n",
18 | "\n",
19 | "第一步是根据操作系统下载并安装[Miniconda](https://conda.io/en/master/miniconda.html),在安装过程中需要勾选“Add Anaconda to the system PATH environment variable”选项(如当conda版本为4.6.14时)。\n",
20 | "\n",
21 | "第二步是使用git下载包含本书全部代码的代码库:`git clone git@github.com:sangyx/d2l-torch.git`。下载完成后将会得到一个名为“d2l-torch”的文件夹。在该目录文件资源管理器的地址栏输入`cmd`进入命令行模式。\n",
22 | "\n",
23 | "第三步是使用conda创建虚拟(运行)环境。conda和pip默认使用国外站点来下载软件,我们可以配置国内镜像来加速下载(国外用户无须此操作)。"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "```\n",
31 | "# 配置清华PyPI镜像(如无法运行,将pip版本升级到>=10.0.0)\n",
32 | "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple\n",
33 | "```\n"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "接下来使用conda创建虚拟环境并安装本书需要的软件。这里`environment.yml`是放置在代码压缩包中的文件。使用文本编辑器打开该文件,即可查看运行压缩包中本书的代码所依赖的软件(如torch和`d2ltorch`包)及版本号。 \n",
41 | "*(因为 torchsummary 模块使用git安装我在github上的定制版本,可能会比较慢,请耐心等待)*"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "```\n",
49 | "conda env create -f environment.yml\n",
50 | "```\n"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "若使用国内镜像后出现安装错误,首先取消PyPI镜像配置,即执行命令`pip config unset global.index-url`。然后重试命令`conda env create -f environment.yml`。\n",
58 | "\n",
59 | "第四步是激活之前创建的环境。激活该环境是能够运行本书的代码的前提。如需退出虚拟环境,可使用命令`conda deactivate`(若conda版本低于4.4,使用命令`deactivate`)。"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "```\n",
67 | "conda activate pytorch # 若conda版本低于4.4,使用命令activate pytorch\n",
68 | "```\n"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "第五步是打开Jupyter记事本。"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "```\n",
83 | "jupyter notebook\n",
84 | "```\n"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "metadata": {},
90 | "source": [
91 | "这时在浏览器打开 http://localhost:8888 (通常会自动打开)就可以查看和运行本书中每一节的代码了。"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "### Linux/macOS用户\n",
99 | "\n",
100 | "第一步是根据操作系统下载[Miniconda](https://conda.io/miniconda.html),它是一个sh文件。打开Terminal应用进入命令行来执行这个sh文件,例如:"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "```\n",
108 | "# 以Miniconda官方网站上的安装文件名为准\n",
109 | "sh Miniconda3-latest-Linux-x86_64.sh\n",
110 | "```\n"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "安装时会显示使用条款,按“↓”继续阅读,按“Q”退出阅读。之后需要回答下面几个问题(如当conda版本为4.6.14时):"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "```\n",
125 | "Do you accept the license terms? [yes|no]\n",
126 | "[no] >>> yes\n",
127 | "Do you wish the installer to initialize Miniconda3\n",
128 | "by running conda init? [yes|no]\n",
129 | "[no] >>> yes\n",
130 | "```\n"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "安装完成后,需要让conda生效。Linux用户需要运行一次`source ~/.bashrc`或重启命令行应用;macOS用户需要运行一次`source ~/.bash_profile`或重启命令行应用。\n",
138 | "\n",
139 | "第二步是下载包含本书全部代码的压缩包,解压后进入文件夹。运行以下命令。Linux用户如未安装`git`,可运行命令`sudo apt install git`安装。"
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "```\n",
147 | "git clone git@github.com:sangyx/d2l-torch.git\n",
148 | "```"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "第三步至第五步可参考前面Windows下的安装步骤。若conda版本低于4.4,其中第四步需将命令替换为`source activate pytorch`,并使用命令`source deactivate`退出虚拟环境。\n",
156 | "\n",
157 | "## 更新代码和运行环境\n",
158 | "\n",
159 | "为了适应深度学习和MXNet的快速发展,本书的开源内容将定期发布新版本。我们推荐大家定期更新本书的开源内容(如代码)和相应的运行环境(如新版MXNet)。以下是更新的具体步骤。\n",
160 | "\n",
161 | "第一步是重新下载最新的包含本书全部代码的代码库(如果原版本中的修改对您有用,请先备份原版本)。进入d2ltorch目录执行如下命令:\n",
162 | "\n",
163 | "```\n",
164 | "git fetch --all\n",
165 | "git reset --hard origin/master\n",
166 | "git pull\n",
167 | "```\n",
168 | "\n",
169 | "第二步是使用下面的命令更新运行环境:"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "```\n",
177 | "conda env update -f environment.yml\n",
178 | "```\n"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "之后的激活环境和运行Jupyter步骤跟本节前面介绍的一致。\n",
186 | "\n",
187 | "\n",
188 | "## 使用GPU版的PyTorch\n",
189 | "\n",
190 | "通过前面介绍的方式安装的PyTorch只支持CPU计算。本书中部分章节需要或推荐使用GPU来运行。如果你的计算机上有NVIDIA显卡并安装了CUDA,建议使用GPU版的PyTorch。\n",
191 | "\n",
192 | "第一步是卸载CPU版本PyTorch。如果没有安装虚拟环境,可以跳过此步。如果已安装虚拟环境,需要先激活该环境,再卸载CPU版本的PyTorch。"
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "metadata": {},
198 | "source": [
199 | "```\n",
200 | "pip uninstall torch torchvision\n",
201 | "```\n"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {},
207 | "source": [
208 | "然后退出虚拟环境。\n",
209 | "\n",
210 | "第二步是更新依赖为GPU版本的PyTorch。首先进入选择适合自己计算机的PyTorch版本的pip安装命令。然后使用文本编辑器打开本书的代码所在根目录下的文件`environment.yml`,将里面的字符串“pytorch”替换成对应版本的安装命令。例如,如果计算机系统为Windows,装的是9.0版本的CUDA,将该文件中的字符串“torch==1.1.0”改为“https://download.pytorch.org/whl/cu90/torch-1.1.0-cp36-cp36m-win_amd64.whl” ,将字符串“torchvision==0.3.0”改为“https://download.pytorch.org/whl/cu90/torchvision-0.3.0-cp36-cp36m-win_amd64.whl” 。保存文件后退出。\n",
211 | "\n",
212 | "\n",
213 | "\n",
214 | "\n",
215 | "\n",
216 | "第三步是更新虚拟环境,执行命令"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "```\n",
224 | "conda env update -f environment.yml\n",
225 | "```\n"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "之后,我们只需要再激活安装环境就可以使用GPU版的PyTorch运行本书中的代码了。需要提醒的是,如果之后下载了新代码,那么还需要重复这3步操作以使用GPU版的PyTorch。\n",
233 | "\n",
234 | "\n",
235 | "## 小结\n",
236 | "\n",
237 | "* 为了能够动手学深度学习,需要获取本书的代码并安装运行环境。\n",
238 | "* 建议大家定期更新代码和运行环境。\n",
239 | "\n",
240 | "\n",
241 | "## 练习\n",
242 | "\n",
243 | "* 获取本书的代码并安装运行环境。如果你在安装时遇到任何问题,请扫一扫本节末尾的二维码。在讨论区,你可以查阅疑难问题汇总或者提问。\n",
244 | "\n",
245 | "\n",
246 | "\n",
247 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/249)\n",
248 | "\n",
249 | ""
250 | ]
251 | }
252 | ],
253 | "metadata": {
254 | "kernelspec": {
255 | "display_name": "Python 3",
256 | "language": "python",
257 | "name": "python3"
258 | },
259 | "language_info": {
260 | "codemirror_mode": {
261 | "name": "ipython",
262 | "version": 3
263 | },
264 | "file_extension": ".py",
265 | "mimetype": "text/x-python",
266 | "name": "python",
267 | "nbconvert_exporter": "python",
268 | "pygments_lexer": "ipython3",
269 | "version": "3.7.4"
270 | },
271 | "toc": {
272 | "base_numbering": 1,
273 | "nav_menu": {},
274 | "number_sections": true,
275 | "sideBar": true,
276 | "skip_h1_title": false,
277 | "title_cell": "Table of Contents",
278 | "title_sidebar": "Contents",
279 | "toc_cell": false,
280 | "toc_position": {},
281 | "toc_section_display": true,
282 | "toc_window_display": false
283 | }
284 | },
285 | "nbformat": 4,
286 | "nbformat_minor": 4
287 | }
288 |
--------------------------------------------------------------------------------
/chapter_prerequisite/lookup-api.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 查阅文档\n",
8 | "\n",
9 | "受篇幅所限,本书无法对所有用到的PyTorch函数和类一一详细介绍。读者可以查阅相关文档来做更深入的了解。\n",
10 | "\n",
11 | "## 查找模块里的所有函数和类\n",
12 | "\n",
13 | "当我们想知道一个模块里面提供了哪些可以调用的函数和类的时候,可以使用`dir`函数。下面我们打印`torch.cuda`模块中所有的成员或属性。 \n",
14 | "\n",
15 | "\n",
16 | "*吐槽一下,原书这里用的是 `nd.random` 模块,而 PyTorch 的模块组织确实很烂,这里挑一个稍微有点条理的做示范*"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 1,
22 | "metadata": {
23 | "attributes": {
24 | "classes": [],
25 | "id": "",
26 | "n": "1"
27 | }
28 | },
29 | "outputs": [
30 | {
31 | "name": "stdout",
32 | "output_type": "stream",
33 | "text": [
34 | "['BFloat16Storage', 'BFloat16Tensor', 'BoolStorage', 'BoolTensor', 'ByteStorage', 'ByteTensor', 'CharStorage', 'CharTensor', 'CudaError', 'DeferredCudaCallError', 'DoubleStorage', 'DoubleTensor', 'Event', 'FloatStorage', 'FloatTensor', 'HalfStorage', 'HalfTensor', 'IntStorage', 'IntTensor', 'LongStorage', 'LongTensor', 'PIPE', 'Popen', 'ShortStorage', 'ShortTensor', 'Stream', '_CudaBase', '_StorageBase', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_after_fork', '_check_capability', '_check_driver', '_cudart', '_dummy_type', '_free_mutex', '_get_device_index', '_host_allocator', '_in_bad_fork', '_initialization_lock', '_initialized', '_lazy_call', '_lazy_init', '_lazy_new', '_load_cudart', '_original_pid', '_queued_calls', '_register_after_fork', '_sleep', '_tls', '_utils', 'check_error', 'comm', 'contextlib', 'ctypes', 'cudaStatus', 'cudart', 'current_blas_handle', 'current_device', 'current_stream', 'default_stream', 'device', 'device_count', 'device_of', 'empty_cache', 'find_cuda_windows_lib', 'get_device_capability', 'get_device_name', 'get_device_properties', 'get_rng_state', 'get_rng_state_all', 'init', 'initial_seed', 'ipc_collect', 'is_available', 'manual_seed', 'manual_seed_all', 'max_memory_allocated', 'max_memory_cached', 'memory_allocated', 'memory_cached', 'nccl', 'nvtx', 'os', 'platform', 'profiler', 'raise_from', 'random', 'reset_max_memory_allocated', 'reset_max_memory_cached', 'seed', 'seed_all', 'set_device', 'set_rng_state', 'set_rng_state_all', 'sparse', 'stream', 'streams', 'synchronize', 'sys', 'threading', 'torch', 'traceback', 'warnings']\n"
35 | ]
36 | }
37 | ],
38 | "source": [
39 | "import torch\n",
40 | "\n",
41 | "print(dir(torch.cuda))"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "通常我们可以忽略掉由`__`开头和结尾的函数(Python的特别对象)或者由`_`开头的函数(一般为内部函数)。通过其余成员的名字我们大致猜测出这个模块提供了各种对GPU的控制方法,包括可用设备数量(`device_count`)、GPU是否可用(`is_available`)、清空显存(`empty_cache`)等。\n",
49 | "\n",
50 | "## 查找特定函数和类的使用\n",
51 | "\n",
52 | "想了解某个函数或者类的具体用法时,可以使用`help`函数。让我们以`torch`中的`ones_like`函数为例,查阅它的用法。"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 2,
58 | "metadata": {},
59 | "outputs": [
60 | {
61 | "name": "stdout",
62 | "output_type": "stream",
63 | "text": [
64 | "Help on built-in function ones_like:\n",
65 | "\n",
66 | "ones_like(...)\n",
67 | " ones_like(input, dtype=None, layout=None, device=None, requires_grad=False) -> Tensor\n",
68 | " \n",
69 | " Returns a tensor filled with the scalar value `1`, with the same size as\n",
70 | " :attr:`input`. ``torch.ones_like(input)`` is equivalent to\n",
71 | " ``torch.ones(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.\n",
72 | " \n",
73 | " .. warning::\n",
74 | " As of 0.4, this function does not support an :attr:`out` keyword. As an alternative,\n",
75 | " the old ``torch.ones_like(input, out=output)`` is equivalent to\n",
76 | " ``torch.ones(input.size(), out=output)``.\n",
77 | " \n",
78 | " Args:\n",
79 | " input (Tensor): the size of :attr:`input` will determine size of the output tensor.\n",
80 | " dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.\n",
81 | " Default: if ``None``, defaults to the dtype of :attr:`input`.\n",
82 | " layout (:class:`torch.layout`, optional): the desired layout of returned tensor.\n",
83 | " Default: if ``None``, defaults to the layout of :attr:`input`.\n",
84 | " device (:class:`torch.device`, optional): the desired device of returned tensor.\n",
85 | " Default: if ``None``, defaults to the device of :attr:`input`.\n",
86 | " requires_grad (bool, optional): If autograd should record operations on the\n",
87 | " returned tensor. Default: ``False``.\n",
88 | " \n",
89 | " Example::\n",
90 | " \n",
91 | " >>> input = torch.empty(2, 3)\n",
92 | " >>> torch.ones_like(input)\n",
93 | " tensor([[ 1., 1., 1.],\n",
94 | " [ 1., 1., 1.]])\n",
95 | "\n"
96 | ]
97 | }
98 | ],
99 | "source": [
100 | "help(torch.ones_like)"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "从文档信息我们了解到,`ones_like`函数会创建和输入`Tensor`形状相同且元素为1的新`Tensor`。我们可以验证一下:"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 3,
113 | "metadata": {},
114 | "outputs": [
115 | {
116 | "data": {
117 | "text/plain": [
118 | "tensor([[1., 1., 1.],\n",
119 | " [1., 1., 1.]])"
120 | ]
121 | },
122 | "execution_count": 3,
123 | "metadata": {},
124 | "output_type": "execute_result"
125 | }
126 | ],
127 | "source": [
128 | "x = torch.Tensor([[0, 0, 0], [2, 2, 2]])\n",
129 | "y = torch.ones_like(x)\n",
130 | "y"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "在Jupyter记事本里,我们可以使用`?`来将文档显示在另外一个窗口中。例如,使用`torch.rand?`将得到与`help(torch.rand)`几乎一样的内容,但会显示在额外窗口里。此外,如果使用`torch.rand??`,那么会额外显示该函数实现的代码。\n",
138 | "\n",
139 | "\n",
140 | "## 在PyTorch网站上查阅\n",
141 | "\n",
142 | "读者也可以在PyTorch的网站上查阅相关文档。访问PyTorch网站 [https://pytorch.org/](https://pytorch.org/) (如图2.1所示),点击网页顶部的下拉菜单“Docs”可查阅各个前端语言的接口。此外,也可以在点击网页右上方的搜索图标直接搜索函数或类名称。\n",
143 | "\n",
144 | "\n",
145 | "\n",
146 | "图2.2展示了PyTorch网站上有关`ones_like`函数的文档。\n",
147 | "\n",
148 | "\n",
149 | "\n",
150 | "## 小结\n",
151 | "\n",
152 | "* 遇到不熟悉的PyTorch API时,可以主动查阅它的相关文档。\n",
153 | "* 查阅PyTorch文档可以使用`dir`和`help`函数,或访问PyTorch官方网站。\n",
154 | "\n",
155 | "\n",
156 | "## 练习\n",
157 | "\n",
158 | "* 查阅`PyTorch`支持的其他操作。\n",
159 | "\n",
160 | "\n",
161 | "\n",
162 | "\n",
163 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/7116)\n",
164 | "\n",
165 | ""
166 | ]
167 | }
168 | ],
169 | "metadata": {
170 | "kernelspec": {
171 | "display_name": "Python 3",
172 | "language": "python",
173 | "name": "python3"
174 | },
175 | "language_info": {
176 | "codemirror_mode": {
177 | "name": "ipython",
178 | "version": 3
179 | },
180 | "file_extension": ".py",
181 | "mimetype": "text/x-python",
182 | "name": "python",
183 | "nbconvert_exporter": "python",
184 | "pygments_lexer": "ipython3",
185 | "version": "3.7.4"
186 | },
187 | "toc": {
188 | "base_numbering": 1,
189 | "nav_menu": {},
190 | "number_sections": true,
191 | "sideBar": true,
192 | "skip_h1_title": false,
193 | "title_cell": "Table of Contents",
194 | "title_sidebar": "Contents",
195 | "toc_cell": false,
196 | "toc_position": {},
197 | "toc_section_display": true,
198 | "toc_window_display": false
199 | }
200 | },
201 | "nbformat": 4,
202 | "nbformat_minor": 4
203 | }
204 |
--------------------------------------------------------------------------------
/chapter_recurrent-neural-networks/bi-rnn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 双向循环神经网络\n",
8 | "\n",
9 | "之前介绍的循环神经网络模型都是假设当前时间步是由前面的较早时间步的序列决定的,因此它们都将信息通过隐藏状态从前往后传递。有时候,当前时间步也可能由后面时间步决定。例如,当我们写下一个句子时,可能会根据句子后面的词来修改句子前面的用词。双向循环神经网络通过增加从后往前传递信息的隐藏层来更灵活地处理这类信息。图6.12演示了一个含单隐藏层的双向循环神经网络的架构。\n",
10 | "\n",
11 | "\n",
12 | "\n",
13 | "下面我们来介绍具体的定义。\n",
14 | "给定时间步$t$的小批量输入$\\boldsymbol{X}_t \\in \\mathbb{R}^{n \\times d}$(样本数为$n$,输入个数为$d$)和隐藏层激活函数为$\\phi$。在双向循环神经网络的架构中,\n",
15 | "设该时间步正向隐藏状态为$\\overrightarrow{\\boldsymbol{H}}_t \\in \\mathbb{R}^{n \\times h}$(正向隐藏单元个数为$h$),\n",
16 | "反向隐藏状态为$\\overleftarrow{\\boldsymbol{H}}_t \\in \\mathbb{R}^{n \\times h}$(反向隐藏单元个数为$h$)。我们可以分别计算正向隐藏状态和反向隐藏状态:\n",
17 | "\n",
18 | "$$\n",
19 | "\\begin{aligned}\n",
20 | "\\overrightarrow{\\boldsymbol{H}}_t &= \\phi(\\boldsymbol{X}_t \\boldsymbol{W}_{xh}^{(f)} + \\overrightarrow{\\boldsymbol{H}}_{t-1} \\boldsymbol{W}_{hh}^{(f)} + \\boldsymbol{b}_h^{(f)}),\\\\\n",
21 | "\\overleftarrow{\\boldsymbol{H}}_t &= \\phi(\\boldsymbol{X}_t \\boldsymbol{W}_{xh}^{(b)} + \\overleftarrow{\\boldsymbol{H}}_{t+1} \\boldsymbol{W}_{hh}^{(b)} + \\boldsymbol{b}_h^{(b)}),\n",
22 | "\\end{aligned}\n",
23 | "$$\n",
24 | "\n",
25 | "其中权重$\\boldsymbol{W}_{xh}^{(f)} \\in \\mathbb{R}^{d \\times h}$、$\\boldsymbol{W}_{hh}^{(f)} \\in \\mathbb{R}^{h \\times h}$、$\\boldsymbol{W}_{xh}^{(b)} \\in \\mathbb{R}^{d \\times h}$、$\\boldsymbol{W}_{hh}^{(b)} \\in \\mathbb{R}^{h \\times h}$和偏差 $\\boldsymbol{b}_h^{(f)} \\in \\mathbb{R}^{1 \\times h}$、$\\boldsymbol{b}_h^{(b)} \\in \\mathbb{R}^{1 \\times h}$均为模型参数。\n",
26 | "\n",
27 | "然后我们连结两个方向的隐藏状态$\\overrightarrow{\\boldsymbol{H}}_t$和$\\overleftarrow{\\boldsymbol{H}}_t$来得到隐藏状态$\\boldsymbol{H}_t \\in \\mathbb{R}^{n \\times 2h}$,并将其输入到输出层。输出层计算输出$\\boldsymbol{O}_t \\in \\mathbb{R}^{n \\times q}$(输出个数为$q$):\n",
28 | "\n",
29 | "$$\\boldsymbol{O}_t = \\boldsymbol{H}_t \\boldsymbol{W}_{hq} + \\boldsymbol{b}_q,$$\n",
30 | "\n",
31 | "其中权重$\\boldsymbol{W}_{hq} \\in \\mathbb{R}^{2h \\times q}$和偏差$\\boldsymbol{b}_q \\in \\mathbb{R}^{1 \\times q}$为输出层的模型参数。不同方向上的隐藏单元个数也可以不同。\n",
32 | "\n",
33 | "## 小结\n",
34 | "\n",
35 | "* 双向循环神经网络在每个时间步的隐藏状态同时取决于该时间步之前和之后的子序列(包括当前时间步的输入)。\n",
36 | "\n",
37 | "\n",
38 | "## 练习\n",
39 | "\n",
40 | "* 如果不同方向上使用不同的隐藏单元个数,$\\boldsymbol{H}_t$的形状会发生怎样的改变?\n",
41 | "* 参考图6.11和图6.12,设计含多个隐藏层的双向循环神经网络。\n",
42 | "\n",
43 | "\n",
44 | "\n",
45 | "\n",
46 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/6732)\n",
47 | "\n",
48 | ""
49 | ]
50 | }
51 | ],
52 | "metadata": {
53 | "kernelspec": {
54 | "display_name": "Python [conda env:pytorch]",
55 | "language": "python",
56 | "name": "conda-env-pytorch-py"
57 | },
58 | "language_info": {
59 | "codemirror_mode": {
60 | "name": "ipython",
61 | "version": 3
62 | },
63 | "file_extension": ".py",
64 | "mimetype": "text/x-python",
65 | "name": "python",
66 | "nbconvert_exporter": "python",
67 | "pygments_lexer": "ipython3",
68 | "version": "3.6.9"
69 | }
70 | },
71 | "nbformat": 4,
72 | "nbformat_minor": 4
73 | }
74 |
--------------------------------------------------------------------------------
/chapter_recurrent-neural-networks/bptt.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 通过时间反向传播\n",
8 | "\n",
9 | "如果读者做了上一节的练习,就会发现,如果不裁剪梯度,模型将无法正常训练。为了深刻理解这一现象,本节将介绍循环神经网络中梯度的计算和存储方法,即通过时间反向传播(back-propagation through time)。\n",
10 | "\n",
11 | "我们在[“正向传播、反向传播和计算图”](../chapter_deep-learning-basics/backprop.ipynb)一节中介绍了神经网络中梯度计算与存储的一般思路,并强调正向传播和反向传播相互依赖。正向传播在循环神经网络中比较直观,而通过时间反向传播其实是反向传播在循环神经网络中的具体应用。我们需要将循环神经网络按时间步展开,从而得到模型变量和参数之间的依赖关系,并依据链式法则应用反向传播计算并存储梯度。\n",
12 | "\n",
13 | "\n",
14 | "## 定义模型\n",
15 | "\n",
16 | "简单起见,我们考虑一个无偏差项的循环神经网络,且激活函数为恒等映射($\\phi(x)=x$)。设时间步$t$的输入为单样本$\\boldsymbol{x}_t \\in \\mathbb{R}^d$,标签为$y_t$,那么隐藏状态$\\boldsymbol{h}_t \\in \\mathbb{R}^h$的计算表达式为\n",
17 | "\n",
18 | "$$\\boldsymbol{h}_t = \\boldsymbol{W}_{hx} \\boldsymbol{x}_t + \\boldsymbol{W}_{hh} \\boldsymbol{h}_{t-1},$$\n",
19 | "\n",
20 | "其中$\\boldsymbol{W}_{hx} \\in \\mathbb{R}^{h \\times d}$和$\\boldsymbol{W}_{hh} \\in \\mathbb{R}^{h \\times h}$是隐藏层权重参数。设输出层权重参数$\\boldsymbol{W}_{qh} \\in \\mathbb{R}^{q \\times h}$,时间步$t$的输出层变量$\\boldsymbol{o}_t \\in \\mathbb{R}^q$计算为\n",
21 | "\n",
22 | "$$\\boldsymbol{o}_t = \\boldsymbol{W}_{qh} \\boldsymbol{h}_{t}.$$\n",
23 | "\n",
24 | "设时间步$t$的损失为$\\ell(\\boldsymbol{o}_t, y_t)$。时间步数为$T$的损失函数$L$定义为\n",
25 | "\n",
26 | "$$L = \\frac{1}{T} \\sum_{t=1}^T \\ell (\\boldsymbol{o}_t, y_t).$$\n",
27 | "\n",
28 | "我们将$L$称为有关给定时间步的数据样本的目标函数,并在本节后续讨论中简称为目标函数。\n",
29 | "\n",
30 | "\n",
31 | "## 模型计算图\n",
32 | "\n",
33 | "为了可视化循环神经网络中模型变量和参数在计算中的依赖关系,我们可以绘制模型计算图,如图6.3所示。例如,时间步3的隐藏状态$\\boldsymbol{h}_3$的计算依赖模型参数$\\boldsymbol{W}_{hx}$、$\\boldsymbol{W}_{hh}$、上一时间步隐藏状态$\\boldsymbol{h}_2$以及当前时间步输入$\\boldsymbol{x}_3$。\n",
34 | "\n",
35 | "\n",
36 | "\n",
37 | "\n",
38 | "## 方法\n",
39 | "\n",
40 | "刚刚提到,图6.3中的模型的参数是$\\boldsymbol{W}_{hx}$、$\\boldsymbol{W}_{hh}$和$\\boldsymbol{W}_{qh}$。与[“正向传播、反向传播和计算图”](../chapter_deep-learning-basics/backprop.ipynb)一节中的类似,训练模型通常需要模型参数的梯度$\\partial L/\\partial \\boldsymbol{W}_{hx}$、$\\partial L/\\partial \\boldsymbol{W}_{hh}$和$\\partial L/\\partial \\boldsymbol{W}_{qh}$。\n",
41 | "根据图6.3中的依赖关系,我们可以按照其中箭头所指的反方向依次计算并存储梯度。为了表述方便,我们依然采用[“正向传播、反向传播和计算图”](../chapter_deep-learning-basics/backprop.ipynb)一节中表达链式法则的运算符prod。\n",
42 | "\n",
43 | "首先,目标函数有关各时间步输出层变量的梯度$\\partial L/\\partial \\boldsymbol{o}_t \\in \\mathbb{R}^q$很容易计算:\n",
44 | "\n",
45 | "$$\\frac{\\partial L}{\\partial \\boldsymbol{o}_t} = \\frac{\\partial \\ell (\\boldsymbol{o}_t, y_t)}{T \\cdot \\partial \\boldsymbol{o}_t}.$$\n",
46 | "\n",
47 | "下面,我们可以计算目标函数有关模型参数$\\boldsymbol{W}_{qh}$的梯度$\\partial L/\\partial \\boldsymbol{W}_{qh} \\in \\mathbb{R}^{q \\times h}$。根据图6.3,$L$通过$\\boldsymbol{o}_1, \\ldots, \\boldsymbol{o}_T$依赖$\\boldsymbol{W}_{qh}$。依据链式法则,\n",
48 | "\n",
49 | "$$\n",
50 | "\\frac{\\partial L}{\\partial \\boldsymbol{W}_{qh}} \n",
51 | "= \\sum_{t=1}^T \\text{prod}\\left(\\frac{\\partial L}{\\partial \\boldsymbol{o}_t}, \\frac{\\partial \\boldsymbol{o}_t}{\\partial \\boldsymbol{W}_{qh}}\\right) \n",
52 | "= \\sum_{t=1}^T \\frac{\\partial L}{\\partial \\boldsymbol{o}_t} \\boldsymbol{h}_t^\\top.\n",
53 | "$$\n",
54 | "\n",
55 | "\n",
56 | "其次,我们注意到隐藏状态之间也存在依赖关系。\n",
57 | "在图6.3中,$L$只通过$\\boldsymbol{o}_T$依赖最终时间步$T$的隐藏状态$\\boldsymbol{h}_T$。因此,我们先计算目标函数有关最终时间步隐藏状态的梯度$\\partial L/\\partial \\boldsymbol{h}_T \\in \\mathbb{R}^h$。依据链式法则,我们得到\n",
58 | "\n",
59 | "$$\n",
60 | "\\frac{\\partial L}{\\partial \\boldsymbol{h}_T} = \\text{prod}\\left(\\frac{\\partial L}{\\partial \\boldsymbol{o}_T}, \\frac{\\partial \\boldsymbol{o}_T}{\\partial \\boldsymbol{h}_T} \\right) = \\boldsymbol{W}_{qh}^\\top \\frac{\\partial L}{\\partial \\boldsymbol{o}_T}.\n",
61 | "$$\n",
62 | "\n",
63 | "\n",
64 | "\n",
65 | "接下来对于时间步$t < T$,\n",
66 | "在图6.3中,$L$通过$\\boldsymbol{h}_{t+1}$和$\\boldsymbol{o}_t$依赖$\\boldsymbol{h}_t$。依据链式法则,\n",
67 | "目标函数有关时间步$t < T$的隐藏状态的梯度$\\partial L/\\partial \\boldsymbol{h}_t \\in \\mathbb{R}^h$需要按照时间步从大到小依次计算:\n",
68 | "\n",
69 | "\n",
70 | "$$\n",
71 | "\\frac{\\partial L}{\\partial \\boldsymbol{h}_t} \n",
72 | "= \\text{prod}\\left(\\frac{\\partial L}{\\partial \\boldsymbol{h}_{t+1}}, \\frac{\\partial \\boldsymbol{h}_{t+1}}{\\partial \\boldsymbol{h}_t} \\right) \n",
73 | "+ \\text{prod}\\left(\\frac{\\partial L}{\\partial \\boldsymbol{o}_t}, \\frac{\\partial \\boldsymbol{o}_t}{\\partial \\boldsymbol{h}_t} \\right) \n",
74 | "= \\boldsymbol{W}_{hh}^\\top \\frac{\\partial L}{\\partial \\boldsymbol{h}_{t+1}} + \\boldsymbol{W}_{qh}^\\top \\frac{\\partial L}{\\partial \\boldsymbol{o}_t}.\n",
75 | "$$\n",
76 | "\n",
77 | "将上面的递归公式展开,对任意时间步$1 \\leq t \\leq T$,我们可以得到目标函数有关隐藏状态梯度的通项公式\n",
78 | "\n",
79 | "$$\n",
80 | "\\frac{\\partial L}{\\partial \\boldsymbol{h}_t} \n",
81 | "= \\sum_{i=t}^T {\\left(\\boldsymbol{W}_{hh}^\\top\\right)}^{T-i} \\boldsymbol{W}_{qh}^\\top \\frac{\\partial L}{\\partial \\boldsymbol{o}_{T+t-i}}.\n",
82 | "$$\n",
83 | "\n",
84 | "由上式中的指数项可见,当时间步数$T$较大或者时间步$t$较小时,目标函数有关隐藏状态的梯度较容易出现衰减和爆炸。这也会影响其他包含$\\partial L / \\partial \\boldsymbol{h}_t$项的梯度,例如隐藏层中模型参数的梯度$\\partial L / \\partial \\boldsymbol{W}_{hx} \\in \\mathbb{R}^{h \\times d}$和$\\partial L / \\partial \\boldsymbol{W}_{hh} \\in \\mathbb{R}^{h \\times h}$。\n",
85 | "在图6.3中,$L$通过$\\boldsymbol{h}_1, \\ldots, \\boldsymbol{h}_T$依赖这些模型参数。\n",
86 | "依据链式法则,我们有\n",
87 | "\n",
88 | "$$\n",
89 | "\\begin{aligned}\n",
90 | "\\frac{\\partial L}{\\partial \\boldsymbol{W}_{hx}} \n",
91 | "&= \\sum_{t=1}^T \\text{prod}\\left(\\frac{\\partial L}{\\partial \\boldsymbol{h}_t}, \\frac{\\partial \\boldsymbol{h}_t}{\\partial \\boldsymbol{W}_{hx}}\\right) \n",
92 | "= \\sum_{t=1}^T \\frac{\\partial L}{\\partial \\boldsymbol{h}_t} \\boldsymbol{x}_t^\\top,\\\\\n",
93 | "\\frac{\\partial L}{\\partial \\boldsymbol{W}_{hh}} \n",
94 | "&= \\sum_{t=1}^T \\text{prod}\\left(\\frac{\\partial L}{\\partial \\boldsymbol{h}_t}, \\frac{\\partial \\boldsymbol{h}_t}{\\partial \\boldsymbol{W}_{hh}}\\right) \n",
95 | "= \\sum_{t=1}^T \\frac{\\partial L}{\\partial \\boldsymbol{h}_t} \\boldsymbol{h}_{t-1}^\\top.\n",
96 | "\\end{aligned}\n",
97 | "$$\n",
98 | "\n",
99 | "\n",
100 | "我们已在[“正向传播、反向传播和计算图”](../chapter_deep-learning-basics/backprop.ipynb)一节里解释过,每次迭代中,我们在依次计算完以上各个梯度后,会将它们存储起来,从而避免重复计算。例如,由于隐藏状态梯度$\\partial L/\\partial \\boldsymbol{h}_t$被计算和存储,之后的模型参数梯度$\\partial L/\\partial \\boldsymbol{W}_{hx}$和$\\partial L/\\partial \\boldsymbol{W}_{hh}$的计算可以直接读取$\\partial L/\\partial \\boldsymbol{h}_t$的值,而无须重复计算它们。此外,反向传播中的梯度计算可能会依赖变量的当前值。它们正是通过正向传播计算出来的。\n",
101 | "举例来说,参数梯度$\\partial L/\\partial \\boldsymbol{W}_{hh}$的计算需要依赖隐藏状态在时间步$t = 0, \\ldots, T-1$的当前值$\\boldsymbol{h}_t$($\\boldsymbol{h}_0$是初始化得到的)。这些值是通过从输入层到输出层的正向传播计算并存储得到的。\n",
102 | "\n",
103 | "\n",
104 | "## 小结\n",
105 | "\n",
106 | "* 通过时间反向传播是反向传播在循环神经网络中的具体应用。\n",
107 | "* 当时间步数较大或者时间步较小时,循环神经网络的梯度较容易出现衰减或爆炸。\n",
108 | "\n",
109 | "\n",
110 | "## 练习\n",
111 | "\n",
112 | "* 除了梯度裁剪,你还能想到别的什么方法应对循环神经网络中的梯度爆炸?\n",
113 | "\n",
114 | "\n",
115 | "\n",
116 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/3711)\n",
117 | "\n",
118 | ""
119 | ]
120 | }
121 | ],
122 | "metadata": {
123 | "kernelspec": {
124 | "display_name": "Python 3",
125 | "language": "python",
126 | "name": "python3"
127 | },
128 | "language_info": {
129 | "codemirror_mode": {
130 | "name": "ipython",
131 | "version": 3
132 | },
133 | "file_extension": ".py",
134 | "mimetype": "text/x-python",
135 | "name": "python",
136 | "nbconvert_exporter": "python",
137 | "pygments_lexer": "ipython3",
138 | "version": "3.7.4"
139 | },
140 | "toc": {
141 | "base_numbering": 1,
142 | "nav_menu": {},
143 | "number_sections": true,
144 | "sideBar": true,
145 | "skip_h1_title": false,
146 | "title_cell": "Table of Contents",
147 | "title_sidebar": "Contents",
148 | "toc_cell": false,
149 | "toc_position": {},
150 | "toc_section_display": true,
151 | "toc_window_display": false
152 | }
153 | },
154 | "nbformat": 4,
155 | "nbformat_minor": 4
156 | }
157 |
--------------------------------------------------------------------------------
/chapter_recurrent-neural-networks/deep-rnn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 深度循环神经网络\n",
8 | "\n",
9 | "本章到目前为止介绍的循环神经网络只有一个单向的隐藏层,在深度学习应用里,我们通常会用到含有多个隐藏层的循环神经网络,也称作深度循环神经网络。图6.11演示了一个有$L$个隐藏层的深度循环神经网络,每个隐藏状态不断传递至当前层的下一时间步和当前时间步的下一层。\n",
10 | "\n",
11 | "\n",
12 | "\n",
13 | "\n",
14 | "具体来说,在时间步$t$里,设小批量输入$\\boldsymbol{X}_t \\in \\mathbb{R}^{n \\times d}$(样本数为$n$,输入个数为$d$),第$\\ell$隐藏层($\\ell=1,\\ldots,L$)的隐藏状态为$\\boldsymbol{H}_t^{(\\ell)} \\in \\mathbb{R}^{n \\times h}$(隐藏单元个数为$h$),输出层变量为$\\boldsymbol{O}_t \\in \\mathbb{R}^{n \\times q}$(输出个数为$q$),且隐藏层的激活函数为$\\phi$。第1隐藏层的隐藏状态和之前的计算一样:\n",
15 | "\n",
16 | "$$\\boldsymbol{H}_t^{(1)} = \\phi(\\boldsymbol{X}_t \\boldsymbol{W}_{xh}^{(1)} + \\boldsymbol{H}_{t-1}^{(1)} \\boldsymbol{W}_{hh}^{(1)} + \\boldsymbol{b}_h^{(1)}),$$\n",
17 | "\n",
18 | "\n",
19 | "其中权重$\\boldsymbol{W}_{xh}^{(1)} \\in \\mathbb{R}^{d \\times h}$、$\\boldsymbol{W}_{hh}^{(1)} \\in \\mathbb{R}^{h \\times h}$和偏差 $\\boldsymbol{b}_h^{(1)} \\in \\mathbb{R}^{1 \\times h}$分别为第1隐藏层的模型参数。\n",
20 | "\n",
21 | "当$1 < \\ell \\leq L$时,第$\\ell$隐藏层的隐藏状态的表达式为\n",
22 | "\n",
23 | "$$\\boldsymbol{H}_t^{(\\ell)} = \\phi(\\boldsymbol{H}_t^{(\\ell-1)} \\boldsymbol{W}_{xh}^{(\\ell)} + \\boldsymbol{H}_{t-1}^{(\\ell)} \\boldsymbol{W}_{hh}^{(\\ell)} + \\boldsymbol{b}_h^{(\\ell)}),$$\n",
24 | "\n",
25 | "\n",
26 | "其中权重$\\boldsymbol{W}_{xh}^{(\\ell)} \\in \\mathbb{R}^{h \\times h}$、$\\boldsymbol{W}_{hh}^{(\\ell)} \\in \\mathbb{R}^{h \\times h}$和偏差 $\\boldsymbol{b}_h^{(\\ell)} \\in \\mathbb{R}^{1 \\times h}$分别为第$\\ell$隐藏层的模型参数。\n",
27 | "\n",
28 | "最终,输出层的输出只需基于第$L$隐藏层的隐藏状态:\n",
29 | "\n",
30 | "$$\\boldsymbol{O}_t = \\boldsymbol{H}_t^{(L)} \\boldsymbol{W}_{hq} + \\boldsymbol{b}_q,$$\n",
31 | "\n",
32 | "其中权重$\\boldsymbol{W}_{hq} \\in \\mathbb{R}^{h \\times q}$和偏差$\\boldsymbol{b}_q \\in \\mathbb{R}^{1 \\times q}$为输出层的模型参数。\n",
33 | "\n",
34 | "同多层感知机一样,隐藏层个数$L$和隐藏单元个数$h$都是超参数。此外,如果将隐藏状态的计算换成门控循环单元或者长短期记忆的计算,我们可以得到深度门控循环神经网络。\n",
35 | "\n",
36 | "## 小结\n",
37 | "\n",
38 | "* 在深度循环神经网络中,隐藏状态的信息不断传递至当前层的下一时间步和当前时间步的下一层。\n",
39 | "\n",
40 | "\n",
41 | "## 练习\n",
42 | "\n",
43 | "* 将[“循环神经网络的从零开始实现”](rnn-scratch.ipynb)一节中的模型改为含有2个隐藏层的循环神经网络。观察并分析实验现象。\n",
44 | "\n",
45 | "\n",
46 | "\n",
47 | "\n",
48 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/6730)\n",
49 | "\n",
50 | ""
51 | ]
52 | }
53 | ],
54 | "metadata": {
55 | "kernelspec": {
56 | "display_name": "Python [conda env:pytorch]",
57 | "language": "python",
58 | "name": "conda-env-pytorch-py"
59 | },
60 | "language_info": {
61 | "codemirror_mode": {
62 | "name": "ipython",
63 | "version": 3
64 | },
65 | "file_extension": ".py",
66 | "mimetype": "text/x-python",
67 | "name": "python",
68 | "nbconvert_exporter": "python",
69 | "pygments_lexer": "ipython3",
70 | "version": "3.6.9"
71 | },
72 | "toc": {
73 | "base_numbering": 1,
74 | "nav_menu": {},
75 | "number_sections": true,
76 | "sideBar": true,
77 | "skip_h1_title": false,
78 | "title_cell": "Table of Contents",
79 | "title_sidebar": "Contents",
80 | "toc_cell": false,
81 | "toc_position": {},
82 | "toc_section_display": true,
83 | "toc_window_display": false
84 | }
85 | },
86 | "nbformat": 4,
87 | "nbformat_minor": 4
88 | }
89 |
--------------------------------------------------------------------------------
/chapter_recurrent-neural-networks/index.md:
--------------------------------------------------------------------------------
1 | # 循环神经网络
2 |
3 | 与之前介绍的多层感知机和能有效处理空间信息的卷积神经网络不同,循环神经网络是为更好地处理时序信息而设计的。它引入状态变量来存储过去的信息,并用其与当前的输入共同决定当前的输出。
4 |
5 | 循环神经网络常用于处理序列数据,如一段文字或声音、购物或观影的顺序,甚至是图像中的一行或一列像素。因此,循环神经网络有着极为广泛的实际应用,如语言模型、文本分类、机器翻译、语音识别、图像分析、手写识别和推荐系统。
6 |
7 | 因为本章中的应用是基于语言模型的,所以我们将先介绍语言模型的基本概念,并由此激发循环神经网络的设计灵感。接着,我们将描述循环神经网络中的梯度计算方法,从而探究循环神经网络训练可能存在的问题。对于其中的部分问题,我们可以使用本章稍后介绍的含门控的循环神经网络来解决。最后,我们将拓展循环神经网络的架构。
8 |
9 | ```eval_rst
10 |
11 | .. toctree::
12 | :maxdepth: 2
13 |
14 | lang-model
15 | rnn
16 | lang-model-dataset
17 | rnn-scratch
18 | rnn-nn
19 | bptt
20 | gru
21 | lstm
22 | deep-rnn
23 | bi-rnn
24 | ```
25 |
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/chapter_recurrent-neural-networks/lang-model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 语言模型\n",
8 | "\n",
9 | "语言模型(language model)是自然语言处理的重要技术。自然语言处理中最常见的数据是文本数据。我们可以把一段自然语言文本看作一段离散的时间序列。假设一段长度为$T$的文本中的词依次为$w_1, w_2, \\ldots, w_T$,那么在离散的时间序列中,$w_t$($1 \\leq t \\leq T$)可看作在时间步(time step)$t$的输出或标签。给定一个长度为$T$的词的序列$w_1, w_2, \\ldots, w_T$,语言模型将计算该序列的概率:\n",
10 | "\n",
11 | "$$P(w_1, w_2, \\ldots, w_T).$$\n",
12 | "\n",
13 | "\n",
14 | "语言模型可用于提升语音识别和机器翻译的性能。例如,在语音识别中,给定一段“厨房里食油用完了”的语音,有可能会输出“厨房里食油用完了”和“厨房里石油用完了”这两个读音完全一样的文本序列。如果语言模型判断出前者的概率大于后者的概率,我们就可以根据相同读音的语音输出“厨房里食油用完了”的文本序列。在机器翻译中,如果对英文“you go first”逐词翻译成中文的话,可能得到“你走先”“你先走”等排列方式的文本序列。如果语言模型判断出“你先走”的概率大于其他排列方式的文本序列的概率,我们就可以把“you go first”翻译成“你先走”。\n",
15 | "\n",
16 | "\n",
17 | "## 语言模型的计算\n",
18 | "\n",
19 | "\n",
20 | "既然语言模型很有用,那该如何计算它呢?假设序列$w_1, w_2, \\ldots, w_T$中的每个词是依次生成的,我们有\n",
21 | "\n",
22 | "$$P(w_1, w_2, \\ldots, w_T) = \\prod_{t=1}^T P(w_t \\mid w_1, \\ldots, w_{t-1}).$$\n",
23 | "\n",
24 | "例如,一段含有4个词的文本序列的概率\n",
25 | "\n",
26 | "$$P(w_1, w_2, w_3, w_4) = P(w_1) P(w_2 \\mid w_1) P(w_3 \\mid w_1, w_2) P(w_4 \\mid w_1, w_2, w_3).$$\n",
27 | "\n",
28 | "为了计算语言模型,我们需要计算词的概率,以及一个词在给定前几个词的情况下的条件概率,即语言模型参数。设训练数据集为一个大型文本语料库,如维基百科的所有条目。词的概率可以通过该词在训练数据集中的相对词频来计算。例如,$P(w_1)$可以计算为$w_1$在训练数据集中的词频(词出现的次数)与训练数据集的总词数之比。因此,根据条件概率定义,一个词在给定前几个词的情况下的条件概率也可以通过训练数据集中的相对词频计算。例如,$P(w_2 \\mid w_1)$可以计算为$w_1, w_2$两词相邻的频率与$w_1$词频的比值,因为该比值即$P(w_1, w_2)$与$P(w_1)$之比;而$P(w_3 \\mid w_1, w_2)$同理可以计算为$w_1$、$w_2$和$w_3$三词相邻的频率与$w_1$和$w_2$两词相邻的频率的比值。以此类推。\n",
29 | "\n",
30 | "\n",
31 | "## $n$元语法\n",
32 | "\n",
33 | "当序列长度增加时,计算和存储多个词共同出现的概率的复杂度会呈指数级增加。$n$元语法通过马尔可夫假设(虽然并不一定成立)简化了语言模型的计算。这里的马尔可夫假设是指一个词的出现只与前面$n$个词相关,即$n$阶马尔可夫链(Markov chain of order $n$)。如果$n=1$,那么有$P(w_3 \\mid w_1, w_2) = P(w_3 \\mid w_2)$。如果基于$n-1$阶马尔可夫链,我们可以将语言模型改写为\n",
34 | "\n",
35 | "$$P(w_1, w_2, \\ldots, w_T) \\approx \\prod_{t=1}^T P(w_t \\mid w_{t-(n-1)}, \\ldots, w_{t-1}) .$$\n",
36 | "\n",
37 | "\n",
38 | "以上也叫$n$元语法($n$-grams)。它是基于$n - 1$阶马尔可夫链的概率语言模型。当$n$分别为1、2和3时,我们将其分别称作一元语法(unigram)、二元语法(bigram)和三元语法(trigram)。例如,长度为4的序列$w_1, w_2, w_3, w_4$在一元语法、二元语法和三元语法中的概率分别为\n",
39 | "\n",
40 | "$$\n",
41 | "\\begin{aligned}\n",
42 | "P(w_1, w_2, w_3, w_4) &= P(w_1) P(w_2) P(w_3) P(w_4) ,\\\\\n",
43 | "P(w_1, w_2, w_3, w_4) &= P(w_1) P(w_2 \\mid w_1) P(w_3 \\mid w_2) P(w_4 \\mid w_3) ,\\\\\n",
44 | "P(w_1, w_2, w_3, w_4) &= P(w_1) P(w_2 \\mid w_1) P(w_3 \\mid w_1, w_2) P(w_4 \\mid w_2, w_3) .\n",
45 | "\\end{aligned}\n",
46 | "$$\n",
47 | "\n",
48 | "当$n$较小时,$n$元语法往往并不准确。例如,在一元语法中,由三个词组成的句子“你走先”和“你先走”的概率是一样的。然而,当$n$较大时,$n$元语法需要计算并存储大量的词频和多词相邻频率。\n",
49 | "\n",
50 | "那么,有没有方法在语言模型中更好地平衡以上这两点呢?我们将在本章探究这样的方法。\n",
51 | "\n",
52 | "## 小结\n",
53 | "\n",
54 | "* 语言模型是自然语言处理的重要技术。\n",
55 | "* $N$元语法是基于$n-1$阶马尔可夫链的概率语言模型,其中$n$权衡了计算复杂度和模型准确性。\n",
56 | "\n",
57 | "\n",
58 | "## 练习\n",
59 | "\n",
60 | "* 假设训练数据集中有10万个词,四元语法需要存储多少词频和多词相邻频率?\n",
61 | "* 你还能想到哪些语言模型的应用?\n",
62 | "\n",
63 | "\n",
64 | "\n",
65 | "\n",
66 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/6650)\n",
67 | "\n",
68 | ""
69 | ]
70 | }
71 | ],
72 | "metadata": {
73 | "kernelspec": {
74 | "display_name": "Python 3",
75 | "language": "python",
76 | "name": "python3"
77 | },
78 | "language_info": {
79 | "codemirror_mode": {
80 | "name": "ipython",
81 | "version": 3
82 | },
83 | "file_extension": ".py",
84 | "mimetype": "text/x-python",
85 | "name": "python",
86 | "nbconvert_exporter": "python",
87 | "pygments_lexer": "ipython3",
88 | "version": "3.7.4"
89 | },
90 | "toc": {
91 | "base_numbering": 1,
92 | "nav_menu": {},
93 | "number_sections": true,
94 | "sideBar": true,
95 | "skip_h1_title": false,
96 | "title_cell": "Table of Contents",
97 | "title_sidebar": "Contents",
98 | "toc_cell": false,
99 | "toc_position": {},
100 | "toc_section_display": true,
101 | "toc_window_display": false
102 | }
103 | },
104 | "nbformat": 4,
105 | "nbformat_minor": 4
106 | }
107 |
--------------------------------------------------------------------------------
/chapter_recurrent-neural-networks/rnn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 循环神经网络\n",
8 | "\n",
9 | "上一节介绍的$n$元语法中,时间步$t$的词$w_t$基于前面所有词的条件概率只考虑了最近时间步的$n-1$个词。如果要考虑比$t-(n-1)$更早时间步的词对$w_t$的可能影响,我们需要增大$n$。但这样模型参数的数量将随之呈指数级增长(可参考上一节的练习)。\n",
10 | "\n",
11 | "本节将介绍循环神经网络。它并非刚性地记忆所有固定长度的序列,而是通过隐藏状态来存储之前时间步的信息。首先我们回忆一下前面介绍过的多层感知机,然后描述如何添加隐藏状态来将它变成循环神经网络。\n",
12 | "\n",
13 | "\n",
14 | "## 不含隐藏状态的神经网络\n",
15 | "\n",
16 | "让我们考虑一个含单隐藏层的多层感知机。给定样本数为$n$、输入个数(特征数或特征向量维度)为$d$的小批量数据样本$\\boldsymbol{X} \\in \\mathbb{R}^{n \\times d}$。设隐藏层的激活函数为$\\phi$,那么隐藏层的输出$\\boldsymbol{H} \\in \\mathbb{R}^{n \\times h}$计算为\n",
17 | "\n",
18 | "$$\\boldsymbol{H} = \\phi(\\boldsymbol{X} \\boldsymbol{W}_{xh} + \\boldsymbol{b}_h),$$\n",
19 | "\n",
20 | "其中隐藏层权重参数$\\boldsymbol{W}_{xh} \\in \\mathbb{R}^{d \\times h}$,隐藏层偏差参数 $\\boldsymbol{b}_h \\in \\mathbb{R}^{1 \\times h}$,$h$为隐藏单元个数。上式相加的两项形状不同,因此将按照广播机制相加(参见[“数据操作”](../chapter_prerequisite/tensor.ipynb)一节)。把隐藏变量$\\boldsymbol{H}$作为输出层的输入,且设输出个数为$q$(如分类问题中的类别数),输出层的输出为\n",
21 | "\n",
22 | "$$\\boldsymbol{O} = \\boldsymbol{H} \\boldsymbol{W}_{hq} + \\boldsymbol{b}_q,$$\n",
23 | "\n",
24 | "其中输出变量$\\boldsymbol{O} \\in \\mathbb{R}^{n \\times q}$, 输出层权重参数$\\boldsymbol{W}_{hq} \\in \\mathbb{R}^{h \\times q}$, 输出层偏差参数$\\boldsymbol{b}_q \\in \\mathbb{R}^{1 \\times q}$。如果是分类问题,我们可以使用$\\text{softmax}(\\boldsymbol{O})$来计算输出类别的概率分布。\n",
25 | "\n",
26 | "\n",
27 | "## 含隐藏状态的循环神经网络\n",
28 | "\n",
29 | "现在我们考虑输入数据存在时间相关性的情况。假设$\\boldsymbol{X}_t \\in \\mathbb{R}^{n \\times d}$是序列中时间步$t$的小批量输入,$\\boldsymbol{H}_t \\in \\mathbb{R}^{n \\times h}$是该时间步的隐藏变量。与多层感知机不同的是,这里我们保存上一时间步的隐藏变量$\\boldsymbol{H}_{t-1}$,并引入一个新的权重参数$\\boldsymbol{W}_{hh} \\in \\mathbb{R}^{h \\times h}$,该参数用来描述在当前时间步如何使用上一时间步的隐藏变量。具体来说,时间步$t$的隐藏变量的计算由当前时间步的输入和上一时间步的隐藏变量共同决定:\n",
30 | "\n",
31 | "$$\\boldsymbol{H}_t = \\phi(\\boldsymbol{X}_t \\boldsymbol{W}_{xh} + \\boldsymbol{H}_{t-1} \\boldsymbol{W}_{hh} + \\boldsymbol{b}_h).$$\n",
32 | "\n",
33 | "与多层感知机相比,我们在这里添加了$\\boldsymbol{H}_{t-1} \\boldsymbol{W}_{hh}$一项。由上式中相邻时间步的隐藏变量$\\boldsymbol{H}_t$和$\\boldsymbol{H}_{t-1}$之间的关系可知,这里的隐藏变量能够捕捉截至当前时间步的序列的历史信息,就像是神经网络当前时间步的状态或记忆一样。因此,该隐藏变量也称为隐藏状态。由于隐藏状态在当前时间步的定义使用了上一时间步的隐藏状态,上式的计算是循环的。使用循环计算的网络即循环神经网络(recurrent neural network)。\n",
34 | "\n",
35 | "循环神经网络有很多种不同的构造方法。含上式所定义的隐藏状态的循环神经网络是极为常见的一种。若无特别说明,本章中的循环神经网络均基于上式中隐藏状态的循环计算。在时间步$t$,输出层的输出和多层感知机中的计算类似:\n",
36 | "\n",
37 | "$$\\boldsymbol{O}_t = \\boldsymbol{H}_t \\boldsymbol{W}_{hq} + \\boldsymbol{b}_q.$$\n",
38 | "\n",
39 | "循环神经网络的参数包括隐藏层的权重$\\boldsymbol{W}_{xh} \\in \\mathbb{R}^{d \\times h}$、$\\boldsymbol{W}_{hh} \\in \\mathbb{R}^{h \\times h}$和偏差 $\\boldsymbol{b}_h \\in \\mathbb{R}^{1 \\times h}$,以及输出层的权重$\\boldsymbol{W}_{hq} \\in \\mathbb{R}^{h \\times q}$和偏差$\\boldsymbol{b}_q \\in \\mathbb{R}^{1 \\times q}$。值得一提的是,即便在不同时间步,循环神经网络也始终使用这些模型参数。因此,循环神经网络模型参数的数量不随时间步的增加而增长。\n",
40 | "\n",
41 | "图6.1展示了循环神经网络在3个相邻时间步的计算逻辑。在时间步$t$,隐藏状态的计算可以看成是将输入$\\boldsymbol{X}_t$和前一时间步隐藏状态$\\boldsymbol{H}_{t-1}$连结后输入一个激活函数为$\\phi$的全连接层。该全连接层的输出就是当前时间步的隐藏状态$\\boldsymbol{H}_t$,且模型参数为$\\boldsymbol{W}_{xh}$与$\\boldsymbol{W}_{hh}$的连结,偏差为$\\boldsymbol{b}_h$。当前时间步$t$的隐藏状态$\\boldsymbol{H}_t$将参与下一个时间步$t+1$的隐藏状态$\\boldsymbol{H}_{t+1}$的计算,并输入到当前时间步的全连接输出层。\n",
42 | "\n",
43 | "\n",
44 | "\n",
45 | "我们刚刚提到,隐藏状态中$\\boldsymbol{X}_t \\boldsymbol{W}_{xh} + \\boldsymbol{H}_{t-1} \\boldsymbol{W}_{hh}$的计算等价于$\\boldsymbol{X}_t$与$\\boldsymbol{H}_{t-1}$连结后的矩阵乘以$\\boldsymbol{W}_{xh}$与$\\boldsymbol{W}_{hh}$连结后的矩阵。接下来,我们用一个具体的例子来验证这一点。首先,我们构造矩阵`X`、`W_xh`、`H`和`W_hh`,它们的形状分别为(3, 1)、(1, 4)、(3, 4)和(4, 4)。将`X`与`W_xh`、`H`与`W_hh`分别相乘,再把两个乘法运算的结果相加,得到形状为(3, 4)的矩阵。"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 1,
51 | "metadata": {},
52 | "outputs": [
53 | {
54 | "data": {
55 | "text/plain": [
56 | "tensor([[-0.3641, -1.9787, -1.7125, 3.6584],\n",
57 | " [-3.2291, 1.8014, 1.4115, 1.1779],\n",
58 | " [ 1.8880, -2.8201, -1.3570, -0.6905]])"
59 | ]
60 | },
61 | "execution_count": 1,
62 | "metadata": {},
63 | "output_type": "execute_result"
64 | }
65 | ],
66 | "source": [
67 | "import torch\n",
68 | "\n",
69 | "X, W_xh = torch.randn(3, 1), torch.randn(1, 4)\n",
70 | "H, W_hh = torch.randn(3, 4), torch.randn(4, 4)\n",
71 | "torch.mm(X, W_xh) + torch.mm(H, W_hh)"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "将矩阵`X`和`H`按列(维度1)连结,连结后的矩阵形状为(3, 5)。可见,连结后矩阵在维度1的长度为矩阵`X`和`H`在维度1的长度之和($1+4$)。然后,将矩阵`W_xh`和`W_hh`按行(维度0)连结,连结后的矩阵形状为(5, 4)。最后将两个连结后的矩阵相乘,得到与上面代码输出相同的形状为(3, 4)的矩阵。"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 2,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "data": {
88 | "text/plain": [
89 | "tensor([[-0.3641, -1.9787, -1.7125, 3.6584],\n",
90 | " [-3.2291, 1.8014, 1.4115, 1.1779],\n",
91 | " [ 1.8880, -2.8201, -1.3570, -0.6905]])"
92 | ]
93 | },
94 | "execution_count": 2,
95 | "metadata": {},
96 | "output_type": "execute_result"
97 | }
98 | ],
99 | "source": [
100 | "torch.mm(torch.cat((X, H), dim=1), torch.cat((W_xh, W_hh), dim=0))"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "## 应用:基于字符级循环神经网络的语言模型\n",
108 | "\n",
109 | "最后我们介绍如何应用循环神经网络来构建一个语言模型。设小批量中样本数为1,文本序列为“想”“要”“有”“直”“升”“机”。图6.2演示了如何使用循环神经网络基于当前和过去的字符来预测下一个字符。在训练时,我们对每个时间步的输出层输出使用softmax运算,然后使用交叉熵损失函数来计算它与标签的误差。在图6.2中,由于隐藏层中隐藏状态的循环计算,时间步3的输出$\\boldsymbol{O}_3$取决于文本序列“想”“要”“有”。 由于训练数据中该序列的下一个词为“直”,时间步3的损失将取决于该时间步基于序列“想”“要”“有”生成下一个词的概率分布与该时间步的标签“直”。\n",
110 | "\n",
111 | "\n",
112 | "\n",
113 | "因为每个输入词是一个字符,因此这个模型被称为字符级循环神经网络(character-level recurrent neural network)。因为不同字符的个数远小于不同词的个数(对于英文尤其如此),所以字符级循环神经网络的计算通常更加简单。在接下来的几节里,我们将介绍它的具体实现。\n",
114 | "\n",
115 | "\n",
116 | "## 小结\n",
117 | "\n",
118 | "* 使用循环计算的网络即循环神经网络。\n",
119 | "* 循环神经网络的隐藏状态可以捕捉截至当前时间步的序列的历史信息。\n",
120 | "* 循环神经网络模型参数的数量不随时间步的增加而增长。\n",
121 | "* 可以基于字符级循环神经网络来创建语言模型。\n",
122 | "\n",
123 | "## 练习\n",
124 | "\n",
125 | "* 如果使用循环神经网络来预测一段文本序列的下一个词,输出个数应该设为多少?\n",
126 | "* 为什么循环神经网络可以表达某时间步的词基于文本序列中所有过去的词的条件概率?\n",
127 | "\n",
128 | "\n",
129 | "\n",
130 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/6669)\n",
131 | "\n",
132 | ""
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | "## 参考答案\n",
140 | "* 输出个数应该为文本序列中所有不同词的个数\n",
141 | "* 因为该时间步计算使用的隐藏变量$H_{t-1}$包含了文本序列中所有过去的词的信息"
142 | ]
143 | }
144 | ],
145 | "metadata": {
146 | "kernelspec": {
147 | "display_name": "Python [conda env:pytorch]",
148 | "language": "python",
149 | "name": "conda-env-pytorch-py"
150 | },
151 | "language_info": {
152 | "codemirror_mode": {
153 | "name": "ipython",
154 | "version": 3
155 | },
156 | "file_extension": ".py",
157 | "mimetype": "text/x-python",
158 | "name": "python",
159 | "nbconvert_exporter": "python",
160 | "pygments_lexer": "ipython3",
161 | "version": "3.6.9"
162 | },
163 | "toc": {
164 | "base_numbering": 1,
165 | "nav_menu": {},
166 | "number_sections": true,
167 | "sideBar": true,
168 | "skip_h1_title": false,
169 | "title_cell": "Table of Contents",
170 | "title_sidebar": "Contents",
171 | "toc_cell": false,
172 | "toc_position": {},
173 | "toc_section_display": true,
174 | "toc_window_display": false
175 | }
176 | },
177 | "nbformat": 4,
178 | "nbformat_minor": 4
179 | }
180 |
--------------------------------------------------------------------------------
/d2ltorch/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *
2 |
3 | __version__ = '0.8.11'
4 |
--------------------------------------------------------------------------------
/data/fr-en-small.txt:
--------------------------------------------------------------------------------
1 | elle est vieille . she is old .
2 | elle est tranquille . she is quiet .
3 | elle a tort . she is wrong .
4 | elle est canadienne . she is canadian .
5 | elle est japonaise . she is japanese .
6 | ils sont russes . they are russian .
7 | ils se disputent . they are arguing .
8 | ils regardent . they are watching .
9 | ils sont acteurs . they are actors .
10 | elles sont crevees . they are exhausted .
11 | il est mon genre ! he is my type !
12 | il a des ennuis . he is in trouble .
13 | c est mon frere . he is my brother .
14 | c est mon oncle . he is my uncle .
15 | il a environ mon age . he is about my age .
16 | elles sont toutes deux bonnes . they are both good .
17 | elle est bonne nageuse . she is a good swimmer .
18 | c est une personne adorable . he is a lovable person .
19 | il fait du velo . he is riding a bicycle .
20 | ils sont de grands amis . they are great friends .
21 |
--------------------------------------------------------------------------------
/data/jaychou_lyrics.txt.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/data/jaychou_lyrics.txt.zip
--------------------------------------------------------------------------------
/data/kaggle_cifar10/test_tiny.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/data/kaggle_cifar10/test_tiny.zip
--------------------------------------------------------------------------------
/data/kaggle_cifar10/trainLabels.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/data/kaggle_cifar10/trainLabels.csv.zip
--------------------------------------------------------------------------------
/data/kaggle_cifar10/train_tiny.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/data/kaggle_cifar10/train_tiny.zip
--------------------------------------------------------------------------------
/data/kaggle_dog/train_valid_test_tiny.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/data/kaggle_dog/train_valid_test_tiny.zip
--------------------------------------------------------------------------------
/data/ptb.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/data/ptb.zip
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: pytorch
2 | dependencies:
3 | - python=3.6
4 | - pip:
5 | - torch==1.1.0
6 | - torchvision==0.3.0
7 | - torchtext==0.3.1
8 | - d2ltorch==0.0.2
9 | - jupyter==1.0.0
10 | - matplotlib==2.2.2
11 | - pandas==0.23.4
12 | - git+https://github.com/sangyx/pytorch-summary
13 |
--------------------------------------------------------------------------------
/img/404.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/404.jpg
--------------------------------------------------------------------------------
/img/autumn_oak.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/autumn_oak.jpg
--------------------------------------------------------------------------------
/img/aws.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/aws.png
--------------------------------------------------------------------------------
/img/cat1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/cat1.jpg
--------------------------------------------------------------------------------
/img/catdog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/catdog.jpg
--------------------------------------------------------------------------------
/img/cifar10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/cifar10.png
--------------------------------------------------------------------------------
/img/connect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/connect.png
--------------------------------------------------------------------------------
/img/contrib01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/contrib01.png
--------------------------------------------------------------------------------
/img/contrib02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/contrib02.png
--------------------------------------------------------------------------------
/img/contrib03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/contrib03.png
--------------------------------------------------------------------------------
/img/contrib04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/contrib04.png
--------------------------------------------------------------------------------
/img/contrib05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/contrib05.png
--------------------------------------------------------------------------------
/img/contrib06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/contrib06.png
--------------------------------------------------------------------------------
/img/cuda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/cuda.png
--------------------------------------------------------------------------------
/img/disk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/disk.png
--------------------------------------------------------------------------------
/img/ec2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/ec2.png
--------------------------------------------------------------------------------
/img/gtx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/gtx.png
--------------------------------------------------------------------------------
/img/house_pricing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/house_pricing.png
--------------------------------------------------------------------------------
/img/install_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/install_gpu.png
--------------------------------------------------------------------------------
/img/jupyter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/jupyter.png
--------------------------------------------------------------------------------
/img/jupyter00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/jupyter00.png
--------------------------------------------------------------------------------
/img/jupyter01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/jupyter01.png
--------------------------------------------------------------------------------
/img/jupyter02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/jupyter02.png
--------------------------------------------------------------------------------
/img/jupyter03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/jupyter03.png
--------------------------------------------------------------------------------
/img/jupyter04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/jupyter04.png
--------------------------------------------------------------------------------
/img/jupyter05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/jupyter05.png
--------------------------------------------------------------------------------
/img/jupyter06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/jupyter06.png
--------------------------------------------------------------------------------
/img/kaggle-dog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/kaggle-dog.png
--------------------------------------------------------------------------------
/img/kaggle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/kaggle.png
--------------------------------------------------------------------------------
/img/kaggle_cifar10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/kaggle_cifar10.png
--------------------------------------------------------------------------------
/img/kaggle_submit2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/kaggle_submit2.png
--------------------------------------------------------------------------------
/img/keypair.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/keypair.png
--------------------------------------------------------------------------------
/img/koebel.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/koebel.jpg
--------------------------------------------------------------------------------
/img/launching.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/launching.png
--------------------------------------------------------------------------------
/img/limits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/limits.png
--------------------------------------------------------------------------------
/img/neural-style-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/neural-style-1.png
--------------------------------------------------------------------------------
/img/neural-style-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/neural-style-2.png
--------------------------------------------------------------------------------
/img/ones_like.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/ones_like.png
--------------------------------------------------------------------------------
/img/os.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/os.png
--------------------------------------------------------------------------------
/img/p2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/p2x.png
--------------------------------------------------------------------------------
/img/pikachu.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/pikachu.jpg
--------------------------------------------------------------------------------
/img/pytorch-website.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/pytorch-website.png
--------------------------------------------------------------------------------
/img/qq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/qq.png
--------------------------------------------------------------------------------
/img/rainier.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/rainier.jpg
--------------------------------------------------------------------------------
/img/ssh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangyx/d2l-torch/31b757807e3ff637436765c1dff09315d97dcff8/img/ssh.png
--------------------------------------------------------------------------------
/todo/chapter_computational-performance/auto-parallelism.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 自动并行计算\n",
8 | "\n",
9 | "MXNet后端会自动构建计算图。通过计算图,系统可以知道所有计算的依赖关系,并可以选择将没有依赖关系的多个任务并行执行来获得计算性能的提升。例如[“异步计算”](async-computation.ipynb)一节的第一个例子里依次执行了`a = nd.ones((1, 2))`和`b = nd.ones((1, 2))`。这两步计算之间并没有依赖关系,因此系统可以选择并行执行它们。\n",
10 | "\n",
11 | "通常,一个运算符会用到所有CPU或单块GPU上全部的计算资源。例如,`dot`运算符会用到所有CPU(即使是一台机器上有多个CPU处理器)或单块GPU上所有的线程。如果每个运算符的计算量足够大,只在CPU上或者单块GPU上并行运行多个运算符时,每个运算符的运行只分到CPU或单块GPU上部分计算资源。即使这些计算可以并行,最终计算性能的提升可能也并不明显。本节中探讨的自动并行计算主要关注同时使用CPU和GPU的并行计算,以及计算和通信的并行。\n",
12 | "\n",
13 | "首先导入本节中实验所需的包或模块。注意,需要至少一块GPU才能运行本节实验。"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 1,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "import d2lzh as d2l\n",
23 | "import mxnet as mx\n",
24 | "from mxnet import nd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## CPU和GPU的并行计算\n",
32 | "\n",
33 | "我们先介绍CPU和GPU的并行计算,例如,程序中的计算既发生在CPU上,又发生在GPU上。先定义`run`函数,令它做10次矩阵乘法。"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "def run(x):\n",
43 | " return [nd.dot(x, x) for _ in range(10)]"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "接下来,分别在内存和显存上创建`NDArray`。"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 3,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "x_cpu = nd.random.uniform(shape=(2000, 2000))\n",
60 | "x_gpu = nd.random.uniform(shape=(6000, 6000), ctx=mx.gpu(0))"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "然后,分别使用它们在CPU和GPU上运行`run`函数并打印运行所需时间。"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 4,
73 | "metadata": {},
74 | "outputs": [
75 | {
76 | "name": "stdout",
77 | "output_type": "stream",
78 | "text": [
79 | "Run on CPU. time: 0.6203 sec\n"
80 | ]
81 | },
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | "Then run on GPU. time: 0.3033 sec\n"
87 | ]
88 | }
89 | ],
90 | "source": [
91 | "run(x_cpu) # 预热开始\n",
92 | "run(x_gpu)\n",
93 | "nd.waitall() # 预热结束\n",
94 | "\n",
95 | "with d2l.Benchmark('Run on CPU.'):\n",
96 | " run(x_cpu)\n",
97 | " nd.waitall()\n",
98 | "\n",
99 | "with d2l.Benchmark('Then run on GPU.'):\n",
100 | " run(x_gpu)\n",
101 | " nd.waitall()"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {},
107 | "source": [
108 | "我们去掉`run(x_cpu)`和`run(x_gpu)`这两个计算任务之间的`waitall`同步函数,并希望系统能自动并行这两个任务。"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 5,
114 | "metadata": {},
115 | "outputs": [
116 | {
117 | "name": "stdout",
118 | "output_type": "stream",
119 | "text": [
120 | "Run on both CPU and GPU in parallel. time: 0.6193 sec\n"
121 | ]
122 | }
123 | ],
124 | "source": [
125 | "with d2l.Benchmark('Run on both CPU and GPU in parallel.'):\n",
126 | " run(x_cpu)\n",
127 | " run(x_gpu)\n",
128 | " nd.waitall()"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "可以看到,当两个计算任务一起执行时,执行总时间小于它们分开执行的总和。这表明,MXNet能有效地在CPU和GPU上自动并行计算。\n",
136 | "\n",
137 | "\n",
138 | "## 计算和通信的并行计算\n",
139 | "\n",
140 | "在同时使用CPU和GPU的计算中,经常需要在内存和显存之间复制数据,造成数据的通信。在下面的例子中,我们在GPU上计算,然后将结果复制回CPU使用的内存。我们分别打印GPU上计算时间和显存到内存的通信时间。"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 6,
146 | "metadata": {},
147 | "outputs": [
148 | {
149 | "name": "stdout",
150 | "output_type": "stream",
151 | "text": [
152 | "Run on GPU. time: 0.3038 sec\n"
153 | ]
154 | },
155 | {
156 | "name": "stdout",
157 | "output_type": "stream",
158 | "text": [
159 | "Then copy to CPU. time: 1.0442 sec\n"
160 | ]
161 | }
162 | ],
163 | "source": [
164 | "def copy_to_cpu(x):\n",
165 | " return [y.copyto(mx.cpu()) for y in x]\n",
166 | "\n",
167 | "with d2l.Benchmark('Run on GPU.'):\n",
168 | " y = run(x_gpu)\n",
169 | " nd.waitall()\n",
170 | "\n",
171 | "with d2l.Benchmark('Then copy to CPU.'):\n",
172 | " copy_to_cpu(y)\n",
173 | " nd.waitall()"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "我们去掉计算和通信之间的`waitall`同步函数,打印这两个任务完成的总时间。"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 7,
186 | "metadata": {},
187 | "outputs": [
188 | {
189 | "name": "stdout",
190 | "output_type": "stream",
191 | "text": [
192 | "Run and copy in parallel. time: 1.0701 sec\n"
193 | ]
194 | }
195 | ],
196 | "source": [
197 | "with d2l.Benchmark('Run and copy in parallel.'):\n",
198 | " y = run(x_gpu)\n",
199 | " copy_to_cpu(y)\n",
200 | " nd.waitall()"
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {},
206 | "source": [
207 | "可以看到,执行计算和通信的总时间小于两者分别执行的耗时之和。需要注意的是,这个计算并通信的任务不同于本节之前介绍的同时使用CPU和GPU并行计算的任务。这里的运行和通信之间有依赖关系:`y[i]`必须先在GPU上计算好才能复制到CPU使用的内存。所幸的是,在计算`y[i]`的时候系统可以复制`y[i-1]`,从而减少计算和通信的总运行时间。\n",
208 | "\n",
209 | "## 小结\n",
210 | "\n",
211 | "* MXNet能够通过自动并行计算提升计算性能,例如CPU和GPU的并行计算以及计算和通信的并行。\n",
212 | "\n",
213 | "\n",
214 | "## 练习\n",
215 | "\n",
216 | "* 本节中定义的`run`函数里做了10次运算。它们之间也没有依赖关系。设计实验,看看MXNet有没有自动并行执行它们。\n",
217 | "* 设计包含更加复杂的数据依赖的计算任务,通过实验观察MXNet能否得到正确的结果并提升计算性能。\n",
218 | "* 当运算符的计算量足够小时,仅在CPU或单块GPU上并行计算也可能提升计算性能。设计实验来验证这一点。\n",
219 | "\n",
220 | "\n",
221 | "\n",
222 | "\n",
223 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/1883)\n",
224 | "\n",
225 | ""
226 | ]
227 | }
228 | ],
229 | "metadata": {
230 | "language_info": {
231 | "name": "python"
232 | }
233 | },
234 | "nbformat": 4,
235 | "nbformat_minor": 2
236 | }
--------------------------------------------------------------------------------
/todo/chapter_computational-performance/index.md:
--------------------------------------------------------------------------------
1 | # 计算性能
2 |
3 | 在深度学习中,数据集通常很大而且模型计算往往很复杂。因此,我们十分关注计算性能。本章将重点介绍影响计算性能的重要因子:命令式编程、符号式编程、异步计算、自动并行计算和多GPU计算。通过本章的学习,你将很可能进一步提升前几章已实现的模型的计算性能,例如,在不影响模型精度的前提下减少模型的训练时间。
4 |
5 | ```eval_rst
6 |
7 | .. toctree::
8 | :maxdepth: 2
9 |
10 | hybridize
11 | async-computation
12 | auto-parallelism
13 | multiple-gpus
14 | multiple-gpus-gluon
15 | ```
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/todo/chapter_computer-vision/index.md:
--------------------------------------------------------------------------------
1 | # 计算机视觉
2 |
3 | 无论是医疗诊断、无人车、摄像监控,还是智能滤镜,计算机视觉领域的诸多应用都与我们当下和未来的生活息息相关。近年来,深度学习技术深刻推动了计算机视觉系统性能的提升。可以说,当下最先进的计算机视觉应用几乎离不开深度学习。鉴于此,本章将关注计算机视觉领域,并从中挑选时下在学术界和工业界具有影响力的方法与应用来展示深度学习的魅力。
4 |
5 | 我们在“卷积神经网络”一章中已经介绍了计算机视觉领域常使用的深度学习模型,并实践了简单的图像分类任务。在本章的开头,我们介绍两种有助于提升模型的泛化能力的方法,即图像增广和微调,并把它们应用于图像分类。由于深度神经网络能够对图像逐级有效地进行表征,这一特性被广泛应用在目标检测、语义分割和样式迁移这些主流计算机视觉任务中,并取得了成功。围绕这一核心思想,首先,我们将描述目标检测的工作流程与各类方法。之后,我们将探究如何使用全卷积网络对图像做语义分割。接下来,我们再解释如何使用样式迁移技术生成像本书封面一样的图像。最后,我们在两个计算机视觉的重要数据集上实践本章和前几章的知识。
6 |
7 | ```eval_rst
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 |
12 | image-augmentation
13 | fine-tuning
14 | bounding-box
15 | anchor
16 | multiscale-object-detection
17 | object-detection-dataset
18 | ssd
19 | rcnn
20 | semantic-segmentation-and-dataset
21 | fcn
22 | neural-style
23 | kaggle-gluon-cifar10
24 | kaggle-gluon-dog
25 | ```
26 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/todo/chapter_computer-vision/rcnn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 区域卷积神经网络(R-CNN)系列\n",
8 | "\n",
9 | "\n",
10 | "区域卷积神经网络(region-based CNN或regions with CNN features,R-CNN)是将深度模型应用于目标检测的开创性工作之一 [1]。在本节中,我们将介绍R-CNN和它的一系列改进方法:快速的R-CNN(Fast R-CNN)[3]、更快的R-CNN(Faster R-CNN)[4] 以及掩码R-CNN(Mask R-CNN)[5]。限于篇幅,这里只介绍这些模型的设计思路。\n",
11 | "\n",
12 | "\n",
13 | "## R-CNN\n",
14 | "\n",
15 | "R-CNN首先对图像选取若干提议区域(如锚框也是一种选取方法)并标注它们的类别和边界框(如偏移量)。然后,用卷积神经网络对每个提议区域做前向计算抽取特征。之后,我们用每个提议区域的特征预测类别和边界框。图9.5描述了R-CNN模型。\n",
16 | "\n",
17 | "\n",
18 | "\n",
19 | "具体来说,R-CNN主要由以下4步构成。\n",
20 | "\n",
21 | "1. 对输入图像使用选择性搜索(selective search)来选取多个高质量的提议区域 [2]。这些提议区域通常是在多个尺度下选取的,并具有不同的形状和大小。每个提议区域将被标注类别和真实边界框。\n",
22 | "1. 选取一个预训练的卷积神经网络,并将其在输出层之前截断。将每个提议区域变形为网络需要的输入尺寸,并通过前向计算输出抽取的提议区域特征。\n",
23 | "1. 将每个提议区域的特征连同其标注的类别作为一个样本,训练多个支持向量机对目标分类。其中每个支持向量机用来判断样本是否属于某一个类别。\n",
24 | "1. 将每个提议区域的特征连同其标注的边界框作为一个样本,训练线性回归模型来预测真实边界框。\n",
25 | "\n",
26 | "R-CNN虽然通过预训练的卷积神经网络有效抽取了图像特征,但它的主要缺点是速度慢。想象一下,我们可能从一张图像中选出上千个提议区域,对该图像做目标检测将导致上千次的卷积神经网络的前向计算。这个巨大的计算量令R-CNN难以在实际应用中被广泛采用。\n",
27 | "\n",
28 | "\n",
29 | "## Fast R-CNN\n",
30 | "\n",
31 | "R-CNN的主要性能瓶颈在于需要对每个提议区域独立抽取特征。由于这些区域通常有大量重叠,独立的特征抽取会导致大量的重复计算。Fast R-CNN对R-CNN的一个主要改进在于只对整个图像做卷积神经网络的前向计算。\n",
32 | "\n",
33 | "图9.6描述了Fast R-CNN模型。\n",
34 | "\n",
35 | "\n",
36 | "\n",
37 | "它的主要计算步骤如下。\n",
38 | "\n",
39 | "1. 与R-CNN相比,Fast R-CNN用来提取特征的卷积神经网络的输入是整个图像,而不是各个提议区域。而且,这个网络通常会参与训练,即更新模型参数。设输入为一张图像,将卷积神经网络的输出的形状记为$1 \\times c \\times h_1 \\times w_1$。\n",
40 | "1. 假设选择性搜索生成$n$个提议区域。这些形状各异的提议区域在卷积神经网络的输出上分别标出形状各异的兴趣区域。这些兴趣区域需要抽取出形状相同的特征(假设高和宽均分别指定为$h_2$和$w_2$)以便于连结后输出。Fast R-CNN引入兴趣区域池化(region of interest pooling,RoI池化)层,将卷积神经网络的输出和提议区域作为输入,输出连结后的各个提议区域抽取的特征,形状为$n \\times c \\times h_2 \\times w_2$。\n",
41 | "1. 通过全连接层将输出形状变换为$n \\times d$,其中超参数$d$取决于模型设计。\n",
42 | "1. 预测类别时,将全连接层的输出的形状再变换为$n \\times q$并使用softmax回归($q$为类别个数)。预测边界框时,将全连接层的输出的形状变换为$n \\times 4$。也就是说,我们为每个提议区域预测类别和边界框。\n",
43 | "\n",
44 | "Fast R-CNN中提出的兴趣区域池化层与我们在[“池化层”](../chapter_convolutional-neural-networks/pooling.ipynb)一节介绍过的池化层有所不同。在池化层中,我们通过设置池化窗口、填充和步幅来控制输出形状。而兴趣区域池化层对每个区域的输出形状是可以直接指定的,例如,指定每个区域输出的高和宽分别为$h_2$和$w_2$。假设某一兴趣区域窗口的高和宽分别为$h$和$w$,该窗口将被划分为形状为$h_2 \\times w_2$的子窗口网格,且每个子窗口的大小大约为$(h/h_2) \\times (w/w_2)$。任一子窗口的高和宽要取整,其中的最大元素作为该子窗口的输出。因此,兴趣区域池化层可从形状各异的兴趣区域中均抽取出形状相同的特征。\n",
45 | "\n",
46 | "图9.7中,我们在$4 \\times 4$的输入上选取了左上角的$3\\times 3$区域作为兴趣区域。对于该兴趣区域,我们通过$2\\times 2$兴趣区域池化层得到一个$2\\times 2$的输出。4个划分后的子窗口分别含有元素0、1、4、5(5最大),2、6(6最大),8、9(9最大),10。\n",
47 | "\n",
48 | "\n",
49 | "\n",
50 | "我们使用`ROIPooling`函数来演示兴趣区域池化层的计算。假设卷积神经网络抽取的特征`X`的高和宽均为4且只有单通道。"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 1,
56 | "metadata": {
57 | "attributes": {
58 | "classes": [],
59 | "id": "",
60 | "n": "4"
61 | }
62 | },
63 | "outputs": [
64 | {
65 | "data": {
66 | "text/plain": [
67 | "\n",
68 | "[[[[ 0. 1. 2. 3.]\n",
69 | " [ 4. 5. 6. 7.]\n",
70 | " [ 8. 9. 10. 11.]\n",
71 | " [12. 13. 14. 15.]]]]\n",
72 | ""
73 | ]
74 | },
75 | "execution_count": 1,
76 | "metadata": {},
77 | "output_type": "execute_result"
78 | }
79 | ],
80 | "source": [
81 | "from mxnet import nd\n",
82 | "\n",
83 | "X = nd.arange(16).reshape((1, 1, 4, 4))\n",
84 | "X"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "metadata": {},
90 | "source": [
91 | "假设图像的高和宽均为40像素。再假设选择性搜索在图像上生成了两个提议区域:每个区域由5个元素表示,分别为区域目标类别、左上角的$x$和$y$轴坐标以及右下角的$x$和$y$轴坐标。"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 2,
97 | "metadata": {
98 | "attributes": {
99 | "classes": [],
100 | "id": "",
101 | "n": "5"
102 | }
103 | },
104 | "outputs": [],
105 | "source": [
106 | "rois = nd.array([[0, 0, 0, 20, 20], [0, 0, 10, 30, 30]])"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "由于`X`的高和宽是图像的高和宽的$1/10$,以上两个提议区域中的坐标先按`spatial_scale`自乘0.1,然后在`X`上分别标出兴趣区域`X[:,:,0:3,0:3]`和`X[:,:,1:4,0:4]`。最后对这两个兴趣区域分别划分子窗口网格并抽取高和宽为2的特征。"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 3,
119 | "metadata": {
120 | "attributes": {
121 | "classes": [],
122 | "id": "",
123 | "n": "6"
124 | }
125 | },
126 | "outputs": [
127 | {
128 | "data": {
129 | "text/plain": [
130 | "\n",
131 | "[[[[ 5. 6.]\n",
132 | " [ 9. 10.]]]\n",
133 | "\n",
134 | "\n",
135 | " [[[ 9. 11.]\n",
136 | " [13. 15.]]]]\n",
137 | ""
138 | ]
139 | },
140 | "execution_count": 3,
141 | "metadata": {},
142 | "output_type": "execute_result"
143 | }
144 | ],
145 | "source": [
146 | "nd.ROIPooling(X, rois, pooled_size=(2, 2), spatial_scale=0.1)"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "## Faster R-CNN\n",
154 | "\n",
155 | "Fast R-CNN通常需要在选择性搜索中生成较多的提议区域,以获得较精确的目标检测结果。Faster R-CNN提出将选择性搜索替换成区域提议网络(region proposal network),从而减少提议区域的生成数量,并保证目标检测的精度。\n",
156 | "\n",
157 | "\n",
158 | "\n",
159 | "\n",
160 | "\n",
161 | "图9.8描述了Faster R-CNN模型。与Fast R-CNN相比,只有生成提议区域的方法从选择性搜索变成了区域提议网络,而其他部分均保持不变。具体来说,区域提议网络的计算步骤如下。\n",
162 | "\n",
163 | "1. 使用填充为1的$3\\times 3$卷积层变换卷积神经网络的输出,并将输出通道数记为$c$。这样,卷积神经网络为图像抽取的特征图中的每个单元均得到一个长度为$c$的新特征。\n",
164 | "1. 以特征图每个单元为中心,生成多个不同大小和宽高比的锚框并标注它们。\n",
165 | "1. 用锚框中心单元长度为$c$的特征分别预测该锚框的二元类别(含目标还是背景)和边界框。\n",
166 | "1. 使用非极大值抑制,从预测类别为目标的预测边界框中移除相似的结果。最终输出的预测边界框即兴趣区域池化层所需要的提议区域。\n",
167 | "\n",
168 | "\n",
169 | "值得一提的是,区域提议网络作为Faster R-CNN的一部分,是和整个模型一起训练得到的。也就是说,Faster R-CNN的目标函数既包括目标检测中的类别和边界框预测,又包括区域提议网络中锚框的二元类别和边界框预测。最终,区域提议网络能够学习到如何生成高质量的提议区域,从而在减少提议区域数量的情况下也能保证目标检测的精度。\n",
170 | "\n",
171 | "\n",
172 | "## Mask R-CNN\n",
173 | "\n",
174 | "如果训练数据还标注了每个目标在图像上的像素级位置,那么Mask R-CNN能有效利用这些详尽的标注信息进一步提升目标检测的精度。\n",
175 | "\n",
176 | "\n",
177 | "\n",
178 | "如图9.9所示,Mask R-CNN在Faster R-CNN的基础上做了修改。Mask R-CNN将兴趣区域池化层替换成了兴趣区域对齐层,即通过双线性插值(bilinear interpolation)来保留特征图上的空间信息,从而更适于像素级预测。兴趣区域对齐层的输出包含了所有兴趣区域的形状相同的特征图。它们既用来预测兴趣区域的类别和边界框,又通过额外的全卷积网络预测目标的像素级位置。我们将在[“全卷积网络(FCN)”](fcn.ipynb)一节介绍如何使用全卷积网络预测图像中像素级的语义。\n",
179 | "\n",
180 | "\n",
181 | "\n",
182 | "## 小结\n",
183 | "\n",
184 | "* R-CNN对图像选取若干提议区域,然后用卷积神经网络对每个提议区域做前向计算抽取特征,再用这些特征预测提议区域的类别和边界框。\n",
185 | "* Fast R-CNN对R-CNN的一个主要改进在于只对整个图像做卷积神经网络的前向计算。它引入了兴趣区域池化层,从而令兴趣区域能够抽取出形状相同的特征。\n",
186 | "* Faster R-CNN将Fast R-CNN中的选择性搜索替换成区域提议网络,从而减少提议区域的生成数量,并保证目标检测的精度。\n",
187 | "* Mask R-CNN在Faster R-CNN基础上引入一个全卷积网络,从而借助目标的像素级位置进一步提升目标检测的精度。\n",
188 | "\n",
189 | "\n",
190 | "## 练习\n",
191 | "\n",
192 | "* 了解GluonCV工具包中有关本节中各个模型的实现 [6]。\n",
193 | "\n",
194 | "\n",
195 | "\n",
196 | "\n",
197 | "\n",
198 | "## 参考文献\n",
199 | "\n",
200 | "[1] Girshick, R., Donahue, J., Darrell, T., & Malik, J. (2014). Rich feature hierarchies for accurate object detection and semantic segmentation. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 580-587).\n",
201 | "\n",
202 | "[2] Uijlings, J. R., Van De Sande, K. E., Gevers, T., & Smeulders, A. W. (2013). Selective search for object recognition. International journal of computer vision, 104(2), 154-171.\n",
203 | "\n",
204 | "[3] Girshick, R. (2015). Fast r-cnn. arXiv preprint arXiv:1504.08083.\n",
205 | "\n",
206 | "[4] Ren, S., He, K., Girshick, R., & Sun, J. (2015). Faster r-cnn: Towards real-time object detection with region proposal networks. In Advances in neural information processing systems (pp. 91-99).\n",
207 | "\n",
208 | "[5] He, K., Gkioxari, G., Dollár, P., & Girshick, R. (2017, October). Mask r-cnn. In Computer Vision (ICCV), 2017 IEEE International Conference on (pp. 2980-2988). IEEE.\n",
209 | "\n",
210 | "[6] GluonCV 工具包。https://gluon-cv.mxnet.io/\n",
211 | "\n",
212 | "## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/7219)\n",
213 | "\n",
214 | ""
215 | ]
216 | }
217 | ],
218 | "metadata": {
219 | "language_info": {
220 | "name": "python"
221 | }
222 | },
223 | "nbformat": 4,
224 | "nbformat_minor": 2
225 | }
--------------------------------------------------------------------------------