├── README.md ├── TERMINOLOGY.ipynb ├── chapter_appendix-tools-for-deep-learning ├── aws.ipynb ├── contributing.ipynb ├── d2l.ipynb ├── index.ipynb ├── jupyter.ipynb ├── sagemaker.ipynb └── selecting-servers-gpus.ipynb ├── chapter_attention-mechanisms ├── attention-cues.ipynb ├── attention-scoring-functions.ipynb ├── bahdanau-attention.ipynb ├── index.ipynb ├── multihead-attention.ipynb ├── nadaraya-waston.ipynb ├── self-attention-and-positional-encoding.ipynb └── transformer.ipynb ├── chapter_computational-performance ├── async-computation.ipynb ├── auto-parallelism.ipynb ├── hardware.ipynb ├── hybridize.ipynb ├── index.ipynb ├── multiple-gpus-concise.ipynb ├── multiple-gpus.ipynb ├── my_mlp └── parameterserver.ipynb ├── chapter_computer-vision ├── anchor.ipynb ├── bounding-box.ipynb ├── fcn.ipynb ├── fine-tuning.ipynb ├── image-augmentation.ipynb ├── index.ipynb ├── kaggle-cifar10.ipynb ├── kaggle-dog.ipynb ├── multiscale-object-detection.ipynb ├── neural-style.ipynb ├── object-detection-dataset.ipynb ├── rcnn.ipynb ├── semantic-segmentation-and-dataset.ipynb ├── ssd.ipynb ├── submission.csv └── transposed-conv.ipynb ├── chapter_convolutional-modern ├── alexnet.ipynb ├── batch-norm.ipynb ├── densenet.ipynb ├── googlenet.ipynb ├── index.ipynb ├── nin.ipynb ├── resnet.ipynb └── vgg.ipynb ├── chapter_convolutional-neural-networks ├── channels.ipynb ├── conv-layer.ipynb ├── index.ipynb ├── lenet.ipynb ├── padding-and-strides.ipynb ├── pooling.ipynb └── why-conv.ipynb ├── chapter_deep-learning-computation ├── custom-layer.ipynb ├── deferred-init.ipynb ├── index.ipynb ├── mlp.params ├── model-construction.ipynb ├── mydict ├── parameters.ipynb ├── read-write.ipynb ├── use-gpu.ipynb ├── x-file └── x-files ├── chapter_installation └── index.ipynb ├── chapter_introduction └── index.ipynb ├── chapter_linear-networks ├── image-classification-dataset.ipynb ├── index.ipynb ├── linear-regression-concise.ipynb ├── linear-regression-scratch.ipynb ├── linear-regression.ipynb ├── softmax-regression-concise.ipynb ├── softmax-regression-scratch.ipynb └── softmax-regression.ipynb ├── chapter_multilayer-perceptrons ├── backprop.ipynb ├── dropout.ipynb ├── environment.ipynb ├── index.ipynb ├── kaggle-house-price.ipynb ├── mlp-concise.ipynb ├── mlp-scratch.ipynb ├── mlp.ipynb ├── numerical-stability-and-init.ipynb ├── submission.csv ├── underfit-overfit.ipynb └── weight-decay.ipynb ├── chapter_natural-language-processing-applications ├── finetuning-bert.ipynb ├── index.ipynb ├── natural-language-inference-and-dataset.ipynb ├── natural-language-inference-attention.ipynb ├── natural-language-inference-bert.ipynb ├── sentiment-analysis-and-dataset.ipynb ├── sentiment-analysis-cnn.ipynb └── sentiment-analysis-rnn.ipynb ├── chapter_natural-language-processing-pretraining ├── approx-training.ipynb ├── bert-dataset.ipynb ├── bert-pretraining.ipynb ├── bert.ipynb ├── glove.ipynb ├── index.ipynb ├── similarity-analogy.ipynb ├── subword-embedding.ipynb ├── word-embedding-dataset.ipynb ├── word2vec-pretraining.ipynb └── word2vec.ipynb ├── chapter_notation └── index.ipynb ├── chapter_optimization ├── adadelta.ipynb ├── adagrad.ipynb ├── adam.ipynb ├── convexity.ipynb ├── gd.ipynb ├── index.ipynb ├── lr-scheduler.ipynb ├── minibatch-sgd.ipynb ├── momentum.ipynb ├── optimization-intro.ipynb ├── rmsprop.ipynb └── sgd.ipynb ├── chapter_preface └── index.ipynb ├── chapter_preliminaries ├── autograd.ipynb ├── calculus.ipynb ├── index.ipynb ├── linear-algebra.ipynb ├── lookup-api.ipynb ├── ndarray.ipynb ├── pandas.ipynb └── probability.ipynb ├── chapter_recurrent-modern ├── beam-search.ipynb ├── bi-rnn.ipynb ├── deep-rnn.ipynb ├── encoder-decoder.ipynb ├── gru.ipynb ├── index.ipynb ├── lstm.ipynb ├── machine-translation-and-dataset.ipynb └── seq2seq.ipynb ├── chapter_recurrent-neural-networks ├── bptt.ipynb ├── index.ipynb ├── language-models-and-dataset.ipynb ├── rnn-concise.ipynb ├── rnn-scratch.ipynb ├── rnn.ipynb ├── sequence.ipynb └── text-preprocessing.ipynb ├── chapter_references └── zreferences.ipynb ├── d2l.bib ├── img ├── Marginal.svg ├── Neuron.svg ├── a77.svg ├── add_norm.svg ├── alexnet-original.svg ├── alexnet.svg ├── anchor-label.svg ├── asyncgraph.svg ├── attention-output.svg ├── attention.svg ├── autumn-oak.jpg ├── aws.png ├── banana.jpg ├── beam-search.svg ├── bert-input.svg ├── bert-one-seq.svg ├── bert-qa.svg ├── bert-tagging.svg ├── bert-two-seqs.svg ├── birnn.svg ├── blocks.svg ├── book-org.svg ├── bw-hierarchy.svg ├── capacity-vs-error.svg ├── capacity_vs_error.svg ├── cat-dog-pixels.png ├── cat-dog-test.svg ├── cat-dog-train.svg ├── cat1.jpg ├── cat2.jpg ├── cat3.jpg ├── catdog.jpg ├── cbow.svg ├── chain-net1.svg ├── chain-net2.svg ├── chmod.png ├── cnn-rnn-self-attention.svg ├── colab-2.png ├── colab.png ├── computegraph.svg ├── connect.png ├── contribute.svg ├── conv-1x1.svg ├── conv-multi-in.svg ├── conv-pad.svg ├── conv-stride.svg ├── conv1d-2d.svg ├── conv1d-channel.svg ├── conv1d.svg ├── convex-intersect.svg ├── copyto.svg ├── correlation.svg ├── cuda101.png ├── data-collection.svg ├── data-parallel.svg ├── death-cap.jpg ├── deep-rnn.svg ├── deeplearning-amazon.jpg ├── densenet-block.svg ├── densenet.svg ├── disk.png ├── dog1.jpg ├── dog2.jpg ├── dropout2.svg ├── ec2.png ├── edit-file.png ├── elmo-gpt-bert.svg ├── encoder-decoder.svg ├── eye-book.png ├── eye-book.svg ├── eye-coffee.png ├── eye-coffee.svg ├── falseshare.svg ├── falsesharing.svg ├── fast-rcnn.svg ├── faster-rcnn.svg ├── fcn.svg ├── filters.png ├── finetune.svg ├── fit-linreg.svg ├── flopsvsprice.svg ├── forward.svg ├── frontends.png ├── frontends.svg ├── frontends │ ├── Canvas 1.svg │ ├── image10.tiff │ ├── image2.tiff │ ├── image3.tiff │ ├── image4.tiff │ ├── image5.pdf │ └── image8.tiff ├── ftse100.png ├── functionclasses.svg ├── gan.svg ├── git-clone.png ├── git-createpr.png ├── git-fork.png ├── git-forked.png ├── git-newpr.png ├── grid-points.svg ├── grid-transform-filled.svg ├── grid-transform.svg ├── gru-1.svg ├── gru-2.svg ├── gru-3.svg ├── hi-softmax.svg ├── hmm.svg ├── house-pricing.png ├── inception-full.svg ├── inception.svg ├── iou.svg ├── jupyter.png ├── jupyter00.png ├── jupyter01.png ├── jupyter02.png ├── jupyter03.png ├── jupyter04.png ├── jupyter05.png ├── jupyter06.png ├── kaggle-cifar10.png ├── kaggle-dog.jpg ├── kaggle-submit2.png ├── kaggle.png ├── keypair.png ├── koebel.jpg ├── lang-model-data.svg ├── latencynumbers.png ├── launching.png ├── lenet-vert.svg ├── lenet.svg ├── limits.png ├── lstm-0.svg ├── lstm-1.svg ├── lstm-2.svg ├── lstm-3.svg ├── marginal.svg ├── mask-rcnn.svg ├── ml-loop.svg ├── mlp.svg ├── mobo-symbol.svg ├── multi-head-attention.svg ├── mutual-information.svg ├── negSecDer.svg ├── neon128.svg ├── neural-style.jpg ├── neural-style.svg ├── neuron.svg ├── nin-compare.svg ├── nin.svg ├── nli-attention.svg ├── nli_attention.svg ├── nlp-map-app.svg ├── nlp-map-nli-attention.svg ├── nlp-map-nli-bert.svg ├── nlp-map-pretrain.svg ├── nlp-map-sa-cnn.svg ├── nlp-map-sa-rnn.svg ├── nonconvex.svg ├── nvlink-twoloop.svg ├── nvlink.svg ├── p2x.png ├── pacman.svg ├── par-vec.svg ├── pikachu.jpg ├── polygon-circle.svg ├── pooling.svg ├── popvssoda.png ├── posSecDer.svg ├── proj-vec.svg ├── projections.svg ├── ps-distributed.svg ├── ps-multimachine.svg ├── ps-multips.svg ├── ps.svg ├── qkv.svg ├── r-cnn.svg ├── rainier.jpg ├── rec-caser.svg ├── rec-deepfm.svg ├── rec-intro.svg ├── rec-mf.svg ├── rec-neumf.svg ├── rec-ranking.svg ├── rec-seq-data.svg ├── rect-trans.svg ├── residual-block.svg ├── resnet-block.svg ├── resnet18.svg ├── ringsync.svg ├── rl-environment.svg ├── rnn-bptt.svg ├── rnn-train.svg ├── rnn.svg ├── roi.svg ├── s2s-prob1.svg ├── s2s-prob2.svg ├── sagemaker-create-2.png ├── sagemaker-create-3-pytorch.png ├── sagemaker-create-3-tensorflow.png ├── sagemaker-create-3.png ├── sagemaker-create.png ├── sagemaker-open.png ├── sagemaker-stop.png ├── sagemaker-terminal.png ├── sagemaker.png ├── segmentation.svg ├── self-attention.svg ├── seq2seq-attention-details.svg ├── seq2seq-attention.svg ├── seq2seq-details.svg ├── seq2seq-predict.svg ├── seq2seq.svg ├── sequence-model.svg ├── singlelayer.svg ├── singleneuron.svg ├── skip-gram.svg ├── skylake.svg ├── softmaxreg.svg ├── space-division-3d.svg ├── space-division.svg ├── speech.png ├── splitting.svg ├── ssd.svg ├── stackedanimals.png ├── statistical-significance.svg ├── style-transfer.svg ├── sub-area.svg ├── sum-order.svg ├── supervised-learning.svg ├── tensorcore.jpg ├── textcnn.svg ├── threading.svg ├── timemachine-5gram.svg ├── trans_conv.svg ├── trans_conv_stride2.svg ├── transformer.svg ├── truncated-bptt.svg ├── turing-processing-block.png ├── turing.png ├── twogpu.svg ├── ubuntu-new.png ├── vec-add.svg ├── vec-angle.svg ├── vgg.svg ├── wake-word.svg ├── waldo-mask.jpg ├── wattvsprice.svg ├── where-wally-walker-books.jpg └── zeroSecDer.svg ├── index.ipynb └── setup.py /README.md: -------------------------------------------------------------------------------- 1 | # d2l-zh-pytorch-sagemaker 2 | Automatically Generated d2l-zh PyTorch Notebooks for SageMaker 3 | -------------------------------------------------------------------------------- /TERMINOLOGY.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8e7fb728", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "## 英汉术语对照\n", 11 | "\n", 12 | "鞍点,saddle point\n", 13 | "\n", 14 | "变换,transform\n", 15 | "\n", 16 | "编码器,encoder\n", 17 | "\n", 18 | "标签,label\n", 19 | "\n", 20 | "步幅,stride\n", 21 | "\n", 22 | "参数,parameter\n", 23 | "\n", 24 | "长短期记忆网络,long short-term memory (LSTM)\n", 25 | "\n", 26 | "超参数,hyperparameter\n", 27 | "\n", 28 | "层序softmax,hierarchical softmax\n", 29 | "\n", 30 | "查准率,precision\n", 31 | "\n", 32 | "成本,cost\n", 33 | "\n", 34 | "词表,vocabulary\n", 35 | "\n", 36 | "词嵌入,word embedding\n", 37 | "\n", 38 | "词向量,word vector\n", 39 | "\n", 40 | "词元,token\n", 41 | "\n", 42 | "词元分析器,tokenizer\n", 43 | "\n", 44 | "词元化,tokenize\n", 45 | "\n", 46 | "汇聚层,pooling layer\n", 47 | "\n", 48 | "稠密,dense\n", 49 | "\n", 50 | "大小,size\n", 51 | "\n", 52 | "导入,import\n", 53 | "\n", 54 | "轮,epoch\n", 55 | "\n", 56 | "暂退法,dropout\n", 57 | "\n", 58 | "动量法,momentum (method)\n", 59 | "\n", 60 | "独立同分布,independent and identically distributed (i.i.d.)\n", 61 | "\n", 62 | "端到端,end-to-end\n", 63 | "\n", 64 | "多层感知机,multilayer perceptron\n", 65 | "\n", 66 | "多头注意力,multi-head attention\n", 67 | "\n", 68 | "二元分类,binary classification\n", 69 | "\n", 70 | "二元,bigram\n", 71 | "\n", 72 | "子采样,subsample\n", 73 | "\n", 74 | "发散,diverge\n", 75 | "\n", 76 | "泛化,generalization\n", 77 | "\n", 78 | "泛化误差,generalization error\n", 79 | "\n", 80 | "方差,variance\n", 81 | "\n", 82 | "分类,classification\n", 83 | "\n", 84 | "分类器,classifier\n", 85 | "\n", 86 | "负采样,negative sampling\n", 87 | "\n", 88 | "感受野,receptive field\n", 89 | "\n", 90 | "格拉姆矩阵,Gram matrix\n", 91 | "\n", 92 | "共现,co-occurrence\n", 93 | "\n", 94 | "广播,broadcast\n", 95 | "\n", 96 | "规范化,normalization\n", 97 | "\n", 98 | "过拟合,overfitting\n", 99 | "\n", 100 | "核回归,kernel regression\n", 101 | "\n", 102 | "恒等映射,identity mapping\n", 103 | "\n", 104 | "假设,hypothesis\n", 105 | "\n", 106 | "基准,baseline\n", 107 | "\n", 108 | "激活函数,activation function\n", 109 | "\n", 110 | "解码器,decoder\n", 111 | "\n", 112 | "近似法,approximate method\n", 113 | "\n", 114 | "经验风险最小化,empirical risk minimization\n", 115 | "\n", 116 | "局部最小值,local minimum\n", 117 | "\n", 118 | "卷积核,convolutional kernel\n", 119 | "\n", 120 | "卷积神经网络,convolutional neural network\n", 121 | "\n", 122 | "决策边界,decision boundary\n", 123 | "\n", 124 | "均值,mean\n", 125 | "\n", 126 | "均方误差,mean squared error\n", 127 | "\n", 128 | "均匀采样,uniform sampling\n", 129 | "\n", 130 | "块,block\n", 131 | "\n", 132 | "困惑度,perplexity\n", 133 | "\n", 134 | "拉普拉斯平滑,Laplace smoothing\n", 135 | "\n", 136 | "连结,concatenate\n", 137 | "\n", 138 | "类,class\n", 139 | "\n", 140 | "交叉熵,cross-entropy\n", 141 | "\n", 142 | "连续词袋,continous bag-of-words (CBOW)\n", 143 | "\n", 144 | "零张量,zero tensor\n", 145 | "\n", 146 | "流水线,pipeline\n", 147 | "\n", 148 | "滤波器,filter\n", 149 | "\n", 150 | "门控循环单元,gated recurrent units (GRU)\n", 151 | "\n", 152 | "目标检测,object detection\n", 153 | "\n", 154 | "偏置,bias\n", 155 | "\n", 156 | "偏导数,partial derivative\n", 157 | "\n", 158 | "偏移量,offset\n", 159 | "\n", 160 | "批量,batch\n", 161 | "\n", 162 | "齐普夫定律,Zipf's law\n", 163 | "\n", 164 | "欠拟合,underfitting\n", 165 | "\n", 166 | "情感分析,sentiment analysis\n", 167 | "\n", 168 | "全连接层,fully-connected layer\n", 169 | "\n", 170 | "权重,weight\n", 171 | "\n", 172 | "三元,trigram\n", 173 | "\n", 174 | "上采样,upsample\n", 175 | "\n", 176 | "上下文变量,context variable\n", 177 | "\n", 178 | "上下文窗口,context window\n", 179 | "\n", 180 | "上下文词,context word\n", 181 | "\n", 182 | "上下文向量,context vector\n", 183 | "\n", 184 | "实例/示例,instance\n", 185 | "\n", 186 | "收敛,converge\n", 187 | "\n", 188 | "属性,property\n", 189 | "\n", 190 | "数值方法,numerical method\n", 191 | "\n", 192 | "数据集,dataset\n", 193 | "\n", 194 | "数据示例,data instance\n", 195 | "\n", 196 | "数据样例,data example\n", 197 | "\n", 198 | "顺序分区,sequential partitioning\n", 199 | "\n", 200 | "softmax回归,softmax regression\n", 201 | "\n", 202 | "随机采样,random sampling\n", 203 | "\n", 204 | "损失函数,loss function\n", 205 | "\n", 206 | "双向循环神经网络,bidirectional recurrent neural network\n", 207 | "\n", 208 | "特征,feature\n", 209 | "\n", 210 | "特征图,feature map\n", 211 | "\n", 212 | "特征值,eigenvalue\n", 213 | "\n", 214 | "梯度,gradient\n", 215 | "\n", 216 | "梯度裁剪,gradient clipping\n", 217 | "\n", 218 | "梯度消失,vanishing gradients\n", 219 | "\n", 220 | "填充,padding\n", 221 | "\n", 222 | "跳元模型,skip-gram model\n", 223 | "\n", 224 | "调参,tune hyperparameter\n", 225 | "\n", 226 | "停用词,stop words\n", 227 | "\n", 228 | "通道,channel\n", 229 | "\n", 230 | "凸优化,convex optimization\n", 231 | "\n", 232 | "图像,image\n", 233 | "\n", 234 | "未知词元,unknown token\n", 235 | "\n", 236 | "无偏估计,unbiased estimate\n", 237 | "\n", 238 | "误差,error\n", 239 | "\n", 240 | "小批量,minibatch\n", 241 | "\n", 242 | "小批量梯度,minibatch gradient\n", 243 | "\n", 244 | "线性模型,linear model\n", 245 | "\n", 246 | "线性回归,linear regression\n", 247 | "\n", 248 | "协同过滤,collaborative filtering\n", 249 | "\n", 250 | "学习率,learning rate\n", 251 | "\n", 252 | "训练误差,training error\n", 253 | "\n", 254 | "循环神经网络,recurrent neural network (RNN)\n", 255 | "\n", 256 | "样例,example\n", 257 | "\n", 258 | "一维梯度下降,gradient descent in one-dimensional space\n", 259 | "\n", 260 | "一元,unigram\n", 261 | "\n", 262 | "隐藏变量,hidden variable\n", 263 | "\n", 264 | "隐藏层,hidden layer\n", 265 | "\n", 266 | "优化器,optimizer\n", 267 | "\n", 268 | "语料库,corpus\n", 269 | "\n", 270 | "运算符,operator\n", 271 | "\n", 272 | "自注意力,self-attention\n", 273 | "\n", 274 | "真实值,ground truth\n", 275 | "\n", 276 | "指标,metric\n", 277 | "\n", 278 | "支持向量机,support vector machine\n", 279 | "\n", 280 | "注意力机制,attention mechanism\n", 281 | "\n", 282 | "注意力模型,attention model\n", 283 | "\n", 284 | "注意力提示,attention cue\n", 285 | "\n", 286 | "准确率/精度,accuracy\n" 287 | ] 288 | } 289 | ], 290 | "metadata": { 291 | "kernelspec": { 292 | "display_name": "conda_pytorch_p36", 293 | "name": "conda_pytorch_p36" 294 | }, 295 | "language_info": { 296 | "name": "python" 297 | }, 298 | "required_libs": [] 299 | }, 300 | "nbformat": 4, 301 | "nbformat_minor": 5 302 | } -------------------------------------------------------------------------------- /chapter_appendix-tools-for-deep-learning/contributing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "14e728be", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 为本书做贡献\n", 11 | ":label:`sec_how_to_contribute`\n", 12 | "\n", 13 | "读者们的投稿大大帮助我们改进了本书的质量。\n", 14 | "如果你发现笔误、无效的链接、一些你认为我们遗漏了引文的地方,\n", 15 | "代码看起来不优雅,或者解释不清楚的地方,请回复我们以帮助读者。\n", 16 | "在常规书籍中,两次印刷之间的间隔(即修订笔误的间隔)常常需要几年,\n", 17 | "但这本书的改进通常需要几小时到几天的时间。\n", 18 | "由于版本控制和持续自动集成(CI)测试,这一切颇为高效。\n", 19 | "为此,你需要向gihub存储库提交一个\n", 20 | "[pull request](https://github.com/d2l-ai/d2l-en/pulls)。\n", 21 | "当你的pull请求被作者合并到代码库中时,\n", 22 | "你将成为[贡献者](https://github.com/d2l-ai/d2l-en/graphs/contributors)。\n", 23 | "\n", 24 | "## 提交微小更改\n", 25 | "\n", 26 | "最常见的贡献是编辑一句话或修正笔误。\n", 27 | "我们建议你在[GitHub存储库](https://github.com/d2l-ai/d2l-en)\n", 28 | "中查找源文件,以定位源文件(一个markdown文件)。\n", 29 | "然后单击右上角的“Edit this file”按钮,在markdown文件中进行更改。\n", 30 | "\n", 31 | "![在Github上编辑文件](../img/edit-file.png)\n", 32 | ":width:`300px`\n", 33 | ":label:`fig_edit_file`\n", 34 | "\n", 35 | "完成后,在页面底部的“Propose file change”(“提交文件修改”)\n", 36 | "面板中填写更改说明,然后单击“Propose file change”按钮。\n", 37 | "它会重定向到新页面以查看你的更改( :numref:`fig_git_createpr`)。\n", 38 | "如果一切正常,你可以通过点击“Create pull request”按钮提交pull请求。\n", 39 | "\n", 40 | "## 大量文本或代码修改\n", 41 | "\n", 42 | "如果你计划修改大量文本或代码,那么你需要更多地了解本书使用的格式。\n", 43 | "源文件基于[markdown格式](https://daringfireball.net/projects/markdown/syntax),\n", 44 | "并通过[d2lbook](http://book.d2l.ai/user/markdown.html)包提供了一组扩展,\n", 45 | "例如引用公式、图像、章节和引文。\n", 46 | "你可以使用任何markdown编辑器打开这些文件并进行更改。\n", 47 | "\n", 48 | "如果你想要更改代码,我们建议你使用Jupyter Notebook打开这些标记文件,\n", 49 | "如 :numref:`sec_jupyter`中所述。\n", 50 | "这样你就可以运行并测试你的更改。\n", 51 | "请记住在提交更改之前清除所有输出,我们的CI系统将执行你更新的部分以生成输出。\n", 52 | "\n", 53 | "某些部分可能支持多个框架实现。如果你添加的新代码块不是使用mxnet,\n", 54 | "请使用`#@tab`来标记代码块的起始行。\n", 55 | "例如`#@tab pytorch`用于一个PyTorch代码块,\n", 56 | "`#@tab tensorflow`用于一个TensorFlow代码块,\n", 57 | "`#@tab paddle`用于一个PaddlePaddle代码块,\n", 58 | "或者`#@tab all`是所有实现的共享代码块。\n", 59 | "你可以参考[d2lbook](http://book.d2l.ai/user/code_tabs.html)包了解更多信息。\n", 60 | "\n", 61 | "## 提交主要更改\n", 62 | "\n", 63 | "我们建议你使用标准的Git流程提交大量修改。\n", 64 | "简而言之,该过程的工作方式如 :numref:`fig_contribute`中所述。\n", 65 | "\n", 66 | "![为这本书作贡献](../img/contribute.svg)\n", 67 | ":label:`fig_contribute`\n", 68 | "\n", 69 | "我们将向你详细介绍这些步骤。\n", 70 | "如果你已经熟悉Git,可以跳过本部分。\n", 71 | "在介绍时,我们假设贡献者的用户名为“astonzhang”。\n", 72 | "\n", 73 | "### 安装Git\n", 74 | "\n", 75 | "Git开源书籍描述了[如何安装git](https://git-scm.com/book/en/v2)。\n", 76 | "这通常通过Ubuntu Linux上的`apt install git`,\n", 77 | "在MacOS上安装Xcode开发人员工具或使用gihub的\n", 78 | "[桌面客户端](https://desktop.github.com)来实现。\n", 79 | "如果你没有GitHub帐户,则需要注册一个帐户。\n", 80 | "\n", 81 | "### 登录GitHub\n", 82 | "\n", 83 | "在浏览器中输入本书代码存储库的[地址](https://github.com/d2l-ai/d2l-en/)。\n", 84 | "单击 :numref:`fig_git_fork`右上角红色框中的`Fork`按钮,以复制本书的存储库。\n", 85 | "这将是你的副本,你可以随心所欲地更改它。\n", 86 | "\n", 87 | "![代码存储库页面](../img/git-fork.png)\n", 88 | ":width:`700px`\n", 89 | ":label:`fig_git_fork`\n", 90 | "\n", 91 | "现在,本书的代码库将被分叉(即复制)到你的用户名,\n", 92 | "例如`astonzhang/d2l-en`显示在 :numref:`fig_git_forked`的左上角。\n", 93 | "\n", 94 | "![分叉代码存储库](../img/git-forked.png)\n", 95 | ":width:`700px`\n", 96 | ":label:`fig_git_forked`\n", 97 | "\n", 98 | "### 克隆存储库\n", 99 | "\n", 100 | "要克隆存储库(即制作本地副本),我们需要获取其存储库地址。\n", 101 | "点击 :numref:`fig_git_clone`中的绿色按钮显示此信息。\n", 102 | "如果你决定将此分支保留更长时间,请确保你的本地副本与主存储库保持最新。\n", 103 | "现在,只需按照 :ref:`chap_installation`中的说明开始。\n", 104 | "主要区别在于,你现在下载的是你自己的存储库分支。\n", 105 | "\n", 106 | "![克隆存储库](../img/git-clone.png)\n", 107 | ":width:`700px`\n", 108 | ":label:`fig_git_clone`\n", 109 | "\n", 110 | "```\n", 111 | "# 将your_github_username替换为你的github用户名\n", 112 | "git clone https://github.com/your_github_username/d2l-en.git\n", 113 | "```\n", 114 | "\n", 115 | "### 编辑和推送\n", 116 | "\n", 117 | "现在是编辑这本书的时候了。最好按照 :numref:`sec_jupyter`中的说明在Jupyter Notebook中编辑它。进行更改并检查它们是否正常。假设我们已经修改了文件`~/d2l-en/chapter_appendix_tools/how-to-contribute.md`中的一个拼写错误。你可以检查你更改了哪些文件。\n", 118 | "\n", 119 | "此时,Git将提示`chapter_appendix_tools/how-to-contribute.md`文件已被修改。\n", 120 | "\n", 121 | "```\n", 122 | "mylaptop:d2l-en me$ git status\n", 123 | "On branch master\n", 124 | "Your branch is up-to-date with 'origin/master'.\n", 125 | "\n", 126 | "Changes not staged for commit:\n", 127 | " (use \"git add ...\" to update what will be committed)\n", 128 | " (use \"git checkout -- ...\" to discard changes in working directory)\n", 129 | "\n", 130 | "\tmodified: chapter_appendix_tools/how-to-contribute.md\n", 131 | "```\n", 132 | "\n", 133 | "在确认这是你想要的之后,执行以下命令:\n", 134 | "\n", 135 | "```\n", 136 | "git add chapter_appendix_tools/how-to-contribute.md\n", 137 | "git commit -m 'fix typo in git documentation'\n", 138 | "git push\n", 139 | "```\n", 140 | "\n", 141 | "然后,更改后的代码将位于存储库的个人分支中。要请求添加更改,你必须为本书的官方存储库创建一个Pull请求。\n", 142 | "\n", 143 | "### 提交Pull请求\n", 144 | "\n", 145 | "如 :numref:`fig_git_newpr`所示,进入gihub上的存储库分支,选择“New pull request”。这将打开一个页面,显示你的编辑与本书主存储库中的当前内容之间的更改。\n", 146 | "\n", 147 | "![新的Pull请求](../img/git-newpr.png)\n", 148 | ":width:`700px`\n", 149 | ":label:`fig_git_newpr`\n", 150 | "\n", 151 | "最后,单击按钮提交Pull请求,如 :numref:`fig_git_createpr`所示。请务必描述你在Pull请求中所做的更改。这将使作者更容易审阅它,并将其与本书合并。根据更改的不同,这可能会立即被接受,也可能会被拒绝,或者更有可能的是,你会收到一些关于更改的反馈。一旦你把它们合并了,你就做完了。\n", 152 | "\n", 153 | "![创建Pull请求](../img/git-createpr.png)\n", 154 | ":width:`700px`\n", 155 | ":label:`fig_git_createpr`\n", 156 | "\n", 157 | "## 小结\n", 158 | "\n", 159 | "* 你可以使用GitHub为本书做贡献。\n", 160 | "* 你可以直接在GitHub上编辑文件以进行微小更改。\n", 161 | "* 要进行重大更改,请分叉存储库,在本地编辑内容,并在准备好后再做出贡献。\n", 162 | "* 尽量不要提交巨大的Pull请求,因为这会使它们难以理解和合并。最好拆分为几个小一点的。\n", 163 | "\n", 164 | "## 练习\n", 165 | "\n", 166 | "1. 启动并分叉`d2l-ai/d2l-en`存储库。\n", 167 | "1. 如果发现任何需要改进的地方(例如,缺少引用),请提交Pull请求。\n", 168 | "1. 通常更好的做法是使用新分支创建Pull请求。学习如何用[Git分支](https://git-scm.com/book/en/v2/Git-Branching-Branches-in-a-Nutshell)来做这件事。\n", 169 | "\n", 170 | "[Discussions](https://discuss.d2l.ai/t/5730)\n" 171 | ] 172 | } 173 | ], 174 | "metadata": { 175 | "kernelspec": { 176 | "display_name": "conda_pytorch_p36", 177 | "name": "conda_pytorch_p36" 178 | }, 179 | "language_info": { 180 | "name": "python" 181 | }, 182 | "required_libs": [] 183 | }, 184 | "nbformat": 4, 185 | "nbformat_minor": 5 186 | } -------------------------------------------------------------------------------- /chapter_appendix-tools-for-deep-learning/d2l.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "1069a72a", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# `d2l` API 文档\n", 11 | ":label:`sec_d2l`\n", 12 | "\n", 13 | "`d2l`包以下成员的实现及其定义和解释部分可在[源文件](https://github.com/d2l-ai/d2l-en/tree/master/d2l)中找到。\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "c81dbb31", 19 | "metadata": { 20 | "origin_pos": 2, 21 | "tab": [ 22 | "pytorch" 23 | ] 24 | }, 25 | "source": [ 26 | "```eval_rst\n", 27 | ".. currentmodule:: d2l.torch\n", 28 | "```\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "7f0df80c", 34 | "metadata": { 35 | "origin_pos": 5 36 | }, 37 | "source": [ 38 | "## 模型\n", 39 | "\n", 40 | "```eval_rst\n", 41 | ".. autoclass:: Module\n", 42 | " :members:\n", 43 | "\n", 44 | ".. autoclass:: LinearRegressionScratch\n", 45 | " :members:\n", 46 | "\n", 47 | ".. autoclass:: LinearRegression\n", 48 | " :members:\n", 49 | "\n", 50 | ".. autoclass:: Classification\n", 51 | " :members:\n", 52 | "```\n", 53 | "\n", 54 | "## 数据\n", 55 | "\n", 56 | "```eval_rst\n", 57 | ".. autoclass:: DataModule\n", 58 | " :members:\n", 59 | "\n", 60 | ".. autoclass:: SyntheticRegressionData\n", 61 | " :members:\n", 62 | "\n", 63 | ".. autoclass:: FashionMNIST\n", 64 | " :members:\n", 65 | "```\n", 66 | "\n", 67 | "## 训练\n", 68 | "\n", 69 | "```eval_rst\n", 70 | ".. autoclass:: Trainer\n", 71 | " :members:\n", 72 | "\n", 73 | ".. autoclass:: SGD\n", 74 | " :members:\n", 75 | "```\n", 76 | "\n", 77 | "## 公用\n", 78 | "\n", 79 | "```eval_rst\n", 80 | ".. autofunction:: add_to_class\n", 81 | "\n", 82 | ".. autofunction:: cpu\n", 83 | "\n", 84 | ".. autofunction:: gpu\n", 85 | "\n", 86 | ".. autofunction:: num_gpus\n", 87 | "\n", 88 | ".. autoclass:: ProgressBoard\n", 89 | " :members:\n", 90 | "\n", 91 | ".. autoclass:: HyperParameters\n", 92 | " :members:\n", 93 | "```\n" 94 | ] 95 | } 96 | ], 97 | "metadata": { 98 | "kernelspec": { 99 | "display_name": "conda_pytorch_p36", 100 | "name": "conda_pytorch_p36" 101 | }, 102 | "language_info": { 103 | "name": "python" 104 | }, 105 | "required_libs": [] 106 | }, 107 | "nbformat": 4, 108 | "nbformat_minor": 5 109 | } -------------------------------------------------------------------------------- /chapter_appendix-tools-for-deep-learning/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "b6a8abfc", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 附录:深度学习工具\n", 11 | ":label:`chap_appendix_tools`\n", 12 | "\n", 13 | "为了充分利用《动手学深度学习》,本书将在本附录中介绍不同工具,\n", 14 | "例如如何运行这本交互式开源书籍和为本书做贡献。\n", 15 | "\n", 16 | ":begin_tab:toc\n", 17 | " - [jupyter](jupyter.ipynb)\n", 18 | " - [sagemaker](sagemaker.ipynb)\n", 19 | " - [aws](aws.ipynb)\n", 20 | " - [selecting-servers-gpus](selecting-servers-gpus.ipynb)\n", 21 | " - [contributing](contributing.ipynb)\n", 22 | " - [d2l](d2l.ipynb)\n", 23 | ":end_tab:\n" 24 | ] 25 | } 26 | ], 27 | "metadata": { 28 | "kernelspec": { 29 | "display_name": "conda_pytorch_p36", 30 | "name": "conda_pytorch_p36" 31 | }, 32 | "language_info": { 33 | "name": "python" 34 | }, 35 | "required_libs": [] 36 | }, 37 | "nbformat": 4, 38 | "nbformat_minor": 5 39 | } -------------------------------------------------------------------------------- /chapter_appendix-tools-for-deep-learning/jupyter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5d3957b1", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 使用Jupyter Notebook\n", 11 | ":label:`sec_jupyter`\n", 12 | "\n", 13 | "本节介绍如何使用Jupyter Notebook编辑和运行本书各章中的代码。确保你已按照 :ref:`chap_installation`中的说明安装了Jupyter并下载了代码。如果你想了解更多关于Jupyter的信息,请参阅其[文档](https://jupyter.readthedocs.io/en/latest/)中的优秀教程。 \n", 14 | "\n", 15 | "## 在本地编辑和运行代码\n", 16 | "\n", 17 | "假设本书代码的本地路径为`xx/yy/d2l-en/`。使用shell将目录更改为此路径(`cd xx/yy/d2l-en`)并运行命令`jupyter notebook`。如果浏览器未自动打开,请打开http://localhost:8888。此时你将看到Jupyter的界面以及包含本书代码的所有文件夹,如 :numref:`fig_jupyter00`所示\n", 18 | "\n", 19 | "![包含本书代码的文件夹](../img/jupyter00.png)\n", 20 | ":width:`600px`\n", 21 | ":label:`fig_jupyter00`\n", 22 | "\n", 23 | "你可以通过单击网页上显示的文件夹来访问notebook文件。它们通常有后缀“.ipynb”。为了简洁起见,我们创建了一个临时的“test.ipynb”文件。单击后显示的内容如 :numref:`fig_jupyter01`所示。此notebook包括一个标记单元格和一个代码单元格。标记单元格中的内容包括“This Is a Title”和“This is text.”。代码单元包含两行Python代码。 \n", 24 | "\n", 25 | "![“test.ipynb”文件中的markdown和代码块](../img/jupyter01.png)\n", 26 | ":width:`600px`\n", 27 | ":label:`fig_jupyter01`\n", 28 | "\n", 29 | "双击标记单元格以进入编辑模式。在单元格末尾添加一个新的文本字符串“Hello world.”,如 :numref:`fig_jupyter02`所示。 \n", 30 | "\n", 31 | "![编辑markdown单元格](../img/jupyter02.png)\n", 32 | ":width:`600px`\n", 33 | ":label:`fig_jupyter02`\n", 34 | "\n", 35 | "如 :numref:`fig_jupyter03`所示,单击菜单栏中的“Cell” $\\rightarrow$ “Run Cells”以运行编辑后的单元格。 \n", 36 | "\n", 37 | "![运行单元格](../img/jupyter03.png)\n", 38 | ":width:`600px`\n", 39 | ":label:`fig_jupyter03`\n", 40 | "\n", 41 | "运行后,markdown单元格如 :numref:`fig_jupyter04`所示。 \n", 42 | "\n", 43 | "![编辑后的markdown单元格](../img/jupyter04.png)\n", 44 | ":width:`600px`\n", 45 | ":label:`fig_jupyter04`\n", 46 | "\n", 47 | "接下来,单击代码单元。将最后一行代码后的元素乘以2,如 :numref:`fig_jupyter05`所示。 \n", 48 | "\n", 49 | "![编辑代码单元格](../img/jupyter05.png)\n", 50 | ":width:`600px`\n", 51 | ":label:`fig_jupyter05`\n", 52 | "\n", 53 | "你还可以使用快捷键(默认情况下为Ctrl+Enter)运行单元格,并从 :numref:`fig_jupyter06`获取输出结果。 \n", 54 | "\n", 55 | "![运行代码单元格以获得输出](../img/jupyter06.png)\n", 56 | ":width:`600px`\n", 57 | ":label:`fig_jupyter06`\n", 58 | "\n", 59 | "当一个notebook包含更多单元格时,我们可以单击菜单栏中的“Kernel”$\\rightarrow$“Restart & Run All”来运行整个notebook中的所有单元格。通过单击菜单栏中的“Help”$\\rightarrow$“Edit Keyboard Shortcuts”,可以根据你的首选项编辑快捷键。 \n", 60 | "\n", 61 | "## 高级选项\n", 62 | "\n", 63 | "除了本地编辑,还有两件事非常重要:以markdown格式编辑notebook和远程运行Jupyter。当我们想要在更快的服务器上运行代码时,后者很重要。前者很重要,因为Jupyter原生的ipynb格式存储了大量辅助数据,这些数据实际上并不特定于notebook中的内容,主要与代码的运行方式和运行位置有关。这让git感到困惑,并且使得合并贡献非常困难。幸运的是,还有另一种选择——在markdown中进行本地编辑。 \n", 64 | "\n", 65 | "### Jupyter中的Markdown文件\n", 66 | "\n", 67 | "如果你希望对本书的内容有所贡献,则需要在GitHub上修改源文件(md文件,而不是ipynb文件)。使用notedown插件,我们可以直接在Jupyter中修改md格式的notebook。 \n", 68 | "\n", 69 | "首先,安装notedown插件,运行Jupyter Notebook并加载插件:\n", 70 | "\n", 71 | "```\n", 72 | "pip install d2l-notedown # 你可能需要卸载原始notedown\n", 73 | "jupyter notebook --NotebookApp.contents_manager_class='notedown.NotedownContentsManager'\n", 74 | "```\n", 75 | "\n", 76 | "要在运行Jupyter Notebook时默认打开notedown插件,请执行以下操作:首先,生成一个Jupyter Notebook配置文件(如果已经生成了,可以跳过此步骤)。\n", 77 | "\n", 78 | "```\n", 79 | "jupyter notebook --generate-config\n", 80 | "```\n", 81 | "\n", 82 | "然后,在Jupyter Notebook配置文件的末尾添加以下行(对于Linux/macOS,通常位于`~/.jupyter/jupyter_notebook_config.py`):\n", 83 | "\n", 84 | "```\n", 85 | "c.NotebookApp.contents_manager_class = 'notedown.NotedownContentsManager'\n", 86 | "```\n", 87 | "\n", 88 | "在这之后,你只需要运行`jupyter notebook`命令就可以默认打开notedown插件。 \n", 89 | "\n", 90 | "### 在远程服务器上运行Jupyter Notebook\n", 91 | "\n", 92 | "有时,你可能希望在远程服务器上运行Jupyter Notebook,并通过本地计算机上的浏览器访问它。如果本地计算机上安装了Linux或MacOS(Windows也可以通过PuTTY等第三方软件支持此功能),则可以使用端口转发:\n", 93 | "\n", 94 | "```\n", 95 | "ssh myserver -L 8888:localhost:8888\n", 96 | "```\n", 97 | "\n", 98 | "以上是远程服务器`myserver`的地址。然后我们可以使用http://localhost:8888 访问运行Jupyter Notebook的远程服务器`myserver`。下一节将详细介绍如何在AWS实例上运行Jupyter Notebook。 \n", 99 | "\n", 100 | "### 执行时间\n", 101 | "\n", 102 | "我们可以使用`ExecuteTime`插件来计算Jupyter Notebook中每个代码单元的执行时间。使用以下命令安装插件:\n", 103 | "\n", 104 | "```\n", 105 | "pip install jupyter_contrib_nbextensions\n", 106 | "jupyter contrib nbextension install --user\n", 107 | "jupyter nbextension enable execute_time/ExecuteTime\n", 108 | "```\n", 109 | "\n", 110 | "## 小结\n", 111 | "\n", 112 | "* 使用Jupyter Notebook工具,我们可以编辑、运行和为本书做贡献。\n", 113 | "* 使用端口转发在远程服务器上运行Jupyter Notebook。\n", 114 | "\n", 115 | "## 练习\n", 116 | "\n", 117 | "1. 在本地计算机上使用Jupyter Notebook编辑并运行本书中的代码。\n", 118 | "1. 使用Jupyter Notebook通过端口转发来远程编辑和运行本书中的代码。\n", 119 | "1. 对于两个方矩阵,测量$\\mathbf{A}^\\top \\mathbf{B}$与$\\mathbf{A} \\mathbf{B}$在$\\mathbb{R}^{1024 \\times 1024}$中的运行时间。哪一个更快?\n", 120 | "\n", 121 | "[Discussions](https://discuss.d2l.ai/t/5731)\n" 122 | ] 123 | } 124 | ], 125 | "metadata": { 126 | "kernelspec": { 127 | "display_name": "conda_pytorch_p36", 128 | "name": "conda_pytorch_p36" 129 | }, 130 | "language_info": { 131 | "name": "python" 132 | }, 133 | "required_libs": [] 134 | }, 135 | "nbformat": 4, 136 | "nbformat_minor": 5 137 | } -------------------------------------------------------------------------------- /chapter_appendix-tools-for-deep-learning/sagemaker.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "b0c43609", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 使用Amazon SageMaker\n", 11 | ":label:`sec_sagemaker`\n", 12 | "\n", 13 | "深度学习程序可能需要很多计算资源,这很容易超出你的本地计算机所能提供的范围。云计算服务允许你使用功能更强大的计算机更轻松地运行本书的GPU密集型代码。本节将介绍如何使用Amazon SageMaker运行本书的代码。\n", 14 | "\n", 15 | "## 注册\n", 16 | "\n", 17 | "首先,我们需要在注册一个帐户https://aws.amazon.com/。 为了增加安全性,鼓励使用双因素身份验证。设置详细的计费和支出警报也是一个好主意,以避免任何意外,例如,当忘记停止运行实例时。登录AWS帐户后,转到[console](http://console.aws.amazon.com/)并搜索“Amazon SageMaker”(参见 :numref:`fig_sagemaker`),然后单击它打开SageMaker面板。\n", 18 | "\n", 19 | "![搜索并打开SageMaker面板](../img/sagemaker.png)\n", 20 | ":width:`300px`\n", 21 | ":label:`fig_sagemaker`\n", 22 | "\n", 23 | "## 创建SageMaker实例\n", 24 | "\n", 25 | "接下来,让我们创建一个notebook实例,如 :numref:`fig_sagemaker-create`所示。\n", 26 | "\n", 27 | "![创建一个SageMaker实例](../img/sagemaker-create.png)\n", 28 | ":width:`400px`\n", 29 | ":label:`fig_sagemaker-create`\n", 30 | "\n", 31 | "SageMaker提供多个具有不同计算能力和价格的[实例类型](https://aws.amazon.com/sagemaker/pricing/instance-types/)。创建notebook实例时,可以指定其名称和类型。在 :numref:`fig_sagemaker-create-2`中,我们选择`ml.p3.2xlarge`:使用一个Tesla V100 GPU和一个8核CPU,这个实例的性能足够本书的大部分内容使用。\n", 32 | "\n", 33 | "![选择实例类型](../img/sagemaker-create-2.png)\n", 34 | ":width:`400px`\n", 35 | ":label:`fig_sagemaker-create-2`\n" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "id": "87a915ca", 41 | "metadata": { 42 | "origin_pos": 2, 43 | "tab": [ 44 | "pytorch" 45 | ] 46 | }, 47 | "source": [ 48 | "用于与SageMaker一起运行的ipynb格式的整本书可从https://github.com/d2l-ai/d2l-pytorch-sagemaker获得。\n", 49 | "我们可以指定此GitHub存储库URL( :numref:`fig_sagemaker-create-3`),以允许SageMaker在创建实例时克隆它。\n" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "id": "061d3b04", 55 | "metadata": { 56 | "origin_pos": 4 57 | }, 58 | "source": [ 59 | "![指定GitHub存储库](../img/sagemaker-create-3.png)\n", 60 | ":width:`400px`\n", 61 | ":label:`fig_sagemaker-create-3`\n", 62 | "\n", 63 | "## 运行和停止实例\n", 64 | "\n", 65 | "创建实例可能需要几分钟的时间。当实例准备就绪时,单击它旁边的“Open Jupyter”链接( :numref:`fig_sagemaker-open`),以便你可以在此实例上编辑并运行本书的所有Jupyter Notebook(类似于 :numref:`sec_jupyter`中的步骤)。\n", 66 | "\n", 67 | "![在创建的SageMaker实例上打开Jupyter](../img/sagemaker-open.png)\n", 68 | ":width:`400px`\n", 69 | ":label:`fig_sagemaker-open`\n", 70 | "\n", 71 | "完成工作后,不要忘记停止实例以避免进一步收费( :numref:`fig_sagemaker-stop`)。\n", 72 | "\n", 73 | "![停止SageMaker实例](../img/sagemaker-stop.png)\n", 74 | ":width:`300px`\n", 75 | ":label:`fig_sagemaker-stop`\n", 76 | "\n", 77 | "## 更新Notebook\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "id": "f55e7f4e", 83 | "metadata": { 84 | "origin_pos": 6, 85 | "tab": [ 86 | "pytorch" 87 | ] 88 | }, 89 | "source": [ 90 | "这本开源书的notebook将定期在GitHub上的[d2l-ai/d2l-pytorch-sagemaker](https://github.com/d2l-ai/d2l-pytorch-sagemaker)存储库中更新。要更新至最新版本,你可以在SageMaker实例( :numref:`fig_sagemaker-terminal`)上打开终端。\n" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "f2b7db7b", 96 | "metadata": { 97 | "origin_pos": 8 98 | }, 99 | "source": [ 100 | "![在SageMaker实例上打开终端](../img/sagemaker-terminal.png)\n", 101 | ":width:`300px`\n", 102 | ":label:`fig_sagemaker-terminal`\n", 103 | "\n", 104 | "你可能希望在从远程存储库提取更新之前提交本地更改。否则,只需在终端中使用以下命令放弃所有本地更改:\n" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "b1900934", 110 | "metadata": { 111 | "origin_pos": 10, 112 | "tab": [ 113 | "pytorch" 114 | ] 115 | }, 116 | "source": [ 117 | "```bash\n", 118 | "cd SageMaker/d2l-pytorch-sagemaker/\n", 119 | "git reset --hard\n", 120 | "git pull\n", 121 | "```\n" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "id": "5060f222", 127 | "metadata": { 128 | "origin_pos": 12 129 | }, 130 | "source": [ 131 | "## 小结\n", 132 | "\n", 133 | "* 我们可以使用Amazon SageMaker创建一个GPU的notebook实例来运行本书的密集型代码。\n", 134 | "* 我们可以通过Amazon SageMaker实例上的终端更新notebooks。\n", 135 | "\n", 136 | "## 练习\n", 137 | "\n", 138 | "1. 使用Amazon SageMaker编辑并运行任何需要GPU的部分。\n", 139 | "1. 打开终端以访问保存本书所有notebooks的本地目录。\n", 140 | "\n", 141 | "[Discussions](https://discuss.d2l.ai/t/5732)\n" 142 | ] 143 | } 144 | ], 145 | "metadata": { 146 | "kernelspec": { 147 | "display_name": "conda_pytorch_p36", 148 | "name": "conda_pytorch_p36" 149 | }, 150 | "language_info": { 151 | "name": "python" 152 | }, 153 | "required_libs": [] 154 | }, 155 | "nbformat": 4, 156 | "nbformat_minor": 5 157 | } -------------------------------------------------------------------------------- /chapter_appendix-tools-for-deep-learning/selecting-servers-gpus.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "c34c875c", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 选择服务器和GPU\n", 11 | ":label:`sec_buy_gpu`\n", 12 | "\n", 13 | "深度学习训练通常需要大量的计算。目前,GPU是深度学习最具成本效益的硬件加速器。与CPU相比,GPU更便宜,性能更高,通常超过一个数量级。此外,一台服务器可以支持多个GPU,高端服务器最多支持8个GPU。更典型的数字是工程工作站最多4个GPU,这是因为热量、冷却和电源需求会迅速增加,超出办公楼所能支持的范围。对于更大的部署,云计算(例如亚马逊的[P3](https://aws.amazon.com/ec2/instance-types/p3/)和[G4](https://aws.amazon.com/blogs/aws/in-the-works-ec2-instances-g4-with-nvidia-t4-gpus/)实例)是一个更实用的解决方案。\n", 14 | "\n", 15 | "## 选择服务器\n", 16 | "\n", 17 | "通常不需要购买具有多个线程的高端CPU,因为大部分计算都发生在GPU上。这就是说,由于Python中的全局解释器锁(GIL),CPU的单线程性能在有4-8个GPU的情况下可能很重要。所有的条件都是一样的,这意味着核数较少但时钟频率较高的CPU可能是更经济的选择。例如,当在6核4GHz和8核3.5GHz CPU之间进行选择时,前者更可取,即使其聚合速度较低。一个重要的考虑因素是,GPU使用大量的电能,从而释放大量的热量。这需要非常好的冷却和足够大的机箱来容纳GPU。如有可能,请遵循以下指南:\n", 18 | "\n", 19 | "1. **电源**。GPU使用大量的电源。每个设备预计高达350W(检查显卡的*峰值需求*而不是一般需求,因为高效代码可能会消耗大量能源)。如果电源不能满足需求,系统会变得不稳定。\n", 20 | "1. **机箱尺寸**。GPU很大,辅助电源连接器通常需要额外的空间。此外,大型机箱更容易冷却。\n", 21 | "1. **GPU散热**。如果有大量的GPU,可能需要投资水冷。此外,即使风扇较少,也应以“公版设计”为目标,因为它们足够薄,可以在设备之间进气。当使用多风扇GPU,安装多个GPU时,它可能太厚而无法获得足够的空气。\n", 22 | "1. **PCIe插槽**。在GPU之间来回移动数据(以及在GPU之间交换数据)需要大量带宽。建议使用16通道的PCIe 3.0插槽。当安装了多个GPU时,请务必仔细阅读主板说明,以确保在同时使用多个GPU时16$\\times$带宽仍然可用,并且使用的是PCIe3.0,而不是用于附加插槽的PCIe2.0。在安装多个GPU的情况下,一些主板的带宽降级到8$\\times$甚至4$\\times$。这部分是由于CPU提供的PCIe通道数量限制。\n", 23 | "\n", 24 | "简而言之,以下是构建深度学习服务器的一些建议。\n", 25 | "\n", 26 | "* **初学者**。购买低功耗的低端GPU(适合深度学习的廉价游戏GPU,功耗150-200W)。如果幸运的话,大家现在常用的计算机将支持它。\n", 27 | "* **1个GPU**。一个4核的低端CPU就足够了,大多数主板也足够了。以至少32 GB的DRAM为目标,投资SSD进行本地数据访问。600W的电源应足够。买一个有很多风扇的GPU。\n", 28 | "* **2个GPU**。一个4-6核的低端CPU就足够了。可以考虑64 GB的DRAM并投资于SSD。两个高端GPU将需要1000瓦的功率。对于主板,请确保它们具有*两个*PCIe 3.0 x16插槽。如果可以,请使用PCIe 3.0 x16插槽之间有两个可用空间(60毫米间距)的主板,以提供额外的空气。在这种情况下,购买两个具有大量风扇的GPU。\n", 29 | "* **4个GPU**。确保购买的CPU具有相对较快的单线程速度(即较高的时钟频率)。可能需要具有更多PCIe通道的CPU,例如AMD Threadripper。可能需要相对昂贵的主板才能获得4个PCIe 3.0 x16插槽,因为它们可能需要一个PLX来多路复用PCIe通道。购买带有公版设计的GPU,这些GPU很窄,并且让空气进入GPU之间。需要一个1600-2000W的电源,而办公室的插座可能不支持。此服务器可能在运行时*声音很大,很热*。不想把它放在桌子下面。建议使用128 GB的DRAM。获取一个用于本地存储的SSD(1-2 TB NVMe)和RAID配置的硬盘来存储数据。\n", 30 | "* **8 GPU**。需要购买带有多个冗余电源的专用多GPU服务器机箱(例如,每个电源为1600W时为2+1)。这将需要双插槽服务器CPU、256 GB ECC DRAM、快速网卡(建议使用10 GBE),并且需要检查服务器是否支持GPU的*物理外形*。用户GPU和服务器GPU之间的气流和布线位置存在显著差异(例如RTX 2080和Tesla V100)。这意味着可能无法在服务器中安装消费级GPU,因为电源线间隙不足或缺少合适的接线(本书一位合著者痛苦地发现了这一点)。\n", 31 | "\n", 32 | "## 选择GPU\n", 33 | "\n", 34 | "目前,AMD和NVIDIA是专用GPU的两大主要制造商。NVIDIA是第一个进入深度学习领域的公司,通过CUDA为深度学习框架提供更好的支持。因此,大多数买家选择NVIDIA GPU。\n", 35 | "\n", 36 | "NVIDIA提供两种类型的GPU,针对个人用户(例如,通过GTX和RTX系列)和企业用户(通过其Tesla系列)。这两种类型的GPU提供了相当的计算能力。但是,企业用户GPU通常使用强制(被动)冷却、更多内存和ECC(纠错)内存。这些GPU更适用于数据中心,通常成本是消费者GPU的十倍。\n", 37 | "\n", 38 | "如果是一个拥有100个服务器的大公司,则应该考虑英伟达Tesla系列,或者在云中使用GPU服务器。对于实验室或10+服务器的中小型公司,英伟达RTX系列可能是最具成本效益的,可以购买超微或华硕机箱的预配置服务器,这些服务器可以有效地容纳4-8个GPU。\n", 39 | "\n", 40 | "GPU供应商通常每一到两年发布一代,例如2017年发布的GTX 1000(Pascal)系列和2019年发布的RTX 2000(Turing)系列。每个系列都提供几种不同的型号,提供不同的性能级别。GPU性能主要是以下三个参数的组合:\n", 41 | "\n", 42 | "1. **计算能力**。通常大家会追求32位浮点计算能力。16位浮点训练(FP16)也进入主流。如果只对预测感兴趣,还可以使用8位整数。最新一代图灵GPU提供4-bit加速。不幸的是,目前训练低精度网络的算法还没有普及;\n", 43 | "1. **内存大小**。随着模型变大或训练期间使用的批量变大,将需要更多的GPU内存。检查HBM2(高带宽内存)与GDDR6(图形DDR)内存。HBM2速度更快,但成本更高;\n", 44 | "1. **内存带宽**。当有足够的内存带宽时,才能最大限度地利用计算能力。如果使用GDDR6,请追求宽内存总线。\n", 45 | "\n", 46 | "对于大多数用户,只需看看计算能力就足够了。请注意,许多GPU提供不同类型的加速。例如,NVIDIA的Tensor Cores将操作符子集的速度提高了5$\\times$。确保所使用的库支持这一点。GPU内存应不小于4GB(8GB更好)。尽量避免将GPU也用于显示GUI(改用内置显卡)。如果无法避免,请添加额外的2GB RAM以确保安全。\n", 47 | "\n", 48 | ":numref:`fig_flopsvsprice`比较了各种GTX 900、GTX 1000和RTX 2000系列的(GFlops)和价格(Price)。价格是维基百科上的建议价格。\n", 49 | "\n", 50 | "![浮点计算能力和价格比较](../img/flopsvsprice.svg)\n", 51 | ":label:`fig_flopsvsprice`\n", 52 | "\n", 53 | "由上图,可以看出很多事情:\n", 54 | "\n", 55 | "1. 在每个系列中,价格和性能大致成比例。Titan因拥有大GPU内存而有相当的溢价。然而,通过比较980 Ti和1080 Ti可以看出,较新型号具有更好的成本效益。RTX 2000系列的价格似乎没有多大提高。然而,它们提供了更优秀的低精度性能(FP16、INT8和INT4);\n", 56 | "2. GTX 1000系列的性价比大约是900系列的两倍;\n", 57 | "3. 对于RTX 2000系列,浮点计算能力是价格的“仿射”函数。\n", 58 | "\n", 59 | "![浮点计算能力和能耗](../img/wattvsprice.svg)\n", 60 | ":label:`fig_wattvsprice`\n", 61 | "\n", 62 | ":numref:`fig_wattvsprice`显示了能耗与计算量基本成线性关系。其次,后一代更有效率。这似乎与对应于RTX 2000系列的图表相矛盾。然而,这是TensorCore不成比例的大能耗的结果。\n", 63 | "\n", 64 | "## 小结\n", 65 | "\n", 66 | "* 在构建服务器时,请注意电源、PCIe总线通道、CPU单线程速度和散热。\n", 67 | "* 如果可能,应该购买最新一代的GPU。\n", 68 | "* 使用云进行大型部署。\n", 69 | "* 高密度服务器可能不与所有GPU兼容。在购买之前,请检查一下机械和散热规格。\n", 70 | "* 为提高效率,请使用FP16或更低的精度。\n" 71 | ] 72 | } 73 | ], 74 | "metadata": { 75 | "kernelspec": { 76 | "display_name": "conda_pytorch_p36", 77 | "name": "conda_pytorch_p36" 78 | }, 79 | "language_info": { 80 | "name": "python" 81 | }, 82 | "required_libs": [] 83 | }, 84 | "nbformat": 4, 85 | "nbformat_minor": 5 86 | } -------------------------------------------------------------------------------- /chapter_attention-mechanisms/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "e2119702", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 注意力机制\n", 11 | ":label:`chap_attention`\n", 12 | "\n", 13 | "灵长类动物的视觉系统接受了大量的感官输入,\n", 14 | "这些感官输入远远超过了大脑能够完全处理的程度。\n", 15 | "然而,并非所有刺激的影响都是相等的。\n", 16 | "意识的聚集和专注使灵长类动物能够在复杂的视觉环境中将注意力引向感兴趣的物体,例如猎物和天敌。\n", 17 | "只关注一小部分信息的能力对进化更加有意义,使人类得以生存和成功。\n", 18 | "\n", 19 | "自19世纪以来,科学家们一直致力于研究认知神经科学领域的注意力。\n", 20 | "本章的很多章节将涉及到一些研究。\n", 21 | "\n", 22 | "首先回顾一个经典注意力框架,解释如何在视觉场景中展开注意力。\n", 23 | "受此框架中的*注意力提示*(attention cues)的启发,\n", 24 | "我们将设计能够利用这些注意力提示的模型。\n", 25 | "1964年的Nadaraya-Waston核回归(kernel regression)正是具有\n", 26 | "*注意力机制*(attention mechanism)的机器学习的简单演示。\n", 27 | "\n", 28 | "然后继续介绍的是注意力函数,它们在深度学习的注意力模型设计中被广泛使用。\n", 29 | "具体来说,我们将展示如何使用这些函数来设计*Bahdanau注意力*。\n", 30 | "Bahdanau注意力是深度学习中的具有突破性价值的注意力模型,它双向对齐并且可以微分。\n", 31 | "\n", 32 | "最后将描述仅仅基于注意力机制的*Transformer*架构,\n", 33 | "该架构中使用了*多头注意力*(multi-head attention)\n", 34 | "和*自注意力*(self-attention)。\n", 35 | "自2017年横空出世,Transformer一直都普遍存在于现代的深度学习应用中,\n", 36 | "例如语言、视觉、语音和强化学习领域。\n", 37 | "\n", 38 | ":begin_tab:toc\n", 39 | " - [attention-cues](attention-cues.ipynb)\n", 40 | " - [nadaraya-waston](nadaraya-waston.ipynb)\n", 41 | " - [attention-scoring-functions](attention-scoring-functions.ipynb)\n", 42 | " - [bahdanau-attention](bahdanau-attention.ipynb)\n", 43 | " - [multihead-attention](multihead-attention.ipynb)\n", 44 | " - [self-attention-and-positional-encoding](self-attention-and-positional-encoding.ipynb)\n", 45 | " - [transformer](transformer.ipynb)\n", 46 | ":end_tab:\n" 47 | ] 48 | } 49 | ], 50 | "metadata": { 51 | "kernelspec": { 52 | "display_name": "conda_pytorch_p36", 53 | "name": "conda_pytorch_p36" 54 | }, 55 | "language_info": { 56 | "name": "python" 57 | }, 58 | "required_libs": [] 59 | }, 60 | "nbformat": 4, 61 | "nbformat_minor": 5 62 | } -------------------------------------------------------------------------------- /chapter_computational-performance/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "88dd3320", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 计算性能\n", 11 | ":label:`chap_performance`\n", 12 | "\n", 13 | "在深度学习中,数据集和模型通常都很大,导致计算量也会很大。\n", 14 | "因此,计算的性能非常重要。\n", 15 | "本章将集中讨论影响计算性能的主要因素:命令式编程、符号编程、\n", 16 | "异步计算、自动并行和多GPU计算。\n", 17 | "通过学习本章,对于前几章中实现的那些模型,可以进一步提高它们的计算性能。\n", 18 | "例如,我们可以在不影响准确性的前提下,大大减少训练时间。\n", 19 | "\n", 20 | ":begin_tab:toc\n", 21 | " - [hybridize](hybridize.ipynb)\n", 22 | " - [async-computation](async-computation.ipynb)\n", 23 | " - [auto-parallelism](auto-parallelism.ipynb)\n", 24 | " - [hardware](hardware.ipynb)\n", 25 | " - [multiple-gpus](multiple-gpus.ipynb)\n", 26 | " - [multiple-gpus-concise](multiple-gpus-concise.ipynb)\n", 27 | " - [parameterserver](parameterserver.ipynb)\n", 28 | ":end_tab:\n" 29 | ] 30 | } 31 | ], 32 | "metadata": { 33 | "kernelspec": { 34 | "display_name": "conda_pytorch_p36", 35 | "name": "conda_pytorch_p36" 36 | }, 37 | "language_info": { 38 | "name": "python" 39 | }, 40 | "required_libs": [] 41 | }, 42 | "nbformat": 4, 43 | "nbformat_minor": 5 44 | } -------------------------------------------------------------------------------- /chapter_computational-performance/my_mlp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/chapter_computational-performance/my_mlp -------------------------------------------------------------------------------- /chapter_computational-performance/parameterserver.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8082b37d", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 参数服务器\n", 11 | ":label:`sec_parameterserver`\n", 12 | "\n", 13 | "当我们从一个GPU迁移到多个GPU时,以及再迁移到包含多个GPU的多个服务器时(可能所有服务器的分布跨越了多个机架和多个网络交换机),分布式并行训练算法也需要变得更加复杂。通过细节可以知道,一方面是不同的互连方式的带宽存在极大的区别(例如,NVLink可以通过设置实现跨$6$条链路的高达100GB/s的带宽,16通道的PCIe4.0提供32GB/s的带宽,而即使是高速100GbE以太网也只能提供大约10GB/s的带宽);另一方面是期望开发者既能完成统计学习建模还精通系统和网络也是不切实际的。\n", 14 | "\n", 15 | "参数服务器的核心思想首先是由 :cite:`Smola.Narayanamurthy.2010`在分布式隐变量模型的背景下引入的。然后,在 :cite:`Ahmed.Aly.Gonzalez.ea.2012`中描述了Push和Pull的语义,又在 :cite:`Li.Andersen.Park.ea.2014`中描述了系统和开源库。下面,我们将介绍用于提高计算效率的组件。\n", 16 | "\n", 17 | "## 数据并行训练\n", 18 | "\n", 19 | "让我们回顾一下在分布式架构中数据并行的训练方法,因为在实践中它的实现相对简单,因此本节将排除其他内容只对其进行介绍。由于当今的GPU拥有大量的显存,因此在实际场景中(不包括图深度学习)只有数据并行这种并行训练策略值得推荐。图 :numref:`fig_parameterserver`描述了在 :numref:`sec_multi_gpu`中实现的数据并行的变体。其中的关键是梯度的聚合需要在单个GPU(GPU 0)上完成,然后再将更新后的参数广播给所有GPU。\n", 20 | "\n", 21 | "![左图是单GPU训练;右图是多GPU训练的一个变体:(1)计算损失和梯度,(2)所有梯度聚合在一个GPU上,(3)发生参数更新,并将参数重新广播给所有GPU](../img/ps.svg)\n", 22 | ":label:`fig_parameterserver`\n", 23 | "\n", 24 | "回顾来看,选择GPU 0进行聚合似乎是个很随便的决定,当然也可以选择CPU上聚合,事实上只要优化算法支持,在实际操作中甚至可以在某个GPU上聚合其中一些参数,而在另一个GPU上聚合另一些参数。例如,如果有四个与参数向量相关的梯度$\\mathbf{g}_1, \\ldots, \\mathbf{g}_4$,还可以一个GPU对一个$\\mathbf{g}_i (i = 1, \\ldots, 4$)地进行梯度聚合。\n", 25 | "\n", 26 | "这样的推断似乎是轻率和武断的,毕竟数学应该是逻辑自洽的。但是,我们处理的是如 :numref:`sec_hardware`中所述的真实的物理硬件,其中不同的总线具有不同的带宽。考虑一个如 :numref:`sec_hardware`中所述的真实的$4$路GPU服务器。如果它的连接是特别完整的,那么可能拥有一个100GbE的网卡。更有代表性的数字是1-10GbE范围内,其有效带宽为100MB/s到1GB/s。因为CPU的PCIe通道太少(例如,消费级的Intel CPU有$24$个通道),所以无法直接与所有的GPU相连接,因此需要[multiplexer](https://www.broadcom.com/products/pcie-switches-bridges/pcie-switches)。CPU在16x Gen3链路上的带宽为16GB/s,这也是每个GPU连接到交换机的速度,这意味着GPU设备之间的通信更有效。\n", 27 | "\n", 28 | "![一个4路GPU服务器](../img/bw-hierarchy.svg)\n", 29 | ":label:`fig_bw_hierarchy`\n", 30 | "\n", 31 | "为了便于讨论,我们假设所有梯度共需160MB。在这种情况下,将其中$3$个GPU的梯度发送到第$4$个GPU上需要$30$毫秒(每次传输需要$10$毫秒=160MB/16GB/s)。再加上$30$毫秒将权重向量传输回来,得到的结果是总共需要$60$毫秒。如果将所有的数据发送到CPU,总共需要$80$毫秒,其中将有$40$毫秒的惩罚,因为$4$个GPU每个都需要将数据发送到CPU。最后,假设能够将梯度分为$4$个部分,每个部分为$40$MB,现在可以在不同的GPU上同时聚合每个部分。因为PCIe交换机在所有链路之间提供全带宽操作,所以传输需要$2.5\\times 3=7.5$毫秒,而不是$30$毫秒,因此同步操作总共需要$15$毫秒。简而言之,一样的参数同步操作基于不同的策略时间可能在$15$毫秒到$80$毫秒之间。 :numref:`fig_ps_distributed`描述了交换参数的不同策略。\n", 32 | "\n", 33 | "![参数同步策略](../img/ps-distributed.svg)\n", 34 | ":label:`fig_ps_distributed`\n", 35 | "\n", 36 | "请注意,我们还可以使用另一个工具来改善性能:在深度网络中,从顶部到底部计算所有梯度需要一些时间,因此即使还在忙着为某些参数计算梯度时,就可以开始为准备好的参数同步梯度了。想了解详细信息可以参见 :cite:`Sergeev.Del-Balso.2018`,想知道如何操作可参考[Horovod](https://github.com/horovod/horovod)。\n", 37 | "\n", 38 | "## 环同步(Ring Synchronization)\n", 39 | "\n", 40 | "当谈及现代深度学习硬件的同步问题时,我们经常会遇到大量的定制的网络连接。例如,AWS p3.16xlarge和NVIDIA DGX-2实例中的连接都使用了 :numref:`fig_nvlink`中的结构。每个GPU通过PCIe链路连接到主机CPU,该链路最多只能以16GB/s的速度运行。此外,每个GPU还具有$6$个NVLink连接,每个NVLink连接都能够以300Gbit/s进行双向传输。这相当于每个链路每个方向约$300\\div 8\\div 2\\approx 18 \\mathrm{GB/s}$。简言之,聚合的NVLink带宽明显高于PCIe带宽,问题是如何有效地使用它。\n", 41 | "\n", 42 | "![在8台V100 GPU服务器上连接NVLink(图片由英伟达提供)](../img/nvlink.svg)\n", 43 | ":label:`fig_nvlink`\n", 44 | "\n", 45 | " :cite:`Wang.Li.Liberty.ea.2018`的研究结果表明最优的同步策略是将网络分解成两个环,并基于两个环直接同步数据。\n", 46 | " :numref:`fig_nvlink_twoloop`描述了网络可以分解为一个具有双NVLink带宽的环(1-2-3-4-5-6-7-8-1)和一个具有常规带宽的环(1-4-6-3-5-8-2-7-1)。在这种情况下,设计一个高效的同步协议是非常重要的。\n", 47 | "\n", 48 | "![将NVLink网络分解为两个环。](../img/nvlink-twoloop.svg)\n", 49 | ":label:`fig_nvlink_twoloop`\n", 50 | "\n", 51 | "考虑下面的思维试验:给定由$n$个计算节点(或GPU)组成的一个环,梯度可以从第一个节点发送到第二个节点,在第二个结点将本地的梯度与传送的梯度相加并发送到第三个节点,依此类推。在$n-1$步之后,可以在最后访问的节点中找到聚合梯度。也就是说,聚合梯度的时间随节点数线性增长。但如果照此操作,算法是相当低效的。归根结底,在任何时候都只有一个节点在通信。如果我们将梯度分为$n$个块,并从节点$i$开始同步块$i$,会怎么样?因为每个块的大小是$1/n$,所以总时间现在是$(n-1)/n \\approx 1$。换句话说,当我们增大环的大小时,聚合梯度所花费的时间不会增加。这是一个相当惊人的结果。 :numref:`fig_ringsync`说明了$n=4$个节点上的步骤顺序。\n", 52 | "\n", 53 | "![跨4个节点的环同步。每个节点开始向其左邻居发送部分梯度,直到在其右邻居中找到聚合的梯度](../img/ringsync.svg)\n", 54 | ":label:`fig_ringsync`\n", 55 | "\n", 56 | "如果我们使用相同的例子,跨$8$个V100 GPU同步160MB,我们得到的结果大约是$2 \\times 160 \\mathrm{MB} \\div (3 \\times18 \\mathrm{GB/s}) \\approx 6 \\mathrm{ms}$。这比使用PCIe总线要好,即使我们现在使用的是$8$个GPU。请注意,这些数字在实践中通常会差一些,因为深度学习框架无法将通信组合成大的突发传输。\n", 57 | "\n", 58 | "注意到有一种常见的误解认为环同步与其他同步算法在本质上是不同的,实际上与简单的树算法相比其唯一的区别是同步路径稍微精细一些。\n", 59 | "\n", 60 | "## 多机训练\n", 61 | "\n", 62 | "新的挑战出现在多台机器上进行分布式训练:我们需要服务器之间相互通信,而这些服务器又只通过相对较低的带宽结构连接,在某些情况下这种连接的速度可能会慢一个数量级,因此跨设备同步是个棘手的问题。毕竟,在不同机器上运行训练代码的速度会有细微的差别,因此如果想使用分布式优化的同步算法就需要*同步*(synchronize)这些机器。\n", 63 | " :numref:`fig_ps_multimachine`说明了分布式并行训练是如何发生的。\n", 64 | "\n", 65 | "1. 在每台机器上读取一组(不同的)批量数据,在多个GPU之间分割数据并传输到GPU的显存中。基于每个GPU上的批量数据分别计算预测和梯度。\n", 66 | "2. 来自一台机器上的所有的本地GPU的梯度聚合在一个GPU上(或者在不同的GPU上聚合梯度的某些部分)。\n", 67 | "3. 每台机器的梯度被发送到其本地CPU中。\n", 68 | "4. 所有的CPU将梯度发送到中央参数服务器中,由该服务器聚合所有梯度。\n", 69 | "5. 然后使用聚合后的梯度来更新参数,并将更新后的参数广播回各个CPU中。\n", 70 | "6. 更新后的参数信息发送到本地一个(或多个)GPU中。\n", 71 | "7. 所有GPU上的参数更新完成。\n", 72 | "\n", 73 | "![多机多GPU分布式并行训练](../img/ps-multimachine.svg)\n", 74 | ":label:`fig_ps_multimachine`\n", 75 | "\n", 76 | "以上这些操作似乎都相当简单,而且事实上它们可以在一台机器内高效地执行,但是当我们考虑多台机器时,就会发现中央的参数服务器成为了瓶颈。毕竟,每个服务器的带宽是有限的,因此对$m$个工作节点来说,将所有梯度发送到服务器所需的时间是$\\mathcal{O}(m)$。我们也可以通过将参数服务器数量增加到$n$来突破这一障碍。此时,每个服务器只需要存储$\\mathcal{O}(1/n)$个参数,因此更新和优化的总时间变为$\\mathcal{O}(m/n)$。这两个数字的匹配会产生稳定的伸缩性,而不用在乎我们需要处理多少工作节点。在实际应用中,我们使用同一台机器既作为工作节点还作为服务器。设计说明请参考 :numref:`fig_ps_multips`(技术细节请参考 :cite:`Li.Andersen.Park.ea.2014`)。特别是,确保多台机器只在没有不合理延迟的情况下工作是相当困难的。\n", 77 | "\n", 78 | "![上图:单参数服务器是一个瓶颈,因为它的带宽是有限的;下图:多参数服务器使用聚合带宽存储部分参数](../img/ps-multips.svg)\n", 79 | ":label:`fig_ps_multips`\n", 80 | "\n", 81 | "## 键值存储\n", 82 | "\n", 83 | "在实践中,实现分布式多GPU训练所需要的步骤绝非易事。这就是公共抽象值得使用的原因,公共抽象即重新定义具有更新语义的*键-值存储*(key-value store)的抽象。\n", 84 | "\n", 85 | "在许多工作节点和许多GPU中,梯度$i$的计算可以定义为\n", 86 | "\n", 87 | "$$\\mathbf{g}_{i} = \\sum_{k \\in \\text{workers}} \\sum_{j \\in \\text{GPUs}} \\mathbf{g}_{ijk},$$\n", 88 | "\n", 89 | "其中$\\mathbf{g}_{ijk}$是在工作节点$k$的GPU$j$上拆分的梯度$i$的一部分。这个运算的关键在于它是一个*交换归约*(commutative reduction),也就是说,它把许多向量变换成一个向量,而运算顺序在完成向量变换时并不重要。这对实现我们的目标来说是非常好的,因为不需要为何时接收哪个梯度进行细粒度的控制。此外,请注意,这个操作在不同的$i$之间是独立的。\n", 90 | "\n", 91 | "这就允许我们定义下面两个操作:*push*(用于累积梯度)和*pull*(用于取得聚合梯度)。因为我们有很多层,也就有很多不同的梯度集合,因此需要用一个键$i$来对梯度建索引。这个与Dynamo :cite:`DeCandia.Hastorun.Jampani.ea.2007`中引入的*键-值存储*之间存在相似性并非巧合。它们两个定义都拥有许多相似的性质,特别是在多个服务器之间分发参数时。\n", 92 | "\n", 93 | "*键-值存储*的push与pull操作描述如下:\n", 94 | "\n", 95 | "* **push(key,value)**将特定的梯度值从工作节点发送到公共存储,在那里通过某种方式(例如,相加)来聚合值;\n", 96 | "* **pull(key,value)**从公共存储中取得某种方式(例如,组合来自所有工作节点的梯度)的聚合值。\n", 97 | "\n", 98 | "通过将同步的所有复杂性隐藏在一个简单的push和pull操作背后,我们可以将统计建模人员(他们希望能够用简单的术语表达优化)和系统工程师(他们需要处理分布式同步中固有的复杂性)的关注点解耦。\n", 99 | "\n", 100 | "## 小结\n", 101 | "\n", 102 | "* 同步需要高度适应特定的网络基础设施和服务器内的连接,这种适应会严重影响同步所需的时间。\n", 103 | "* 环同步对于p3和DGX-2服务器是最佳的,而对于其他服务器则未必。\n", 104 | "* 当添加多个参数服务器以增加带宽时,分层同步策略可以工作的很好。\n", 105 | "\n", 106 | "## 练习\n", 107 | "\n", 108 | "1. 请尝试进一步提高环同步的性能吗。(提示:可以双向发送消息。)\n", 109 | "1. 在计算仍在进行中,可否允许执行异步通信?它将如何影响性能?\n", 110 | "1. 怎样处理在长时间运行的计算过程中丢失了一台服务器这种问题?尝试设计一种容错机制来避免重启计算这种解决方案?\n", 111 | "\n", 112 | "[Discussions](https://discuss.d2l.ai/t/5774)\n" 113 | ] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "conda_pytorch_p36", 119 | "name": "conda_pytorch_p36" 120 | }, 121 | "language_info": { 122 | "name": "python" 123 | }, 124 | "required_libs": [] 125 | }, 126 | "nbformat": 4, 127 | "nbformat_minor": 5 128 | } -------------------------------------------------------------------------------- /chapter_computer-vision/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2858f745", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 计算机视觉\n", 11 | ":label:`chap_cv`\n", 12 | "\n", 13 | "近年来,深度学习一直是提高计算机视觉系统性能的变革力量。\n", 14 | "无论是医疗诊断、自动驾驶,还是智能滤波器、摄像头监控,许多计算机视觉领域的应用都与我们当前和未来的生活密切相关。\n", 15 | "可以说,最先进的计算机视觉应用与深度学习几乎是不可分割的。\n", 16 | "有鉴于此,本章将重点介绍计算机视觉领域,并探讨最近在学术界和行业中具有影响力的方法和应用。\n", 17 | "\n", 18 | "在 :numref:`chap_cnn`和 :numref:`chap_modern_cnn`中,我们研究了计算机视觉中常用的各种卷积神经网络,并将它们应用到简单的图像分类任务中。\n", 19 | "本章开头,我们将介绍两种可以改进模型泛化的方法,即*图像增广*和*微调*,并将它们应用于图像分类。\n", 20 | "由于深度神经网络可以有效地表示多个层次的图像,因此这种分层表示已成功用于各种计算机视觉任务,例如*目标检测*(object detection)、*语义分割*(semantic segmentation)和*样式迁移*(style transfer)。\n", 21 | "秉承计算机视觉中利用分层表示的关键思想,我们将从物体检测的主要组件和技术开始,继而展示如何使用*完全卷积网络*对图像进行语义分割,然后我们将解释如何使用样式迁移技术来生成像本书封面一样的图像。\n", 22 | "最后在结束本章时,我们将本章和前几章的知识应用于两个流行的计算机视觉基准数据集。\n", 23 | "\n", 24 | ":begin_tab:toc\n", 25 | " - [image-augmentation](image-augmentation.ipynb)\n", 26 | " - [fine-tuning](fine-tuning.ipynb)\n", 27 | " - [bounding-box](bounding-box.ipynb)\n", 28 | " - [anchor](anchor.ipynb)\n", 29 | " - [multiscale-object-detection](multiscale-object-detection.ipynb)\n", 30 | " - [object-detection-dataset](object-detection-dataset.ipynb)\n", 31 | " - [ssd](ssd.ipynb)\n", 32 | " - [rcnn](rcnn.ipynb)\n", 33 | " - [semantic-segmentation-and-dataset](semantic-segmentation-and-dataset.ipynb)\n", 34 | " - [transposed-conv](transposed-conv.ipynb)\n", 35 | " - [fcn](fcn.ipynb)\n", 36 | " - [neural-style](neural-style.ipynb)\n", 37 | " - [kaggle-cifar10](kaggle-cifar10.ipynb)\n", 38 | " - [kaggle-dog](kaggle-dog.ipynb)\n", 39 | ":end_tab:\n" 40 | ] 41 | } 42 | ], 43 | "metadata": { 44 | "kernelspec": { 45 | "display_name": "conda_pytorch_p36", 46 | "name": "conda_pytorch_p36" 47 | }, 48 | "language_info": { 49 | "name": "python" 50 | }, 51 | "required_libs": [] 52 | }, 53 | "nbformat": 4, 54 | "nbformat_minor": 5 55 | } -------------------------------------------------------------------------------- /chapter_computer-vision/submission.csv: -------------------------------------------------------------------------------- 1 | id,label 2 | 1,airplane 3 | 2,deer 4 | 3,horse 5 | 4,frog 6 | 5,cat 7 | -------------------------------------------------------------------------------- /chapter_convolutional-modern/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "048ad798", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 现代卷积神经网络\n", 11 | ":label:`chap_modern_cnn`\n", 12 | "\n", 13 | "上一章我们介绍了卷积神经网络的基本原理,本章将介绍现代的卷积神经网络架构,许多现代卷积神经网络的研究都是建立在这一章的基础上的。\n", 14 | "在本章中的每一个模型都曾一度占据主导地位,其中许多模型都是ImageNet竞赛的优胜者。ImageNet竞赛自2010年以来,一直是计算机视觉中监督学习进展的指向标。\n", 15 | "\n", 16 | "这些模型包括:\n", 17 | "\n", 18 | "- AlexNet。它是第一个在大规模视觉竞赛中击败传统计算机视觉模型的大型神经网络;\n", 19 | "- 使用重复块的网络(VGG)。它利用许多重复的神经网络块;\n", 20 | "- 网络中的网络(NiN)。它重复使用由卷积层和$1\\times 1$卷积层(用来代替全连接层)来构建深层网络;\n", 21 | "- 含并行连结的网络(GoogLeNet)。它使用并行连结的网络,通过不同窗口大小的卷积层和最大汇聚层来并行抽取信息;\n", 22 | "- 残差网络(ResNet)。它通过残差块构建跨层的数据通道,是计算机视觉中最流行的体系架构;\n", 23 | "- 稠密连接网络(DenseNet)。它的计算成本很高,但给我们带来了更好的效果。\n", 24 | "\n", 25 | "虽然深度神经网络的概念非常简单——将神经网络堆叠在一起。但由于不同的网络架构和超参数选择,这些神经网络的性能会发生很大变化。\n", 26 | "本章介绍的神经网络是将人类直觉和相关数学见解结合后,经过大量研究试错后的结晶。\n", 27 | "我们会按时间顺序介绍这些模型,在追寻历史的脉络的同时,帮助培养对该领域发展的直觉。这将有助于研究开发自己的架构。\n", 28 | "例如,本章介绍的批量规范化(batch normalization)和残差网络(ResNet)为设计和训练深度神经网络提供了重要思想指导。\n", 29 | "\n", 30 | ":begin_tab:toc\n", 31 | " - [alexnet](alexnet.ipynb)\n", 32 | " - [vgg](vgg.ipynb)\n", 33 | " - [nin](nin.ipynb)\n", 34 | " - [googlenet](googlenet.ipynb)\n", 35 | " - [batch-norm](batch-norm.ipynb)\n", 36 | " - [resnet](resnet.ipynb)\n", 37 | " - [densenet](densenet.ipynb)\n", 38 | ":end_tab:\n" 39 | ] 40 | } 41 | ], 42 | "metadata": { 43 | "kernelspec": { 44 | "display_name": "conda_pytorch_p36", 45 | "name": "conda_pytorch_p36" 46 | }, 47 | "language_info": { 48 | "name": "python" 49 | }, 50 | "required_libs": [] 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 5 54 | } -------------------------------------------------------------------------------- /chapter_convolutional-neural-networks/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5231073b", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 卷积神经网络\n", 11 | ":label:`chap_cnn`\n", 12 | "\n", 13 | "在前面的章节中,我们遇到过图像数据。\n", 14 | "这种数据的每个样本都由一个二维像素网格组成,\n", 15 | "每个像素可能是一个或者多个数值,取决于是黑白还是彩色图像。\n", 16 | "到目前为止,我们处理这类结构丰富的数据的方式还不够有效。\n", 17 | "我们仅仅通过将图像数据展平成一维向量而忽略了每个图像的空间结构信息,再将数据送入一个全连接的多层感知机中。\n", 18 | "因为这些网络特征元素的顺序是不变的,因此最优的结果是利用先验知识,即利用相近像素之间的相互关联性,从图像数据中学习得到有效的模型。\n", 19 | "\n", 20 | "本章介绍的*卷积神经网络*(convolutional neural network,CNN)是一类强大的、为处理图像数据而设计的神经网络。\n", 21 | "基于卷积神经网络架构的模型在计算机视觉领域中已经占主导地位,当今几乎所有的图像识别、目标检测或语义分割相关的学术竞赛和商业应用都以这种方法为基础。\n", 22 | "\n", 23 | "现代卷积神经网络的设计得益于生物学、群论和一系列的补充实验。\n", 24 | "卷积神经网络需要的参数少于全连接架构的网络,而且卷积也很容易用GPU并行计算。\n", 25 | "因此卷积神经网络除了能够高效地采样从而获得精确的模型,还能够高效地计算。\n", 26 | "久而久之,从业人员越来越多地使用卷积神经网络。即使在通常使用循环神经网络的一维序列结构任务上(例如音频、文本和时间序列分析),卷积神经网络也越来越受欢迎。\n", 27 | "通过对卷积神经网络一些巧妙的调整,也使它们在图结构数据和推荐系统中发挥作用。\n", 28 | "\n", 29 | "在本章的开始,我们将介绍构成所有卷积网络主干的基本元素。\n", 30 | "这包括卷积层本身、填充(padding)和步幅(stride)的基本细节、用于在相邻区域汇聚信息的汇聚层(pooling)、在每一层中多通道(channel)的使用,以及有关现代卷积网络架构的仔细讨论。\n", 31 | "在本章的最后,我们将介绍一个完整的、可运行的LeNet模型:这是第一个成功应用的卷积神经网络,比现代深度学习兴起时间还要早。\n", 32 | "在下一章中,我们将深入研究一些流行的、相对较新的卷积神经网络架构的完整实现,这些网络架构涵盖了现代从业者通常使用的大多数经典技术。\n", 33 | "\n", 34 | ":begin_tab:toc\n", 35 | " - [why-conv](why-conv.ipynb)\n", 36 | " - [conv-layer](conv-layer.ipynb)\n", 37 | " - [padding-and-strides](padding-and-strides.ipynb)\n", 38 | " - [channels](channels.ipynb)\n", 39 | " - [pooling](pooling.ipynb)\n", 40 | " - [lenet](lenet.ipynb)\n", 41 | ":end_tab:\n" 42 | ] 43 | } 44 | ], 45 | "metadata": { 46 | "kernelspec": { 47 | "display_name": "conda_pytorch_p36", 48 | "name": "conda_pytorch_p36" 49 | }, 50 | "language_info": { 51 | "name": "python" 52 | }, 53 | "required_libs": [] 54 | }, 55 | "nbformat": 4, 56 | "nbformat_minor": 5 57 | } -------------------------------------------------------------------------------- /chapter_convolutional-neural-networks/why-conv.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "36224718", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 从全连接层到卷积\n", 11 | ":label:`sec_why-conv`\n", 12 | "\n", 13 | "我们之前讨论的多层感知机十分适合处理表格数据,其中行对应样本,列对应特征。\n", 14 | "对于表格数据,我们寻找的模式可能涉及特征之间的交互,但是我们不能预先假设任何与特征交互相关的先验结构。\n", 15 | "此时,多层感知机可能是最好的选择,然而对于高维感知数据,这种缺少结构的网络可能会变得不实用。\n", 16 | "\n", 17 | "例如,在之前猫狗分类的例子中:假设我们有一个足够充分的照片数据集,数据集中是拥有标注的照片,每张照片具有百万级像素,这意味着网络的每次输入都有一百万个维度。\n", 18 | "即使将隐藏层维度降低到1000,这个全连接层也将有$10^6 \\times 10^3 = 10^9$个参数。\n", 19 | "想要训练这个模型将不可实现,因为需要有大量的GPU、分布式优化训练的经验和超乎常人的耐心。\n", 20 | "\n", 21 | "有些读者可能会反对这个观点,认为要求百万像素的分辨率可能不是必要的。\n", 22 | "然而,即使分辨率减小为十万像素,使用1000个隐藏单元的隐藏层也可能不足以学习到良好的图像特征,在真实的系统中我们仍然需要数十亿个参数。\n", 23 | "此外,拟合如此多的参数还需要收集大量的数据。\n", 24 | "然而,如今人类和机器都能很好地区分猫和狗:这是因为图像中本就拥有丰富的结构,而这些结构可以被人类和机器学习模型使用。\n", 25 | "*卷积神经网络*(convolutional neural networks,CNN)是机器学习利用自然图像中一些已知结构的创造性方法。\n", 26 | "\n", 27 | "## 不变性\n", 28 | "\n", 29 | "想象一下,假设我们想从一张图片中找到某个物体。\n", 30 | "合理的假设是:无论哪种方法找到这个物体,都应该和物体的位置无关。\n", 31 | "理想情况下,我们的系统应该能够利用常识:猪通常不在天上飞,飞机通常不在水里游泳。\n", 32 | "但是,如果一只猪出现在图片顶部,我们还是应该认出它。\n", 33 | "我们可以从儿童游戏”沃尔多在哪里”( :numref:`img_waldo`)中得到灵感:\n", 34 | "在这个游戏中包含了许多充斥着活动的混乱场景,而沃尔多通常潜伏在一些不太可能的位置,读者的目标就是找出他。\n", 35 | "尽管沃尔多的装扮很有特点,但是在眼花缭乱的场景中找到他也如大海捞针。\n", 36 | "然而沃尔多的样子并不取决于他潜藏的地方,因此我们可以使用一个“沃尔多检测器”扫描图像。\n", 37 | "该检测器将图像分割成多个区域,并为每个区域包含沃尔多的可能性打分。\n", 38 | "卷积神经网络正是将*空间不变性*(spatial invariance)的这一概念系统化,从而基于这个模型使用较少的参数来学习有用的表示。\n", 39 | "\n", 40 | "![沃尔多游戏示例图。](../img/where-wally-walker-books.jpg)\n", 41 | ":width:`400px`\n", 42 | ":label:`img_waldo`\n", 43 | "\n", 44 | "现在,我们将上述想法总结一下,从而帮助我们设计适合于计算机视觉的神经网络架构。\n", 45 | "\n", 46 | "1. *平移不变性*(translation invariance):不管检测对象出现在图像中的哪个位置,神经网络的前面几层应该对相同的图像区域具有相似的反应,即为“平移不变性”。\n", 47 | "1. *局部性*(locality):神经网络的前面几层应该只探索输入图像中的局部区域,而不过度在意图像中相隔较远区域的关系,这就是“局部性”原则。最终,可以聚合这些局部特征,以在整个图像级别进行预测。\n", 48 | "\n", 49 | "让我们看看这些原则是如何转化为数学表示的。\n", 50 | "\n", 51 | "## 多层感知机的限制\n", 52 | "\n", 53 | "首先,多层感知机的输入是二维图像$\\mathbf{X}$,其隐藏表示$\\mathbf{H}$在数学上是一个矩阵,在代码中表示为二维张量。\n", 54 | "其中$\\mathbf{X}$和$\\mathbf{H}$具有相同的形状。\n", 55 | "为了方便理解,我们可以认为,无论是输入还是隐藏表示都拥有空间结构。\n", 56 | "\n", 57 | "使用$[\\mathbf{X}]_{i, j}$和$[\\mathbf{H}]_{i, j}$分别表示输入图像和隐藏表示中位置($i$,$j$)处的像素。\n", 58 | "为了使每个隐藏神经元都能接收到每个输入像素的信息,我们将参数从权重矩阵(如同我们先前在多层感知机中所做的那样)替换为四阶权重张量$\\mathsf{W}$。假设$\\mathbf{U}$包含偏置参数,我们可以将全连接层形式化地表示为\n", 59 | "\n", 60 | "$$\\begin{aligned} \\left[\\mathbf{H}\\right]_{i, j} &= [\\mathbf{U}]_{i, j} + \\sum_k \\sum_l[\\mathsf{W}]_{i, j, k, l} [\\mathbf{X}]_{k, l}\\\\ &= [\\mathbf{U}]_{i, j} +\n", 61 | "\\sum_a \\sum_b [\\mathsf{V}]_{i, j, a, b} [\\mathbf{X}]_{i+a, j+b}.\\end{aligned}$$\n", 62 | "\n", 63 | "其中,从$\\mathsf{W}$到$\\mathsf{V}$的转换只是形式上的转换,因为在这两个四阶张量的元素之间存在一一对应的关系。\n", 64 | "我们只需重新索引下标$(k, l)$,使$k = i+a$、$l = j+b$,由此可得$[\\mathsf{V}]_{i, j, a, b} = [\\mathsf{W}]_{i, j, i+a, j+b}$。\n", 65 | "索引$a$和$b$通过在正偏移和负偏移之间移动覆盖了整个图像。\n", 66 | "对于隐藏表示中任意给定位置($i$,$j$)处的像素值$[\\mathbf{H}]_{i, j}$,可以通过在$x$中以$(i, j)$为中心对像素进行加权求和得到,加权使用的权重为$[\\mathsf{V}]_{i, j, a, b}$。\n", 67 | "\n", 68 | "### 平移不变性\n", 69 | "\n", 70 | "现在引用上述的第一个原则:平移不变性。\n", 71 | "这意味着检测对象在输入$\\mathbf{X}$中的平移,应该仅导致隐藏表示$\\mathbf{H}$中的平移。也就是说,$\\mathsf{V}$和$\\mathbf{U}$实际上不依赖于$(i, j)$的值,即$[\\mathsf{V}]_{i, j, a, b} = [\\mathbf{V}]_{a, b}$。并且$\\mathbf{U}$是一个常数,比如$u$。因此,我们可以简化$\\mathbf{H}$定义为:\n", 72 | "\n", 73 | "$$[\\mathbf{H}]_{i, j} = u + \\sum_a\\sum_b [\\mathbf{V}]_{a, b} [\\mathbf{X}]_{i+a, j+b}.$$\n", 74 | "\n", 75 | "这就是*卷积*(convolution)。我们是在使用系数$[\\mathbf{V}]_{a, b}$对位置$(i, j)$附近的像素$(i+a, j+b)$进行加权得到$[\\mathbf{H}]_{i, j}$。\n", 76 | "注意,$[\\mathbf{V}]_{a, b}$的系数比$[\\mathsf{V}]_{i, j, a, b}$少很多,因为前者不再依赖于图像中的位置。这就是显著的进步!\n", 77 | "\n", 78 | "### 局部性\n", 79 | "\n", 80 | "现在引用上述的第二个原则:局部性。如上所述,为了收集用来训练参数$[\\mathbf{H}]_{i, j}$的相关信息,我们不应偏离到距$(i, j)$很远的地方。这意味着在$|a|> \\Delta$或$|b| > \\Delta$的范围之外,我们可以设置$[\\mathbf{V}]_{a, b} = 0$。因此,我们可以将$[\\mathbf{H}]_{i, j}$重写为\n", 81 | "\n", 82 | "$$[\\mathbf{H}]_{i, j} = u + \\sum_{a = -\\Delta}^{\\Delta} \\sum_{b = -\\Delta}^{\\Delta} [\\mathbf{V}]_{a, b} [\\mathbf{X}]_{i+a, j+b}.$$\n", 83 | ":eqlabel:`eq_conv-layer`\n", 84 | "\n", 85 | "简而言之, :eqref:`eq_conv-layer`是一个*卷积层*(convolutional layer),而卷积神经网络是包含卷积层的一类特殊的神经网络。\n", 86 | "在深度学习研究社区中,$\\mathbf{V}$被称为*卷积核*(convolution kernel)或者*滤波器*(filter),亦或简单地称之为该卷积层的*权重*,通常该权重是可学习的参数。\n", 87 | "当图像处理的局部区域很小时,卷积神经网络与多层感知机的训练差异可能是巨大的:以前,多层感知机可能需要数十亿个参数来表示网络中的一层,而现在卷积神经网络通常只需要几百个参数,而且不需要改变输入或隐藏表示的维数。\n", 88 | "参数大幅减少的代价是,我们的特征现在是平移不变的,并且当确定每个隐藏活性值时,每一层只包含局部的信息。\n", 89 | "以上所有的权重学习都将依赖于归纳偏置。当这种偏置与现实相符时,我们就能得到样本有效的模型,并且这些模型能很好地泛化到未知数据中。\n", 90 | "但如果这偏置与现实不符时,比如当图像不满足平移不变时,我们的模型可能难以拟合我们的训练数据。\n", 91 | "\n", 92 | "## 卷积\n", 93 | "\n", 94 | "在进一步讨论之前,我们先简要回顾一下为什么上面的操作被称为卷积。在数学中,两个函数(比如$f, g: \\mathbb{R}^d \\to \\mathbb{R}$)之间的“卷积”被定义为\n", 95 | "\n", 96 | "$$(f * g)(\\mathbf{x}) = \\int f(\\mathbf{z}) g(\\mathbf{x}-\\mathbf{z}) d\\mathbf{z}.$$\n", 97 | "\n", 98 | "也就是说,卷积是当把一个函数“翻转”并移位$\\mathbf{x}$时,测量$f$和$g$之间的重叠。\n", 99 | "当为离散对象时,积分就变成求和。例如,对于由索引为$\\mathbb{Z}$的、平方可和的、无限维向量集合中抽取的向量,我们得到以下定义:\n", 100 | "\n", 101 | "$$(f * g)(i) = \\sum_a f(a) g(i-a).$$\n", 102 | "\n", 103 | "对于二维张量,则为$f$的索引$(a, b)$和$g$的索引$(i-a, j-b)$上的对应加和:\n", 104 | "\n", 105 | "$$(f * g)(i, j) = \\sum_a\\sum_b f(a, b) g(i-a, j-b).$$\n", 106 | ":eqlabel:`eq_2d-conv-discrete`\n", 107 | "\n", 108 | "这看起来类似于 :eqref:`eq_conv-layer`,但有一个主要区别:这里不是使用$(i+a, j+b)$,而是使用差值。然而,这种区别是表面的,因为我们总是可以匹配 :eqref:`eq_conv-layer`和 :eqref:`eq_2d-conv-discrete`之间的符号。我们在 :eqref:`eq_conv-layer`中的原始定义更正确地描述了*互相关*(cross-correlation),这个问题将在下一节中讨论。\n", 109 | "\n", 110 | "## “沃尔多在哪里”回顾\n", 111 | "\n", 112 | "回到上面的“沃尔多在哪里”游戏,让我们看看它到底是什么样子。卷积层根据滤波器$\\mathbf{V}$选取给定大小的窗口,并加权处理图片,如 :numref:`fig_waldo_mask`中所示。我们的目标是学习一个模型,以便探测出在“沃尔多”最可能出现的地方。\n", 113 | "\n", 114 | "![发现沃尔多。](../img/waldo-mask.jpg)\n", 115 | ":width:`400px`\n", 116 | ":label:`fig_waldo_mask`\n", 117 | "\n", 118 | "### 通道\n", 119 | ":label:`subsec_why-conv-channels`\n", 120 | "\n", 121 | "然而这种方法有一个问题:我们忽略了图像一般包含三个通道/三种原色(红色、绿色和蓝色)。\n", 122 | "实际上,图像不是二维张量,而是一个由高度、宽度和颜色组成的三维张量,比如包含$1024 \\times 1024 \\times 3$个像素。\n", 123 | "前两个轴与像素的空间位置有关,而第三个轴可以看作每个像素的多维表示。\n", 124 | "因此,我们将$\\mathsf{X}$索引为$[\\mathsf{X}]_{i, j, k}$。由此卷积相应地调整为$[\\mathsf{V}]_{a,b,c}$,而不是$[\\mathbf{V}]_{a,b}$。\n", 125 | "\n", 126 | "此外,由于输入图像是三维的,我们的隐藏表示$\\mathsf{H}$也最好采用三维张量。\n", 127 | "换句话说,对于每一个空间位置,我们想要采用一组而不是一个隐藏表示。这样一组隐藏表示可以想象成一些互相堆叠的二维网格。\n", 128 | "因此,我们可以把隐藏表示想象为一系列具有二维张量的*通道*(channel)。\n", 129 | "这些通道有时也被称为*特征映射*(feature maps),因为每个通道都向后续层提供一组空间化的学习特征。\n", 130 | "直观上可以想象在靠近输入的底层,一些通道专门识别边缘,而一些通道专门识别纹理。\n", 131 | "\n", 132 | "为了支持输入$\\mathsf{X}$和隐藏表示$\\mathsf{H}$中的多个通道,我们可以在$\\mathsf{V}$中添加第四个坐标,即$[\\mathsf{V}]_{a, b, c, d}$。综上所述,\n", 133 | "\n", 134 | "$$[\\mathsf{H}]_{i,j,d} = \\sum_{a = -\\Delta}^{\\Delta} \\sum_{b = -\\Delta}^{\\Delta} \\sum_c [\\mathsf{V}]_{a, b, c, d} [\\mathsf{X}]_{i+a, j+b, c},$$\n", 135 | ":eqlabel:`eq_conv-layer-channels`\n", 136 | "\n", 137 | "其中隐藏表示$\\mathsf{H}$中的索引$d$表示输出通道,而随后的输出将继续以三维张量$\\mathsf{H}$作为输入进入下一个卷积层。\n", 138 | "所以, :eqref:`eq_conv-layer-channels`可以定义具有多个通道的卷积层,而其中$\\mathsf{V}$是该卷积层的权重。\n", 139 | "\n", 140 | "然而,仍有许多问题亟待解决。\n", 141 | "例如,图像中是否到处都有存在沃尔多的可能?如何有效地计算输出层?如何选择适当的激活函数?为了训练有效的网络,如何做出合理的网络设计选择?我们将在本章的其它部分讨论这些问题。\n", 142 | "\n", 143 | "## 小结\n", 144 | "\n", 145 | "- 图像的平移不变性使我们以相同的方式处理局部图像,而不在乎它的位置。\n", 146 | "- 局部性意味着计算相应的隐藏表示只需一小部分局部图像像素。\n", 147 | "- 在图像处理中,卷积层通常比全连接层需要更少的参数,但依旧获得高效用的模型。\n", 148 | "- 卷积神经网络(CNN)是一类特殊的神经网络,它可以包含多个卷积层。\n", 149 | "- 多个输入和输出通道使模型在每个空间位置可以获取图像的多方面特征。\n", 150 | "\n", 151 | "## 练习\n", 152 | "\n", 153 | "1. 假设卷积层 :eqref:`eq_conv-layer`覆盖的局部区域$\\Delta = 0$。在这种情况下,证明卷积内核为每组通道独立地实现一个全连接层。\n", 154 | "1. 为什么平移不变性可能也不是好主意呢?\n", 155 | "1. 当从图像边界像素获取隐藏表示时,我们需要思考哪些问题?\n", 156 | "1. 描述一个类似的音频卷积层的架构。\n", 157 | "1. 卷积层也适合于文本数据吗?为什么?\n", 158 | "1. 证明在 :eqref:`eq_2d-conv-discrete`中,$f * g = g * f$。\n", 159 | "\n", 160 | "[Discussions](https://discuss.d2l.ai/t/5767)\n" 161 | ] 162 | } 163 | ], 164 | "metadata": { 165 | "kernelspec": { 166 | "display_name": "conda_pytorch_p36", 167 | "name": "conda_pytorch_p36" 168 | }, 169 | "language_info": { 170 | "name": "python" 171 | }, 172 | "required_libs": [] 173 | }, 174 | "nbformat": 4, 175 | "nbformat_minor": 5 176 | } -------------------------------------------------------------------------------- /chapter_deep-learning-computation/deferred-init.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "59a11c8e", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 延后初始化\n", 11 | ":label:`sec_deferred_init`\n", 12 | "\n", 13 | "到目前为止,我们忽略了建立网络时需要做的以下这些事情:\n", 14 | "\n", 15 | "* 我们定义了网络架构,但没有指定输入维度。\n", 16 | "* 我们添加层时没有指定前一层的输出维度。\n", 17 | "* 我们在初始化参数时,甚至没有足够的信息来确定模型应该包含多少参数。\n", 18 | "\n", 19 | "有些读者可能会对我们的代码能运行感到惊讶。\n", 20 | "毕竟,深度学习框架无法判断网络的输入维度是什么。\n", 21 | "这里的诀窍是框架的*延后初始化*(defers initialization),\n", 22 | "即直到数据第一次通过模型传递时,框架才会动态地推断出每个层的大小。\n", 23 | "\n", 24 | "在以后,当使用卷积神经网络时,\n", 25 | "由于输入维度(即图像的分辨率)将影响每个后续层的维数,\n", 26 | "有了该技术将更加方便。\n", 27 | "现在我们在编写代码时无须知道维度是什么就可以设置参数,\n", 28 | "这种能力可以大大简化定义和修改模型的任务。\n", 29 | "接下来,我们将更深入地研究初始化机制。\n", 30 | "\n", 31 | "## 实例化网络\n", 32 | "\n", 33 | "首先,让我们实例化一个多层感知机。\n" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "id": "1d75086b", 39 | "metadata": { 40 | "origin_pos": 3 41 | }, 42 | "source": [ 43 | "此时,因为输入维数是未知的,所以网络不可能知道输入层权重的维数。\n", 44 | "因此,框架尚未初始化任何参数,我们通过尝试访问以下参数进行确认。\n" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "82b701e3", 50 | "metadata": { 51 | "origin_pos": 10 52 | }, 53 | "source": [ 54 | "接下来让我们将数据通过网络,最终使框架初始化参数。\n" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "094382a3", 60 | "metadata": { 61 | "origin_pos": 13 62 | }, 63 | "source": [ 64 | "一旦我们知道输入维数是20,框架可以通过代入值20来识别第一层权重矩阵的形状。\n", 65 | "识别出第一层的形状后,框架处理第二层,依此类推,直到所有形状都已知为止。\n", 66 | "注意,在这种情况下,只有第一层需要延迟初始化,但是框架仍是按顺序初始化的。\n", 67 | "等到知道了所有的参数形状,框架就可以初始化参数。\n", 68 | "\n", 69 | "## 小结\n", 70 | "\n", 71 | "* 延后初始化使框架能够自动推断参数形状,使修改模型架构变得容易,避免了一些常见的错误。\n", 72 | "* 我们可以通过模型传递数据,使框架最终初始化参数。\n", 73 | "\n", 74 | "## 练习\n", 75 | "\n", 76 | "1. 如果指定了第一层的输入尺寸,但没有指定后续层的尺寸,会发生什么?是否立即进行初始化?\n", 77 | "1. 如果指定了不匹配的维度会发生什么?\n", 78 | "1. 如果输入具有不同的维度,需要做什么?提示:查看参数绑定的相关内容。\n" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "7ed4b454", 84 | "metadata": { 85 | "origin_pos": 15, 86 | "tab": [ 87 | "pytorch" 88 | ] 89 | }, 90 | "source": [ 91 | "[Discussions](https://discuss.d2l.ai/t/5770)\n" 92 | ] 93 | } 94 | ], 95 | "metadata": { 96 | "kernelspec": { 97 | "display_name": "conda_pytorch_p36", 98 | "name": "conda_pytorch_p36" 99 | }, 100 | "language_info": { 101 | "name": "python" 102 | }, 103 | "required_libs": [] 104 | }, 105 | "nbformat": 4, 106 | "nbformat_minor": 5 107 | } -------------------------------------------------------------------------------- /chapter_deep-learning-computation/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5cf9fbc8", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 深度学习计算\n", 11 | ":label:`chap_computation`\n", 12 | "\n", 13 | "除了庞大的数据集和强大的硬件,\n", 14 | "优秀的软件工具在深度学习的快速发展中发挥了不可或缺的作用。\n", 15 | "从2007年发布的开创性的Theano库开始,\n", 16 | "灵活的开源工具使研究人员能够快速开发模型原型,\n", 17 | "避免了我们使用标准组件时的重复工作,\n", 18 | "同时仍然保持了我们进行底层修改的能力。\n", 19 | "随着时间的推移,深度学习库已经演变成提供越来越粗糙的抽象。\n", 20 | "就像半导体设计师从指定晶体管到逻辑电路再到编写代码一样,\n", 21 | "神经网络研究人员已经从考虑单个人工神经元的行为转变为从层的角度构思网络,\n", 22 | "通常在设计架构时考虑的是更粗糙的块(block)。\n", 23 | "\n", 24 | "之前我们已经介绍了一些基本的机器学习概念,\n", 25 | "并慢慢介绍了功能齐全的深度学习模型。\n", 26 | "在上一章中,我们从零开始实现了多层感知机的每个组件,\n", 27 | "然后展示了如何利用高级API轻松地实现相同的模型。\n", 28 | "为了易于学习,我们调用了深度学习库,但是跳过了它们工作的细节。\n", 29 | "在本章中,我们将深入探索深度学习计算的关键组件,\n", 30 | "即模型构建、参数访问与初始化、设计自定义层和块、将模型读写到磁盘,\n", 31 | "以及利用GPU实现显著的加速。\n", 32 | "这些知识将使读者从深度学习“基础用户”变为“高级用户”。\n", 33 | "虽然本章不介绍任何新的模型或数据集,\n", 34 | "但后面的高级模型章节在很大程度上依赖于本章的知识。\n", 35 | "\n", 36 | ":begin_tab:toc\n", 37 | " - [model-construction](model-construction.ipynb)\n", 38 | " - [parameters](parameters.ipynb)\n", 39 | " - [deferred-init](deferred-init.ipynb)\n", 40 | " - [custom-layer](custom-layer.ipynb)\n", 41 | " - [read-write](read-write.ipynb)\n", 42 | " - [use-gpu](use-gpu.ipynb)\n", 43 | ":end_tab:\n" 44 | ] 45 | } 46 | ], 47 | "metadata": { 48 | "kernelspec": { 49 | "display_name": "conda_pytorch_p36", 50 | "name": "conda_pytorch_p36" 51 | }, 52 | "language_info": { 53 | "name": "python" 54 | }, 55 | "required_libs": [] 56 | }, 57 | "nbformat": 4, 58 | "nbformat_minor": 5 59 | } -------------------------------------------------------------------------------- /chapter_deep-learning-computation/mlp.params: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/chapter_deep-learning-computation/mlp.params -------------------------------------------------------------------------------- /chapter_deep-learning-computation/mydict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/chapter_deep-learning-computation/mydict -------------------------------------------------------------------------------- /chapter_deep-learning-computation/x-file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/chapter_deep-learning-computation/x-file -------------------------------------------------------------------------------- /chapter_deep-learning-computation/x-files: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/chapter_deep-learning-computation/x-files -------------------------------------------------------------------------------- /chapter_installation/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "6262ada7", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 安装\n", 11 | ":label:`chap_installation`\n", 12 | "\n", 13 | "我们需要配置一个环境来运行 Python、Jupyter Notebook、相关库以及运行本书所需的代码,以快速入门并获得动手学习经验。\n", 14 | "\n", 15 | "## 安装 Miniconda\n", 16 | "\n", 17 | "最简单的方法就是安装依赖Python 3.x的[Miniconda](https://conda.io/en/latest/miniconda.html)。\n", 18 | "如果已安装conda,则可以跳过以下步骤。访问Miniconda网站,根据Python3.x版本确定适合的版本。\n", 19 | "\n", 20 | "如果我们使用macOS,假设Python版本是3.9(我们的测试版本),将下载名称包含字符串“MacOSX”的bash脚本,并执行以下操作:\n", 21 | "\n", 22 | "```bash\n", 23 | "# 以Intel处理器为例,文件名可能会更改\n", 24 | "sh Miniconda3-py39_4.12.0-MacOSX-x86_64.sh -b\n", 25 | "```\n", 26 | "\n", 27 | "如果我们使用Linux,假设Python版本是3.9(我们的测试版本),将下载名称包含字符串“Linux”的bash脚本,并执行以下操作:\n", 28 | "\n", 29 | "```bash\n", 30 | "# 文件名可能会更改\n", 31 | "sh Miniconda3-py39_4.12.0-Linux-x86_64.sh -b\n", 32 | "```\n", 33 | "\n", 34 | "接下来,初始化终端Shell,以便我们可以直接运行`conda`。\n", 35 | "\n", 36 | "```bash\n", 37 | "~/miniconda3/bin/conda init\n", 38 | "```\n", 39 | "\n", 40 | "现在关闭并重新打开当前的shell。并使用下面的命令创建一个新的环境:\n", 41 | "\n", 42 | "```bash\n", 43 | "conda create --name d2l python=3.9 -y\n", 44 | "```\n", 45 | "\n", 46 | "现在激活 `d2l` 环境:\n", 47 | "\n", 48 | "```bash\n", 49 | "conda activate d2l\n", 50 | "```\n", 51 | "\n", 52 | "## 安装深度学习框架和`d2l`软件包\n", 53 | "\n", 54 | "在安装深度学习框架之前,请先检查计算机上是否有可用的GPU。\n", 55 | "例如可以查看计算机是否装有NVIDIA GPU并已安装[CUDA](https://developer.nvidia.com/cuda-downloads)。\n", 56 | "如果机器没有任何GPU,没有必要担心,因为CPU在前几章完全够用。\n", 57 | "但是,如果想流畅地学习全部章节,请提早获取GPU并且安装深度学习框架的GPU版本。\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "id": "ed0912e5", 63 | "metadata": { 64 | "origin_pos": 2, 65 | "tab": [ 66 | "pytorch" 67 | ] 68 | }, 69 | "source": [ 70 | "我们可以按如下方式安装PyTorch的CPU或GPU版本:\n", 71 | "\n", 72 | "```bash\n", 73 | "pip install torch==1.12.0\n", 74 | "pip install torchvision==0.13.0\n", 75 | "```\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "id": "4e508102", 81 | "metadata": { 82 | "origin_pos": 5 83 | }, 84 | "source": [ 85 | "我们的下一步是安装`d2l`包,以方便调取本书中经常使用的函数和类:\n", 86 | "\n", 87 | "```bash\n", 88 | "pip install d2l==0.17.6\n", 89 | "```\n", 90 | "\n", 91 | "## 下载 D2L Notebook\n", 92 | "\n", 93 | "接下来,需要下载这本书的代码。\n", 94 | "可以点击本书HTML页面顶部的“Jupyter 记事本”选项下载后解压代码,或者可以按照如下方式进行下载:\n" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "id": "d6f62ffb", 100 | "metadata": { 101 | "origin_pos": 7, 102 | "tab": [ 103 | "pytorch" 104 | ] 105 | }, 106 | "source": [ 107 | "```bash\n", 108 | "mkdir d2l-zh && cd d2l-zh\n", 109 | "curl https://zh-v2.d2l.ai/d2l-zh-2.0.0.zip -o d2l-zh.zip\n", 110 | "unzip d2l-zh.zip && rm d2l-zh.zip\n", 111 | "cd pytorch\n", 112 | "```\n", 113 | "\n", 114 | "\n", 115 | "注意:如果没有安装`unzip`,则可以通过运行`sudo apt install unzip`进行安装。\n" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "id": "668ab9cb", 121 | "metadata": { 122 | "origin_pos": 10 123 | }, 124 | "source": [ 125 | "安装完成后我们可以通过运行以下命令打开Jupyter笔记本(在Window系统的命令行窗口中运行以下命令前,需先将当前路径定位到刚下载的本书代码解压后的目录):\n", 126 | "\n", 127 | "```bash\n", 128 | "jupyter notebook\n", 129 | "```\n", 130 | "\n", 131 | "现在可以在Web浏览器中打开(通常会自动打开)。\n", 132 | "由此,我们可以运行这本书中每个部分的代码。\n", 133 | "在运行书籍代码、更新深度学习框架或`d2l`软件包之前,请始终执行`conda activate d2l`以激活运行时环境。\n", 134 | "要退出环境,请运行`conda deactivate`。\n" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "04be90e9", 140 | "metadata": { 141 | "origin_pos": 12, 142 | "tab": [ 143 | "pytorch" 144 | ] 145 | }, 146 | "source": [ 147 | "[Discussions](https://discuss.d2l.ai/t/2083)\n" 148 | ] 149 | } 150 | ], 151 | "metadata": { 152 | "kernelspec": { 153 | "display_name": "conda_pytorch_p36", 154 | "name": "conda_pytorch_p36" 155 | }, 156 | "language_info": { 157 | "name": "python" 158 | }, 159 | "required_libs": [] 160 | }, 161 | "nbformat": 4, 162 | "nbformat_minor": 5 163 | } -------------------------------------------------------------------------------- /chapter_linear-networks/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "e65d2ddd", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 线性神经网络\n", 11 | ":label:`chap_linear`\n", 12 | "\n", 13 | "在介绍深度神经网络之前,我们需要了解神经网络训练的基础知识。\n", 14 | "本章我们将介绍神经网络的整个训练过程,\n", 15 | "包括:定义简单的神经网络架构、数据处理、指定损失函数和如何训练模型。\n", 16 | "为了更容易学习,我们将从经典算法————*线性*神经网络开始,介绍神经网络的基础知识。\n", 17 | "经典统计学习技术中的线性回归和softmax回归可以视为线性神经网络,\n", 18 | "这些知识将为本书其他部分中更复杂的技术奠定基础。\n", 19 | "\n", 20 | ":begin_tab:toc\n", 21 | " - [linear-regression](linear-regression.ipynb)\n", 22 | " - [linear-regression-scratch](linear-regression-scratch.ipynb)\n", 23 | " - [linear-regression-concise](linear-regression-concise.ipynb)\n", 24 | " - [softmax-regression](softmax-regression.ipynb)\n", 25 | " - [image-classification-dataset](image-classification-dataset.ipynb)\n", 26 | " - [softmax-regression-scratch](softmax-regression-scratch.ipynb)\n", 27 | " - [softmax-regression-concise](softmax-regression-concise.ipynb)\n", 28 | ":end_tab:\n" 29 | ] 30 | } 31 | ], 32 | "metadata": { 33 | "kernelspec": { 34 | "display_name": "conda_pytorch_p36", 35 | "name": "conda_pytorch_p36" 36 | }, 37 | "language_info": { 38 | "name": "python" 39 | }, 40 | "required_libs": [] 41 | }, 42 | "nbformat": 4, 43 | "nbformat_minor": 5 44 | } -------------------------------------------------------------------------------- /chapter_multilayer-perceptrons/backprop.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "9ed6d9cb", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 前向传播、反向传播和计算图\n", 11 | ":label:`sec_backprop`\n", 12 | "\n", 13 | "我们已经学习了如何用小批量随机梯度下降训练模型。\n", 14 | "然而当实现该算法时,我们只考虑了通过*前向传播*(forward propagation)所涉及的计算。\n", 15 | "在计算梯度时,我们只调用了深度学习框架提供的反向传播函数,而不知其所以然。\n", 16 | "\n", 17 | "梯度的自动计算(自动微分)大大简化了深度学习算法的实现。\n", 18 | "在自动微分之前,即使是对复杂模型的微小调整也需要手工重新计算复杂的导数,\n", 19 | "学术论文也不得不分配大量页面来推导更新规则。\n", 20 | "本节将通过一些基本的数学和计算图,\n", 21 | "深入探讨*反向传播*的细节。\n", 22 | "首先,我们将重点放在带权重衰减($L_2$正则化)的单隐藏层多层感知机上。\n", 23 | "\n", 24 | "## 前向传播\n", 25 | "\n", 26 | "*前向传播*(forward propagation或forward pass)\n", 27 | "指的是:按顺序(从输入层到输出层)计算和存储神经网络中每层的结果。\n", 28 | "\n", 29 | "我们将一步步研究单隐藏层神经网络的机制,\n", 30 | "为了简单起见,我们假设输入样本是 $\\mathbf{x}\\in \\mathbb{R}^d$,\n", 31 | "并且我们的隐藏层不包括偏置项。\n", 32 | "这里的中间变量是:\n", 33 | "\n", 34 | "$$\\mathbf{z}= \\mathbf{W}^{(1)} \\mathbf{x},$$\n", 35 | "\n", 36 | "其中$\\mathbf{W}^{(1)} \\in \\mathbb{R}^{h \\times d}$\n", 37 | "是隐藏层的权重参数。\n", 38 | "将中间变量$\\mathbf{z}\\in \\mathbb{R}^h$通过激活函数$\\phi$后,\n", 39 | "我们得到长度为$h$的隐藏激活向量:\n", 40 | "\n", 41 | "$$\\mathbf{h}= \\phi (\\mathbf{z}).$$\n", 42 | "\n", 43 | "隐藏变量$\\mathbf{h}$也是一个中间变量。\n", 44 | "假设输出层的参数只有权重$\\mathbf{W}^{(2)} \\in \\mathbb{R}^{q \\times h}$,\n", 45 | "我们可以得到输出层变量,它是一个长度为$q$的向量:\n", 46 | "\n", 47 | "$$\\mathbf{o}= \\mathbf{W}^{(2)} \\mathbf{h}.$$\n", 48 | "\n", 49 | "假设损失函数为$l$,样本标签为$y$,我们可以计算单个数据样本的损失项,\n", 50 | "\n", 51 | "$$L = l(\\mathbf{o}, y).$$\n", 52 | "\n", 53 | "根据$L_2$正则化的定义,给定超参数$\\lambda$,正则化项为\n", 54 | "\n", 55 | "$$s = \\frac{\\lambda}{2} \\left(\\|\\mathbf{W}^{(1)}\\|_F^2 + \\|\\mathbf{W}^{(2)}\\|_F^2\\right),$$\n", 56 | ":eqlabel:`eq_forward-s`\n", 57 | "\n", 58 | "其中矩阵的Frobenius范数是将矩阵展平为向量后应用的$L_2$范数。\n", 59 | "最后,模型在给定数据样本上的正则化损失为:\n", 60 | "\n", 61 | "$$J = L + s.$$\n", 62 | "\n", 63 | "在下面的讨论中,我们将$J$称为*目标函数*(objective function)。\n", 64 | "\n", 65 | "## 前向传播计算图\n", 66 | "\n", 67 | "绘制*计算图*有助于我们可视化计算中操作符和变量的依赖关系。\n", 68 | " :numref:`fig_forward` 是与上述简单网络相对应的计算图,\n", 69 | " 其中正方形表示变量,圆圈表示操作符。\n", 70 | " 左下角表示输入,右上角表示输出。\n", 71 | " 注意显示数据流的箭头方向主要是向右和向上的。\n", 72 | "\n", 73 | "![前向传播的计算图](../img/forward.svg)\n", 74 | ":label:`fig_forward`\n", 75 | "\n", 76 | "## 反向传播\n", 77 | "\n", 78 | "*反向传播*(backward propagation或backpropagation)指的是计算神经网络参数梯度的方法。\n", 79 | "简言之,该方法根据微积分中的*链式规则*,按相反的顺序从输出层到输入层遍历网络。\n", 80 | "该算法存储了计算某些参数梯度时所需的任何中间变量(偏导数)。\n", 81 | "假设我们有函数$\\mathsf{Y}=f(\\mathsf{X})$和$\\mathsf{Z}=g(\\mathsf{Y})$,\n", 82 | "其中输入和输出$\\mathsf{X}, \\mathsf{Y}, \\mathsf{Z}$是任意形状的张量。\n", 83 | "利用链式法则,我们可以计算$\\mathsf{Z}$关于$\\mathsf{X}$的导数\n", 84 | "\n", 85 | "$$\\frac{\\partial \\mathsf{Z}}{\\partial \\mathsf{X}} = \\text{prod}\\left(\\frac{\\partial \\mathsf{Z}}{\\partial \\mathsf{Y}}, \\frac{\\partial \\mathsf{Y}}{\\partial \\mathsf{X}}\\right).$$\n", 86 | "\n", 87 | "在这里,我们使用$\\text{prod}$运算符在执行必要的操作(如换位和交换输入位置)后将其参数相乘。\n", 88 | "对于向量,这很简单,它只是矩阵-矩阵乘法。\n", 89 | "对于高维张量,我们使用适当的对应项。\n", 90 | "运算符$\\text{prod}$指代了所有的这些符号。\n", 91 | "\n", 92 | "回想一下,在计算图 :numref:`fig_forward`中的单隐藏层简单网络的参数是\n", 93 | "$\\mathbf{W}^{(1)}$和$\\mathbf{W}^{(2)}$。\n", 94 | "反向传播的目的是计算梯度$\\partial J/\\partial \\mathbf{W}^{(1)}$和\n", 95 | "$\\partial J/\\partial \\mathbf{W}^{(2)}$。\n", 96 | "为此,我们应用链式法则,依次计算每个中间变量和参数的梯度。\n", 97 | "计算的顺序与前向传播中执行的顺序相反,因为我们需要从计算图的结果开始,并朝着参数的方向努力。第一步是计算目标函数$J=L+s$相对于损失项$L$和正则项$s$的梯度。\n", 98 | "\n", 99 | "$$\\frac{\\partial J}{\\partial L} = 1 \\; \\text{and} \\; \\frac{\\partial J}{\\partial s} = 1.$$\n", 100 | "\n", 101 | "接下来,我们根据链式法则计算目标函数关于输出层变量$\\mathbf{o}$的梯度:\n", 102 | "\n", 103 | "$$\n", 104 | "\\frac{\\partial J}{\\partial \\mathbf{o}}\n", 105 | "= \\text{prod}\\left(\\frac{\\partial J}{\\partial L}, \\frac{\\partial L}{\\partial \\mathbf{o}}\\right)\n", 106 | "= \\frac{\\partial L}{\\partial \\mathbf{o}}\n", 107 | "\\in \\mathbb{R}^q.\n", 108 | "$$\n", 109 | "\n", 110 | "接下来,我们计算正则化项相对于两个参数的梯度:\n", 111 | "\n", 112 | "$$\\frac{\\partial s}{\\partial \\mathbf{W}^{(1)}} = \\lambda \\mathbf{W}^{(1)}\n", 113 | "\\; \\text{and} \\;\n", 114 | "\\frac{\\partial s}{\\partial \\mathbf{W}^{(2)}} = \\lambda \\mathbf{W}^{(2)}.$$\n", 115 | "\n", 116 | "现在我们可以计算最接近输出层的模型参数的梯度\n", 117 | "$\\partial J/\\partial \\mathbf{W}^{(2)} \\in \\mathbb{R}^{q \\times h}$。\n", 118 | "使用链式法则得出:\n", 119 | "\n", 120 | "$$\\frac{\\partial J}{\\partial \\mathbf{W}^{(2)}}= \\text{prod}\\left(\\frac{\\partial J}{\\partial \\mathbf{o}}, \\frac{\\partial \\mathbf{o}}{\\partial \\mathbf{W}^{(2)}}\\right) + \\text{prod}\\left(\\frac{\\partial J}{\\partial s}, \\frac{\\partial s}{\\partial \\mathbf{W}^{(2)}}\\right)= \\frac{\\partial J}{\\partial \\mathbf{o}} \\mathbf{h}^\\top + \\lambda \\mathbf{W}^{(2)}.$$\n", 121 | ":eqlabel:`eq_backprop-J-h`\n", 122 | "\n", 123 | "为了获得关于$\\mathbf{W}^{(1)}$的梯度,我们需要继续沿着输出层到隐藏层反向传播。\n", 124 | "关于隐藏层输出的梯度$\\partial J/\\partial \\mathbf{h} \\in \\mathbb{R}^h$由下式给出:\n", 125 | "\n", 126 | "$$\n", 127 | "\\frac{\\partial J}{\\partial \\mathbf{h}}\n", 128 | "= \\text{prod}\\left(\\frac{\\partial J}{\\partial \\mathbf{o}}, \\frac{\\partial \\mathbf{o}}{\\partial \\mathbf{h}}\\right)\n", 129 | "= {\\mathbf{W}^{(2)}}^\\top \\frac{\\partial J}{\\partial \\mathbf{o}}.\n", 130 | "$$\n", 131 | "\n", 132 | "由于激活函数$\\phi$是按元素计算的,\n", 133 | "计算中间变量$\\mathbf{z}$的梯度$\\partial J/\\partial \\mathbf{z} \\in \\mathbb{R}^h$\n", 134 | "需要使用按元素乘法运算符,我们用$\\odot$表示:\n", 135 | "\n", 136 | "$$\n", 137 | "\\frac{\\partial J}{\\partial \\mathbf{z}}\n", 138 | "= \\text{prod}\\left(\\frac{\\partial J}{\\partial \\mathbf{h}}, \\frac{\\partial \\mathbf{h}}{\\partial \\mathbf{z}}\\right)\n", 139 | "= \\frac{\\partial J}{\\partial \\mathbf{h}} \\odot \\phi'\\left(\\mathbf{z}\\right).\n", 140 | "$$\n", 141 | "\n", 142 | "最后,我们可以得到最接近输入层的模型参数的梯度\n", 143 | "$\\partial J/\\partial \\mathbf{W}^{(1)} \\in \\mathbb{R}^{h \\times d}$。\n", 144 | "根据链式法则,我们得到:\n", 145 | "\n", 146 | "$$\n", 147 | "\\frac{\\partial J}{\\partial \\mathbf{W}^{(1)}}\n", 148 | "= \\text{prod}\\left(\\frac{\\partial J}{\\partial \\mathbf{z}}, \\frac{\\partial \\mathbf{z}}{\\partial \\mathbf{W}^{(1)}}\\right) + \\text{prod}\\left(\\frac{\\partial J}{\\partial s}, \\frac{\\partial s}{\\partial \\mathbf{W}^{(1)}}\\right)\n", 149 | "= \\frac{\\partial J}{\\partial \\mathbf{z}} \\mathbf{x}^\\top + \\lambda \\mathbf{W}^{(1)}.\n", 150 | "$$\n", 151 | "\n", 152 | "## 训练神经网络\n", 153 | "\n", 154 | "在训练神经网络时,前向传播和反向传播相互依赖。\n", 155 | "对于前向传播,我们沿着依赖的方向遍历计算图并计算其路径上的所有变量。\n", 156 | "然后将这些用于反向传播,其中计算顺序与计算图的相反。\n", 157 | "\n", 158 | "以上述简单网络为例:一方面,在前向传播期间计算正则项\n", 159 | " :eqref:`eq_forward-s`取决于模型参数$\\mathbf{W}^{(1)}$和\n", 160 | "$\\mathbf{W}^{(2)}$的当前值。\n", 161 | "它们是由优化算法根据最近迭代的反向传播给出的。\n", 162 | "另一方面,反向传播期间参数 :eqref:`eq_backprop-J-h`的梯度计算,\n", 163 | "取决于由前向传播给出的隐藏变量$\\mathbf{h}$的当前值。\n", 164 | "\n", 165 | "因此,在训练神经网络时,在初始化模型参数后,\n", 166 | "我们交替使用前向传播和反向传播,利用反向传播给出的梯度来更新模型参数。\n", 167 | "注意,反向传播重复利用前向传播中存储的中间值,以避免重复计算。\n", 168 | "带来的影响之一是我们需要保留中间值,直到反向传播完成。\n", 169 | "这也是训练比单纯的预测需要更多的内存(显存)的原因之一。\n", 170 | "此外,这些中间值的大小与网络层的数量和批量的大小大致成正比。\n", 171 | "因此,使用更大的批量来训练更深层次的网络更容易导致*内存不足*(out of memory)错误。\n", 172 | "\n", 173 | "## 小结\n", 174 | "\n", 175 | "* 前向传播在神经网络定义的计算图中按顺序计算和存储中间变量,它的顺序是从输入层到输出层。\n", 176 | "* 反向传播按相反的顺序(从输出层到输入层)计算和存储神经网络的中间变量和参数的梯度。\n", 177 | "* 在训练深度学习模型时,前向传播和反向传播是相互依赖的。\n", 178 | "* 训练比预测需要更多的内存。\n", 179 | "\n", 180 | "## 练习\n", 181 | "\n", 182 | "1. 假设一些标量函数$\\mathbf{X}$的输入$\\mathbf{X}$是$n \\times m$矩阵。$f$相对于$\\mathbf{X}$的梯度维数是多少?\n", 183 | "1. 向本节中描述的模型的隐藏层添加偏置项(不需要在正则化项中包含偏置项)。\n", 184 | " 1. 画出相应的计算图。\n", 185 | " 1. 推导正向和反向传播方程。\n", 186 | "1. 计算本节所描述的模型,用于训练和预测的内存占用。\n", 187 | "1. 假设想计算二阶导数。计算图发生了什么?预计计算需要多长时间?\n", 188 | "1. 假设计算图对当前拥有的GPU来说太大了。\n", 189 | " 1. 请试着把它划分到多个GPU上。\n", 190 | " 1. 与小批量训练相比,有哪些优点和缺点?\n", 191 | "\n", 192 | "[Discussions](https://discuss.d2l.ai/t/5769)\n" 193 | ] 194 | } 195 | ], 196 | "metadata": { 197 | "kernelspec": { 198 | "display_name": "conda_pytorch_p36", 199 | "name": "conda_pytorch_p36" 200 | }, 201 | "language_info": { 202 | "name": "python" 203 | }, 204 | "required_libs": [] 205 | }, 206 | "nbformat": 4, 207 | "nbformat_minor": 5 208 | } -------------------------------------------------------------------------------- /chapter_multilayer-perceptrons/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "f0f1791b", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 多层感知机\n", 11 | ":label:`chap_perceptrons`\n", 12 | "\n", 13 | "在本章中,我们将第一次介绍真正的*深度*网络。\n", 14 | "最简单的深度网络称为*多层感知机*。多层感知机由多层神经元组成,\n", 15 | "每一层与它的上一层相连,从中接收输入;\n", 16 | "同时每一层也与它的下一层相连,影响当前层的神经元。\n", 17 | "当我们训练容量较大的模型时,我们面临着*过拟合*的风险。\n", 18 | "因此,本章将从基本的概念介绍开始讲起,包括*过拟合*、*欠拟合*和模型选择。\n", 19 | "为了解决这些问题,本章将介绍*权重衰减*和*暂退法*等正则化技术。\n", 20 | "我们还将讨论数值稳定性和参数初始化相关的问题,\n", 21 | "这些问题是成功训练深度网络的关键。\n", 22 | "在本章的最后,我们将把所介绍的内容应用到一个真实的案例:房价预测。\n", 23 | "关于模型计算性能、可伸缩性和效率相关的问题,我们将放在后面的章节中讨论。\n", 24 | "\n", 25 | ":begin_tab:toc\n", 26 | " - [mlp](mlp.ipynb)\n", 27 | " - [mlp-scratch](mlp-scratch.ipynb)\n", 28 | " - [mlp-concise](mlp-concise.ipynb)\n", 29 | " - [underfit-overfit](underfit-overfit.ipynb)\n", 30 | " - [weight-decay](weight-decay.ipynb)\n", 31 | " - [dropout](dropout.ipynb)\n", 32 | " - [backprop](backprop.ipynb)\n", 33 | " - [numerical-stability-and-init](numerical-stability-and-init.ipynb)\n", 34 | " - [environment](environment.ipynb)\n", 35 | " - [kaggle-house-price](kaggle-house-price.ipynb)\n", 36 | ":end_tab:\n" 37 | ] 38 | } 39 | ], 40 | "metadata": { 41 | "kernelspec": { 42 | "display_name": "conda_pytorch_p36", 43 | "name": "conda_pytorch_p36" 44 | }, 45 | "language_info": { 46 | "name": "python" 47 | }, 48 | "required_libs": [] 49 | }, 50 | "nbformat": 4, 51 | "nbformat_minor": 5 52 | } -------------------------------------------------------------------------------- /chapter_natural-language-processing-applications/finetuning-bert.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2731ad59", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 针对序列级和词元级应用微调BERT\n", 11 | ":label:`sec_finetuning-bert`\n", 12 | "\n", 13 | "在本章的前几节中,我们为自然语言处理应用设计了不同的模型,例如基于循环神经网络、卷积神经网络、注意力和多层感知机。这些模型在有空间或时间限制的情况下是有帮助的,但是,为每个自然语言处理任务精心设计一个特定的模型实际上是不可行的。在 :numref:`sec_bert`中,我们介绍了一个名为BERT的预训练模型,该模型可以对广泛的自然语言处理任务进行最少的架构更改。一方面,在提出时,BERT改进了各种自然语言处理任务的技术水平。另一方面,正如在 :numref:`sec_bert-pretraining`中指出的那样,原始BERT模型的两个版本分别带有1.1亿和3.4亿个参数。因此,当有足够的计算资源时,我们可以考虑为下游自然语言处理应用微调BERT。\n", 14 | "\n", 15 | "下面,我们将自然语言处理应用的子集概括为序列级和词元级。在序列层次上,介绍了在单文本分类任务和文本对分类(或回归)任务中,如何将文本输入的BERT表示转换为输出标签。在词元级别,我们将简要介绍新的应用,如文本标注和问答,并说明BERT如何表示它们的输入并转换为输出标签。在微调期间,不同应用之间的BERT所需的“最小架构更改”是额外的全连接层。在下游应用的监督学习期间,额外层的参数是从零开始学习的,而预训练BERT模型中的所有参数都是微调的。\n", 16 | "\n", 17 | "## 单文本分类\n", 18 | "\n", 19 | "*单文本分类*将单个文本序列作为输入,并输出其分类结果。\n", 20 | "除了我们在这一章中探讨的情感分析之外,语言可接受性语料库(Corpus of Linguistic Acceptability,COLA)也是一个单文本分类的数据集,它的要求判断给定的句子在语法上是否可以接受。 :cite:`Warstadt.Singh.Bowman.2019`。例如,“I should study.”是可以接受的,但是“I should studying.”不是可以接受的。\n", 21 | "\n", 22 | "![微调BERT用于单文本分类应用,如情感分析和测试语言可接受性(这里假设输入的单个文本有六个词元)](../img/bert-one-seq.svg)\n", 23 | ":label:`fig_bert-one-seq`\n", 24 | "\n", 25 | " :numref:`sec_bert`描述了BERT的输入表示。BERT输入序列明确地表示单个文本和文本对,其中特殊分类标记“<cls>”用于序列分类,而特殊分类标记“<sep>”标记单个文本的结束或分隔成对文本。如 :numref:`fig_bert-one-seq`所示,在单文本分类应用中,特殊分类标记“<cls>”的BERT表示对整个输入文本序列的信息进行编码。作为输入单个文本的表示,它将被送入到由全连接(稠密)层组成的小多层感知机中,以输出所有离散标签值的分布。\n", 26 | "\n", 27 | "## 文本对分类或回归\n", 28 | "\n", 29 | "在本章中,我们还研究了自然语言推断。它属于*文本对分类*,这是一种对文本进行分类的应用类型。\n", 30 | "\n", 31 | "以一对文本作为输入但输出连续值,*语义文本相似度*是一个流行的“文本对回归”任务。\n", 32 | "这项任务评估句子的语义相似度。例如,在语义文本相似度基准数据集(Semantic Textual Similarity Benchmark)中,句子对的相似度得分是从0(无语义重叠)到5(语义等价)的分数区间 :cite:`Cer.Diab.Agirre.ea.2017`。我们的目标是预测这些分数。来自语义文本相似性基准数据集的样本包括(句子1,句子2,相似性得分):\n", 33 | "\n", 34 | "* \"A plane is taking off.\"(“一架飞机正在起飞。”),\"An air plane is taking off.\"(“一架飞机正在起飞。”),5.000分;\n", 35 | "* \"A woman is eating something.\"(“一个女人在吃东西。”),\"A woman is eating meat.\"(“一个女人在吃肉。”),3.000分;\n", 36 | "* \"A woman is dancing.\"(一个女人在跳舞。),\"A man is talking.\"(“一个人在说话。”),0.000分。\n", 37 | "\n", 38 | "![文本对分类或回归应用的BERT微调,如自然语言推断和语义文本相似性(假设输入文本对分别有两个词元和三个词元)](../img/bert-two-seqs.svg)\n", 39 | ":label:`fig_bert-two-seqs`\n", 40 | "\n", 41 | "与 :numref:`fig_bert-one-seq`中的单文本分类相比, :numref:`fig_bert-two-seqs`中的文本对分类的BERT微调在输入表示上有所不同。对于文本对回归任务(如语义文本相似性),可以应用细微的更改,例如输出连续的标签值和使用均方损失:它们在回归中很常见。\n", 42 | "\n", 43 | "## 文本标注\n", 44 | "\n", 45 | "现在让我们考虑词元级任务,比如*文本标注*(text tagging),其中每个词元都被分配了一个标签。在文本标注任务中,*词性标注*为每个单词分配词性标记(例如,形容词和限定词)。\n", 46 | "根据单词在句子中的作用。如,在Penn树库II标注集中,句子“John Smith‘s car is new”应该被标记为“NNP(名词,专有单数)NNP POS(所有格结尾)NN(名词,单数或质量)VB(动词,基本形式)JJ(形容词)”。\n", 47 | "\n", 48 | "![文本标记应用的BERT微调,如词性标记。假设输入的单个文本有六个词元。](../img/bert-tagging.svg)\n", 49 | ":label:`fig_bert-tagging`\n", 50 | "\n", 51 | " :numref:`fig_bert-tagging`中说明了文本标记应用的BERT微调。与 :numref:`fig_bert-one-seq`相比,唯一的区别在于,在文本标注中,输入文本的*每个词元*的BERT表示被送到相同的额外全连接层中,以输出词元的标签,例如词性标签。\n", 52 | "\n", 53 | "## 问答\n", 54 | "\n", 55 | "作为另一个词元级应用,*问答*反映阅读理解能力。\n", 56 | "例如,斯坦福问答数据集(Stanford Question Answering Dataset,SQuAD v1.1)由阅读段落和问题组成,其中每个问题的答案只是段落中的一段文本(文本片段) :cite:`Rajpurkar.Zhang.Lopyrev.ea.2016`。举个例子,考虑一段话:“Some experts report that a mask's efficacy is inconclusive.However,mask makers insist that their products,such as N95 respirator masks,can guard against the virus.”(“一些专家报告说面罩的功效是不确定的。然而,口罩制造商坚持他们的产品,如N95口罩,可以预防病毒。”)还有一个问题“Who say that N95 respirator masks can guard against the virus?”(“谁说N95口罩可以预防病毒?”)。答案应该是文章中的文本片段“mask makers”(“口罩制造商”)。因此,SQuAD v1.1的目标是在给定问题和段落的情况下预测段落中文本片段的开始和结束。\n", 57 | "\n", 58 | "![对问答进行BERT微调(假设输入文本对分别有两个和三个词元)](../img/bert-qa.svg)\n", 59 | ":label:`fig_bert-qa`\n", 60 | "\n", 61 | "为了微调BERT进行问答,在BERT的输入中,将问题和段落分别作为第一个和第二个文本序列。为了预测文本片段开始的位置,相同的额外的全连接层将把来自位置$i$的任何词元的BERT表示转换成标量分数$s_i$。文章中所有词元的分数还通过softmax转换成概率分布,从而为文章中的每个词元位置$i$分配作为文本片段开始的概率$p_i$。预测文本片段的结束与上面相同,只是其额外的全连接层中的参数与用于预测开始位置的参数无关。当预测结束时,位置$i$的词元由相同的全连接层变换成标量分数$e_i$。 :numref:`fig_bert-qa`描述了用于问答的微调BERT。\n", 62 | "\n", 63 | "对于问答,监督学习的训练目标就像最大化真实值的开始和结束位置的对数似然一样简单。当预测片段时,我们可以计算从位置$i$到位置$j$的有效片段的分数$s_i + e_j$($i \\leq j$),并输出分数最高的跨度。\n", 64 | "\n", 65 | "## 小结\n", 66 | "\n", 67 | "* 对于序列级和词元级自然语言处理应用,BERT只需要最小的架构改变(额外的全连接层),如单个文本分类(例如,情感分析和测试语言可接受性)、文本对分类或回归(例如,自然语言推断和语义文本相似性)、文本标记(例如,词性标记)和问答。\n", 68 | "* 在下游应用的监督学习期间,额外层的参数是从零开始学习的,而预训练BERT模型中的所有参数都是微调的。\n", 69 | "\n", 70 | "## 练习\n", 71 | "\n", 72 | "1. 让我们为新闻文章设计一个搜索引擎算法。当系统接收到查询(例如,“冠状病毒爆发期间的石油行业”)时,它应该返回与该查询最相关的新闻文章的排序列表。假设我们有一个巨大的新闻文章池和大量的查询。为了简化问题,假设为每个查询标记了最相关的文章。如何在算法设计中应用负采样(见 :numref:`subsec_negative-sampling`)和BERT?\n", 73 | "1. 我们如何利用BERT来训练语言模型?\n", 74 | "1. 我们能在机器翻译中利用BERT吗?\n", 75 | "\n", 76 | "[Discussions](https://discuss.d2l.ai/t/5729)\n" 77 | ] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "conda_pytorch_p36", 83 | "name": "conda_pytorch_p36" 84 | }, 85 | "language_info": { 86 | "name": "python" 87 | }, 88 | "required_libs": [] 89 | }, 90 | "nbformat": 4, 91 | "nbformat_minor": 5 92 | } -------------------------------------------------------------------------------- /chapter_natural-language-processing-applications/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "cd1572d4", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 自然语言处理:应用\n", 11 | ":label:`chap_nlp_app`\n", 12 | "\n", 13 | "前面我们学习了如何在文本序列中表示词元,\n", 14 | "并在 :numref:`chap_nlp_pretrain`中训练了词元的表示。\n", 15 | "这样的预训练文本表示可以通过不同模型架构,放入不同的下游自然语言处理任务。\n", 16 | "\n", 17 | "前一章我们提及到一些自然语言处理应用,这些应用没有预训练,只是为了解释深度学习架构。\n", 18 | "例如,在 :numref:`chap_rnn`中,\n", 19 | "我们依赖循环神经网络设计语言模型来生成类似中篇小说的文本。\n", 20 | "在 :numref:`chap_modern_rnn`和 :numref:`chap_attention`中,\n", 21 | "我们还设计了基于循环神经网络和注意力机制的机器翻译模型。\n", 22 | "\n", 23 | "然而,本书并不打算全面涵盖所有此类应用。\n", 24 | "相反,我们的重点是*如何应用深度语言表征学习来解决自然语言处理问题*。\n", 25 | "在给定预训练的文本表示的情况下,\n", 26 | "本章将探讨两种流行且具有代表性的下游自然语言处理任务:\n", 27 | "情感分析和自然语言推断,它们分别分析单个文本和文本对之间的关系。\n", 28 | "\n", 29 | "![预训练文本表示可以通过不同模型架构,放入不同的下游自然语言处理应用(本章重点介绍如何为不同的下游应用设计模型)](../img/nlp-map-app.svg)\n", 30 | ":label:`fig_nlp-map-app`\n", 31 | "\n", 32 | "如 :numref:`fig_nlp-map-app`所述,\n", 33 | "本章将重点描述然后使用不同类型的深度学习架构\n", 34 | "(如多层感知机、卷积神经网络、循环神经网络和注意力)\n", 35 | "设计自然语言处理模型。\n", 36 | "尽管在 :numref:`fig_nlp-map-app`中,\n", 37 | "可以将任何预训练的文本表示与任何应用的架构相结合,\n", 38 | "但我们选择了一些具有代表性的组合。\n", 39 | "具体来说,我们将探索基于循环神经网络和卷积神经网络的流行架构进行情感分析。\n", 40 | "对于自然语言推断,我们选择注意力和多层感知机来演示如何分析文本对。\n", 41 | "最后,我们介绍了如何为广泛的自然语言处理应用,\n", 42 | "如在序列级(单文本分类和文本对分类)和词元级(文本标注和问答)上\n", 43 | "对预训练BERT模型进行微调。\n", 44 | "作为一个具体的经验案例,我们将针对自然语言推断对BERT进行微调。\n", 45 | "\n", 46 | "正如我们在 :numref:`sec_bert`中介绍的那样,\n", 47 | "对于广泛的自然语言处理应用,BERT只需要最少的架构更改。\n", 48 | "然而,这一好处是以微调下游应用的大量BERT参数为代价的。\n", 49 | "当空间或时间有限时,基于多层感知机、卷积神经网络、循环神经网络\n", 50 | "和注意力的精心构建的模型更具可行性。\n", 51 | "下面,我们从情感分析应用开始,分别解读基于循环神经网络和卷积神经网络的模型设计。\n", 52 | "\n", 53 | ":begin_tab:toc\n", 54 | " - [sentiment-analysis-and-dataset](sentiment-analysis-and-dataset.ipynb)\n", 55 | " - [sentiment-analysis-rnn](sentiment-analysis-rnn.ipynb)\n", 56 | " - [sentiment-analysis-cnn](sentiment-analysis-cnn.ipynb)\n", 57 | " - [natural-language-inference-and-dataset](natural-language-inference-and-dataset.ipynb)\n", 58 | " - [natural-language-inference-attention](natural-language-inference-attention.ipynb)\n", 59 | " - [finetuning-bert](finetuning-bert.ipynb)\n", 60 | " - [natural-language-inference-bert](natural-language-inference-bert.ipynb)\n", 61 | ":end_tab:\n" 62 | ] 63 | } 64 | ], 65 | "metadata": { 66 | "kernelspec": { 67 | "display_name": "conda_pytorch_p36", 68 | "name": "conda_pytorch_p36" 69 | }, 70 | "language_info": { 71 | "name": "python" 72 | }, 73 | "required_libs": [] 74 | }, 75 | "nbformat": 4, 76 | "nbformat_minor": 5 77 | } -------------------------------------------------------------------------------- /chapter_natural-language-processing-pretraining/approx-training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4eb98fe2", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 近似训练\n", 11 | ":label:`sec_approx_train`\n", 12 | "\n", 13 | "回想一下我们在 :numref:`sec_word2vec`中的讨论。跳元模型的主要思想是使用softmax运算来计算基于给定的中心词$w_c$生成上下文字$w_o$的条件概率(如 :eqref:`eq_skip-gram-softmax`),对应的对数损失在 :eqref:`eq_skip-gram-log`给出。\n", 14 | "\n", 15 | "由于softmax操作的性质,上下文词可以是词表$\\mathcal{V}$中的任意项, :eqref:`eq_skip-gram-log`包含与整个词表大小一样多的项的求和。因此, :eqref:`eq_skip-gram-grad`中跳元模型的梯度计算和 :eqref:`eq_cbow-gradient`中的连续词袋模型的梯度计算都包含求和。不幸的是,在一个词典上(通常有几十万或数百万个单词)求和的梯度的计算成本是巨大的!\n", 16 | "\n", 17 | "为了降低上述计算复杂度,本节将介绍两种近似训练方法:*负采样*和*分层softmax*。\n", 18 | "由于跳元模型和连续词袋模型的相似性,我们将以跳元模型为例来描述这两种近似训练方法。\n", 19 | "\n", 20 | "## 负采样\n", 21 | ":label:`subsec_negative-sampling`\n", 22 | "\n", 23 | "负采样修改了原目标函数。给定中心词$w_c$的上下文窗口,任意上下文词$w_o$来自该上下文窗口的被认为是由下式建模概率的事件:\n", 24 | "\n", 25 | "$$P(D=1\\mid w_c, w_o) = \\sigma(\\mathbf{u}_o^\\top \\mathbf{v}_c),$$\n", 26 | "\n", 27 | "其中$\\sigma$使用了sigmoid激活函数的定义:\n", 28 | "\n", 29 | "$$\\sigma(x) = \\frac{1}{1+\\exp(-x)}.$$\n", 30 | ":eqlabel:`eq_sigma-f`\n", 31 | "\n", 32 | "让我们从最大化文本序列中所有这些事件的联合概率开始训练词嵌入。具体而言,给定长度为$T$的文本序列,以$w^{(t)}$表示时间步$t$的词,并使上下文窗口为$m$,考虑最大化联合概率:\n", 33 | "\n", 34 | "$$ \\prod_{t=1}^{T} \\prod_{-m \\leq j \\leq m,\\ j \\neq 0} P(D=1\\mid w^{(t)}, w^{(t+j)}).$$\n", 35 | ":eqlabel:`eq-negative-sample-pos`\n", 36 | "\n", 37 | "然而, :eqref:`eq-negative-sample-pos`只考虑那些正样本的事件。仅当所有词向量都等于无穷大时, :eqref:`eq-negative-sample-pos`中的联合概率才最大化为1。当然,这样的结果毫无意义。为了使目标函数更有意义,*负采样*添加从预定义分布中采样的负样本。\n", 38 | "\n", 39 | "用$S$表示上下文词$w_o$来自中心词$w_c$的上下文窗口的事件。对于这个涉及$w_o$的事件,从预定义分布$P(w)$中采样$K$个不是来自这个上下文窗口*噪声词*。用$N_k$表示噪声词$w_k$($k=1, \\ldots, K$)不是来自$w_c$的上下文窗口的事件。假设正例和负例$S, N_1, \\ldots, N_K$的这些事件是相互独立的。负采样将 :eqref:`eq-negative-sample-pos`中的联合概率(仅涉及正例)重写为\n", 40 | "\n", 41 | "$$ \\prod_{t=1}^{T} \\prod_{-m \\leq j \\leq m,\\ j \\neq 0} P(w^{(t+j)} \\mid w^{(t)}),$$\n", 42 | "\n", 43 | "通过事件$S, N_1, \\ldots, N_K$近似条件概率:\n", 44 | "\n", 45 | "$$ P(w^{(t+j)} \\mid w^{(t)}) =P(D=1\\mid w^{(t)}, w^{(t+j)})\\prod_{k=1,\\ w_k \\sim P(w)}^K P(D=0\\mid w^{(t)}, w_k).$$\n", 46 | ":eqlabel:`eq-negative-sample-conditional-prob`\n", 47 | "\n", 48 | "分别用$i_t$和$h_k$表示词$w^{(t)}$和噪声词$w_k$在文本序列的时间步$t$处的索引。 :eqref:`eq-negative-sample-conditional-prob`中关于条件概率的对数损失为:\n", 49 | "\n", 50 | "$$\n", 51 | "\\begin{aligned}\n", 52 | "-\\log P(w^{(t+j)} \\mid w^{(t)})\n", 53 | "=& -\\log P(D=1\\mid w^{(t)}, w^{(t+j)}) - \\sum_{k=1,\\ w_k \\sim P(w)}^K \\log P(D=0\\mid w^{(t)}, w_k)\\\\\n", 54 | "=&- \\log\\, \\sigma\\left(\\mathbf{u}_{i_{t+j}}^\\top \\mathbf{v}_{i_t}\\right) - \\sum_{k=1,\\ w_k \\sim P(w)}^K \\log\\left(1-\\sigma\\left(\\mathbf{u}_{h_k}^\\top \\mathbf{v}_{i_t}\\right)\\right)\\\\\n", 55 | "=&- \\log\\, \\sigma\\left(\\mathbf{u}_{i_{t+j}}^\\top \\mathbf{v}_{i_t}\\right) - \\sum_{k=1,\\ w_k \\sim P(w)}^K \\log\\sigma\\left(-\\mathbf{u}_{h_k}^\\top \\mathbf{v}_{i_t}\\right).\n", 56 | "\\end{aligned}\n", 57 | "$$\n", 58 | "\n", 59 | "我们可以看到,现在每个训练步的梯度计算成本与词表大小无关,而是线性依赖于$K$。当将超参数$K$设置为较小的值时,在负采样的每个训练步处的梯度的计算成本较小。\n", 60 | "\n", 61 | "## 层序Softmax\n", 62 | "\n", 63 | "作为另一种近似训练方法,*层序Softmax*(hierarchical softmax)使用二叉树( :numref:`fig_hi_softmax`中说明的数据结构),其中树的每个叶节点表示词表$\\mathcal{V}$中的一个词。\n", 64 | "\n", 65 | "![用于近似训练的分层softmax,其中树的每个叶节点表示词表中的一个词](../img/hi-softmax.svg)\n", 66 | ":label:`fig_hi_softmax`\n", 67 | "\n", 68 | "用$L(w)$表示二叉树中表示字$w$的从根节点到叶节点的路径上的节点数(包括两端)。设$n(w,j)$为该路径上的$j^\\mathrm{th}$节点,其上下文字向量为$\\mathbf{u}_{n(w, j)}$。例如, :numref:`fig_hi_softmax`中的$L(w_3) = 4$。分层softmax将 :eqref:`eq_skip-gram-softmax`中的条件概率近似为\n", 69 | "\n", 70 | "$$P(w_o \\mid w_c) = \\prod_{j=1}^{L(w_o)-1} \\sigma\\left( [\\![ n(w_o, j+1) = \\text{leftChild}(n(w_o, j)) ]\\!] \\cdot \\mathbf{u}_{n(w_o, j)}^\\top \\mathbf{v}_c\\right),$$\n", 71 | "\n", 72 | "其中函数$\\sigma$在 :eqref:`eq_sigma-f`中定义,$\\text{leftChild}(n)$是节点$n$的左子节点:如果$x$为真,$[\\![x]\\!] = 1$;否则$[\\![x]\\!] = -1$。\n", 73 | "\n", 74 | "为了说明,让我们计算 :numref:`fig_hi_softmax`中给定词$w_c$生成词$w_3$的条件概率。这需要$w_c$的词向量$\\mathbf{v}_c$和从根到$w_3$的路径( :numref:`fig_hi_softmax`中加粗的路径)上的非叶节点向量之间的点积,该路径依次向左、向右和向左遍历:\n", 75 | "\n", 76 | "$$P(w_3 \\mid w_c) = \\sigma(\\mathbf{u}_{n(w_3, 1)}^\\top \\mathbf{v}_c) \\cdot \\sigma(-\\mathbf{u}_{n(w_3, 2)}^\\top \\mathbf{v}_c) \\cdot \\sigma(\\mathbf{u}_{n(w_3, 3)}^\\top \\mathbf{v}_c).$$\n", 77 | "\n", 78 | "由$\\sigma(x)+\\sigma(-x) = 1$,它认为基于任意词$w_c$生成词表$\\mathcal{V}$中所有词的条件概率总和为1:\n", 79 | "\n", 80 | "$$\\sum_{w \\in \\mathcal{V}} P(w \\mid w_c) = 1.$$\n", 81 | ":eqlabel:`eq_hi-softmax-sum-one`\n", 82 | "\n", 83 | "幸运的是,由于二叉树结构,$L(w_o)-1$大约与$\\mathcal{O}(\\text{log}_2|\\mathcal{V}|)$是一个数量级。当词表大小$\\mathcal{V}$很大时,与没有近似训练的相比,使用分层softmax的每个训练步的计算代价显著降低。\n", 84 | "\n", 85 | "## 小结\n", 86 | "\n", 87 | "* 负采样通过考虑相互独立的事件来构造损失函数,这些事件同时涉及正例和负例。训练的计算量与每一步的噪声词数成线性关系。\n", 88 | "* 分层softmax使用二叉树中从根节点到叶节点的路径构造损失函数。训练的计算成本取决于词表大小的对数。\n", 89 | "\n", 90 | "## 练习\n", 91 | "\n", 92 | "1. 如何在负采样中对噪声词进行采样?\n", 93 | "1. 验证 :eqref:`eq_hi-softmax-sum-one`是否有效。\n", 94 | "1. 如何分别使用负采样和分层softmax训练连续词袋模型?\n", 95 | "\n", 96 | "[Discussions](https://discuss.d2l.ai/t/5741)\n" 97 | ] 98 | } 99 | ], 100 | "metadata": { 101 | "kernelspec": { 102 | "display_name": "conda_pytorch_p36", 103 | "name": "conda_pytorch_p36" 104 | }, 105 | "language_info": { 106 | "name": "python" 107 | }, 108 | "required_libs": [] 109 | }, 110 | "nbformat": 4, 111 | "nbformat_minor": 5 112 | } -------------------------------------------------------------------------------- /chapter_natural-language-processing-pretraining/glove.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "1aa10e3e", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 全局向量的词嵌入(GloVe)\n", 11 | ":label:`sec_glove`\n", 12 | "\n", 13 | "上下文窗口内的词共现可以携带丰富的语义信息。例如,在一个大型语料库中,“固体”比“气体”更有可能与“冰”共现,但“气体”一词与“蒸汽”的共现频率可能比与“冰”的共现频率更高。此外,可以预先计算此类共现的全局语料库统计数据:这可以提高训练效率。为了利用整个语料库中的统计信息进行词嵌入,让我们首先回顾 :numref:`subsec_skip-gram`中的跳元模型,但是使用全局语料库统计(如共现计数)来解释它。\n", 14 | "\n", 15 | "## 带全局语料统计的跳元模型\n", 16 | ":label:`subsec_skipgram-global`\n", 17 | "\n", 18 | "用$q_{ij}$表示词$w_j$的条件概率$P(w_j\\mid w_i)$,在跳元模型中给定词$w_i$,我们有:\n", 19 | "\n", 20 | "$$q_{ij}=\\frac{\\exp(\\mathbf{u}_j^\\top \\mathbf{v}_i)}{ \\sum_{k \\in \\mathcal{V}} \\text{exp}(\\mathbf{u}_k^\\top \\mathbf{v}_i)},$$\n", 21 | "\n", 22 | "其中,对于任意索引$i$,向量$\\mathbf{v}_i$和$\\mathbf{u}_i$分别表示词$w_i$作为中心词和上下文词,且$\\mathcal{V} = \\{0, 1, \\ldots, |\\mathcal{V}|-1\\}$是词表的索引集。\n", 23 | "\n", 24 | "考虑词$w_i$可能在语料库中出现多次。在整个语料库中,所有以$w_i$为中心词的上下文词形成一个词索引的*多重集*$\\mathcal{C}_i$,该索引允许同一元素的多个实例。对于任何元素,其实例数称为其*重数*。举例说明,假设词$w_i$在语料库中出现两次,并且在两个上下文窗口中以$w_i$为其中心词的上下文词索引是$k, j, m, k$和$k, l, k, j$。因此,多重集$\\mathcal{C}_i = \\{j, j, k, k, k, k, l, m\\}$,其中元素$j, k, l, m$的重数分别为2、4、1、1。\n", 25 | "\n", 26 | "现在,让我们将多重集$\\mathcal{C}_i$中的元素$j$的重数表示为$x_{ij}$。这是词$w_j$(作为上下文词)和词$w_i$(作为中心词)在整个语料库的同一上下文窗口中的全局共现计数。使用这样的全局语料库统计,跳元模型的损失函数等价于:\n", 27 | "\n", 28 | "$$-\\sum_{i\\in\\mathcal{V}}\\sum_{j\\in\\mathcal{V}} x_{ij} \\log\\,q_{ij}.$$\n", 29 | ":eqlabel:`eq_skipgram-x_ij`\n", 30 | "\n", 31 | "我们用$x_i$表示上下文窗口中的所有上下文词的数量,其中$w_i$作为它们的中心词出现,这相当于$|\\mathcal{C}_i|$。设$p_{ij}$为用于生成上下文词$w_j$的条件概率$x_{ij}/x_i$。给定中心词$w_i$, :eqref:`eq_skipgram-x_ij`可以重写为:\n", 32 | "\n", 33 | "$$-\\sum_{i\\in\\mathcal{V}} x_i \\sum_{j\\in\\mathcal{V}} p_{ij} \\log\\,q_{ij}.$$\n", 34 | ":eqlabel:`eq_skipgram-p_ij`\n", 35 | "\n", 36 | "在 :eqref:`eq_skipgram-p_ij`中,$-\\sum_{j\\in\\mathcal{V}} p_{ij} \\log\\,q_{ij}$计算全局语料统计的条件分布$p_{ij}$和模型预测的条件分布$q_{ij}$的交叉熵。如上所述,这一损失也按$x_i$加权。在 :eqref:`eq_skipgram-p_ij`中最小化损失函数将使预测的条件分布接近全局语料库统计中的条件分布。\n", 37 | "\n", 38 | "虽然交叉熵损失函数通常用于测量概率分布之间的距离,但在这里可能不是一个好的选择。一方面,正如我们在 :numref:`sec_approx_train`中提到的,规范化$q_{ij}$的代价在于整个词表的求和,这在计算上可能非常昂贵。另一方面,来自大型语料库的大量罕见事件往往被交叉熵损失建模,从而赋予过多的权重。\n", 39 | "\n", 40 | "## GloVe模型\n", 41 | "\n", 42 | "有鉴于此,*GloVe*模型基于平方损失 :cite:`Pennington.Socher.Manning.2014`对跳元模型做了三个修改:\n", 43 | "\n", 44 | "1. 使用变量$p'_{ij}=x_{ij}$和$q'_{ij}=\\exp(\\mathbf{u}_j^\\top \\mathbf{v}_i)$\n", 45 | "而非概率分布,并取两者的对数。所以平方损失项是$\\left(\\log\\,p'_{ij} - \\log\\,q'_{ij}\\right)^2 = \\left(\\mathbf{u}_j^\\top \\mathbf{v}_i - \\log\\,x_{ij}\\right)^2$。\n", 46 | "2. 为每个词$w_i$添加两个标量模型参数:中心词偏置$b_i$和上下文词偏置$c_i$。\n", 47 | "3. 用权重函数$h(x_{ij})$替换每个损失项的权重,其中$h(x)$在$[0, 1]$的间隔内递增。\n", 48 | "\n", 49 | "整合代码,训练GloVe是为了尽量降低以下损失函数:\n", 50 | "\n", 51 | "$$\\sum_{i\\in\\mathcal{V}} \\sum_{j\\in\\mathcal{V}} h(x_{ij}) \\left(\\mathbf{u}_j^\\top \\mathbf{v}_i + b_i + c_j - \\log\\,x_{ij}\\right)^2.$$\n", 52 | ":eqlabel:`eq_glove-loss`\n", 53 | "\n", 54 | "对于权重函数,建议的选择是:当$x < c$(例如,$c = 100$)时,$h(x) = (x/c) ^\\alpha$(例如$\\alpha = 0.75$);否则$h(x) = 1$。在这种情况下,由于$h(0)=0$,为了提高计算效率,可以省略任意$x_{ij}=0$的平方损失项。例如,当使用小批量随机梯度下降进行训练时,在每次迭代中,我们随机抽样一小批量*非零*的$x_{ij}$来计算梯度并更新模型参数。注意,这些非零的$x_{ij}$是预先计算的全局语料库统计数据;因此,该模型GloVe被称为*全局向量*。\n", 55 | "\n", 56 | "应该强调的是,当词$w_i$出现在词$w_j$的上下文窗口时,词$w_j$也出现在词$w_i$的上下文窗口。因此,$x_{ij}=x_{ji}$。与拟合非对称条件概率$p_{ij}$的word2vec不同,GloVe拟合对称概率$\\log \\, x_{ij}$。因此,在GloVe模型中,任意词的中心词向量和上下文词向量在数学上是等价的。但在实际应用中,由于初始值不同,同一个词经过训练后,在这两个向量中可能得到不同的值:GloVe将它们相加作为输出向量。\n", 57 | "\n", 58 | "## 从条件概率比值理解GloVe模型\n", 59 | "\n", 60 | "我们也可以从另一个角度来理解GloVe模型。使用 :numref:`subsec_skipgram-global`中的相同符号,设$p_{ij} \\stackrel{\\mathrm{def}}{=} P(w_j \\mid w_i)$为生成上下文词$w_j$的条件概率,给定$w_i$作为语料库中的中心词。 :numref:`tab_glove`根据大量语料库的统计数据,列出了给定单词“ice”和“steam”的共现概率及其比值。\n", 61 | "\n", 62 | "大型语料库中的词-词共现概率及其比值(根据 :cite:`Pennington.Socher.Manning.2014`中的表1改编)\n", 63 | "\n", 64 | "|$w_k$=|solid|gas|water|fashion|\n", 65 | "|:--|:-|:-|:-|:-|\n", 66 | "|$p_1=P(w_k\\mid \\text{ice})$|0.00019|0.000066|0.003|0.000017|\n", 67 | "|$p_2=P(w_k\\mid\\text{steam})$|0.000022|0.00078|0.0022|0.000018|\n", 68 | "|$p_1/p_2$|8.9|0.085|1.36|0.96|\n", 69 | ":label:`tab_glove`\n", 70 | "\n", 71 | "从 :numref:`tab_glove`中,我们可以观察到以下几点:\n", 72 | "\n", 73 | "* 对于与“ice”相关但与“steam”无关的单词$w_k$,例如$w_k=\\text{solid}$,我们预计会有更大的共现概率比值,例如8.9。\n", 74 | "* 对于与“steam”相关但与“ice”无关的单词$w_k$,例如$w_k=\\text{gas}$,我们预计较小的共现概率比值,例如0.085。\n", 75 | "* 对于同时与“ice”和“steam”相关的单词$w_k$,例如$w_k=\\text{water}$,我们预计其共现概率的比值接近1,例如1.36.\n", 76 | "* 对于与“ice”和“steam”都不相关的单词$w_k$,例如$w_k=\\text{fashion}$,我们预计共现概率的比值接近1,例如0.96.\n", 77 | "\n", 78 | "由此可见,共现概率的比值能够直观地表达词与词之间的关系。因此,我们可以设计三个词向量的函数来拟合这个比值。对于共现概率${p_{ij}}/{p_{ik}}$的比值,其中$w_i$是中心词,$w_j$和$w_k$是上下文词,我们希望使用某个函数$f$来拟合该比值:\n", 79 | "\n", 80 | "$$f(\\mathbf{u}_j, \\mathbf{u}_k, {\\mathbf{v}}_i) \\approx \\frac{p_{ij}}{p_{ik}}.$$\n", 81 | ":eqlabel:`eq_glove-f`\n", 82 | "\n", 83 | "在$f$的许多可能的设计中,我们只在以下几点中选择了一个合理的选择。因为共现概率的比值是标量,所以我们要求$f$是标量函数,例如$f(\\mathbf{u}_j, \\mathbf{u}_k, {\\mathbf{v}}_i) = f\\left((\\mathbf{u}_j - \\mathbf{u}_k)^\\top {\\mathbf{v}}_i\\right)$。在 :eqref:`eq_glove-f`中交换词索引$j$和$k$,它必须保持$f(x)f(-x)=1$,所以一种可能性是$f(x)=\\exp(x)$,即:\n", 84 | "\n", 85 | "$$f(\\mathbf{u}_j, \\mathbf{u}_k, {\\mathbf{v}}_i) = \\frac{\\exp\\left(\\mathbf{u}_j^\\top {\\mathbf{v}}_i\\right)}{\\exp\\left(\\mathbf{u}_k^\\top {\\mathbf{v}}_i\\right)} \\approx \\frac{p_{ij}}{p_{ik}}.$$\n", 86 | "\n", 87 | "现在让我们选择$\\exp\\left(\\mathbf{u}_j^\\top {\\mathbf{v}}_i\\right) \\approx \\alpha p_{ij}$,其中$\\alpha$是常数。从$p_{ij}=x_{ij}/x_i$开始,取两边的对数得到$\\mathbf{u}_j^\\top {\\mathbf{v}}_i \\approx \\log\\,\\alpha + \\log\\,x_{ij} - \\log\\,x_i$。我们可以使用附加的偏置项来拟合$- \\log\\, \\alpha + \\log\\, x_i$,如中心词偏置$b_i$和上下文词偏置$c_j$:\n", 88 | "\n", 89 | "$$\\mathbf{u}_j^\\top \\mathbf{v}_i + b_i + c_j \\approx \\log\\, x_{ij}.$$\n", 90 | ":eqlabel:`eq_glove-square`\n", 91 | "\n", 92 | "通过对 :eqref:`eq_glove-square`的加权平方误差的度量,得到了 :eqref:`eq_glove-loss`的GloVe损失函数。\n", 93 | "\n", 94 | "## 小结\n", 95 | "\n", 96 | "* 诸如词-词共现计数的全局语料库统计可以来解释跳元模型。\n", 97 | "* 交叉熵损失可能不是衡量两种概率分布差异的好选择,特别是对于大型语料库。GloVe使用平方损失来拟合预先计算的全局语料库统计数据。\n", 98 | "* 对于GloVe中的任意词,中心词向量和上下文词向量在数学上是等价的。\n", 99 | "* GloVe可以从词-词共现概率的比率来解释。\n", 100 | "\n", 101 | "## 练习\n", 102 | "\n", 103 | "1. 如果词$w_i$和$w_j$在同一上下文窗口中同时出现,我们如何使用它们在文本序列中的距离来重新设计计算条件概率$p_{ij}$的方法?提示:参见GloVe论文 :cite:`Pennington.Socher.Manning.2014`的第4.2节。\n", 104 | "1. 对于任何一个词,它的中心词偏置和上下文偏置在数学上是等价的吗?为什么?\n", 105 | "\n", 106 | "[Discussions](https://discuss.d2l.ai/t/5736)\n" 107 | ] 108 | } 109 | ], 110 | "metadata": { 111 | "kernelspec": { 112 | "display_name": "conda_pytorch_p36", 113 | "name": "conda_pytorch_p36" 114 | }, 115 | "language_info": { 116 | "name": "python" 117 | }, 118 | "required_libs": [] 119 | }, 120 | "nbformat": 4, 121 | "nbformat_minor": 5 122 | } -------------------------------------------------------------------------------- /chapter_natural-language-processing-pretraining/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a820fc25", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 自然语言处理:预训练\n", 11 | ":label:`chap_nlp_pretrain`\n", 12 | "\n", 13 | "人与人之间需要交流。\n", 14 | "出于人类这种基本需要,每天都有大量的书面文本产生。\n", 15 | "比如,社交媒体、聊天应用、电子邮件、产品评论、新闻文章、\n", 16 | "研究论文和书籍中的丰富文本,\n", 17 | "使计算机能够理解它们以提供帮助或基于人类语言做出决策变得至关重要。\n", 18 | "\n", 19 | "*自然语言处理*是指研究使用自然语言的计算机和人类之间的交互。\n", 20 | "在实践中,使用自然语言处理技术来处理和分析文本数据是非常常见的,\n", 21 | "例如 :numref:`sec_language_model`的语言模型\n", 22 | "和 :numref:`sec_machine_translation`的机器翻译模型。\n", 23 | "\n", 24 | "要理解文本,我们可以从学习它的表示开始。\n", 25 | "利用来自大型语料库的现有文本序列,\n", 26 | "*自监督学习*(self-supervised learning)\n", 27 | "已被广泛用于预训练文本表示,\n", 28 | "例如通过使用周围文本的其它部分来预测文本的隐藏部分。\n", 29 | "通过这种方式,模型可以通过有监督地从*海量*文本数据中学习,而不需要*昂贵*的标签标注!\n", 30 | "\n", 31 | "本章我们将看到:当将每个单词或子词视为单个词元时,\n", 32 | "可以在大型语料库上使用word2vec、GloVe或子词嵌入模型预先训练每个词元的词元。\n", 33 | "经过预训练后,每个词元的表示可以是一个向量。\n", 34 | "但是,无论上下文是什么,它都保持不变。\n", 35 | "例如,“bank”(可以译作银行或者河岸)的向量表示在\n", 36 | "“go to the bank to deposit some money”(去银行存点钱)\n", 37 | "和“go to the bank to sit down”(去河岸坐下来)中是相同的。\n", 38 | "因此,许多较新的预训练模型使相同词元的表示适应于不同的上下文,\n", 39 | "其中包括基于Transformer编码器的更深的自监督模型BERT。\n", 40 | "在本章中,我们将重点讨论如何预训练文本的这种表示,\n", 41 | "如 :numref:`fig_nlp-map-pretrain`中所强调的那样。\n", 42 | "\n", 43 | "![预训练好的文本表示可以放入各种深度学习架构,应用于不同自然语言处理任务(本章主要研究上游文本的预训练)](../img/nlp-map-pretrain.svg)\n", 44 | ":label:`fig_nlp-map-pretrain`\n", 45 | "\n", 46 | " :numref:`fig_nlp-map-pretrain`显示了\n", 47 | "预训练好的文本表示可以放入各种深度学习架构,应用于不同自然语言处理任务。\n", 48 | "我们将在 :numref:`chap_nlp_app`中介绍它们。\n", 49 | "\n", 50 | ":begin_tab:toc\n", 51 | " - [word2vec](word2vec.ipynb)\n", 52 | " - [approx-training](approx-training.ipynb)\n", 53 | " - [word-embedding-dataset](word-embedding-dataset.ipynb)\n", 54 | " - [word2vec-pretraining](word2vec-pretraining.ipynb)\n", 55 | " - [glove](glove.ipynb)\n", 56 | " - [subword-embedding](subword-embedding.ipynb)\n", 57 | " - [similarity-analogy](similarity-analogy.ipynb)\n", 58 | " - [bert](bert.ipynb)\n", 59 | " - [bert-dataset](bert-dataset.ipynb)\n", 60 | " - [bert-pretraining](bert-pretraining.ipynb)\n", 61 | ":end_tab:\n" 62 | ] 63 | } 64 | ], 65 | "metadata": { 66 | "kernelspec": { 67 | "display_name": "conda_pytorch_p36", 68 | "name": "conda_pytorch_p36" 69 | }, 70 | "language_info": { 71 | "name": "python" 72 | }, 73 | "required_libs": [] 74 | }, 75 | "nbformat": 4, 76 | "nbformat_minor": 5 77 | } -------------------------------------------------------------------------------- /chapter_natural-language-processing-pretraining/word2vec.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "9594aa73", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 词嵌入(word2vec)\n", 11 | ":label:`sec_word2vec`\n", 12 | "\n", 13 | "自然语言是用来表达人脑思维的复杂系统。\n", 14 | "在这个系统中,词是意义的基本单元。顾名思义,\n", 15 | "*词向量*是用于表示单词意义的向量,\n", 16 | "并且还可以被认为是单词的特征向量或表示。\n", 17 | "将单词映射到实向量的技术称为*词嵌入*。\n", 18 | "近年来,词嵌入逐渐成为自然语言处理的基础知识。\n", 19 | "\n", 20 | "## 为何独热向量是一个糟糕的选择\n", 21 | "\n", 22 | "在 :numref:`sec_rnn_scratch`中,我们使用独热向量来表示词(字符就是单词)。假设词典中不同词的数量(词典大小)为$N$,每个词对应一个从$0$到$N−1$的不同整数(索引)。为了得到索引为$i$的任意词的独热向量表示,我们创建了一个全为0的长度为$N$的向量,并将位置$i$的元素设置为1。这样,每个词都被表示为一个长度为$N$的向量,可以直接由神经网络使用。\n", 23 | "\n", 24 | "虽然独热向量很容易构建,但它们通常不是一个好的选择。一个主要原因是独热向量不能准确表达不同词之间的相似度,比如我们经常使用的“余弦相似度”。对于向量$\\mathbf{x}, \\mathbf{y} \\in \\mathbb{R}^d$,它们的余弦相似度是它们之间角度的余弦:\n", 25 | "\n", 26 | "$$\\frac{\\mathbf{x}^\\top \\mathbf{y}}{\\|\\mathbf{x}\\| \\|\\mathbf{y}\\|} \\in [-1, 1].$$\n", 27 | "\n", 28 | "由于任意两个不同词的独热向量之间的余弦相似度为0,所以独热向量不能编码词之间的相似性。\n", 29 | "\n", 30 | "## 自监督的word2vec\n", 31 | "\n", 32 | "[word2vec](https://code.google.com/archive/p/word2vec/)工具是为了解决上述问题而提出的。它将每个词映射到一个固定长度的向量,这些向量能更好地表达不同词之间的相似性和类比关系。word2vec工具包含两个模型,即*跳元模型*(skip-gram) :cite:`Mikolov.Sutskever.Chen.ea.2013`和*连续词袋*(CBOW) :cite:`Mikolov.Chen.Corrado.ea.2013`。对于在语义上有意义的表示,它们的训练依赖于条件概率,条件概率可以被看作使用语料库中一些词来预测另一些单词。由于是不带标签的数据,因此跳元模型和连续词袋都是自监督模型。\n", 33 | "\n", 34 | "下面,我们将介绍这两种模式及其训练方法。\n", 35 | "\n", 36 | "## 跳元模型(Skip-Gram)\n", 37 | ":label:`subsec_skip-gram`\n", 38 | "\n", 39 | "跳元模型假设一个词可以用来在文本序列中生成其周围的单词。以文本序列“the”“man”“loves”“his”“son”为例。假设*中心词*选择“loves”,并将上下文窗口设置为2,如图 :numref:`fig_skip_gram`所示,给定中心词“loves”,跳元模型考虑生成*上下文词*“the”“man”“him”“son”的条件概率:\n", 40 | "\n", 41 | "$$P(\\textrm{\"the\"},\\textrm{\"man\"},\\textrm{\"his\"},\\textrm{\"son\"}\\mid\\textrm{\"loves\"}).$$\n", 42 | "\n", 43 | "假设上下文词是在给定中心词的情况下独立生成的(即条件独立性)。在这种情况下,上述条件概率可以重写为:\n", 44 | "\n", 45 | "$$P(\\textrm{\"the\"}\\mid\\textrm{\"loves\"})\\cdot P(\\textrm{\"man\"}\\mid\\textrm{\"loves\"})\\cdot P(\\textrm{\"his\"}\\mid\\textrm{\"loves\"})\\cdot P(\\textrm{\"son\"}\\mid\\textrm{\"loves\"}).$$\n", 46 | "\n", 47 | "![跳元模型考虑了在给定中心词的情况下生成周围上下文词的条件概率](../img/skip-gram.svg)\n", 48 | ":label:`fig_skip_gram`\n", 49 | "\n", 50 | "在跳元模型中,每个词都有两个$d$维向量表示,用于计算条件概率。更具体地说,对于词典中索引为$i$的任何词,分别用$\\mathbf{v}_i\\in\\mathbb{R}^d$和$\\mathbf{u}_i\\in\\mathbb{R}^d$表示其用作*中心词*和*上下文词*时的两个向量。给定中心词$w_c$(词典中的索引$c$),生成任何上下文词$w_o$(词典中的索引$o$)的条件概率可以通过对向量点积的softmax操作来建模:\n", 51 | "\n", 52 | "$$P(w_o \\mid w_c) = \\frac{\\text{exp}(\\mathbf{u}_o^\\top \\mathbf{v}_c)}{ \\sum_{i \\in \\mathcal{V}} \\text{exp}(\\mathbf{u}_i^\\top \\mathbf{v}_c)},$$\n", 53 | ":eqlabel:`eq_skip-gram-softmax`\n", 54 | "\n", 55 | "其中词表索引集$\\mathcal{V} = \\{0, 1, \\ldots, |\\mathcal{V}|-1\\}$。给定长度为$T$的文本序列,其中时间步$t$处的词表示为$w^{(t)}$。假设上下文词是在给定任何中心词的情况下独立生成的。对于上下文窗口$m$,跳元模型的似然函数是在给定任何中心词的情况下生成所有上下文词的概率:\n", 56 | "\n", 57 | "$$ \\prod_{t=1}^{T} \\prod_{-m \\leq j \\leq m,\\ j \\neq 0} P(w^{(t+j)} \\mid w^{(t)}),$$\n", 58 | "\n", 59 | "其中可以省略小于$1$或大于$T$的任何时间步。\n", 60 | "\n", 61 | "### 训练\n", 62 | "\n", 63 | "跳元模型参数是词表中每个词的中心词向量和上下文词向量。在训练中,我们通过最大化似然函数(即极大似然估计)来学习模型参数。这相当于最小化以下损失函数:\n", 64 | "\n", 65 | "$$ - \\sum_{t=1}^{T} \\sum_{-m \\leq j \\leq m,\\ j \\neq 0} \\text{log}\\, P(w^{(t+j)} \\mid w^{(t)}).$$\n", 66 | "\n", 67 | "当使用随机梯度下降来最小化损失时,在每次迭代中可以随机抽样一个较短的子序列来计算该子序列的(随机)梯度,以更新模型参数。为了计算该(随机)梯度,我们需要获得对数条件概率关于中心词向量和上下文词向量的梯度。通常,根据 :eqref:`eq_skip-gram-softmax`,涉及中心词$w_c$和上下文词$w_o$的对数条件概率为:\n", 68 | "\n", 69 | "$$\\log P(w_o \\mid w_c) =\\mathbf{u}_o^\\top \\mathbf{v}_c - \\log\\left(\\sum_{i \\in \\mathcal{V}} \\text{exp}(\\mathbf{u}_i^\\top \\mathbf{v}_c)\\right).$$\n", 70 | ":eqlabel:`eq_skip-gram-log`\n", 71 | "\n", 72 | "通过微分,我们可以获得其相对于中心词向量$\\mathbf{v}_c$的梯度为\n", 73 | "\n", 74 | "$$\\begin{aligned}\\frac{\\partial \\text{log}\\, P(w_o \\mid w_c)}{\\partial \\mathbf{v}_c}&= \\mathbf{u}_o - \\frac{\\sum_{j \\in \\mathcal{V}} \\exp(\\mathbf{u}_j^\\top \\mathbf{v}_c)\\mathbf{u}_j}{\\sum_{i \\in \\mathcal{V}} \\exp(\\mathbf{u}_i^\\top \\mathbf{v}_c)}\\\\&= \\mathbf{u}_o - \\sum_{j \\in \\mathcal{V}} \\left(\\frac{\\text{exp}(\\mathbf{u}_j^\\top \\mathbf{v}_c)}{ \\sum_{i \\in \\mathcal{V}} \\text{exp}(\\mathbf{u}_i^\\top \\mathbf{v}_c)}\\right) \\mathbf{u}_j\\\\&= \\mathbf{u}_o - \\sum_{j \\in \\mathcal{V}} P(w_j \\mid w_c) \\mathbf{u}_j.\\end{aligned}$$\n", 75 | ":eqlabel:`eq_skip-gram-grad`\n", 76 | "\n", 77 | "注意, :eqref:`eq_skip-gram-grad`中的计算需要词典中以$w_c$为中心词的所有词的条件概率。其他词向量的梯度可以以相同的方式获得。\n", 78 | "\n", 79 | "对词典中索引为$i$的词进行训练后,得到$\\mathbf{v}_i$(作为中心词)和$\\mathbf{u}_i$(作为上下文词)两个词向量。在自然语言处理应用中,跳元模型的中心词向量通常用作词表示。\n", 80 | "\n", 81 | "## 连续词袋(CBOW)模型\n", 82 | "\n", 83 | "*连续词袋*(CBOW)模型类似于跳元模型。与跳元模型的主要区别在于,连续词袋模型假设中心词是基于其在文本序列中的周围上下文词生成的。例如,在文本序列“the”“man”“loves”“his”“son”中,在“loves”为中心词且上下文窗口为2的情况下,连续词袋模型考虑基于上下文词“the”“man”“him”“son”(如 :numref:`fig_cbow`所示)生成中心词“loves”的条件概率,即:\n", 84 | "\n", 85 | "$$P(\\textrm{\"loves\"}\\mid\\textrm{\"the\"},\\textrm{\"man\"},\\textrm{\"his\"},\\textrm{\"son\"}).$$\n", 86 | "\n", 87 | "![连续词袋模型考虑了给定周围上下文词生成中心词条件概率](../img/cbow.svg)\n", 88 | ":label:`fig_cbow`\n", 89 | "\n", 90 | "\n", 91 | "由于连续词袋模型中存在多个上下文词,因此在计算条件概率时对这些上下文词向量进行平均。具体地说,对于字典中索引$i$的任意词,分别用$\\mathbf{v}_i\\in\\mathbb{R}^d$和$\\mathbf{u}_i\\in\\mathbb{R}^d$表示用作*上下文*词和*中心*词的两个向量(符号与跳元模型中相反)。给定上下文词$w_{o_1}, \\ldots, w_{o_{2m}}$(在词表中索引是$o_1, \\ldots, o_{2m}$)生成任意中心词$w_c$(在词表中索引是$c$)的条件概率可以由以下公式建模:\n", 92 | "\n", 93 | "$$P(w_c \\mid w_{o_1}, \\ldots, w_{o_{2m}}) = \\frac{\\text{exp}\\left(\\frac{1}{2m}\\mathbf{u}_c^\\top (\\mathbf{v}_{o_1} + \\ldots, + \\mathbf{v}_{o_{2m}}) \\right)}{ \\sum_{i \\in \\mathcal{V}} \\text{exp}\\left(\\frac{1}{2m}\\mathbf{u}_i^\\top (\\mathbf{v}_{o_1} + \\ldots, + \\mathbf{v}_{o_{2m}}) \\right)}.$$\n", 94 | ":eqlabel:`fig_cbow-full`\n", 95 | "\n", 96 | "为了简洁起见,我们设为$\\mathcal{W}_o= \\{w_{o_1}, \\ldots, w_{o_{2m}}\\}$和$\\bar{\\mathbf{v}}_o = \\left(\\mathbf{v}_{o_1} + \\ldots, + \\mathbf{v}_{o_{2m}} \\right)/(2m)$。那么 :eqref:`fig_cbow-full`可以简化为:\n", 97 | "\n", 98 | "$$P(w_c \\mid \\mathcal{W}_o) = \\frac{\\exp\\left(\\mathbf{u}_c^\\top \\bar{\\mathbf{v}}_o\\right)}{\\sum_{i \\in \\mathcal{V}} \\exp\\left(\\mathbf{u}_i^\\top \\bar{\\mathbf{v}}_o\\right)}.$$\n", 99 | "\n", 100 | "给定长度为$T$的文本序列,其中时间步$t$处的词表示为$w^{(t)}$。对于上下文窗口$m$,连续词袋模型的似然函数是在给定其上下文词的情况下生成所有中心词的概率:\n", 101 | "\n", 102 | "$$ \\prod_{t=1}^{T} P(w^{(t)} \\mid w^{(t-m)}, \\ldots, w^{(t-1)}, w^{(t+1)}, \\ldots, w^{(t+m)}).$$\n", 103 | "\n", 104 | "### 训练\n", 105 | "\n", 106 | "训练连续词袋模型与训练跳元模型几乎是一样的。连续词袋模型的最大似然估计等价于最小化以下损失函数:\n", 107 | "\n", 108 | "$$ -\\sum_{t=1}^T \\text{log}\\, P(w^{(t)} \\mid w^{(t-m)}, \\ldots, w^{(t-1)}, w^{(t+1)}, \\ldots, w^{(t+m)}).$$\n", 109 | "\n", 110 | "请注意,\n", 111 | "\n", 112 | "$$\\log\\,P(w_c \\mid \\mathcal{W}_o) = \\mathbf{u}_c^\\top \\bar{\\mathbf{v}}_o - \\log\\,\\left(\\sum_{i \\in \\mathcal{V}} \\exp\\left(\\mathbf{u}_i^\\top \\bar{\\mathbf{v}}_o\\right)\\right).$$\n", 113 | "\n", 114 | "通过微分,我们可以获得其关于任意上下文词向量$\\mathbf{v}_{o_i}$($i = 1, \\ldots, 2m$)的梯度,如下:\n", 115 | "\n", 116 | "$$\\frac{\\partial \\log\\, P(w_c \\mid \\mathcal{W}_o)}{\\partial \\mathbf{v}_{o_i}} = \\frac{1}{2m} \\left(\\mathbf{u}_c - \\sum_{j \\in \\mathcal{V}} \\frac{\\exp(\\mathbf{u}_j^\\top \\bar{\\mathbf{v}}_o)\\mathbf{u}_j}{ \\sum_{i \\in \\mathcal{V}} \\text{exp}(\\mathbf{u}_i^\\top \\bar{\\mathbf{v}}_o)} \\right) = \\frac{1}{2m}\\left(\\mathbf{u}_c - \\sum_{j \\in \\mathcal{V}} P(w_j \\mid \\mathcal{W}_o) \\mathbf{u}_j \\right).$$\n", 117 | ":eqlabel:`eq_cbow-gradient`\n", 118 | "\n", 119 | "其他词向量的梯度可以以相同的方式获得。与跳元模型不同,连续词袋模型通常使用上下文词向量作为词表示。\n", 120 | "\n", 121 | "## 小结\n", 122 | "\n", 123 | "* 词向量是用于表示单词意义的向量,也可以看作词的特征向量。将词映射到实向量的技术称为词嵌入。\n", 124 | "* word2vec工具包含跳元模型和连续词袋模型。\n", 125 | "* 跳元模型假设一个单词可用于在文本序列中,生成其周围的单词;而连续词袋模型假设基于上下文词来生成中心单词。\n", 126 | "\n", 127 | "## 练习\n", 128 | "\n", 129 | "1. 计算每个梯度的计算复杂度是多少?如果词表很大,会有什么问题呢?\n", 130 | "1. 英语中的一些固定短语由多个单词组成,例如“new york”。如何训练它们的词向量?提示:查看word2vec论文的第四节 :cite:`Mikolov.Sutskever.Chen.ea.2013`。\n", 131 | "1. 让我们以跳元模型为例来思考word2vec设计。跳元模型中两个词向量的点积与余弦相似度之间有什么关系?对于语义相似的一对词,为什么它们的词向量(由跳元模型训练)的余弦相似度可能很高?\n", 132 | "\n", 133 | "[Discussions](https://discuss.d2l.ai/t/5744)\n" 134 | ] 135 | } 136 | ], 137 | "metadata": { 138 | "kernelspec": { 139 | "display_name": "conda_pytorch_p36", 140 | "name": "conda_pytorch_p36" 141 | }, 142 | "language_info": { 143 | "name": "python" 144 | }, 145 | "required_libs": [] 146 | }, 147 | "nbformat": 4, 148 | "nbformat_minor": 5 149 | } -------------------------------------------------------------------------------- /chapter_notation/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "26b7e5a4", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 符号\n", 11 | ":label:`chap_notation`\n", 12 | "\n", 13 | "本书中使用的符号概述如下。\n", 14 | "\n", 15 | "## 数字\n", 16 | "\n", 17 | "* $x$:标量\n", 18 | "* $\\mathbf{x}$:向量\n", 19 | "* $\\mathbf{X}$:矩阵\n", 20 | "* $\\mathsf{X}$:张量\n", 21 | "* $\\mathbf{I}$:单位矩阵\n", 22 | "* $x_i$, $[\\mathbf{x}]_i$:向量$\\mathbf{x}$第$i$个元素\n", 23 | "* $x_{ij}$, $[\\mathbf{X}]_{ij}$:矩阵$\\mathbf{X}$第$i$行第$j$列的元素\n", 24 | "\n", 25 | "## 集合论\n", 26 | "\n", 27 | "* $\\mathcal{X}$: 集合\n", 28 | "* $\\mathbb{Z}$: 整数集合\n", 29 | "* $\\mathbb{R}$: 实数集合\n", 30 | "* $\\mathbb{R}^n$: $n$维实数向量集合\n", 31 | "* $\\mathbb{R}^{a\\times b}$: 包含$a$行和$b$列的实数矩阵集合\n", 32 | "* $\\mathcal{A}\\cup\\mathcal{B}$: 集合$\\mathcal{A}$和$\\mathcal{B}$的并集\n", 33 | "* $\\mathcal{A}\\cap\\mathcal{B}$:集合$\\mathcal{A}$和$\\mathcal{B}$的交集\n", 34 | "* $\\mathcal{A}\\setminus\\mathcal{B}$:集合$\\mathcal{A}$与集合$\\mathcal{B}$相减,$\\mathcal{B}$关于$\\mathcal{A}$的相对补集\n", 35 | "\n", 36 | "## 函数和运算符\n", 37 | "\n", 38 | "* $f(\\cdot)$:函数\n", 39 | "* $\\log(\\cdot)$:自然对数\n", 40 | "* $\\exp(\\cdot)$: 指数函数\n", 41 | "* $\\mathbf{1}_\\mathcal{X}$: 指示函数\n", 42 | "* $\\mathbf{(\\cdot)}^\\top$: 向量或矩阵的转置\n", 43 | "* $\\mathbf{X}^{-1}$: 矩阵的逆\n", 44 | "* $\\odot$: 按元素相乘\n", 45 | "* $[\\cdot, \\cdot]$:连结\n", 46 | "* $\\lvert \\mathcal{X} \\rvert$:集合的基数\n", 47 | "* $\\|\\cdot\\|_p$: :$L_p$ 正则\n", 48 | "* $\\|\\cdot\\|$: $L_2$ 正则\n", 49 | "* $\\langle \\mathbf{x}, \\mathbf{y} \\rangle$:向量$\\mathbf{x}$和$\\mathbf{y}$的点积\n", 50 | "* $\\sum$: 连加\n", 51 | "* $\\prod$: 连乘\n", 52 | "* $\\stackrel{\\mathrm{def}}{=}$:定义\n", 53 | "\n", 54 | "## 微积分\n", 55 | "\n", 56 | "* $\\frac{dy}{dx}$:$y$关于$x$的导数\n", 57 | "* $\\frac{\\partial y}{\\partial x}$:$y$关于$x$的偏导数\n", 58 | "* $\\nabla_{\\mathbf{x}} y$:$y$关于$\\mathbf{x}$的梯度\n", 59 | "* $\\int_a^b f(x) \\;dx$: $f$在$a$到$b$区间上关于$x$的定积分\n", 60 | "* $\\int f(x) \\;dx$: $f$关于$x$的不定积分\n", 61 | "\n", 62 | "## 概率与信息论\n", 63 | "\n", 64 | "* $P(\\cdot)$:概率分布\n", 65 | "* $z \\sim P$: 随机变量$z$具有概率分布$P$\n", 66 | "* $P(X \\mid Y)$:$X\\mid Y$的条件概率\n", 67 | "* $p(x)$: 概率密度函数\n", 68 | "* ${E}_{x} [f(x)]$: 函数$f$对$x$的数学期望\n", 69 | "* $X \\perp Y$: 随机变量$X$和$Y$是独立的\n", 70 | "* $X \\perp Y \\mid Z$: 随机变量$X$和$Y$在给定随机变量$Z$的条件下是独立的\n", 71 | "* $\\mathrm{Var}(X)$: 随机变量$X$的方差\n", 72 | "* $\\sigma_X$: 随机变量$X$的标准差\n", 73 | "* $\\mathrm{Cov}(X, Y)$: 随机变量$X$和$Y$的协方差\n", 74 | "* $\\rho(X, Y)$: 随机变量$X$和$Y$的相关性\n", 75 | "* $H(X)$: 随机变量$X$的熵\n", 76 | "* $D_{\\mathrm{KL}}(P\\|Q)$: $P$和$Q$的KL-散度\n", 77 | "\n", 78 | "## 复杂度\n", 79 | "\n", 80 | "* $\\mathcal{O}$:大O标记\n", 81 | "\n", 82 | "[Discussions](https://discuss.d2l.ai/t/2089)\n" 83 | ] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "conda_pytorch_p36", 89 | "name": "conda_pytorch_p36" 90 | }, 91 | "language_info": { 92 | "name": "python" 93 | }, 94 | "required_libs": [] 95 | }, 96 | "nbformat": 4, 97 | "nbformat_minor": 5 98 | } -------------------------------------------------------------------------------- /chapter_optimization/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "18ed6c9f", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 优化算法\n", 11 | ":label:`chap_optimization`\n", 12 | "\n", 13 | "截止到目前,本书已经使用了许多优化算法来训练深度学习模型。优化算法使我们能够继续更新模型参数,并使损失函数的值最小化。这就像在训练集上评估一样。事实上,任何满足于将优化视为黑盒装置,以在简单的设置中最小化目标函数的人,都可能会知道存在着一系列此类“咒语”(名称如“SGD”和“Adam”)。\n", 14 | "\n", 15 | "但是,为了做得更好,还需要更深入的知识。优化算法对于深度学习非常重要。一方面,训练复杂的深度学习模型可能需要数小时、几天甚至数周。优化算法的性能直接影响模型的训练效率。另一方面,了解不同优化算法的原则及其超参数的作用将使我们能够以有针对性的方式调整超参数,以提高深度学习模型的性能。\n", 16 | "\n", 17 | "在本章中,我们深入探讨常见的深度学习优化算法。深度学习中出现的几乎所有优化问题都是*非凸*的。尽管如此,在*凸问题*背景下设计和分析算法是非常有启发性的。正是出于这个原因,本章包括了凸优化的入门,以及凸目标函数上非常简单的随机梯度下降算法的证明。\n", 18 | "\n", 19 | ":begin_tab:toc\n", 20 | " - [optimization-intro](optimization-intro.ipynb)\n", 21 | " - [convexity](convexity.ipynb)\n", 22 | " - [gd](gd.ipynb)\n", 23 | " - [sgd](sgd.ipynb)\n", 24 | " - [minibatch-sgd](minibatch-sgd.ipynb)\n", 25 | " - [momentum](momentum.ipynb)\n", 26 | " - [adagrad](adagrad.ipynb)\n", 27 | " - [rmsprop](rmsprop.ipynb)\n", 28 | " - [adadelta](adadelta.ipynb)\n", 29 | " - [adam](adam.ipynb)\n", 30 | " - [lr-scheduler](lr-scheduler.ipynb)\n", 31 | ":end_tab:\n" 32 | ] 33 | } 34 | ], 35 | "metadata": { 36 | "kernelspec": { 37 | "display_name": "conda_pytorch_p36", 38 | "name": "conda_pytorch_p36" 39 | }, 40 | "language_info": { 41 | "name": "python" 42 | }, 43 | "required_libs": [] 44 | }, 45 | "nbformat": 4, 46 | "nbformat_minor": 5 47 | } -------------------------------------------------------------------------------- /chapter_preliminaries/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "fc08a2aa", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 预备知识\n", 11 | ":label:`chap_preliminaries`\n", 12 | "\n", 13 | "要学习深度学习,首先需要先掌握一些基本技能。\n", 14 | "所有机器学习方法都涉及从数据中提取信息。\n", 15 | "因此,我们先学习一些关于数据的实用技能,包括存储、操作和预处理数据。\n", 16 | "\n", 17 | "机器学习通常需要处理大型数据集。\n", 18 | "我们可以将某些数据集视为一个表,其中表的行对应样本,列对应属性。\n", 19 | "线性代数为人们提供了一些用来处理表格数据的方法。\n", 20 | "我们不会太深究细节,而是将重点放在矩阵运算的基本原理及其实现上。\n", 21 | "\n", 22 | "深度学习是关于优化的学习。\n", 23 | "对于一个带有参数的模型,我们想要找到其中能拟合数据的最好模型。\n", 24 | "在算法的每个步骤中,决定以何种方式调整参数需要一点微积分知识。\n", 25 | "本章将简要介绍这些知识。\n", 26 | "幸运的是,`autograd`包会自动计算微分,本章也将介绍它。\n", 27 | "\n", 28 | "机器学习还涉及如何做出预测:给定观察到的信息,某些未知属性可能的值是多少?\n", 29 | "要在不确定的情况下进行严格的推断,我们需要借用概率语言。\n", 30 | "\n", 31 | "最后,官方文档提供了本书之外的大量描述和示例。\n", 32 | "在本章的结尾,我们将展示如何在官方文档中查找所需信息。\n", 33 | "\n", 34 | "本书对读者数学基础无过分要求,只要可以正确理解深度学习所需的数学知识即可。\n", 35 | "但这并不意味着本书中不涉及数学方面的内容,本章会快速介绍一些基本且常用的数学知识,\n", 36 | "以便读者能够理解书中的大部分数学内容。\n", 37 | "如果读者想要深入理解全部数学内容,可以进一步学习本书数学附录中给出的数学基础知识。\n", 38 | "\n", 39 | ":begin_tab:toc\n", 40 | " - [ndarray](ndarray.ipynb)\n", 41 | " - [pandas](pandas.ipynb)\n", 42 | " - [linear-algebra](linear-algebra.ipynb)\n", 43 | " - [calculus](calculus.ipynb)\n", 44 | " - [autograd](autograd.ipynb)\n", 45 | " - [probability](probability.ipynb)\n", 46 | " - [lookup-api](lookup-api.ipynb)\n", 47 | ":end_tab:\n" 48 | ] 49 | } 50 | ], 51 | "metadata": { 52 | "kernelspec": { 53 | "display_name": "conda_pytorch_p36", 54 | "name": "conda_pytorch_p36" 55 | }, 56 | "language_info": { 57 | "name": "python" 58 | }, 59 | "required_libs": [] 60 | }, 61 | "nbformat": 4, 62 | "nbformat_minor": 5 63 | } -------------------------------------------------------------------------------- /chapter_preliminaries/lookup-api.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "01132d59", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 查阅文档\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "b7f72d17", 16 | "metadata": { 17 | "origin_pos": 2, 18 | "tab": [ 19 | "pytorch" 20 | ] 21 | }, 22 | "source": [ 23 | "由于篇幅限制,本书不可能介绍每一个PyTorch函数和类。\n", 24 | "API文档、其他教程和示例提供了本书之外的大量文档。\n", 25 | "本节提供了一些查看PyTorch API的指导。\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "id": "97173144", 31 | "metadata": { 32 | "origin_pos": 4 33 | }, 34 | "source": [ 35 | "## 查找模块中的所有函数和类\n", 36 | "\n", 37 | "为了知道模块中可以调用哪些函数和类,可以调用`dir`函数。\n", 38 | "例如,我们可以(**查询随机数生成模块中的所有属性:**)\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 1, 44 | "id": "8f7f4d63", 45 | "metadata": { 46 | "execution": { 47 | "iopub.execute_input": "2023-08-18T07:05:30.519062Z", 48 | "iopub.status.busy": "2023-08-18T07:05:30.518501Z", 49 | "iopub.status.idle": "2023-08-18T07:05:31.469749Z", 50 | "shell.execute_reply": "2023-08-18T07:05:31.468858Z" 51 | }, 52 | "origin_pos": 6, 53 | "tab": [ 54 | "pytorch" 55 | ] 56 | }, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "['AbsTransform', 'AffineTransform', 'Bernoulli', 'Beta', 'Binomial', 'CatTransform', 'Categorical', 'Cauchy', 'Chi2', 'ComposeTransform', 'ContinuousBernoulli', 'CorrCholeskyTransform', 'CumulativeDistributionTransform', 'Dirichlet', 'Distribution', 'ExpTransform', 'Exponential', 'ExponentialFamily', 'FisherSnedecor', 'Gamma', 'Geometric', 'Gumbel', 'HalfCauchy', 'HalfNormal', 'Independent', 'IndependentTransform', 'Kumaraswamy', 'LKJCholesky', 'Laplace', 'LogNormal', 'LogisticNormal', 'LowRankMultivariateNormal', 'LowerCholeskyTransform', 'MixtureSameFamily', 'Multinomial', 'MultivariateNormal', 'NegativeBinomial', 'Normal', 'OneHotCategorical', 'OneHotCategoricalStraightThrough', 'Pareto', 'Poisson', 'PowerTransform', 'RelaxedBernoulli', 'RelaxedOneHotCategorical', 'ReshapeTransform', 'SigmoidTransform', 'SoftmaxTransform', 'SoftplusTransform', 'StackTransform', 'StickBreakingTransform', 'StudentT', 'TanhTransform', 'Transform', 'TransformedDistribution', 'Uniform', 'VonMises', 'Weibull', 'Wishart', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'bernoulli', 'beta', 'biject_to', 'binomial', 'categorical', 'cauchy', 'chi2', 'constraint_registry', 'constraints', 'continuous_bernoulli', 'dirichlet', 'distribution', 'exp_family', 'exponential', 'fishersnedecor', 'gamma', 'geometric', 'gumbel', 'half_cauchy', 'half_normal', 'identity_transform', 'independent', 'kl', 'kl_divergence', 'kumaraswamy', 'laplace', 'lkj_cholesky', 'log_normal', 'logistic_normal', 'lowrank_multivariate_normal', 'mixture_same_family', 'multinomial', 'multivariate_normal', 'negative_binomial', 'normal', 'one_hot_categorical', 'pareto', 'poisson', 'register_kl', 'relaxed_bernoulli', 'relaxed_categorical', 'studentT', 'transform_to', 'transformed_distribution', 'transforms', 'uniform', 'utils', 'von_mises', 'weibull', 'wishart']\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "import torch\n", 68 | "\n", 69 | "print(dir(torch.distributions))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "id": "a6e589e9", 75 | "metadata": { 76 | "origin_pos": 9 77 | }, 78 | "source": [ 79 | "通常可以忽略以“`__`”(双下划线)开始和结束的函数,它们是Python中的特殊对象,\n", 80 | "或以单个“`_`”(单下划线)开始的函数,它们通常是内部函数。\n", 81 | "根据剩余的函数名或属性名,我们可能会猜测这个模块提供了各种生成随机数的方法,\n", 82 | "包括从均匀分布(`uniform`)、正态分布(`normal`)和多项分布(`multinomial`)中采样。\n", 83 | "\n", 84 | "## 查找特定函数和类的用法\n", 85 | "\n", 86 | "有关如何使用给定函数或类的更具体说明,可以调用`help`函数。\n", 87 | "例如,我们来[**查看张量`ones`函数的用法。**]\n" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 2, 93 | "id": "a16494ed", 94 | "metadata": { 95 | "execution": { 96 | "iopub.execute_input": "2023-08-18T07:05:31.473606Z", 97 | "iopub.status.busy": "2023-08-18T07:05:31.472946Z", 98 | "iopub.status.idle": "2023-08-18T07:05:31.477780Z", 99 | "shell.execute_reply": "2023-08-18T07:05:31.476938Z" 100 | }, 101 | "origin_pos": 11, 102 | "tab": [ 103 | "pytorch" 104 | ] 105 | }, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "Help on built-in function ones in module torch:\n", 112 | "\n", 113 | "ones(...)\n", 114 | " ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor\n", 115 | " \n", 116 | " Returns a tensor filled with the scalar value `1`, with the shape defined\n", 117 | " by the variable argument :attr:`size`.\n", 118 | " \n", 119 | " Args:\n", 120 | " size (int...): a sequence of integers defining the shape of the output tensor.\n", 121 | " Can be a variable number of arguments or a collection like a list or tuple.\n", 122 | " \n", 123 | " Keyword arguments:\n", 124 | " out (Tensor, optional): the output tensor.\n", 125 | " dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.\n", 126 | " Default: if ``None``, uses a global default (see :func:`torch.set_default_tensor_type`).\n", 127 | " layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.\n", 128 | " Default: ``torch.strided``.\n", 129 | " device (:class:`torch.device`, optional): the desired device of returned tensor.\n", 130 | " Default: if ``None``, uses the current device for the default tensor type\n", 131 | " (see :func:`torch.set_default_tensor_type`). :attr:`device` will be the CPU\n", 132 | " for CPU tensor types and the current CUDA device for CUDA tensor types.\n", 133 | " requires_grad (bool, optional): If autograd should record operations on the\n", 134 | " returned tensor. Default: ``False``.\n", 135 | " \n", 136 | " Example::\n", 137 | " \n", 138 | " >>> torch.ones(2, 3)\n", 139 | " tensor([[ 1., 1., 1.],\n", 140 | " [ 1., 1., 1.]])\n", 141 | " \n", 142 | " >>> torch.ones(5)\n", 143 | " tensor([ 1., 1., 1., 1., 1.])\n", 144 | "\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "help(torch.ones)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "id": "903c096e", 155 | "metadata": { 156 | "origin_pos": 14 157 | }, 158 | "source": [ 159 | "从文档中,我们可以看到`ones`函数创建一个具有指定形状的新张量,并将所有元素值设置为1。\n", 160 | "下面来[**运行一个快速测试**]来确认这一解释:\n" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 3, 166 | "id": "7870b2f5", 167 | "metadata": { 168 | "execution": { 169 | "iopub.execute_input": "2023-08-18T07:05:31.481310Z", 170 | "iopub.status.busy": "2023-08-18T07:05:31.480685Z", 171 | "iopub.status.idle": "2023-08-18T07:05:31.490398Z", 172 | "shell.execute_reply": "2023-08-18T07:05:31.489581Z" 173 | }, 174 | "origin_pos": 16, 175 | "tab": [ 176 | "pytorch" 177 | ] 178 | }, 179 | "outputs": [ 180 | { 181 | "data": { 182 | "text/plain": [ 183 | "tensor([1., 1., 1., 1.])" 184 | ] 185 | }, 186 | "execution_count": 3, 187 | "metadata": {}, 188 | "output_type": "execute_result" 189 | } 190 | ], 191 | "source": [ 192 | "torch.ones(4)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "id": "dd4f531d", 198 | "metadata": { 199 | "origin_pos": 19 200 | }, 201 | "source": [ 202 | "在Jupyter记事本中,我们可以使用`?`指令在另一个浏览器窗口中显示文档。\n", 203 | "例如,`list?`指令将创建与`help(list)`指令几乎相同的内容,并在新的浏览器窗口中显示它。\n", 204 | "此外,如果我们使用两个问号,如`list??`,将显示实现该函数的Python代码。\n", 205 | "\n", 206 | "## 小结\n", 207 | "\n", 208 | "* 官方文档提供了本书之外的大量描述和示例。\n", 209 | "* 可以通过调用`dir`和`help`函数或在Jupyter记事本中使用`?`和`??`查看API的用法文档。\n", 210 | "\n", 211 | "## 练习\n", 212 | "\n", 213 | "1. 在深度学习框架中查找任何函数或类的文档。请尝试在这个框架的官方网站上找到文档。\n" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "id": "197b3dc7", 219 | "metadata": { 220 | "origin_pos": 21, 221 | "tab": [ 222 | "pytorch" 223 | ] 224 | }, 225 | "source": [ 226 | "[Discussions](https://discuss.d2l.ai/t/1765)\n" 227 | ] 228 | } 229 | ], 230 | "metadata": { 231 | "kernelspec": { 232 | "display_name": "conda_pytorch_p36", 233 | "name": "conda_pytorch_p36" 234 | }, 235 | "language_info": { 236 | "name": "python" 237 | }, 238 | "required_libs": [] 239 | }, 240 | "nbformat": 4, 241 | "nbformat_minor": 5 242 | } -------------------------------------------------------------------------------- /chapter_preliminaries/pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "ab73852c", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 数据预处理\n", 11 | ":label:`sec_pandas`\n", 12 | "\n", 13 | "为了能用深度学习来解决现实世界的问题,我们经常从预处理原始数据开始,\n", 14 | "而不是从那些准备好的张量格式数据开始。\n", 15 | "在Python中常用的数据分析工具中,我们通常使用`pandas`软件包。\n", 16 | "像庞大的Python生态系统中的许多其他扩展包一样,`pandas`可以与张量兼容。\n", 17 | "本节我们将简要介绍使用`pandas`预处理原始数据,并将原始数据转换为张量格式的步骤。\n", 18 | "后面的章节将介绍更多的数据预处理技术。\n", 19 | "\n", 20 | "## 读取数据集\n", 21 | "\n", 22 | "举一个例子,我们首先(**创建一个人工数据集,并存储在CSV(逗号分隔值)文件**)\n", 23 | "`../data/house_tiny.csv`中。\n", 24 | "以其他格式存储的数据也可以通过类似的方式进行处理。\n", 25 | "下面我们将数据集按行写入CSV文件中。\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "id": "ee72fd16", 32 | "metadata": { 33 | "execution": { 34 | "iopub.execute_input": "2023-08-18T07:03:38.903209Z", 35 | "iopub.status.busy": "2023-08-18T07:03:38.902351Z", 36 | "iopub.status.idle": "2023-08-18T07:03:38.918117Z", 37 | "shell.execute_reply": "2023-08-18T07:03:38.916775Z" 38 | }, 39 | "origin_pos": 1, 40 | "tab": [ 41 | "pytorch" 42 | ] 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "import os\n", 47 | "\n", 48 | "os.makedirs(os.path.join('..', 'data'), exist_ok=True)\n", 49 | "data_file = os.path.join('..', 'data', 'house_tiny.csv')\n", 50 | "with open(data_file, 'w') as f:\n", 51 | " f.write('NumRooms,Alley,Price\\n') # 列名\n", 52 | " f.write('NA,Pave,127500\\n') # 每行表示一个数据样本\n", 53 | " f.write('2,NA,106000\\n')\n", 54 | " f.write('4,NA,178100\\n')\n", 55 | " f.write('NA,NA,140000\\n')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "id": "f5be7568", 61 | "metadata": { 62 | "origin_pos": 2 63 | }, 64 | "source": [ 65 | "要[**从创建的CSV文件中加载原始数据集**],我们导入`pandas`包并调用`read_csv`函数。该数据集有四行三列。其中每行描述了房间数量(“NumRooms”)、巷子类型(“Alley”)和房屋价格(“Price”)。\n" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 2, 71 | "id": "5fb16e52", 72 | "metadata": { 73 | "execution": { 74 | "iopub.execute_input": "2023-08-18T07:03:38.923957Z", 75 | "iopub.status.busy": "2023-08-18T07:03:38.923101Z", 76 | "iopub.status.idle": "2023-08-18T07:03:39.372116Z", 77 | "shell.execute_reply": "2023-08-18T07:03:39.371151Z" 78 | }, 79 | "origin_pos": 3, 80 | "tab": [ 81 | "pytorch" 82 | ] 83 | }, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | " NumRooms Alley Price\n", 90 | "0 NaN Pave 127500\n", 91 | "1 2.0 NaN 106000\n", 92 | "2 4.0 NaN 178100\n", 93 | "3 NaN NaN 140000\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "# 如果没有安装pandas,只需取消对以下行的注释来安装pandas\n", 99 | "# !pip install pandas\n", 100 | "import pandas as pd\n", 101 | "\n", 102 | "data = pd.read_csv(data_file)\n", 103 | "print(data)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "id": "30188bf5", 109 | "metadata": { 110 | "origin_pos": 4 111 | }, 112 | "source": [ 113 | "## 处理缺失值\n", 114 | "\n", 115 | "注意,“NaN”项代表缺失值。\n", 116 | "[**为了处理缺失的数据,典型的方法包括*插值法*和*删除法*,**]\n", 117 | "其中插值法用一个替代值弥补缺失值,而删除法则直接忽略缺失值。\n", 118 | "在(**这里,我们将考虑插值法**)。\n", 119 | "\n", 120 | "通过位置索引`iloc`,我们将`data`分成`inputs`和`outputs`,\n", 121 | "其中前者为`data`的前两列,而后者为`data`的最后一列。\n", 122 | "对于`inputs`中缺少的数值,我们用同一列的均值替换“NaN”项。\n" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 3, 128 | "id": "d460a301", 129 | "metadata": { 130 | "execution": { 131 | "iopub.execute_input": "2023-08-18T07:03:39.375828Z", 132 | "iopub.status.busy": "2023-08-18T07:03:39.375535Z", 133 | "iopub.status.idle": "2023-08-18T07:03:39.389220Z", 134 | "shell.execute_reply": "2023-08-18T07:03:39.387998Z" 135 | }, 136 | "origin_pos": 5, 137 | "tab": [ 138 | "pytorch" 139 | ] 140 | }, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | " NumRooms Alley\n", 147 | "0 3.0 Pave\n", 148 | "1 2.0 NaN\n", 149 | "2 4.0 NaN\n", 150 | "3 3.0 NaN\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]\n", 156 | "inputs = inputs.fillna(inputs.mean())\n", 157 | "print(inputs)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "id": "eae762a4", 163 | "metadata": { 164 | "origin_pos": 6 165 | }, 166 | "source": [ 167 | "[**对于`inputs`中的类别值或离散值,我们将“NaN”视为一个类别。**]\n", 168 | "由于“巷子类型”(“Alley”)列只接受两种类型的类别值“Pave”和“NaN”,\n", 169 | "`pandas`可以自动将此列转换为两列“Alley_Pave”和“Alley_nan”。\n", 170 | "巷子类型为“Pave”的行会将“Alley_Pave”的值设置为1,“Alley_nan”的值设置为0。\n", 171 | "缺少巷子类型的行会将“Alley_Pave”和“Alley_nan”分别设置为0和1。\n" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 4, 177 | "id": "09ab8738", 178 | "metadata": { 179 | "execution": { 180 | "iopub.execute_input": "2023-08-18T07:03:39.394176Z", 181 | "iopub.status.busy": "2023-08-18T07:03:39.393444Z", 182 | "iopub.status.idle": "2023-08-18T07:03:39.409892Z", 183 | "shell.execute_reply": "2023-08-18T07:03:39.408559Z" 184 | }, 185 | "origin_pos": 7, 186 | "tab": [ 187 | "pytorch" 188 | ] 189 | }, 190 | "outputs": [ 191 | { 192 | "name": "stdout", 193 | "output_type": "stream", 194 | "text": [ 195 | " NumRooms Alley_Pave Alley_nan\n", 196 | "0 3.0 1 0\n", 197 | "1 2.0 0 1\n", 198 | "2 4.0 0 1\n", 199 | "3 3.0 0 1\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "inputs = pd.get_dummies(inputs, dummy_na=True)\n", 205 | "print(inputs)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "id": "ea1dd875", 211 | "metadata": { 212 | "origin_pos": 8 213 | }, 214 | "source": [ 215 | "## 转换为张量格式\n", 216 | "\n", 217 | "[**现在`inputs`和`outputs`中的所有条目都是数值类型,它们可以转换为张量格式。**]\n", 218 | "当数据采用张量格式后,可以通过在 :numref:`sec_ndarray`中引入的那些张量函数来进一步操作。\n" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 5, 224 | "id": "4f551c6d", 225 | "metadata": { 226 | "execution": { 227 | "iopub.execute_input": "2023-08-18T07:03:39.414531Z", 228 | "iopub.status.busy": "2023-08-18T07:03:39.413831Z", 229 | "iopub.status.idle": "2023-08-18T07:03:40.467689Z", 230 | "shell.execute_reply": "2023-08-18T07:03:40.466637Z" 231 | }, 232 | "origin_pos": 10, 233 | "tab": [ 234 | "pytorch" 235 | ] 236 | }, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "(tensor([[3., 1., 0.],\n", 242 | " [2., 0., 1.],\n", 243 | " [4., 0., 1.],\n", 244 | " [3., 0., 1.]], dtype=torch.float64),\n", 245 | " tensor([127500., 106000., 178100., 140000.], dtype=torch.float64))" 246 | ] 247 | }, 248 | "execution_count": 5, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "import torch\n", 255 | "\n", 256 | "X = torch.tensor(inputs.to_numpy(dtype=float))\n", 257 | "y = torch.tensor(outputs.to_numpy(dtype=float))\n", 258 | "X, y" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "id": "dbcbca0d", 264 | "metadata": { 265 | "origin_pos": 13 266 | }, 267 | "source": [ 268 | "## 小结\n", 269 | "\n", 270 | "* `pandas`软件包是Python中常用的数据分析工具中,`pandas`可以与张量兼容。\n", 271 | "* 用`pandas`处理缺失的数据时,我们可根据情况选择用插值法和删除法。\n", 272 | "\n", 273 | "## 练习\n", 274 | "\n", 275 | "创建包含更多行和列的原始数据集。\n", 276 | "\n", 277 | "1. 删除缺失值最多的列。\n", 278 | "2. 将预处理后的数据集转换为张量格式。\n" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "id": "7b8c6c96", 284 | "metadata": { 285 | "origin_pos": 15, 286 | "tab": [ 287 | "pytorch" 288 | ] 289 | }, 290 | "source": [ 291 | "[Discussions](https://discuss.d2l.ai/t/1750)\n" 292 | ] 293 | } 294 | ], 295 | "metadata": { 296 | "kernelspec": { 297 | "display_name": "conda_pytorch_p36", 298 | "name": "conda_pytorch_p36" 299 | }, 300 | "language_info": { 301 | "name": "python" 302 | }, 303 | "required_libs": [] 304 | }, 305 | "nbformat": 4, 306 | "nbformat_minor": 5 307 | } -------------------------------------------------------------------------------- /chapter_recurrent-modern/beam-search.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "06969ee4", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 束搜索\n", 11 | ":label:`sec_beam-search`\n", 12 | "\n", 13 | "在 :numref:`sec_seq2seq`中,我们逐个预测输出序列,\n", 14 | "直到预测序列中出现特定的序列结束词元“<eos>”。\n", 15 | "本节将首先介绍*贪心搜索*(greedy search)策略,\n", 16 | "并探讨其存在的问题,然后对比其他替代策略:\n", 17 | "*穷举搜索*(exhaustive search)和*束搜索*(beam search)。\n", 18 | "\n", 19 | "在正式介绍贪心搜索之前,我们使用与 :numref:`sec_seq2seq`中\n", 20 | "相同的数学符号定义搜索问题。\n", 21 | "在任意时间步$t'$,解码器输出$y_{t'}$的概率取决于\n", 22 | "时间步$t'$之前的输出子序列$y_1, \\ldots, y_{t'-1}$\n", 23 | "和对输入序列的信息进行编码得到的上下文变量$\\mathbf{c}$。\n", 24 | "为了量化计算代价,用$\\mathcal{Y}$表示输出词表,\n", 25 | "其中包含“<eos>”,\n", 26 | "所以这个词汇集合的基数$\\left|\\mathcal{Y}\\right|$就是词表的大小。\n", 27 | "我们还将输出序列的最大词元数指定为$T'$。\n", 28 | "因此,我们的目标是从所有$\\mathcal{O}(\\left|\\mathcal{Y}\\right|^{T'})$个\n", 29 | "可能的输出序列中寻找理想的输出。\n", 30 | "当然,对于所有输出序列,在“<eos>”之后的部分(非本句)\n", 31 | "将在实际输出中丢弃。\n", 32 | "\n", 33 | "## 贪心搜索\n", 34 | "\n", 35 | "首先,让我们看看一个简单的策略:*贪心搜索*,\n", 36 | "该策略已用于 :numref:`sec_seq2seq`的序列预测。\n", 37 | "对于输出序列的每一时间步$t'$,\n", 38 | "我们都将基于贪心搜索从$\\mathcal{Y}$中找到具有最高条件概率的词元,即:\n", 39 | "\n", 40 | "$$y_{t'} = \\operatorname*{argmax}_{y \\in \\mathcal{Y}} P(y \\mid y_1, \\ldots, y_{t'-1}, \\mathbf{c})$$\n", 41 | "\n", 42 | "一旦输出序列包含了“<eos>”或者达到其最大长度$T'$,则输出完成。\n", 43 | "\n", 44 | "![在每个时间步,贪心搜索选择具有最高条件概率的词元](../img/s2s-prob1.svg)\n", 45 | ":label:`fig_s2s-prob1`\n", 46 | "\n", 47 | "如 :numref:`fig_s2s-prob1`中,\n", 48 | "假设输出中有四个词元“A”“B”“C”和“<eos>”。\n", 49 | "每个时间步下的四个数字分别表示在该时间步\n", 50 | "生成“A”“B”“C”和“<eos>”的条件概率。\n", 51 | "在每个时间步,贪心搜索选择具有最高条件概率的词元。\n", 52 | "因此,将在 :numref:`fig_s2s-prob1`中\n", 53 | "预测输出序列“A”“B”“C”和“<eos>”。\n", 54 | "这个输出序列的条件概率是\n", 55 | "$0.5\\times0.4\\times0.4\\times0.6 = 0.048$。\n", 56 | "\n", 57 | "那么贪心搜索存在的问题是什么呢?\n", 58 | "现实中,*最优序列*(optimal sequence)应该是最大化\n", 59 | "$\\prod_{t'=1}^{T'} P(y_{t'} \\mid y_1, \\ldots, y_{t'-1}, \\mathbf{c})$\n", 60 | "值的输出序列,这是基于输入序列生成输出序列的条件概率。\n", 61 | "然而,贪心搜索无法保证得到最优序列。\n", 62 | "\n", 63 | "![在时间步2,选择具有第二高条件概率的词元“C”(而非最高条件概率的词元)](../img/s2s-prob2.svg)\n", 64 | ":label:`fig_s2s-prob2`\n", 65 | "\n", 66 | " :numref:`fig_s2s-prob2`中的另一个例子阐述了这个问题。\n", 67 | "与 :numref:`fig_s2s-prob1`不同,在时间步$2$中,\n", 68 | "我们选择 :numref:`fig_s2s-prob2`中的词元“C”,\n", 69 | "它具有*第二*高的条件概率。\n", 70 | "由于时间步$3$所基于的时间步$1$和$2$处的输出子序列已从\n", 71 | " :numref:`fig_s2s-prob1`中的“A”和“B”改变为\n", 72 | " :numref:`fig_s2s-prob2`中的“A”和“C”,\n", 73 | "因此时间步$3$处的每个词元的条件概率也在 :numref:`fig_s2s-prob2`中改变。\n", 74 | "假设我们在时间步$3$选择词元“B”,\n", 75 | "于是当前的时间步$4$基于前三个时间步的输出子序列“A”“C”和“B”为条件,\n", 76 | "这与 :numref:`fig_s2s-prob1`中的“A”“B”和“C”不同。\n", 77 | "因此,在 :numref:`fig_s2s-prob2`中的时间步$4$生成\n", 78 | "每个词元的条件概率也不同于 :numref:`fig_s2s-prob1`中的条件概率。\n", 79 | "结果, :numref:`fig_s2s-prob2`中的输出序列\n", 80 | "“A”“C”“B”和“<eos>”的条件概率为\n", 81 | "$0.5\\times0.3 \\times0.6\\times0.6=0.054$,\n", 82 | "这大于 :numref:`fig_s2s-prob1`中的贪心搜索的条件概率。\n", 83 | "这个例子说明:贪心搜索获得的输出序列\n", 84 | "“A”“B”“C”和“<eos>”\n", 85 | "不一定是最佳序列。\n", 86 | "\n", 87 | "## 穷举搜索\n", 88 | "\n", 89 | "如果目标是获得最优序列,\n", 90 | "我们可以考虑使用*穷举搜索*(exhaustive search):\n", 91 | "穷举地列举所有可能的输出序列及其条件概率,\n", 92 | "然后计算输出条件概率最高的一个。\n", 93 | "\n", 94 | "虽然我们可以使用穷举搜索来获得最优序列,\n", 95 | "但其计算量$\\mathcal{O}(\\left|\\mathcal{Y}\\right|^{T'})$可能高的惊人。\n", 96 | "例如,当$|\\mathcal{Y}|=10000$和$T'=10$时,\n", 97 | "我们需要评估$10000^{10} = 10^{40}$序列,\n", 98 | "这是一个极大的数,现有的计算机几乎不可能计算它。\n", 99 | "然而,贪心搜索的计算量\n", 100 | "$\\mathcal{O}(\\left|\\mathcal{Y}\\right|T')$\n", 101 | "通它要显著地小于穷举搜索。\n", 102 | "例如,当$|\\mathcal{Y}|=10000$和$T'=10$时,\n", 103 | "我们只需要评估$10000\\times10=10^5$个序列。\n", 104 | "\n", 105 | "## 束搜索\n", 106 | "\n", 107 | "那么该选取哪种序列搜索策略呢?\n", 108 | "如果精度最重要,则显然是穷举搜索。\n", 109 | "如果计算成本最重要,则显然是贪心搜索。\n", 110 | "而束搜索的实际应用则介于这两个极端之间。\n", 111 | "\n", 112 | "*束搜索*(beam search)是贪心搜索的一个改进版本。\n", 113 | "它有一个超参数,名为*束宽*(beam size)$k$。\n", 114 | "在时间步$1$,我们选择具有最高条件概率的$k$个词元。\n", 115 | "这$k$个词元将分别是$k$个候选输出序列的第一个词元。\n", 116 | "在随后的每个时间步,基于上一时间步的$k$个候选输出序列,\n", 117 | "我们将继续从$k\\left|\\mathcal{Y}\\right|$个可能的选择中\n", 118 | "挑出具有最高条件概率的$k$个候选输出序列。\n", 119 | "\n", 120 | "![束搜索过程(束宽:2,输出序列的最大长度:3)。候选输出序列是$A$、$C$、$AB$、$CE$、$ABD$和$CED$](../img/beam-search.svg)\n", 121 | ":label:`fig_beam-search`\n", 122 | "\n", 123 | " :numref:`fig_beam-search`演示了束搜索的过程。\n", 124 | "假设输出的词表只包含五个元素:\n", 125 | "$\\mathcal{Y} = \\{A, B, C, D, E\\}$,\n", 126 | "其中有一个是“<eos>”。\n", 127 | "设置束宽为$2$,输出序列的最大长度为$3$。\n", 128 | "在时间步$1$,假设具有最高条件概率\n", 129 | "$P(y_1 \\mid \\mathbf{c})$的词元是$A$和$C$。\n", 130 | "在时间步$2$,我们计算所有$y_2 \\in \\mathcal{Y}$为:\n", 131 | "\n", 132 | "$$\\begin{aligned}P(A, y_2 \\mid \\mathbf{c}) = P(A \\mid \\mathbf{c})P(y_2 \\mid A, \\mathbf{c}),\\\\ P(C, y_2 \\mid \\mathbf{c}) = P(C \\mid \\mathbf{c})P(y_2 \\mid C, \\mathbf{c}),\\end{aligned}$$ \n", 133 | "\n", 134 | "从这十个值中选择最大的两个,\n", 135 | "比如$P(A, B \\mid \\mathbf{c})$和$P(C, E \\mid \\mathbf{c})$。\n", 136 | "然后在时间步$3$,我们计算所有$y_3 \\in \\mathcal{Y}$为:\n", 137 | "\n", 138 | "$$\\begin{aligned}P(A, B, y_3 \\mid \\mathbf{c}) = P(A, B \\mid \\mathbf{c})P(y_3 \\mid A, B, \\mathbf{c}),\\\\P(C, E, y_3 \\mid \\mathbf{c}) = P(C, E \\mid \\mathbf{c})P(y_3 \\mid C, E, \\mathbf{c}),\\end{aligned}$$ \n", 139 | "\n", 140 | "从这十个值中选择最大的两个,\n", 141 | "即$P(A, B, D \\mid \\mathbf{c})$和$P(C, E, D \\mid \\mathbf{c})$,\n", 142 | "我们会得到六个候选输出序列:\n", 143 | "(1)$A$;(2)$C$;(3)$A,B$;(4)$C,E$;(5)$A,B,D$;(6)$C,E,D$。\n", 144 | "\n", 145 | "最后,基于这六个序列(例如,丢弃包括“<eos>”和之后的部分),\n", 146 | "我们获得最终候选输出序列集合。\n", 147 | "然后我们选择其中条件概率乘积最高的序列作为输出序列:\n", 148 | "\n", 149 | "$$ \\frac{1}{L^\\alpha} \\log P(y_1, \\ldots, y_{L}\\mid \\mathbf{c}) = \\frac{1}{L^\\alpha} \\sum_{t'=1}^L \\log P(y_{t'} \\mid y_1, \\ldots, y_{t'-1}, \\mathbf{c}),$$\n", 150 | ":eqlabel:`eq_beam-search-score`\n", 151 | "\n", 152 | "其中$L$是最终候选序列的长度,\n", 153 | "$\\alpha$通常设置为$0.75$。\n", 154 | "因为一个较长的序列在 :eqref:`eq_beam-search-score`\n", 155 | "的求和中会有更多的对数项,\n", 156 | "因此分母中的$L^\\alpha$用于惩罚长序列。\n", 157 | "\n", 158 | "束搜索的计算量为$\\mathcal{O}(k\\left|\\mathcal{Y}\\right|T')$,\n", 159 | "这个结果介于贪心搜索和穷举搜索之间。\n", 160 | "实际上,贪心搜索可以看作一种束宽为$1$的特殊类型的束搜索。\n", 161 | "通过灵活地选择束宽,束搜索可以在正确率和计算代价之间进行权衡。\n", 162 | "\n", 163 | "## 小结\n", 164 | "\n", 165 | "* 序列搜索策略包括贪心搜索、穷举搜索和束搜索。\n", 166 | "* 贪心搜索所选取序列的计算量最小,但精度相对较低。\n", 167 | "* 穷举搜索所选取序列的精度最高,但计算量最大。\n", 168 | "* 束搜索通过灵活选择束宽,在正确率和计算代价之间进行权衡。\n", 169 | "\n", 170 | "## 练习\n", 171 | "\n", 172 | "1. 我们可以把穷举搜索看作一种特殊的束搜索吗?为什么?\n", 173 | "1. 在 :numref:`sec_seq2seq`的机器翻译问题中应用束搜索。\n", 174 | " 束宽是如何影响预测的速度和结果的?\n", 175 | "1. 在 :numref:`sec_rnn_scratch`中,我们基于用户提供的前缀,\n", 176 | " 通过使用语言模型来生成文本。这个例子中使用了哪种搜索策略?可以改进吗?\n", 177 | "\n", 178 | "[Discussions](https://discuss.d2l.ai/t/5768)\n" 179 | ] 180 | } 181 | ], 182 | "metadata": { 183 | "kernelspec": { 184 | "display_name": "conda_pytorch_p36", 185 | "name": "conda_pytorch_p36" 186 | }, 187 | "language_info": { 188 | "name": "python" 189 | }, 190 | "required_libs": [] 191 | }, 192 | "nbformat": 4, 193 | "nbformat_minor": 5 194 | } -------------------------------------------------------------------------------- /chapter_recurrent-modern/encoder-decoder.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "962e28eb", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 编码器-解码器架构\n", 11 | ":label:`sec_encoder-decoder`\n", 12 | "\n", 13 | "正如我们在 :numref:`sec_machine_translation`中所讨论的,\n", 14 | "机器翻译是序列转换模型的一个核心问题,\n", 15 | "其输入和输出都是长度可变的序列。\n", 16 | "为了处理这种类型的输入和输出,\n", 17 | "我们可以设计一个包含两个主要组件的架构:\n", 18 | "第一个组件是一个*编码器*(encoder):\n", 19 | "它接受一个长度可变的序列作为输入,\n", 20 | "并将其转换为具有固定形状的编码状态。\n", 21 | "第二个组件是*解码器*(decoder):\n", 22 | "它将固定形状的编码状态映射到长度可变的序列。\n", 23 | "这被称为*编码器-解码器*(encoder-decoder)架构,\n", 24 | "如 :numref:`fig_encoder_decoder` 所示。\n", 25 | "\n", 26 | "![编码器-解码器架构](../img/encoder-decoder.svg)\n", 27 | ":label:`fig_encoder_decoder`\n", 28 | "\n", 29 | "我们以英语到法语的机器翻译为例:\n", 30 | "给定一个英文的输入序列:“They”“are”“watching”“.”。\n", 31 | "首先,这种“编码器-解码器”架构将长度可变的输入序列编码成一个“状态”,\n", 32 | "然后对该状态进行解码,\n", 33 | "一个词元接着一个词元地生成翻译后的序列作为输出:\n", 34 | "“Ils”“regordent”“.”。\n", 35 | "由于“编码器-解码器”架构是形成后续章节中不同序列转换模型的基础,\n", 36 | "因此本节将把这个架构转换为接口方便后面的代码实现。\n", 37 | "\n", 38 | "## (**编码器**)\n", 39 | "\n", 40 | "在编码器接口中,我们只指定长度可变的序列作为编码器的输入`X`。\n", 41 | "任何继承这个`Encoder`基类的模型将完成代码实现。\n" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 1, 47 | "id": "17f77c60", 48 | "metadata": { 49 | "execution": { 50 | "iopub.execute_input": "2023-08-18T07:05:48.406295Z", 51 | "iopub.status.busy": "2023-08-18T07:05:48.405469Z", 52 | "iopub.status.idle": "2023-08-18T07:05:49.653322Z", 53 | "shell.execute_reply": "2023-08-18T07:05:49.651979Z" 54 | }, 55 | "origin_pos": 2, 56 | "tab": [ 57 | "pytorch" 58 | ] 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "from torch import nn\n", 63 | "\n", 64 | "\n", 65 | "#@save\n", 66 | "class Encoder(nn.Module):\n", 67 | " \"\"\"编码器-解码器架构的基本编码器接口\"\"\"\n", 68 | " def __init__(self, **kwargs):\n", 69 | " super(Encoder, self).__init__(**kwargs)\n", 70 | "\n", 71 | " def forward(self, X, *args):\n", 72 | " raise NotImplementedError" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "de7f0caf", 78 | "metadata": { 79 | "origin_pos": 5 80 | }, 81 | "source": [ 82 | "## [**解码器**]\n", 83 | "\n", 84 | "在下面的解码器接口中,我们新增一个`init_state`函数,\n", 85 | "用于将编码器的输出(`enc_outputs`)转换为编码后的状态。\n", 86 | "注意,此步骤可能需要额外的输入,例如:输入序列的有效长度,\n", 87 | "这在 :numref:`subsec_mt_data_loading`中进行了解释。\n", 88 | "为了逐个地生成长度可变的词元序列,\n", 89 | "解码器在每个时间步都会将输入\n", 90 | "(例如:在前一时间步生成的词元)和编码后的状态\n", 91 | "映射成当前时间步的输出词元。\n" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 2, 97 | "id": "5c7a6471", 98 | "metadata": { 99 | "execution": { 100 | "iopub.execute_input": "2023-08-18T07:05:49.659889Z", 101 | "iopub.status.busy": "2023-08-18T07:05:49.659020Z", 102 | "iopub.status.idle": "2023-08-18T07:05:49.666360Z", 103 | "shell.execute_reply": "2023-08-18T07:05:49.665230Z" 104 | }, 105 | "origin_pos": 7, 106 | "tab": [ 107 | "pytorch" 108 | ] 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "#@save\n", 113 | "class Decoder(nn.Module):\n", 114 | " \"\"\"编码器-解码器架构的基本解码器接口\"\"\"\n", 115 | " def __init__(self, **kwargs):\n", 116 | " super(Decoder, self).__init__(**kwargs)\n", 117 | "\n", 118 | " def init_state(self, enc_outputs, *args):\n", 119 | " raise NotImplementedError\n", 120 | "\n", 121 | " def forward(self, X, state):\n", 122 | " raise NotImplementedError" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "6e0548de", 128 | "metadata": { 129 | "origin_pos": 10 130 | }, 131 | "source": [ 132 | "## [**合并编码器和解码器**]\n", 133 | "\n", 134 | "总而言之,“编码器-解码器”架构包含了一个编码器和一个解码器,\n", 135 | "并且还拥有可选的额外的参数。\n", 136 | "在前向传播中,编码器的输出用于生成编码状态,\n", 137 | "这个状态又被解码器作为其输入的一部分。\n" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 3, 143 | "id": "53fb0929", 144 | "metadata": { 145 | "execution": { 146 | "iopub.execute_input": "2023-08-18T07:05:49.671685Z", 147 | "iopub.status.busy": "2023-08-18T07:05:49.670944Z", 148 | "iopub.status.idle": "2023-08-18T07:05:49.678831Z", 149 | "shell.execute_reply": "2023-08-18T07:05:49.677718Z" 150 | }, 151 | "origin_pos": 12, 152 | "tab": [ 153 | "pytorch" 154 | ] 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "#@save\n", 159 | "class EncoderDecoder(nn.Module):\n", 160 | " \"\"\"编码器-解码器架构的基类\"\"\"\n", 161 | " def __init__(self, encoder, decoder, **kwargs):\n", 162 | " super(EncoderDecoder, self).__init__(**kwargs)\n", 163 | " self.encoder = encoder\n", 164 | " self.decoder = decoder\n", 165 | "\n", 166 | " def forward(self, enc_X, dec_X, *args):\n", 167 | " enc_outputs = self.encoder(enc_X, *args)\n", 168 | " dec_state = self.decoder.init_state(enc_outputs, *args)\n", 169 | " return self.decoder(dec_X, dec_state)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "id": "dce5eb8e", 175 | "metadata": { 176 | "origin_pos": 15 177 | }, 178 | "source": [ 179 | "“编码器-解码器”体系架构中的术语*状态*\n", 180 | "会启发人们使用具有状态的神经网络来实现该架构。\n", 181 | "在下一节中,我们将学习如何应用循环神经网络,\n", 182 | "来设计基于“编码器-解码器”架构的序列转换模型。\n", 183 | "\n", 184 | "## 小结\n", 185 | "\n", 186 | "* “编码器-解码器”架构可以将长度可变的序列作为输入和输出,因此适用于机器翻译等序列转换问题。\n", 187 | "* 编码器将长度可变的序列作为输入,并将其转换为具有固定形状的编码状态。\n", 188 | "* 解码器将具有固定形状的编码状态映射为长度可变的序列。\n", 189 | "\n", 190 | "## 练习\n", 191 | "\n", 192 | "1. 假设我们使用神经网络来实现“编码器-解码器”架构,那么编码器和解码器必须是同一类型的神经网络吗?\n", 193 | "1. 除了机器翻译,还有其它可以适用于”编码器-解码器“架构的应用吗?\n" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "id": "99846b42", 199 | "metadata": { 200 | "origin_pos": 17, 201 | "tab": [ 202 | "pytorch" 203 | ] 204 | }, 205 | "source": [ 206 | "[Discussions](https://discuss.d2l.ai/t/2779)\n" 207 | ] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "conda_pytorch_p36", 213 | "name": "conda_pytorch_p36" 214 | }, 215 | "language_info": { 216 | "name": "python" 217 | }, 218 | "required_libs": [] 219 | }, 220 | "nbformat": 4, 221 | "nbformat_minor": 5 222 | } -------------------------------------------------------------------------------- /chapter_recurrent-modern/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "729b9613", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 现代循环神经网络\n", 11 | ":label:`chap_modern_rnn`\n", 12 | "\n", 13 | "前一章中我们介绍了循环神经网络的基础知识,\n", 14 | "这种网络可以更好地处理序列数据。\n", 15 | "我们在文本数据上实现了基于循环神经网络的语言模型,\n", 16 | "但是对于当今各种各样的序列学习问题,这些技术可能并不够用。\n", 17 | "\n", 18 | "例如,循环神经网络在实践中一个常见问题是数值不稳定性。\n", 19 | "尽管我们已经应用了梯度裁剪等技巧来缓解这个问题,\n", 20 | "但是仍需要通过设计更复杂的序列模型来进一步处理它。\n", 21 | "具体来说,我们将引入两个广泛使用的网络,\n", 22 | "即*门控循环单元*(gated recurrent units,GRU)和\n", 23 | "*长短期记忆网络*(long short-term memory,LSTM)。\n", 24 | "然后,我们将基于一个单向隐藏层来扩展循环神经网络架构。\n", 25 | "我们将描述具有多个隐藏层的深层架构,\n", 26 | "并讨论基于前向和后向循环计算的双向设计。\n", 27 | "现代循环网络经常采用这种扩展。\n", 28 | "在解释这些循环神经网络的变体时,\n", 29 | "我们将继续考虑 :numref:`chap_rnn`中的语言建模问题。\n", 30 | "\n", 31 | "事实上,语言建模只揭示了序列学习能力的冰山一角。\n", 32 | "在各种序列学习问题中,如自动语音识别、文本到语音转换和机器翻译,\n", 33 | "输入和输出都是任意长度的序列。\n", 34 | "为了阐述如何拟合这种类型的数据,\n", 35 | "我们将以机器翻译为例介绍基于循环神经网络的\n", 36 | "“编码器-解码器”架构和束搜索,并用它们来生成序列。\n", 37 | "\n", 38 | ":begin_tab:toc\n", 39 | " - [gru](gru.ipynb)\n", 40 | " - [lstm](lstm.ipynb)\n", 41 | " - [deep-rnn](deep-rnn.ipynb)\n", 42 | " - [bi-rnn](bi-rnn.ipynb)\n", 43 | " - [machine-translation-and-dataset](machine-translation-and-dataset.ipynb)\n", 44 | " - [encoder-decoder](encoder-decoder.ipynb)\n", 45 | " - [seq2seq](seq2seq.ipynb)\n", 46 | " - [beam-search](beam-search.ipynb)\n", 47 | ":end_tab:\n" 48 | ] 49 | } 50 | ], 51 | "metadata": { 52 | "kernelspec": { 53 | "display_name": "conda_pytorch_p36", 54 | "name": "conda_pytorch_p36" 55 | }, 56 | "language_info": { 57 | "name": "python" 58 | }, 59 | "required_libs": [] 60 | }, 61 | "nbformat": 4, 62 | "nbformat_minor": 5 63 | } -------------------------------------------------------------------------------- /chapter_recurrent-neural-networks/index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3fa3d90d", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "# 循环神经网络\n", 11 | ":label:`chap_rnn`\n", 12 | "\n", 13 | "到目前为止,我们遇到过两种类型的数据:表格数据和图像数据。\n", 14 | "对于图像数据,我们设计了专门的卷积神经网络架构来为这类特殊的数据结构建模。\n", 15 | "换句话说,如果我们拥有一张图像,我们需要有效地利用其像素位置,\n", 16 | "假若我们对图像中的像素位置进行重排,就会对图像中内容的推断造成极大的困难。\n", 17 | "\n", 18 | "最重要的是,到目前为止我们默认数据都来自于某种分布,\n", 19 | "并且所有样本都是独立同分布的\n", 20 | "(independently and identically distributed,i.i.d.)。\n", 21 | "然而,大多数的数据并非如此。\n", 22 | "例如,文章中的单词是按顺序写的,如果顺序被随机地重排,就很难理解文章原始的意思。\n", 23 | "同样,视频中的图像帧、对话中的音频信号以及网站上的浏览行为都是有顺序的。\n", 24 | "因此,针对此类数据而设计特定模型,可能效果会更好。\n", 25 | "\n", 26 | "另一个问题来自这样一个事实:\n", 27 | "我们不仅仅可以接收一个序列作为输入,而是还可能期望继续猜测这个序列的后续。\n", 28 | "例如,一个任务可以是继续预测$2, 4, 6, 8, 10, \\ldots$。\n", 29 | "这在时间序列分析中是相当常见的,可以用来预测股市的波动、\n", 30 | "患者的体温曲线或者赛车所需的加速度。\n", 31 | "同理,我们需要能够处理这些数据的特定模型。\n", 32 | "\n", 33 | "简言之,如果说卷积神经网络可以有效地处理空间信息,\n", 34 | "那么本章的*循环神经网络*(recurrent neural network,RNN)则可以更好地处理序列信息。\n", 35 | "循环神经网络通过引入状态变量存储过去的信息和当前的输入,从而可以确定当前的输出。\n", 36 | "\n", 37 | "许多使用循环网络的例子都是基于文本数据的,因此我们将在本章中重点介绍语言模型。\n", 38 | "在对序列数据进行更详细的回顾之后,我们将介绍文本预处理的实用技术。\n", 39 | "然后,我们将讨论语言模型的基本概念,并将此讨论作为循环神经网络设计的灵感。\n", 40 | "最后,我们描述了循环神经网络的梯度计算方法,以探讨训练此类网络时可能遇到的问题。\n", 41 | "\n", 42 | ":begin_tab:toc\n", 43 | " - [sequence](sequence.ipynb)\n", 44 | " - [text-preprocessing](text-preprocessing.ipynb)\n", 45 | " - [language-models-and-dataset](language-models-and-dataset.ipynb)\n", 46 | " - [rnn](rnn.ipynb)\n", 47 | " - [rnn-scratch](rnn-scratch.ipynb)\n", 48 | " - [rnn-concise](rnn-concise.ipynb)\n", 49 | " - [bptt](bptt.ipynb)\n", 50 | ":end_tab:\n" 51 | ] 52 | } 53 | ], 54 | "metadata": { 55 | "kernelspec": { 56 | "display_name": "conda_pytorch_p36", 57 | "name": "conda_pytorch_p36" 58 | }, 59 | "language_info": { 60 | "name": "python" 61 | }, 62 | "required_libs": [] 63 | }, 64 | "nbformat": 4, 65 | "nbformat_minor": 5 66 | } -------------------------------------------------------------------------------- /chapter_references/zreferences.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8d4d58c7", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "```eval_rst\n", 11 | ".. only:: html\n", 12 | "\n", 13 | " 参考文献\n", 14 | " ==========\n", 15 | "```\n", 16 | "\n", 17 | ":bibliography:`../d2l.bib`\n" 18 | ] 19 | } 20 | ], 21 | "metadata": { 22 | "kernelspec": { 23 | "display_name": "conda_pytorch_p36", 24 | "name": "conda_pytorch_p36" 25 | }, 26 | "language_info": { 27 | "name": "python" 28 | }, 29 | "required_libs": [] 30 | }, 31 | "nbformat": 4, 32 | "nbformat_minor": 5 33 | } -------------------------------------------------------------------------------- /img/add_norm.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | Produced by OmniGraffle 7.18.5\n2021-11-02 17:34:31 +0000 22 | 23 | Canvas 1 24 | 25 | 26 | Layer 1 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 层规范化 39 | 40 | 41 | 42 | 43 | 44 | 45 | + 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /img/autumn-oak.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/autumn-oak.jpg -------------------------------------------------------------------------------- /img/aws.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/aws.png -------------------------------------------------------------------------------- /img/banana.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/banana.jpg -------------------------------------------------------------------------------- /img/capacity_vs_error.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | Produced by OmniGraffle 7.18.5\n2021-11-02 17:31:20 +0000 22 | 23 | Canvas 1 24 | 25 | Layer 1 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 损失 35 | 36 | 37 | 38 | 39 | 模型复杂度 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 泛化损失 51 | 52 | 53 | 54 | 55 | 训练损失 56 | 57 | 58 | 59 | 60 | 欠拟合 61 | 62 | 63 | 64 | 65 | 过拟合 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 最佳 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /img/cat-dog-pixels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/cat-dog-pixels.png -------------------------------------------------------------------------------- /img/cat1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/cat1.jpg -------------------------------------------------------------------------------- /img/cat2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/cat2.jpg -------------------------------------------------------------------------------- /img/cat3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/cat3.jpg -------------------------------------------------------------------------------- /img/catdog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/catdog.jpg -------------------------------------------------------------------------------- /img/chmod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/chmod.png -------------------------------------------------------------------------------- /img/colab-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/colab-2.png -------------------------------------------------------------------------------- /img/colab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/colab.png -------------------------------------------------------------------------------- /img/connect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/connect.png -------------------------------------------------------------------------------- /img/cuda101.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/cuda101.png -------------------------------------------------------------------------------- /img/death-cap.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/death-cap.jpg -------------------------------------------------------------------------------- /img/deeplearning-amazon.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/deeplearning-amazon.jpg -------------------------------------------------------------------------------- /img/disk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/disk.png -------------------------------------------------------------------------------- /img/dog1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/dog1.jpg -------------------------------------------------------------------------------- /img/dog2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/dog2.jpg -------------------------------------------------------------------------------- /img/ec2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/ec2.png -------------------------------------------------------------------------------- /img/edit-file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/edit-file.png -------------------------------------------------------------------------------- /img/eye-book.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/eye-book.png -------------------------------------------------------------------------------- /img/eye-coffee.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/eye-coffee.png -------------------------------------------------------------------------------- /img/filters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/filters.png -------------------------------------------------------------------------------- /img/frontends.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/frontends.png -------------------------------------------------------------------------------- /img/frontends/Canvas 1.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | Produced by OmniGraffle 7.18.5\n2021-11-02 17:39:31 +0000 17 | 18 | Canvas 1 19 | 20 | Layer 1 21 | 22 | 23 | 语言前端 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 后端框架 89 | (调度器、核心等) 90 | 91 | 92 | 93 | 94 | 计算设备 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /img/frontends/image10.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/frontends/image10.tiff -------------------------------------------------------------------------------- /img/frontends/image2.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/frontends/image2.tiff -------------------------------------------------------------------------------- /img/frontends/image3.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/frontends/image3.tiff -------------------------------------------------------------------------------- /img/frontends/image4.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/frontends/image4.tiff -------------------------------------------------------------------------------- /img/frontends/image5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/frontends/image5.pdf -------------------------------------------------------------------------------- /img/frontends/image8.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/frontends/image8.tiff -------------------------------------------------------------------------------- /img/ftse100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/ftse100.png -------------------------------------------------------------------------------- /img/git-clone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/git-clone.png -------------------------------------------------------------------------------- /img/git-createpr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/git-createpr.png -------------------------------------------------------------------------------- /img/git-fork.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/git-fork.png -------------------------------------------------------------------------------- /img/git-forked.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/git-forked.png -------------------------------------------------------------------------------- /img/git-newpr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/git-newpr.png -------------------------------------------------------------------------------- /img/house-pricing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/house-pricing.png -------------------------------------------------------------------------------- /img/jupyter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/jupyter.png -------------------------------------------------------------------------------- /img/jupyter00.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/jupyter00.png -------------------------------------------------------------------------------- /img/jupyter01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/jupyter01.png -------------------------------------------------------------------------------- /img/jupyter02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/jupyter02.png -------------------------------------------------------------------------------- /img/jupyter03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/jupyter03.png -------------------------------------------------------------------------------- /img/jupyter04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/jupyter04.png -------------------------------------------------------------------------------- /img/jupyter05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/jupyter05.png -------------------------------------------------------------------------------- /img/jupyter06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/jupyter06.png -------------------------------------------------------------------------------- /img/kaggle-cifar10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/kaggle-cifar10.png -------------------------------------------------------------------------------- /img/kaggle-dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/kaggle-dog.jpg -------------------------------------------------------------------------------- /img/kaggle-submit2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/kaggle-submit2.png -------------------------------------------------------------------------------- /img/kaggle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/kaggle.png -------------------------------------------------------------------------------- /img/keypair.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/keypair.png -------------------------------------------------------------------------------- /img/koebel.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/koebel.jpg -------------------------------------------------------------------------------- /img/latencynumbers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/latencynumbers.png -------------------------------------------------------------------------------- /img/launching.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/launching.png -------------------------------------------------------------------------------- /img/limits.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/limits.png -------------------------------------------------------------------------------- /img/neural-style.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/neural-style.jpg -------------------------------------------------------------------------------- /img/nonconvex.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /img/p2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/p2x.png -------------------------------------------------------------------------------- /img/pacman.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /img/pikachu.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/pikachu.jpg -------------------------------------------------------------------------------- /img/polygon-circle.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /img/popvssoda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/popvssoda.png -------------------------------------------------------------------------------- /img/projections.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /img/rainier.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/rainier.jpg -------------------------------------------------------------------------------- /img/sagemaker-create-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/sagemaker-create-2.png -------------------------------------------------------------------------------- /img/sagemaker-create-3-pytorch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/sagemaker-create-3-pytorch.png -------------------------------------------------------------------------------- /img/sagemaker-create-3-tensorflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/sagemaker-create-3-tensorflow.png -------------------------------------------------------------------------------- /img/sagemaker-create-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/sagemaker-create-3.png -------------------------------------------------------------------------------- /img/sagemaker-create.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/sagemaker-create.png -------------------------------------------------------------------------------- /img/sagemaker-open.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/sagemaker-open.png -------------------------------------------------------------------------------- /img/sagemaker-stop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/sagemaker-stop.png -------------------------------------------------------------------------------- /img/sagemaker-terminal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/sagemaker-terminal.png -------------------------------------------------------------------------------- /img/sagemaker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/sagemaker.png -------------------------------------------------------------------------------- /img/space-division-3d.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /img/speech.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/speech.png -------------------------------------------------------------------------------- /img/stackedanimals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/stackedanimals.png -------------------------------------------------------------------------------- /img/sub-area.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /img/tensorcore.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/tensorcore.jpg -------------------------------------------------------------------------------- /img/turing-processing-block.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/turing-processing-block.png -------------------------------------------------------------------------------- /img/turing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/turing.png -------------------------------------------------------------------------------- /img/ubuntu-new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/ubuntu-new.png -------------------------------------------------------------------------------- /img/waldo-mask.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/waldo-mask.jpg -------------------------------------------------------------------------------- /img/where-wally-walker-books.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2l-ai/d2l-zh-pytorch-sagemaker/a0adc5f9374d02f34f2ecaf5a790bcc17d945c95/img/where-wally-walker-books.jpg -------------------------------------------------------------------------------- /index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "bc53b743", 6 | "metadata": { 7 | "origin_pos": 0 8 | }, 9 | "source": [ 10 | "《动手学深度学习》\n", 11 | "========================\n", 12 | "\n", 13 | "```eval_rst\n", 14 | ".. raw:: html\n", 15 | " :file: frontpage.html\n", 16 | "```\n", 17 | "\n", 18 | ":begin_tab:toc\n", 19 | " - [chapter_preface/index](chapter_preface/index.ipynb)\n", 20 | " - [chapter_installation/index](chapter_installation/index.ipynb)\n", 21 | " - [chapter_notation/index](chapter_notation/index.ipynb)\n", 22 | ":end_tab:\n", 23 | "\n", 24 | ":begin_tab:toc\n", 25 | " - [chapter_introduction/index](chapter_introduction/index.ipynb)\n", 26 | " - [chapter_preliminaries/index](chapter_preliminaries/index.ipynb)\n", 27 | " - [chapter_linear-networks/index](chapter_linear-networks/index.ipynb)\n", 28 | " - [chapter_multilayer-perceptrons/index](chapter_multilayer-perceptrons/index.ipynb)\n", 29 | " - [chapter_deep-learning-computation/index](chapter_deep-learning-computation/index.ipynb)\n", 30 | " - [chapter_convolutional-neural-networks/index](chapter_convolutional-neural-networks/index.ipynb)\n", 31 | " - [chapter_convolutional-modern/index](chapter_convolutional-modern/index.ipynb)\n", 32 | " - [chapter_recurrent-neural-networks/index](chapter_recurrent-neural-networks/index.ipynb)\n", 33 | " - [chapter_recurrent-modern/index](chapter_recurrent-modern/index.ipynb)\n", 34 | " - [chapter_attention-mechanisms/index](chapter_attention-mechanisms/index.ipynb)\n", 35 | " - [chapter_optimization/index](chapter_optimization/index.ipynb)\n", 36 | " - [chapter_computational-performance/index](chapter_computational-performance/index.ipynb)\n", 37 | " - [chapter_computer-vision/index](chapter_computer-vision/index.ipynb)\n", 38 | " - [chapter_natural-language-processing-pretraining/index](chapter_natural-language-processing-pretraining/index.ipynb)\n", 39 | " - [chapter_natural-language-processing-applications/index](chapter_natural-language-processing-applications/index.ipynb)\n", 40 | " - [chapter_appendix-tools-for-deep-learning/index](chapter_appendix-tools-for-deep-learning/index.ipynb)\n", 41 | ":end_tab:\n", 42 | "\n", 43 | ":begin_tab:toc\n", 44 | " - [chapter_references/zreferences](chapter_references/zreferences.ipynb)\n", 45 | ":end_tab:\n" 46 | ] 47 | } 48 | ], 49 | "metadata": { 50 | "kernelspec": { 51 | "display_name": "conda_pytorch_p36", 52 | "name": "conda_pytorch_p36" 53 | }, 54 | "language_info": { 55 | "name": "python" 56 | }, 57 | "required_libs": [] 58 | }, 59 | "nbformat": 4, 60 | "nbformat_minor": 5 61 | } -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import d2l 3 | 4 | requirements = [ 5 | 'jupyter==1.0.0', 6 | 'numpy==1.21.5', 7 | 'matplotlib==3.5.1', 8 | 'requests==2.25.1', 9 | 'pandas==1.2.4' 10 | ] 11 | 12 | setup( 13 | name='d2l', 14 | version=d2l.__version__, 15 | python_requires='>=3.5', 16 | author='D2L Developers', 17 | author_email='d2l.devs@gmail.com', 18 | url='https://d2l.ai', 19 | description='Dive into Deep Learning', 20 | license='MIT-0', 21 | packages=find_packages(), 22 | zip_safe=True, 23 | install_requires=requirements, 24 | ) 25 | --------------------------------------------------------------------------------