├── .gitignore ├── README.md ├── gradient-notes.pdf ├── gradient-notes.tex ├── notes1.pdf ├── notes1.tex ├── notes1 ├── fig │ ├── CBOW.png │ ├── Skip-Gram.png │ ├── manwoman.jpg │ ├── sigmoid.png │ └── tree.png └── reference.bib ├── notes2.pdf ├── notes2.tex ├── notes2 ├── Resources │ └── ImageBlocks.pptx └── fig │ ├── DataSize.png │ ├── IntrinsicEval.png │ ├── LinearBoundary.png │ ├── LinearBoundary2.png │ ├── NonlinearBoundary.png │ ├── TrainTime.png │ ├── hyperparam.png │ ├── pretraining.png │ ├── retraining.png │ └── window.png ├── notes3.pdf ├── notes3.tex ├── notes3 ├── Resources │ ├── ImageBlocks.pptx │ └── NNet.pptx └── fig │ ├── 421nnet.png │ ├── ErrorSignal.png │ ├── ErrorSignal2.png │ ├── ErrorSignal3.png │ ├── Error_Surf.png │ ├── NonlinearBoundary.png │ ├── SimpleFF.png │ ├── SingleLayerNeuralNetwork.png │ ├── dropout.png │ ├── graph_hardtanh.png │ ├── graph_leaky.png │ ├── graph_relu.png │ ├── graph_sigmoid.png │ ├── graph_softsign.png │ ├── graph_tanh.png │ └── sigmoidneuron.png ├── notes4.pdf ├── notes4.tex ├── notes4 ├── fig │ ├── dep_tree.png │ ├── nn.tex │ └── transitions.png └── reference.bib ├── notes5.pdf ├── notes5.tex ├── notes5 ├── fig │ ├── GRU.png │ ├── LSTM.png │ ├── bengio_03.png │ ├── birnn.pdf │ ├── cliping.pdf │ ├── deepbirnn.pdf │ ├── en_decoder.png │ ├── nn.tex │ ├── rnn.pdf │ ├── rnn_loop.pdf │ ├── rnn_node.pdf │ ├── rnn_translate.pdf │ └── two_layer.pdf └── reference.bib ├── notes6.pdf ├── notes6.tex ├── notes6 ├── BPE.png ├── BiEncoder.png ├── Decoder.png ├── Encoder.png ├── alignment.png ├── candidate_list.png ├── google_example.png ├── hybrid.png ├── longsentences.png ├── partition.png └── pointer.png ├── notes7.pdf ├── notes7.tex ├── notes7 ├── Resources │ ├── ImageBlocks.pptx │ └── NNet.pptx └── fig │ ├── 2d_convolution.png │ ├── CNN-alternates.png │ ├── ConstituencyParsing.png │ ├── denpendency.png │ ├── img1.png │ ├── img10.png │ ├── img11.png │ ├── img2.png │ ├── img3.png │ ├── img4.png │ ├── img5.png │ ├── img6.png │ ├── img7.png │ ├── img8.png │ ├── img9.png │ ├── narrow_vs_wide.png │ ├── nonsense.png │ ├── single-conv-complete.png │ └── single-conv.png ├── notes8.pdf ├── notes8.tex ├── notes8 └── fig │ ├── BiGRU.png │ └── DMN.png ├── review-differential-calculus.pdf ├── review-differential-calculus.tex ├── sty ├── code_snippet.sty └── kbordermatrix.sty ├── tensorflow.pdf ├── tensorflow.tex ├── tensorflow ├── fig │ └── tensorFlow.png ├── nonsense.svg └── reference.bib └── update_overleaf.sh /.gitignore: -------------------------------------------------------------------------------- 1 | ## Core latex/pdflatex auxiliary files: 2 | *.aux 3 | *.lof 4 | *.log 5 | *.lot 6 | *.fls 7 | *.out 8 | *.toc 9 | *.fmt 10 | *.fot 11 | *.cb 12 | *.cb2 13 | 14 | ## Intermediate documents: 15 | *.dvi 16 | *-converted-to.* 17 | # these rules might exclude image files for figures etc. 18 | *.ps 19 | *.eps 20 | 21 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 22 | *.bbl 23 | *.bcf 24 | *.blg 25 | *-blx.aux 26 | *-blx.bib 27 | *.brf 28 | *.run.xml 29 | 30 | ## Build tool auxiliary files: 31 | *.fdb_latexmk 32 | *.synctex 33 | *.synctex.gz 34 | *.synctex.gz(busy) 35 | *.pdfsync 36 | 37 | ## Auxiliary and intermediate files from other packages: 38 | # algorithms 39 | *.alg 40 | *.loa 41 | 42 | # achemso 43 | acs-*.bib 44 | 45 | # amsthm 46 | *.thm 47 | 48 | # beamer 49 | *.nav 50 | *.snm 51 | *.vrb 52 | 53 | # cprotect 54 | *.cpt 55 | 56 | # fixme 57 | *.lox 58 | 59 | #(r)(e)ledmac/(r)(e)ledpar 60 | *.end 61 | *.?end 62 | *.[1-9] 63 | *.[1-9][0-9] 64 | *.[1-9][0-9][0-9] 65 | *.[1-9]R 66 | *.[1-9][0-9]R 67 | *.[1-9][0-9][0-9]R 68 | *.eledsec[1-9] 69 | *.eledsec[1-9]R 70 | *.eledsec[1-9][0-9] 71 | *.eledsec[1-9][0-9]R 72 | *.eledsec[1-9][0-9][0-9] 73 | *.eledsec[1-9][0-9][0-9]R 74 | 75 | # glossaries 76 | *.acn 77 | *.acr 78 | *.glg 79 | *.glo 80 | *.gls 81 | *.glsdefs 82 | 83 | # gnuplottex 84 | *-gnuplottex-* 85 | 86 | # hyperref 87 | *.brf 88 | 89 | # knitr 90 | *-concordance.tex 91 | # TODO Comment the next line if you want to keep your tikz graphics files 92 | *.tikz 93 | *-tikzDictionary 94 | 95 | # listings 96 | *.lol 97 | 98 | # makeidx 99 | *.idx 100 | *.ilg 101 | *.ind 102 | *.ist 103 | 104 | # minitoc 105 | *.maf 106 | *.mlf 107 | *.mlt 108 | *.mtc 109 | *.mtc[0-9] 110 | *.mtc[1-9][0-9] 111 | 112 | # minted 113 | _minted* 114 | *.pyg 115 | 116 | # morewrites 117 | *.mw 118 | 119 | # mylatexformat 120 | *.fmt 121 | 122 | # nomencl 123 | *.nlo 124 | 125 | # sagetex 126 | *.sagetex.sage 127 | *.sagetex.py 128 | *.sagetex.scmd 129 | 130 | # sympy 131 | *.sout 132 | *.sympy 133 | sympy-plots-for-*.tex/ 134 | 135 | # pdfcomment 136 | *.upa 137 | *.upb 138 | 139 | # pythontex 140 | *.pytxcode 141 | pythontex-files-*/ 142 | 143 | # thmtools 144 | *.loe 145 | 146 | # TikZ & PGF 147 | *.dpth 148 | *.md5 149 | *.auxlock 150 | 151 | # todonotes 152 | *.tdo 153 | 154 | # xindy 155 | *.xdy 156 | 157 | # xypic precompiled matrices 158 | *.xyc 159 | 160 | # endfloat 161 | *.ttt 162 | *.fff 163 | 164 | # Latexian 165 | TSWLatexianTemp* 166 | 167 | ## Editors: 168 | # WinEdt 169 | *.bak 170 | *.sav 171 | 172 | # Texpad 173 | .texpadtmp 174 | 175 | # Kile 176 | *.backup 177 | 178 | # KBibTeX 179 | *~[0-9]* 180 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Course notes for CS224N Winter17 2 | 3 | Submit pull requests / open issues to help fix typos! 4 | -------------------------------------------------------------------------------- /gradient-notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/gradient-notes.pdf -------------------------------------------------------------------------------- /gradient-notes.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article}[11pt] 2 | \usepackage{sectsty} 3 | \usepackage{enumerate} 4 | \usepackage{bm} 5 | \usepackage{amsmath, amsthm, amssymb} 6 | \usepackage[usenames,dvipsnames]{color} 7 | 8 | \newcommand{\bW} { \bm{W} } 9 | \newcommand{\bU} { \bm{U} } 10 | \newcommand{\bb} { \bm{b} } 11 | \newcommand{\bz} { \bm{z} } 12 | \newcommand{\bmf} { \bm{f} } 13 | \newcommand{\bx} { \bm{x} } 14 | \newcommand{\by} { \bm{y} } 15 | \newcommand{\yhat} { \bm{\hat{y}} } 16 | \newcommand{\btheta} { \bm{\theta} } 17 | \newcommand{\bg} { \bm{g} } 18 | \newcommand{\bh} { \bm{h} } 19 | \newcommand{\bL} { \bm{L} } 20 | \newcommand{\bI} { \bm{I} } 21 | \newcommand{\bdelta} { \bm{\delta} } 22 | \newcommand{\smx} { \text{softmax} } 23 | \newcommand{\relu} { \text{ReLU} } 24 | \newcommand{\sgn} { \text{sgn} } 25 | \newcommand{\diag} { \text{diag} } 26 | 27 | \newcommand{\todo}[1] { \color{red}[TODO: #1]\color{black} } 28 | 29 | 30 | \newcommand{\alns}[1] { 31 | \begin{align*} #1 \end{align*} 32 | } 33 | \newcommand{\pd}[2] { 34 | \frac{\partial #1}{\partial #2} 35 | } 36 | 37 | \setlength{\parindent}{0pt} 38 | \sectionfont{\fontsize{12}{12}\selectfont} 39 | \subsectionfont{\fontsize{11}{0}{\vspace{-10pt}}\selectfont} 40 | \title{Computing Neural Network Gradients} 41 | \date{} 42 | \author{Kevin Clark} 43 | 44 | \begin{document} 45 | \maketitle 46 | \vspace{-5mm} 47 | 48 | 49 | \section{Introduction} 50 | The purpose of these notes is to demonstrate how to quickly compute neural network gradients. This will hopefully help you with question 3 of Assignment 2 (if you haven't already done it) and with the midterm (which will have at least one significant gradient computation question). It is \textbf{not} meant to provide an intuition for how backpropagation works -- for that I recommend going over lecture 5\footnote{http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture5.pdf} and the cs231 course notes\footnote{http://cs231n.github.io/optimization-2/} on backpropagation. 51 | \section{Vectorized Gradients} 52 | While it is a good exercise to compute the gradient of a neural network with respect to a single parameter (e.g., a single element in a weight matrix), in practice this tends to be quite slow. Instead, it is more efficient to keep everything in matrix/vector form. The basic building block of vectorized gradients is the {\it Jacobian Matrix}. Suppose we have a function $\bmf: \mathbb{R}^n \to \mathbb{R}^m$ that maps a vector of length $n$ to a vector of length $m$: $\bmf(\bx) = [f_1(x_1, ..., x_n), f_2(x_1, ..., x_n), ..., f_m(x_1, ..., x_n)]$. Then its Jacobian is. 53 | 54 | \alns{ 55 | \pd{\bmf}{\bx} = \begin{bmatrix} 56 | \pd{f_1}{x_1} & \dots & \pd{f_1}{x_n} \\ 57 | \vdots & \ddots & \vdots \\ 58 | \pd{f_m}{x_1} & \dots & \pd{f_m}{x_n} \\ 59 | \end{bmatrix} 60 | } 61 | That is, $( \pd{\bmf}{\bx} )_{ij} = \pd{f_i}{x_j}$ (which is just a standard non-vector derivative). The Jacobian matrix will be useful for us because we can apply the chain rule to a vector-valued function just by multiplying Jacobians. \\ 62 | 63 | As a little illustration of this, suppose we have a function $\bmf(x) = [f_1(x), f_2(x)]$ taking a scalar to a vector of size 2 and a function $\bg(\by) = [g_1(y_1, y_2), g_2(y_1, y_2)]$ taking a vector of size two to a vector of size two. Now let's compose them to get $\bg(x) = [g_1(f_1(x), f_2(x)), g_2(f_1(x), f_2(x))]$. Using the regular chain rule, we can compute the derivative of $\bg$ as the Jacobian 64 | \alns{ 65 | \pd{\bg}{x} = \begin{bmatrix} 66 | \pd{}{x}g_1(f_1(x), f_2(x)) \\ 67 | \pd{}{x}g_2(f_1(x), f_2(x)) \\ 68 | \end{bmatrix} = \begin{bmatrix} 69 | \pd{g_1}{f_1}\pd{f_1}{x} + \pd{g_1}{f_2}\pd{f_2}{x} \\ 70 | \pd{g_2}{f_1}\pd{f_1}{x} + \pd{g_2}{f_2}\pd{f_2}{x} \\ 71 | \end{bmatrix} 72 | } 73 | And we see this is the same as multiplying the two Jacobians: 74 | \alns{ 75 | \pd{\bg}{x} = \pd{\bg}{\bmf}\pd{\bmf}{x} = \begin{bmatrix} 76 | \pd{g_1}{f_1} & \pd{g_1}{f_2} \\ 77 | \pd{g_2}{f_1} & \pd{g_2}{f_2} \\ 78 | \end{bmatrix} 79 | \begin{bmatrix} 80 | \pd{f_1}{x} \\ 81 | \pd{f_2}{x} \\ 82 | \end{bmatrix} 83 | } 84 | 85 | \section{Useful Identities} 86 | This section will now go over how to compute the Jacobian for several simple functions. It will provide some useful identities you can apply when taking neural network gradients. \\ 87 | 88 | \begin{enumerate}[(1)] 89 | 90 | %1. ------------------------------------------------------------------ 91 | 92 | \item \textbf{Matrix times column vector with respect to the column vector} ($\bz = \bW \bx$, what is $\pd{\bz}{\bx}$?) \\ 93 | 94 | Suppose $\bW \in \mathbb{R}^{n \times m}$. Then we can think of $\bz$ as a function of $\bx$ taking an $m$-dimensional vector to an $n$-dimensional vector. So its Jacobian will be $n \times m$. Note that 95 | \alns{ 96 | z_i &= \sum_{k=1}^m W_{ik} x_{k} 97 | } 98 | So an entry $(\pd{\bz}{\bx})_{ij}$ of the Jacobian will be 99 | \alns{ 100 | (\pd{\bz}{\bx})_{ij} = \pd{z_i}{x_j} &= \pd{}{x_j}\sum_{k=1}^m W_{ik} x_{k} = \sum_{k=1}^m W_{ik} \pd{}{x_j}x_{k} = W_{ij} 101 | } 102 | because $\pd{}{x_j}x_{k} = 1$ if $k = j$ and 0 if otherwise. So we see that $\boxed{\pd{\bz}{\bx} = \bW}$ 103 | 104 | %2. ------------------------------------------------------------------ 105 | 106 | \item \textbf{Row vector times matrix with respect to the row vector} \\ ($\bz = \bx \bW$, what is $\pd{\bz}{\bx}$?) \\ 107 | 108 | %Similarly to (1), we have 109 | %\alns{ 110 | % z_i &= \sum_{k=1}^m W_{ki} x_{k} 111 | %} 112 | %so with a computation similar to (1) we get 113 | %\alns{ 114 | % \pd{z_i}{x_j} = W_{ji} 115 | %} 116 | %which means that $\boxed{\pd{\bz}{\bx} = \bW^T}$ 117 | A computation similar to (1) shows that $\boxed{\pd{\bz}{\bx} = \bW^T}$. 118 | 119 | \item \textbf{A vector with itself}\\($\bz = \bx$, what is $\pd{\bz}{\bx}$? ) \\ 120 | We have $z_i = x_i$. So 121 | \alns{ 122 | (\pd{\bz}{\bx})_{ij} = \pd{z_i}{x_j} = \pd{}{x_j}x_i = \begin{cases} 123 | 1 \phantom{abc} \text{if $i = j$} \\ 124 | 0 \phantom{abc} \text{if otherwise} 125 | \end{cases} 126 | } 127 | So we see that the Jacobian $\pd{\bz}{\bx}$ is a diagonal matrix where the entry at $(i, i)$ is 1. This is just the identity matrix: $\boxed{\pd{\bz}{\bx} = \bI}$. When applying the chain rule, this term will disappear because a matrix multiplied by the identity matrix does not change. 128 | 129 | \item \textbf{An elementwise function applied a vector}\\ ($\bz = f(\bx)$, what is $\pd{\bz}{\bx}$? ) \\ 130 | If $f$ is being applied elementwise, we have $z_i = f(x_i)$. So 131 | \alns{ 132 | (\pd{\bz}{\bx})_{ij} = \pd{z_i}{x_j} = \pd{}{x_j}f(x_i) = \begin{cases} 133 | f'(x_i) \phantom{abc} \text{if $i = j$} \\ 134 | 0 \phantom{abc} \text{if otherwise} 135 | \end{cases} 136 | } 137 | So we see that the Jacobian $\pd{\bz}{\bx}$ is a diagonal matrix where the entry at $(i, i)$ is the derivative of $f$ applied to $x_i$. We can write this as $\boxed{\pd{\bz}{\bx} = \diag(f'(\bx))}$. Since multiplication by a diagonal matrix is the same as doing elementwise multiplication by the diagonal, we could also write $\boxed{\circ f'(\bx)}$ when applying the chain rule. 138 | 139 | \item \textbf{Matrix times column vector with respect to the matrix} \\ ($\bz = \bW \bx$, $\bdelta = \pd{J}{\bz}$ what is $\pd{J}{\bW} = \pd{J}{\bz} \pd{\bz}{\bW} = \bdelta \pd{\bz}{\bW}$?) \\ 140 | 141 | This is a bit more complicated than the other identities. The reason for including $\pd{J}{\bz}$% = \bdelta$ 142 | in the above problem formulation will become clear in a moment. 143 | 144 | First suppose we have a loss function $J$ (a scalar) and are computing its gradient with respect to a matrix $\bW \in \mathbb{R}^{n \times m}$. Then we could think of $J$ as a function of $\bW$ taking $nm$ inputs (the entries of $\bW$) to a single output ($J$). This means the Jacobian $\pd{J}{\bW}$ would be a $1 \times nm$ vector. But in practice this is not a very useful way of arranging the gradient. It would be much nicer if the derivatives were in a $n \times m$ matrix like this: 145 | \alns{ 146 | \pd{J}{\bW} = \begin{bmatrix} 147 | \pd{J}{W_{11}} & \dots & \pd{J}{W_{1m}} \\ 148 | \vdots & \ddots & \vdots \\ 149 | \pd{J}{W_{n1}} & \dots & \pd{J}{W_{nm}} \\ 150 | \end{bmatrix} 151 | } 152 | Since this matrix has the same shape as $\bW$, we could just subtract it (times the learning rate) from $\bW$ when doing gradient descent. So (in a slight abuse of notation) let's find this matrix as $\pd{J}{\bW}$ instead. 153 | 154 | %This way of arranging the gradients becomes complicated when computing $\pd{\bz}{\bW}$. Unlike $J$, $\bz$ is vector-valued. So if we are trying to rearrange the gradients like with $\pd{J}{\bW}$, $\pd{\bz}{\bW}$ would become an $n \times m \times m$ tensor! 155 | 156 | I think the easiest way of computing this matrix is by finding the gradient for a single weight $W_{ij}$. We have 157 | \alns{ 158 | z_k &= \sum_{l=1}^m W_{kl} x_{l} \\ 159 | \pd{z_k}{W_{ij}} &= \sum_{l=1}^m x_l \pd{}{W_{ij}}W_{kl} 160 | } 161 | Note that $\pd{}{W_{ij}}W_{kl} = 1$ if $i = k$ and $j = l$ and 0 if otherwise. So if $k \neq i$ everything in the sum is zero and the gradient is zero. Otherwise, the only nonzero element of the sum is when $l = j$, so we just get $x_j$. Thus we find $\pd{z_k}{W_{ij}} = x_j$ if $k = i$ and 0 if otherwise. Another way of writing this is 162 | \alns{ 163 | \pd{\bz}{W_{ij}} = \begin{bmatrix} 0 \\ \vdots \\ 0 \\ x_j \\ 0 \\ \vdots \\ 0 \end{bmatrix} 164 | \gets \text{$i$th element} 165 | } 166 | Now let's compute $\pd{J}{W_{ij}}$ 167 | \alns{ 168 | \pd{J}{W_{ij}} = \pd{J}{\bz}\pd{\bz}{W_{ij}} = \bdelta \pd{\bz}{W_{ij}} = \sum_{k=1}^m \delta_k \pd{z_k}{W_{ij}} = \delta_i x_j 169 | } 170 | (the only nonzero term in the sum is $\delta_i \pd{z_i}{W_{ij}}$). To get $\pd{J}{\bW}$ we want a matrix where entry $(i, j)$ is $\delta_i x_j$. This matrix is equal to the outer project $\boxed{\pd{J}{\bW} = \bdelta^T \bx}$ 171 | 172 | \item \textbf{Row vector time matrix with respect to the matrix} \\ ($\bz = \bx \bW$, $\bdelta = \pd{J}{\bz}$ what is $\pd{J}{\bW} = \bdelta \pd{\bz}{\bW}$?) \\ 173 | A similar computation to (5) shows that $\boxed{\pd{J}{\bW} = \bx^T \bdelta }$. 174 | 175 | \item \textbf{Cross-entropy loss with respect to logits} ($\yhat = \text{softmax}(\btheta)$, $J = CE(\by, \yhat)$, what is $\pd{J}{\btheta}$?) \\ 176 | 177 | You did this in Assignment 1! The gradient is $\boxed{\pd{J}{\btheta} = \yhat - \by}$ 178 | 179 | %I think the easiest way of dealing with this is avoiding the issue by taking the derivative with respect to a single row of $\bW$ $\bW_i$. Then the Jacobian $\pd{z_i}{\bW_{i}}$ will be a matrix instead of a tensor. 180 | %\alns{ 181 | % z_i &= \sum_{k=1}^m W_{ik} x_{k} \\ 182 | % \pd{z_i}{W_{ij}} &= x_j% \to \pd{z_i}{\bW_{i}} = \bx 183 | %} 184 | %Since entry $i, j$ of the Jacobian is $x_j$ we have 185 | %\alns{ 186 | % \pd{\bz}{\bW_i} = \begin{bmatrix} \bx \\ \bx \\ \vdots \\ \bx \end{bmatrix} 187 | %} 188 | %Now let's think about $\pd{J}{\bW_i}$ 189 | 190 | % vector times another vector vector element wise multiplied with another vector norm of a vector with respect to the vector 191 | 192 | \end{enumerate} 193 | 194 | These identities will be enough to let you quickly compute the gradients for many neural networks. However, it's important to know how to compute Jacobians for other functions as well in case they show up. Some examples if you want practice: dot product of two vectors, elementwise product of two vectors, 2-norm of a vector. Feel free to use these identities on the midterm and assignments. 195 | 196 | \section{Example: 1-Layer Neural Network with Embeddings} 197 | This section provides an example of computing the gradients of a full neural network. 198 | In particular we are going to compute the gradients of the dependency parser you are building in Assignment 2. First let's write out the forward pass of the model. 199 | \alns{ 200 | \bx &= [\bL_{w_0}, \bL_{w_1}, ..., \bL_{w_{m - 1} }] \\ 201 | \bz &= \bx \bW + \bb_1 \\ 202 | \bh &= \relu(\bz) \\ 203 | \btheta &= \bh \bU + \bb_2 \\ 204 | \yhat &= \smx(\btheta) \\ 205 | J &= CE(\by, \yhat) 206 | } 207 | It helps to break up the model into the simplest parts possible, so note that we defined $\bz$ and $\btheta$ to split up the activation functions from the linear transformations in the network's layers. The dimensions of the model's parameters are 208 | \[ 209 | \bL \in \mathbb{R}^{|V| \times d} \quad\quad 210 | \bb_1 \in \mathbb{R}^{1 \times D_h} \quad\quad 211 | \bW \in \mathbb{R}^{md \times D_h} \quad\quad 212 | \bb_2 \in \mathbb{R}^{1 \times N_c} \quad\quad 213 | \bU \in \mathbb{R}^{D_h \times N_c} \quad\quad 214 | \] 215 | where $|V|$ is the vocabulary size, $d$ is the size of our word vectors, $m$ is the number of features, $D_h$ is the size of our hidden layer, and $N_c$ is the number of classes. \\ 216 | 217 | In this example, we will compute all the gradients: 218 | 219 | \[ 220 | \frac{\partial J}{\partial \bU} \quad\quad 221 | \frac{\partial J}{\partial \bb_2} \quad\quad 222 | \frac{\partial J}{\partial \bW} \quad\quad 223 | \frac{\partial J}{\partial \bb_1} \quad\quad 224 | \frac{\partial J}{\partial \bL_{w_i}} \quad\quad 225 | \] 226 | 227 | To start with, recall that $\relu(x) = \max(x, 0)$. This means 228 | \begin{align*} 229 | \relu'(x) = \begin{cases} 230 | 1 \phantom{abc} \text{if $x > 0$} \\ 231 | 0 \phantom{abc} \text{if otherwise} 232 | \end{cases} 233 | = \sgn(\relu(x)) 234 | \end{align*} 235 | where $\sgn$ is the signum function. Note that as you did in Assignment 1 with sigmoid, we are able to write the derivative of the activation in terms of the activation itself. \\ 236 | 237 | Now let's write out the chain rule for $\pd{J}{\bU}$ and $\pd{J}{\bb_2}$: 238 | \alns{ 239 | \pd{J}{\bU} &= \pd{J}{\yhat}\pd{\yhat}{\btheta}\pd{\btheta}{\bU} \\ 240 | \pd{J}{\bb_2} &= \pd{J}{\yhat}\pd{\yhat}{\btheta}\pd{\btheta}{\bb_2} 241 | } 242 | Notice that $\pd{J}{\yhat}\pd{\yhat}{\btheta} = \pd{J}{\btheta}$ is present in both gradients. This makes the math a bit cumbersome. Even worse, if we're implementing the model without automatic differentiation, computing $\pd{J}{\btheta}$ twice will be inefficient. So it will help us to define some variables to represent the intermediate derivatives: 243 | \alns{ 244 | \bdelta_1 = \pd{J}{\btheta} \quad\quad 245 | \bdelta_2 = \pd{J}{\bz} 246 | } 247 | We can compute them as follows: 248 | \alns{ 249 | \bdelta_1 &= \pd{J}{\btheta} = \yhat - \by & \text{this is just identity (7)}\\ 250 | \bdelta_2 &= \pd{J}{\bz} = \pd{J}{\btheta}\pd{\btheta}{\bh}\pd{\bh}{\bz} & \text{using the chain rule}\\ 251 | &= \bdelta_1\pd{\btheta}{\bh}\pd{\bh}{\bz} & \text{substituting in $\bdelta_1$}\\ 252 | &= \bdelta_1\ \bU^T \pd{\bh}{\bz} & \text{using identity (2)} \\ 253 | &= \bdelta_1\ \bU^T \circ \relu'(\bz) & \text{using identity (4)} \\ 254 | &= \bdelta_1\ \bU^T \circ \sgn(\bh) & \text{we computed this earlier} \\ 255 | } 256 | A good way of checking our work is by looking at the dimensions of the terms in the derivative: 257 | \alns{ 258 | &\pd{J}{\bz} \qquad = \qquad \bdelta_1 \qquad\qquad\qquad \bU^T \qquad \circ \qquad \sgn(\bh) \\ 259 | (1 &\times D_h) \hspace{9mm} (1 \times N_c) \hspace{10mm} (N_c \times D_h) \hspace{12mm} (D_h) 260 | } 261 | We see that the dimensions of all the terms in the gradient match up (i.e., the number of columns in a term equals the number of rows in the next term). 262 | This will always be the case if we computed our gradients correctly. \\ 263 | 264 | %\todo{something about checking dimensions} 265 | Now we can use the error terms to compute our gradients: 266 | \alns{ 267 | \pd{J}{\bU} &= \pd{J}{\btheta}\pd{\btheta}{\bU} = \bdelta_1\pd{\btheta}{\bU} = \bh^T \bdelta_1 & \text{using identity (6)} \\ 268 | \pd{J}{\bb_2} &= \pd{J}{\btheta}\pd{\btheta}{\bb_2} = \bdelta_1\pd{\btheta}{\bb_2} =\bdelta_1 & \text{using identity (3)} \\ 269 | \pd{J}{\bW} &= \pd{J}{\btheta}\pd{\bz}{\bW} = \bdelta_2\pd{\bz}{\bW} = \bx^T \bdelta_2 & \text{using identity (6)} \\ 270 | \pd{J}{\bb_1} &= \pd{J}{\btheta}\pd{\bz}{\bb_1} = \bdelta_2\pd{\bz}{\bb_1} = \bdelta_2 & \text{using identity (3)} \\ 271 | \pd{J}{\bL_{w_i}} &= \pd{J}{\bz}\pd{\bz}{\bL_{w_i}} = \bdelta_2 \pd{\bz}{\bL_{w_i}} 272 | %\pd{J}{\bL_{w_i}} &= \pd{J}{\btheta}\pd{\bz}{\bx}\pd{\bx}{\bL_{w_i}} = \bdelta_2 \bW^T\pd{\bx}{\bL_{w_i}} & \text{Using identity (2)} 273 | } 274 | All that's left is to compute $\pd{\bz}{\bL_{w_i}}$. It helps to split up $\bW$ by rows like this: 275 | \alns{ 276 | \bx \bW &= [\bL_{w_0}, \bL_{w_1}, ..., \bL_{w_{m - 1} }] \bW 277 | = [\bL_{w_0}, \bL_{w_1}, ..., \bL_{w_{m - 1} }] 278 | \begin{bmatrix} \bW_{0:d} \\ \bW_{d:2d} \\ \vdots \\ \bW_{(m - 1)d:md} \end{bmatrix} \\ 279 | &= \bL_{w_0} \bW_{0:d} + \bL_{w_1} \bW_{d:2d} + \dots + \bL_{w_{m - 1}}\bW_{(m - 1)d:md} 280 | =\sum_{j = 0}^{m - 1} \bL_{w_j} \bW_{dj:d(j + 1)} 281 | } 282 | When we compute $\pd{\bz}{\bL_{w_i}}$, only the $i$th term in this sum is nonzero, so we get 283 | \alns{ 284 | \pd{\bz}{\bL_{w_i}} = \pd{}{\bL_{w_i}} = \bL_{w_i } \bW_{di:d(i + 1)} = (\bW_{di:d(i + 1)})^T 285 | % = (\bW^T)_{\cdot, di:d(i + 1)} 286 | } 287 | using identity (2). 288 | 289 | 290 | 291 | \end{document} 292 | -------------------------------------------------------------------------------- /notes1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes1.pdf -------------------------------------------------------------------------------- /notes1/fig/CBOW.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes1/fig/CBOW.png -------------------------------------------------------------------------------- /notes1/fig/Skip-Gram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes1/fig/Skip-Gram.png -------------------------------------------------------------------------------- /notes1/fig/manwoman.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes1/fig/manwoman.jpg -------------------------------------------------------------------------------- /notes1/fig/sigmoid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes1/fig/sigmoid.png -------------------------------------------------------------------------------- /notes1/fig/tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes1/fig/tree.png -------------------------------------------------------------------------------- /notes1/reference.bib: -------------------------------------------------------------------------------- 1 | @incollection{Rumelhart:1988:LRB:65669.104451, 2 | author = {Rumelhart, David E. and Hinton, Geoffrey E. and Williams, Ronald J.}, 3 | chapter = {Learning Representations by Back-propagating Errors}, 4 | title = {Neurocomputing: Foundations of Research}, 5 | editor = {Anderson, James A. and Rosenfeld, Edward}, 6 | year = {1988}, 7 | isbn = {0-262-01097-6}, 8 | pages = {696--699}, 9 | numpages = {4}, 10 | url = {http://dl.acm.org/citation.cfm?id=65669.104451}, 11 | acmid = {104451}, 12 | publisher = {MIT Press}, 13 | address = {Cambridge, MA, USA}, 14 | } 15 | 16 | @article{Bengio:2003:NPL:944919.944966, 17 | author = {Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Janvin, Christian}, 18 | title = {A Neural Probabilistic Language Model}, 19 | journal = {J. Mach. Learn. Res.}, 20 | issue_date = {3/1/2003}, 21 | volume = {3}, 22 | month = mar, 23 | year = {2003}, 24 | issn = {1532-4435}, 25 | pages = {1137--1155}, 26 | numpages = {19}, 27 | url = {http://dl.acm.org/citation.cfm?id=944919.944966}, 28 | acmid = {944966}, 29 | publisher = {JMLR.org}, 30 | } 31 | 32 | @article{DBLP:journals/corr/abs-1103-0398, 33 | author = {Ronan Collobert and 34 | Jason Weston and 35 | L{\'{e}}on Bottou and 36 | Michael Karlen and 37 | Koray Kavukcuoglu and 38 | Pavel P. Kuksa}, 39 | title = {Natural Language Processing (almost) from Scratch}, 40 | journal = {CoRR}, 41 | volume = {abs/1103.0398}, 42 | year = {2011}, 43 | url = {http://arxiv.org/abs/1103.0398}, 44 | timestamp = {Mon, 05 Dec 2011 18:04:25 +0100}, 45 | biburl = {http://dblp.uni-trier.de/rec/bib/journals/corr/abs-1103-0398}, 46 | bibsource = {dblp computer science bibliography, http://dblp.org} 47 | } 48 | @article{DBLP:journals/corr/abs-1301-3781, 49 | author = {Tomas Mikolov and 50 | Kai Chen and 51 | Greg Corrado and 52 | Jeffrey Dean}, 53 | title = {Efficient Estimation of Word Representations in Vector Space}, 54 | journal = {CoRR}, 55 | volume = {abs/1301.3781}, 56 | year = {2013}, 57 | url = {http://arxiv.org/abs/1301.3781}, 58 | timestamp = {Thu, 07 May 2015 20:02:01 +0200}, 59 | biburl = {http://dblp.uni-trier.de/rec/bib/journals/corr/abs-1301-3781}, 60 | bibsource = {dblp computer science bibliography, http://dblp.org} 61 | } 62 | 63 | @article{DBLP:journals/corr/Rong14, 64 | author = {Xin Rong}, 65 | title = {word2vec Parameter Learning Explained}, 66 | journal = {CoRR}, 67 | volume = {abs/1411.2738}, 68 | year = {2014}, 69 | url = {http://arxiv.org/abs/1411.2738}, 70 | timestamp = {Mon, 01 Dec 2014 14:32:13 +0100}, 71 | biburl = {http://dblp.uni-trier.de/rec/bib/journals/corr/Rong14}, 72 | bibsource = {dblp computer science bibliography, http://dblp.org} 73 | } 74 | homebrowsesearchabout 75 | -------------------------------------------------------------------------------- /notes2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes2.pdf -------------------------------------------------------------------------------- /notes2.tex: -------------------------------------------------------------------------------- 1 | \documentclass{tufte-handout} 2 | 3 | \title{CS224n: Natural Language Processing with Deep Learning 4 | \thanks{Course Instructors: Christopher Manning, Richard Socher} \\ 5 | \Large Lecture Notes: Part II\thanks{Authors: Rohit Mundra, Emma Peng, Richard Socher, Ajay Sohmshetty}} 6 | 7 | 8 | \date{Winter 2017} % without \date command, current date is supplied 9 | 10 | %\geometry{showframe} % display margins for debugging page layout 11 | 12 | \usepackage{graphicx} % allow embedded images 13 | \setkeys{Gin}{width=\linewidth,totalheight=\textheight,keepaspectratio} 14 | \graphicspath{{notes2/fig/}} % set of paths to search for images 15 | \usepackage{amsmath} % extended mathematics 16 | \usepackage{amstext} % extended text 17 | \usepackage{booktabs} % book-quality tables 18 | \usepackage{units} % non-stacked fractions and better unit spacing 19 | \usepackage{multicol} % multiple column layout facilities 20 | \usepackage{lipsum} % filler text 21 | \usepackage{fancyvrb} % extended verbatim environments 22 | \usepackage{placeins} 23 | \fvset{fontsize=\normalsize}% default font size for fancy-verbatim environments 24 | 25 | % Standardize command font styles and environments 26 | \newcommand{\doccmd}[1]{\texttt{\textbackslash#1}}% command name -- adds backslash automatically 27 | \newcommand{\docopt}[1]{\ensuremath{\langle}\textrm{\textit{#1}}\ensuremath{\rangle}}% optional command argument 28 | \newcommand{\docarg}[1]{\textrm{\textit{#1}}}% (required) command argument 29 | \newcommand{\docenv}[1]{\textsf{#1}}% environment name 30 | \newcommand{\docpkg}[1]{\texttt{#1}}% package name 31 | \newcommand{\doccls}[1]{\texttt{#1}}% document class name 32 | \newcommand{\docclsopt}[1]{\texttt{#1}}% document class option name 33 | \newenvironment{docspec}{\begin{quote}\noindent}{\end{quote}}% command specification environment 34 | \newcommand{\argmin}{\operatornamewithlimits{argmin}} 35 | \newcommand{\argmax}{\operatornamewithlimits{argmax}} 36 | \newcommand{\textunderscript}[1]{$_{\text{#1}}$} 37 | 38 | \setcounter{secnumdepth}{3} 39 | 40 | \begin{document} 41 | 42 | \maketitle% this prints the handout title, author, and date 43 | 44 | %\printclassoptions 45 | 46 | 47 | \textbf{Keyphrases: Global Vectors for Word Representation (GloVe). Intrinsic and extrinsic evaluations. Effect of hyperparameters on analogy evaluation tasks. Correlation of human judgment with word vector distances. Dealing with ambiguity in word using contexts. Window classification.} 48 | 49 | This set of notes first introduces the GloVe model for training word vectors. Then it extends our discussion of word vectors (interchangeably called word embeddings) by seeing how they can be evaluated intrinsically and extrinsically. As we proceed, we discuss the example of word analogies as an intrinsic evaluation technique and how it can be used to tune word embedding techniques. We then discuss training model weights/parameters and word vectors for extrinsic tasks. Lastly we motivate artificial neural networks as a class of models for natural language processing tasks. 50 | 51 | \section[GloVe]{Global Vectors for Word Representation (GloVe)\footnote{This section is based on the GloVe paper by Pennington et al.: \\ Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation}} 52 | 53 | \subsection{Comparison with Previous Methods} 54 | So far, we have looked at two main classes of methods to find word embeddings. The first set are count-based and rely on matrix factorization (e.g. LSA, HAL). While these methods effectively leverage global statistical information, they are primarily used to capture word similarities and do poorly on tasks such as word analogy, indicating a sub-optimal vector space structure. 55 | The other set of methods are shallow window-based (e.g. the skip-gram and the CBOW models), which learn word embeddings by making predictions in local context windows. These models demonstrate the capacity to capture complex linguistic patterns beyond word similarity, but fail to make use of the global co-occurrence statistics. 56 | 57 | \marginnote{\textbf{GloVe:} 58 | \begin{itemize} 59 | \item Using global statistics to predict the probability of word j appearing in the context of word i with a least squares objective 60 | \end{itemize} 61 | } 62 | 63 | In comparison, GloVe consists of a weighted least squares model that trains on global word-word co-occurrence counts and thus makes efficient use of statistics. The model produces a word vector space with meaningful sub-structure. It shows state-of-the-art performance on the word analogy task, and outperforms other current methods on several word similarity tasks. 64 | 65 | \subsection{Co-occurrence Matrix} 66 | 67 | \marginnote{\textbf{Co-occurrence Matrix:} 68 | \begin{itemize} 69 | \item $X$: word-word co-occurrence matrix 70 | \item $X_{ij}$: number of times word $j$ occur in the context of word $i$ 71 | \item $X_i = \sum_k X_{ik}$: the number of times any word k appears in the context of word i 72 | \item $P_{ij} = P(w_j | w_i) = \frac{X_{ij}}{X_i}$: the probability of j appearing in the context of word i 73 | \end{itemize} 74 | } 75 | 76 | Let $X$ denote the word-word co-occurrence matrix, where $X_{ij}$ indicates the number of times word $j$ occur in the context of word $i$. Let $X_i = \sum_k X_{ik}$ be the number of times any word k appears in the context of word i. Finally, let $P_{ij} = P(w_j | w_i) = \frac{X_{ij}}{X_i}$ be the probability of j appearing in the context of word i. 77 | 78 | Populating this matrix requires a single pass through the entire corpus to collect the statistics. For large corpora, this pass can be computationally expensive, but it is a one-time up-front cost. 79 | 80 | \subsection{Least Squares Objective} 81 | Recall that for the skip-gram model, we use softmax to compute the probability of word j appears in the context of word i: 82 | 83 | \[ 84 | Q_{ij} = \frac{\exp(\vec{u}_j^T \vec{v}_i)}{\sum_{w=1}^W \exp(\vec{u}_w^T \vec{v}_i)} 85 | \] 86 | 87 | Training proceeds in an on-line, stochastic fashion, but the implied global cross-entropy loss can be calculated as: 88 | 89 | \[ 90 | J = -\sum_{i \in corpus} \sum_{j \in context(i)} \log Q_{ij} 91 | \] 92 | 93 | As the same words i and j can appear multiple times in the corpus, it is more efficient to first group together the same values for i and j: 94 | 95 | \[ 96 | J = -\sum_{i=1}^W \sum_{j=1}^W X_{ij} \log Q_{ij} 97 | \] 98 | 99 | where the value of co-occurring frequency is given by the co-occurrence matrix $X$. One significant drawback of the cross-entropy loss is that it requires the distribution $Q$ to be properly normalized, which involves the expensive summation over the entire vocabulary. Instead, we use a least square objective in which the normalization factors in $P$ and $Q$ are discarded: 100 | 101 | \[ 102 | \hat{J} = \sum_{i=1}^W \sum_{j=1}^W X_i (\hat{P}_{ij} - \hat{Q}_{ij})^2 103 | \] 104 | 105 | where $\hat{P}_{ij} = X_{ij}$ and $\hat{Q}_{ij} = \exp(\vec{u}_j^T \vec{v}_i)$ are the unnormalized distributions. This formulation introduces a new problem -- $X_{ij}$ often takes on very large values and makes the optimization difficult. An effective change is to minimize the squared error of the logarithms of $\hat{P}$ and $\hat{Q}$: 106 | 107 | \begin{align*} 108 | \hat{J} & = \sum_{i=1}^W \sum_{j=1}^W X_i (\log(\hat{P})_{ij} - \log(\hat{Q}_{ij}))^2 \\ 109 | & = \sum_{i=1}^W \sum_{j=1}^W X_i (\vec{u}_j^T \vec{v}_i - \log X_{ij})^2 110 | \end{align*} 111 | 112 | Another observation is that the weighting factor $X_i$ is not guaranteed to be optimal. Instead, we introduce a more general weighting function, which we are free to take to depend on the context word as well: 113 | 114 | \[ 115 | \hat{J} = \sum_{i=1}^W \sum_{j=1}^W f(X_{ij}) (\vec{u}_j^T \vec{v}_i - \log X_{ij})^2 116 | \] 117 | 118 | \subsection{Conclusion} 119 | In conclusion, the GloVe model efficiently leverages global statistical information by training only on the nonzero elements in a word-word co-occurrence matrix, and produces a vector space with meaningful sub-structure. It consistently outperforms $word2vec$ on the word analogy task, given the same corpus, vocabulary, window size, and training time. It achieves better results faster, and also obtains the best results irrespective of speed. 120 | 121 | 122 | \section{Evaluation of Word Vectors}\label{sec:eval-wv} 123 | 124 | So far, we have discussed methods such as the \textit{Word2Vec} and \textit{GloVe} methods to train and discover latent vector representations of natural language words in a semantic space. In this section, we discuss how we can quantitatively evaluate the quality of word vectors produced by such techniques. 125 | 126 | \subsection{Intrinsic Evaluation}\label{sec:intrinsic} 127 | Intrinsic evaluation of word vectors is the evaluation of a set of word vectors generated by an embedding technique (such as Word2Vec or GloVe) on specific intermediate subtasks (such as analogy completion). These subtasks are typically simple and fast to compute and thereby allow us to help understand the system used to generate the word vectors. An intrinsic evaluation should typically return to us a number that indicates the performance of those word vectors on the evaluation subtask. 128 | 129 | \begin{marginfigure}% 130 | \includegraphics[width=\linewidth]{IntrinsicEval} 131 | \caption{The left subsystem (red) being expensive to train is modified by substituting with a simpler subsystem (green) for intrinsic evaluation.} 132 | \label{fig:IntrinsicEval} 133 | \end{marginfigure} 134 | 135 | \textbf{Motivation:} Let us consider an example where our final goal is to create a question answering system which uses word vectors as inputs. One approach of doing so would be to train a machine learning system that: 136 | \begin {enumerate} 137 | \item Takes words as inputs 138 | \item Converts them to word vectors 139 | \item Uses word vectors as inputs for an elaborate machine learning system 140 | \item Maps the output word vectors by this system back to natural language words 141 | \item Produces words as answers 142 | \end{enumerate} 143 | 144 | \marginnote{\textbf{Intrinsic evaluation:} 145 | \begin{itemize} 146 | \item Evaluation on a specific, intermediate task 147 | \item Fast to compute performance 148 | \item Helps understand subsystem 149 | \item Needs positive correlation with real task to determine usefulness 150 | \end{itemize} 151 | } 152 | 153 | Of course, in the process of making such a state-of-the-art question-answering system, we will need to create optimal word-vector representations since they are used in downstream subsystems (such as deep neural networks). To do this in practice, we will need to tune many hyperparameters in the Word2Vec subsystem (such as the dimension of the word vector representation). While the idealistic approach is to retrain the entire system after any parametric changes in the Word2Vec subsystem, this is impractical from an engineering standpoint because the machine learning system (in step 3) is typically a deep neural network with millions of parameters that takes very long to train. In such a situation, we would want to come up with a simple intrinsic evaluation technique which can provide a measure of "goodness" of the word to word vector subsystem. Obviously, a requirement is that the intrinsic evaluation has a positive correlation with the final task performance. 154 | 155 | \subsection{Extrinsic Evaluation}\label{sec:extrinsic} 156 | 157 | \marginnote{\textbf{Extrinsic evaluation:} 158 | \begin{itemize} 159 | \item Is the evaluation on a real task 160 | \item Can be slow to compute performance 161 | \item Unclear if subsystem is the problem, other subsystems, or internal interactions 162 | \item If replacing subsystem improves performance, the change is likely good 163 | \end{itemize} 164 | } 165 | 166 | Extrinsic evaluation of word vectors is the evaluation of a set of word vectors generated by an embedding technique on the real task at hand. These tasks are typically elaborate and slow to compute. Using our example from above, the system which allows for the evaluation of answers from questions is the extrinsic evaluation system. Typically, optimizing over an underperforming extrinsic evaluation system does not allow us to determine which specific subsystem is at fault and this motivates the need for intrinsic evaluation. 167 | 168 | \subsection{Intrinsic Evaluation Example: Word Vector Analogies} 169 | A popular choice for intrinsic evaluation of word vectors is its performance in completing word vector analogies. In a word vector analogy, we are given an incomplete analogy of the form:\\ 170 | \centerline{a : b : : c : ?} 171 | The intrinsic evaluation system then identifies the word vector which maximizes the cosine similarity: 172 | $$ d = \argmax_i \frac{(x_b - x_a + x_c)^Tx_i} {\|x_b - x_a + x_c\|}$$ 173 | This metric has an intuitive interpretation. Ideally, we want $x_b - x_a = x_d - x_c$ (For instance, queen -- king = actress -- actor). This implies that we want $x_b - x_a + x_c = x_d$. Thus we identify the vector $x_d$ which maximizes the normalized dot-product between the two word 174 | vectors (i.e. cosine similarity). 175 | 176 | Using intrinsic evaluation techniques such as word-vector analogies should be handled with care (keeping in mind various aspects of the corpus used for pre-training). For instance, consider analogies of the form:\\ 177 | \centerline{City 1 : State containing City 1 : : City 2 : State containing City 2} 178 | 179 | \begin{table}[ht] 180 | \centering 181 | \fontfamily{ppl}\selectfont 182 | \begin{tabular}{ll} 183 | \toprule 184 | Input & Result Produced \\ 185 | \midrule 186 | Chicago : Illinois : : Houston & Texas\\ 187 | Chicago : Illinois : : Philadelphia & Pennsylvania \\ 188 | Chicago : Illinois : : Phoenix & Arizona\\ 189 | Chicago : Illinois : : Dallas & Texas\\ 190 | Chicago : Illinois : : Jacksonville & Florida \\ 191 | Chicago : Illinois : : Indianapolis & Indiana \\ 192 | Chicago : Illinois : : Austin & Texas\\ 193 | Chicago : Illinois : : Detroit & Michigan\\ 194 | Chicago : Illinois : : Memphis & Tennessee \\ 195 | Chicago : Illinois : : Boston & Massachusetts\\ 196 | \bottomrule 197 | \end{tabular} 198 | \caption{Here are \textbf{semantic} word vector analogies (intrinsic evaluation) that may suffer from different cities having the same name} 199 | \label{tab:normaltab} 200 | \end{table} 201 | 202 | In many cases above, there are multiple cities/towns/villages with the same name across the US. Thus, many states would qualify as the right answer. For instance, there are at least 10 places in the US called Phoenix and thus, Arizona need not be the only correct response. Let us now consider analogies of the form:\\ 203 | \centerline{Capital City 1 : Country 1 : : Capital City 2 : Country 2} 204 | 205 | \begin{table}[ht] 206 | \centering 207 | \fontfamily{ppl}\selectfont 208 | \begin{tabular}{ll} 209 | \toprule 210 | Input & Result Produced \\ 211 | \midrule 212 | Abuja : Nigeria : : Accra & Ghana\\ 213 | Abuja : Nigeria : : Algiers & Algeria\\ 214 | Abuja : Nigeria : : Amman & Jordan\\ 215 | Abuja : Nigeria : : Ankara & Turkey\\ 216 | Abuja : Nigeria : : Antananarivo & Madagascar \\ 217 | Abuja : Nigeria : : Apia & Samoa\\ 218 | Abuja : Nigeria : : Ashgabat & Turkmenistan \\ 219 | Abuja : Nigeria : : Asmara & Eritrea\\ 220 | Abuja : Nigeria : : Astana & Kazakhstan\\ 221 | \bottomrule 222 | \end{tabular} 223 | \caption{Here are \textbf{semantic} word vector analogies (intrinsic evaluation) that may suffer from countries having different capitals at different points in time} 224 | \label{tab:normaltab} 225 | \end{table} 226 | 227 | In many of the cases above, the resulting city produced by this task has only been the capital in the recent past. For instance, prior to 1997 the capital of Kazakhstan was Almaty. Thus, we can anticipate other issues if our corpus is dated. 228 | 229 | The previous two examples demonstrated semantic testing using word vectors. We can also test syntax using word vector analogies. The following intrinsic evaluation tests the word vectors' ability to capture the notion of superlative adjectives: 230 | 231 | \begin{table}[ht] 232 | \centering 233 | \fontfamily{ppl}\selectfont 234 | \begin{tabular}{ll} 235 | \toprule 236 | Input & Result Produced \\ 237 | \midrule 238 | bad : worst : : big & biggest\\ 239 | bad : worst : : bright & brightest \\ 240 | bad : worst : : cold & coldest \\ 241 | bad : worst : : cool & coolest \\ 242 | bad : worst : : dark & darkest \\ 243 | bad : worst : : easy & easiest \\ 244 | bad : worst : : fast & fastest\\ 245 | bad : worst : : good & best\\ 246 | bad : worst : : great & greatest\\ 247 | \bottomrule 248 | \end{tabular} 249 | \caption{Here are \textbf{syntactic} word vector analogies (intrinsic evaluation) that test the notion of superlative adjectives} 250 | \label{tab:normaltab} 251 | \end{table} 252 | 253 | Similarly, the intrinsic evaluation shown below tests the word vectors' ability to capture the notion of past tense: 254 | 255 | \begin{table}[ht] 256 | \centering 257 | \fontfamily{ppl}\selectfont 258 | \begin{tabular}{ll} 259 | \toprule 260 | Input & Result Produced \\ 261 | \midrule 262 | dancing : danced : : decreasing & decreased \\ 263 | dancing : danced : : describing & described \\ 264 | dancing : danced : : enhancing & enhanced \\ 265 | dancing : danced : : falling & fell\\ 266 | dancing : danced : : feeding & fed\\ 267 | dancing : danced : : flying & flew\\ 268 | dancing : danced : : generating & generated \\ 269 | dancing : danced : : going & went\\ 270 | dancing : danced : : hiding & hid\\ 271 | dancing : danced : : hitting & hit\\ 272 | \bottomrule 273 | \end{tabular} 274 | \caption{Here are \textbf{syntactic} word vector analogies (intrinsic evaluation) that test the notion of past tense} 275 | \label{tab:normaltab} 276 | \end{table} 277 | 278 | \subsection{Intrinsic Evaluation Tuning Example: Analogy Evaluations} 279 | 280 | \marginnote{Some parameters we might consider tuning for a word embedding technique on intrinsic evaluation tasks are: 281 | \begin{itemize} 282 | \item Dimension of word vectors 283 | \item Corpus size 284 | \item Corpus souce/type 285 | \item Context window size 286 | \item Context symmetry 287 | \end{itemize} 288 | Can you think of other hyperparameters tunable at this stage? 289 | } 290 | 291 | We now explore some of the hyperparameters in word vector embedding techniques (such as Word2Vec and GloVe) that can be tuned using an intrinsic evaluation system (such as an analogy completion system). Let us first see how different methods for creating word-vector embeddings have performed (in recent research work) under the same hyperparameters on an analogy evaluation task: 292 | 293 | \begin{table}[ht] 294 | \centering 295 | \fontfamily{ppl}\selectfont 296 | \begin{tabular}{lll | lll} 297 | \toprule 298 | Model & Dimension & Size & Semantics & Syntax & Total \\ 299 | \midrule 300 | ivLBL & 100 & 1.5B & 55.9 & 50.1 & 53.2 \\ 301 | HPCA & 100 & 1.6B & 4.2 & 16.4 & 10.8\\ 302 | GloVE & 100 & 1.6B & 67.5 & 54.3 & 60.3\\ 303 | \hline 304 | SG & 300 & 1B & 61 & 61 & 61\\ 305 | CBOW & 300 & 1.6B & 16.1 & 52.6 & 36.1\\ 306 | vLBL & 300 & 1.5B & 54.2 & 64.8 & 60.0\\ 307 | ivLBL & 300 & 1.5B & 65.2 & 63.0 & 64.0\\ 308 | GloVe & 300 & 1.6B & 80.8 & 61.5 & 70.3\\ 309 | \hline 310 | SVD & 300 & 6B & 6.3 & 8.1 & 7.3\\ 311 | SVD-S & 300 & 6B & 36.7 & 46.6 & 42.1\\ 312 | SVD-L & 300 & 6B & 56.6 & 63.0 & 60.1\\ 313 | CBOW & 300 &6B & 63.6 & 67.4 & 65.7\\ 314 | SG & 300 & 6B & 73.0 & 66.0 & 69.1\\ 315 | GloVe & 300 & 6B & 77.4 & 67.0 & 71.7\\ 316 | \hline 317 | CBOW & 1000 & 6B & 57.3 & 68.9 & 63.7\\ 318 | SG & 1000 & 6B & 66.1 & 65.1 & 65.6\\ 319 | SVD-L & 300 & 42B & 38.4 & 58.2 & 49.2\\ 320 | GloVe & 300 & 42B & 81.9 & 69.3 & 75.0\\ 321 | \bottomrule 322 | \end{tabular} 323 | \caption{Here we compare the performance of different models under the use of different hyperparameters and datasets} 324 | \label{tab:normaltab} 325 | \end{table} 326 | 327 | \FloatBarrier 328 | 329 | Inspecting the above table, we can make 3 primary observations: 330 | 331 | \begin{itemize} 332 | \item \textbf{Performance is heavily dependent on the model used for word embedding:}\\ This is an expected result since different methods try embedding words to vectors using fundamentally different properties (such as co-occurrence count, singular vectors, etc.) 333 | 334 | \marginnote{\textbf{Implementation Tip:} A window size of 8 around each center word typically works well for GloVe embeddings} 335 | 336 | \begin{marginfigure}% 337 | \includegraphics[width=\linewidth]{TrainTime} 338 | \caption{Here we see how training time improves training performance and helps squeeze the last few performance.} 339 | \label{fig:IntrinsicEval} 340 | \end{marginfigure} 341 | 342 | 343 | \item \textbf{Performance increases with larger corpus sizes:}\\ This happens because of the experience an embedding technique gains with more examples it sees. For instance, an analogy completion example will produce incorrect results if it has not encountered the test words previously. 344 | 345 | 346 | \item \textbf{Performance is lower for extremely low as well as for extremely high dimensional word vectors:}\\ Lower dimensional word vectors are not able to capture the different meanings of the different words in the corpus. This can be viewed as a high bias problem where our model complexity is too low. For instance, let us consider the words "king", "queen", "man", "woman". Intuitively, we would need to use two dimensions such as "gender" and "leadership" to encode these into 2-bit word vectors. Any lower would fail to capture semantic differences between the four words and any more may capture noise in the corpus that doesn't help in generalization -- this is also known as the high variance problem. 347 | \end{itemize} 348 | 349 | Figure~\ref{fig:DataSize} demonstrates how accuracy has been shown to improve with larger corpus. 350 | 351 | \begin{figure*}% 352 | \includegraphics[width = 12cm]{DataSize} 353 | \caption{Here we see how performance improves with data size.} 354 | \label{fig:DataSize} 355 | \end{figure*} 356 | 357 | Figure~\ref{fig:hyperparam} demonstrates how other hyperparameters have been shown to affect the accuracies using GloVe. 358 | 359 | \begin{figure*}% 360 | \includegraphics[width = 15cm]{hyperparam} 361 | \caption{We see how accuracies vary with vector dimension and context window size for GloVe} 362 | \label{fig:hyperparam} 363 | \end{figure*} 364 | 365 | \subsection{Intrinsic Evaluation Example: Correlation Evaluation} 366 | Another simple way to evaluate the quality of word vectors is by asking humans to assess the similarity between two words on a fixed scale (say 0-10) and then comparing this with the cosine similarity between the corresponding word vectors. This has been done on various datasets that contain human judgement survey data. 367 | 368 | \begin{table}[ht] 369 | \centering 370 | \fontfamily{ppl}\selectfont 371 | \begin{tabular}{ll | lllll} 372 | \toprule 373 | Model & Size & WS353 & MC & RG & SCWS & RW \\ 374 | \midrule 375 | SVD & 6B & 35.3 & 35.1 & 42.5 & 38.3 & 25.6\\ 376 | SVD-S & 6B & 56.5 & 71.5 & 71.0 & 53.6 & 34.7\\ 377 | SVD-L & 6B & 65.7 & 72.7 & 75.1 & 56.5 & 37.0\\ 378 | CBOW & 6B & 57.2 & 65.6 & 68.2 & 57.0 & 32.5\\ 379 | SG & 6B & 62.8 & 65.2 & 69.7 & 58.1 & 37.2\\ 380 | GloVe & 6B & 65.8 & 72.7 & 77.8 & 53.9 & 38.1\\ 381 | \hline 382 | SVD-L & 42B & 74.0 & 76.4 & 74.1 & 58.3 & 39.9\\ 383 | GloVe & 42B & 75.9 & 83.6 & 82.9 & 59.6 & 47.8\\ 384 | \hline 385 | CBOW & 100B & 68.4 & 79.6 & 75.4 & 59.4 & 45.5\\ 386 | \bottomrule 387 | \end{tabular} 388 | \caption{Here we see the correlations between of word vector similarities using different embedding techniques with different human judgment datasets} 389 | \label{tab:normaltab} 390 | \end{table} 391 | 392 | \subsection{Further Reading: Dealing With Ambiguity} 393 | One might wonder how we handle the situation where we want to capture the same word with different vectors for its different uses in natural language. For instance, "run" is both a noun and a verb and is used and interpreted differently based on the context. \textsc{Improving Word Representations Via Global Context And Multiple Word Prototypes (Huang et al, 2012)} describes how such cases can also be handled in NLP. The essence of the method is the following: 394 | 395 | \begin{enumerate} 396 | \item Gather fixed size context windows of all occurrences of the word (for instance, 5 before and 5 after) 397 | \item Each context is represented by a weighted average of the context words' vectors (using idf-weighting) 398 | \item Apply spherical k-means to cluster these context representations. 399 | \item Finally, each word occurrence is re-labeled to its associated cluster and is used to train the word representation for that cluster. 400 | \end{enumerate} 401 | 402 | For a more rigorous treatment on this topic, one should refer to the original paper. 403 | 404 | \section{Training for Extrinsic Tasks} 405 | 406 | We have so far focused on intrinsic tasks and emphasized their importance in developing a good word embedding technique. Of course, the end goal of most real-world problems is to use the resulting word vectors for some other extrinsic task. Here we discuss the general approach for handling extrinsic tasks. 407 | 408 | \subsection{Problem Formulation} 409 | Most NLP extrinsic tasks can be formulated as classification tasks. For instance, given a sentence, we can classify the sentence to have positive, negative or neutral sentiment. Similarly, in named-entity recognition (NER), given a context and a central word, we want to classify the central word to be one of many classes. For the input, "Jim bought 300 shares of Acme Corp. in 2006", we would like a classified output "[Jim]\textunderscript{Person} bought 300 shares of [Acme Corp.]\textunderscript{Organization} in [2006]\textunderscript{Time}." 410 | 411 | \begin{marginfigure}% 412 | \includegraphics[width = \linewidth]{LinearBoundary} 413 | \caption{We can classify word vectors using simple linear decision boundaries such as the one shown here (2-D word vectors) using techniques such as logistic regression and SVMs} 414 | \label{fig:LinearBoundary} 415 | \end{marginfigure} 416 | 417 | For such problems, we typically begin with a training set of the form: 418 | $$\{x^{(i)},y^{(i)}\}_1^N$$ 419 | where $x^{(i)}$ is a $d$-dimensional word vector generated by some word embedding technique and $y^{(i)}$ is a $C$-dimensional one-hot vector which indicates the labels we wish to eventually predict (sentiments, other words, named entities, buy/sell decisions, etc.). 420 | 421 | In typical machine learning tasks, we usually hold input data and target labels fixed and train weights using optimization techniques (such as gradient descent, L-BFGS, Newton's method, etc.). In NLP applications however, we introduce the idea of retraining the input word vectors when we train for extrinsic tasks. Let us discuss when and why we should consider doing this. 422 | 423 | \marginnote{\textbf{Implementation Tip:} Word vector retraining should be considered for large training datasets. For small datasets, retraining word vectors will likely worsen performance.} 424 | 425 | \subsection{Retraining Word Vectors} 426 | 427 | As we have discussed so far, the word vectors we use for extrinsic tasks are initialized by optimizing them over a simpler intrinsic task. In many cases, these pretrained word vectors are a good proxy for optimal word vectors for the extrinsic task and they perform well at the extrinsic task. However, it is also possible that the pretrained word vectors could be trained further (i.e. retrained) using the extrinsic task this time to perform better. However, retraining word vectors can be risky. 428 | 429 | \begin{marginfigure}% 430 | \includegraphics[width = \linewidth]{pretraining} 431 | \caption{Here, we see that the words "Telly", "TV", and "Television" are classified correctly before retraining. "Telly" and "TV" are present in the extrinsic task training set while "Television" is only present in the test set.} 432 | \label{fig:pretraining} 433 | \end{marginfigure} 434 | 435 | If we retrain word vectors using the extrinsic task, we need to ensure that the training set is large enough to cover most words from the vocabulary. This is because Word2Vec or GloVe produce semantically related words to be located in the same part of the word space. When we retrain these words over a small set of the vocabulary, these words are shifted in the word space and as a result, the performance over the final task could actually reduce. Let us explore this idea further using an example. Consider the pretrained vectors to be in a two dimensional space as shown in Figure~\ref{fig:pretraining}. Here, we see that the word vectors are classified correctly on some extrinsic classification task. Now, if we retrain only two of those vectors because of a limited training set size, then we see in Figure~\ref{fig:retraining} that one of the words gets misclassified because the boundary shifts as a result of word vector updates. 436 | 437 | Thus, word vectors should not be retrained if the training data set is small. If the training set is large, retraining may improve performance. 438 | 439 | \begin{marginfigure}% 440 | \includegraphics[width = \linewidth]{retraining} 441 | \caption{Here, we see that the words "Telly" and "TV" are classified correctly after training, but "Television" is not since it was not present in the training set.} 442 | \label{fig:retraining} 443 | \end{marginfigure} 444 | 445 | \subsection{Softmax Classification and Regularization} 446 | 447 | Let us consider using the Softmax classification function which has the form: 448 | $$p(y_j = 1|x) = \frac{\exp(W_{j\cdot}x)}{\sum_{c=1}^C\exp(W_{c\cdot}x)}$$ 449 | Here, we calculate the probability of word vector $x$ being in class $j$. Using the Cross-entropy loss function, we calculate the loss of such a training example as: 450 | $$-\sum_{j=1}^{C}y_j\log(p(y_j = 1|x)) = -\sum_{j=1}^{C}y_j\log \bigg(\frac{\exp(W_{j\cdot}x)}{\sum_{c=1}^C\exp(W_{c\cdot}x)}\bigg)$$ 451 | Of course, the above summation will be a sum over $(C-1)$ zero values since $y_j$ is $1$ only at a single index (at least for now) implying that $x$ belongs to only $1$ correct class. Thus, let us define $k$ to be the index of the correct class. Thus, we can now simplify our loss to be: 452 | $$-\log \bigg(\frac{\exp(W_{k\cdot}x)}{\sum_{c=1}^C\exp(W_{c\cdot}x)}\bigg)$$ 453 | We can then extend the above loss to a dataset of $N$ points: 454 | $$-\sum_{i = 1}^N\log \bigg(\frac{\exp(W_{k{(i)}\cdot}x^{(i)})}{\sum_{c=1}^C\exp(W_{c\cdot}x^{(i)})}\bigg)$$ 455 | The only difference above is that $k(i)$ is now a function that returns the correct class index for example $x^{(i)}$. 456 | 457 | Let us now try to estimate the number of parameters that would be updated if we consider training both, model weights ($W$), as well word vectors ($x$). We know that a simple linear decision boundary would require a model that takes in at least one $d$-dimensional input word vector and produces a distribution over $C$ classes. Thus, to update the model weights, we would be updating $C\cdot d$ parameters. If we update the word vectors for every word in the vocabulary $V$ as well, then we would be updating as many as $|V|$ word vectors, each of which is $d$-dimensional. Thus, the total number of parameters would be as many as $C\cdot d + |V|\cdot d$ for a simple linear classifier: 458 | 459 | $$\nabla_{\theta} J(\theta) = \left[ \begin{array}{c} \nabla_{W_{\cdot 1}} \\ \vdots \\ \nabla_{W_{\cdot d}} \\ \nabla_{x_{aardvark}} \\ \vdots \\ \nabla_{x_{zebra}} \end{array} \right] $$ 460 | 461 | This is an extremely large number of parameters considering how simple the model's decision boundary is - such a large number of parameters is highly prone to overfitting. 462 | 463 | To reduce overfitting risk, we introduce a regularization term which poses the Bayesian belief that the parameters ($\theta$) should be small is magnitude (i.e. close to zero): 464 | $$-\sum_{i = 1}^N\log \bigg(\frac{\exp(W_{k{(i)}\cdot}x^{(i)})}{\sum_{c=1}^C\exp(W_{c\cdot}x^{(i)})}\bigg) + \lambda \sum_{k=1}^{C\cdot d + |V|\cdot d} \theta_k^2$$ 465 | 466 | Minimizing the above cost function reduces the likelihood of the parameters taking on extremely large values just to fit the training set well and may improve generalization if the relative objective weight $\lambda$ is tuned well. The idea of regularization becomes even more of a requirement once we explore more complex models (such as Neural Networks) which have far more parameters. 467 | 468 | \subsection{Window Classification} 469 | 470 | \begin{marginfigure}% 471 | \includegraphics[width = \linewidth]{window} 472 | \caption{Here, we see a central word with a symmetric window of length 2. Such context may help disambiguate between the place Paris and the name Paris.} 473 | \label{fig:window} 474 | \end{marginfigure} 475 | 476 | So far we have primarily explored the idea of predicting in extrinsic tasks using a single word vector $x$. In reality, this is hardly done because of the nature of natural languages. Natural languages tend to use the same word for very different meanings and we typically need to know the context of the word usage to discriminate between meanings. For instance, if you were asked to explain to someone what "to sanction" meant, you would immediately realize that depending on the context "to sanction" could mean "to permit" or "to punish". In most situations, we tend to use a sequence of words as input to the model. A sequence is a central word vector preceded and succeeded by context word vectors. The number of words in the context is also known as the context window size and varies depending on the problem being solved. Generally, narrower window sizes lead to better performance in syntactic tests while wider windows lead to better performance in semantic tests. 477 | 478 | \marginnote{Generally, narrower window sizes lead to better performance in syntactic tests while wider windows lead to better performance in semantic tests.} 479 | 480 | In order to modify the previously discussed Softmax model to use windows of words for classification, we would simply substitute $x^{(i)}$ with $x_{window}^{(i)}$ in the following manner: 481 | 482 | $$x_{window}^{(i)} = \left[ \begin{array}{c} x^{(i-2)} \\ x^{(i-1)} \\ x^{(i)} \\ x^{(i+1)} \\ x^{(i+2)} \end{array} \right] $$ 483 | 484 | As a result, when we evaluate the gradient of the loss with respect to the words, we will receive gradients for the word vectors: 485 | 486 | $$\delta_{window} = \left[ \begin{array}{c} \nabla_{x^{(i-2)}} \\ \nabla_{x^{(i-1)}} \\ \nabla_{x^{(i)}} \\ \nabla_{x^{(i+1)}} \\ \nabla_{x^{(i+2)}} \end{array} \right] $$ 487 | 488 | The gradient will of course need to be distributed to update the corresponding word vectors in implementation. 489 | 490 | \subsection{Non-linear Classifiers} 491 | 492 | \begin{marginfigure}% 493 | \includegraphics[width = \linewidth]{LinearBoundary2} 494 | \caption{Here, we see that many examples are wrongly classified even though the best linear decision boundary is chosen. This is due linear decision boundaries have limited model capacity for this dataset.} 495 | \label{fig:LinearBoundary2} 496 | \end{marginfigure} 497 | 498 | We now introduce the need for non-linear classification models such as neural networks. We see in Figure~\ref{fig:LinearBoundary2} that a linear classifier misclassifies many datapoints. Using a non-linear decision boundary as shown in Figure~\ref{fig:NonlinearBoundary}, we manage to classify all training points accurately. Although oversimplified, this is a classic case demonstrating the need for non-linear decision boundaries. In the next set of notes, we study neural networks as a class of non-linear models that have performed particularly well in deep learning applications. 499 | 500 | \begin{marginfigure}% 501 | \includegraphics[width = \linewidth]{NonlinearBoundary} 502 | \caption{Here, we see that the non-linear decision boundary allows for much better classification of datapoints.} 503 | \label{fig:NonlinearBoundary} 504 | \end{marginfigure} 505 | 506 | \end{document} 507 | -------------------------------------------------------------------------------- /notes2/Resources/ImageBlocks.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes2/Resources/ImageBlocks.pptx -------------------------------------------------------------------------------- /notes2/fig/DataSize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes2/fig/DataSize.png -------------------------------------------------------------------------------- /notes2/fig/IntrinsicEval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes2/fig/IntrinsicEval.png -------------------------------------------------------------------------------- /notes2/fig/LinearBoundary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes2/fig/LinearBoundary.png -------------------------------------------------------------------------------- /notes2/fig/LinearBoundary2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes2/fig/LinearBoundary2.png -------------------------------------------------------------------------------- /notes2/fig/NonlinearBoundary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes2/fig/NonlinearBoundary.png -------------------------------------------------------------------------------- /notes2/fig/TrainTime.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes2/fig/TrainTime.png -------------------------------------------------------------------------------- /notes2/fig/hyperparam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes2/fig/hyperparam.png -------------------------------------------------------------------------------- /notes2/fig/pretraining.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes2/fig/pretraining.png -------------------------------------------------------------------------------- /notes2/fig/retraining.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes2/fig/retraining.png -------------------------------------------------------------------------------- /notes2/fig/window.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes2/fig/window.png -------------------------------------------------------------------------------- /notes3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3.pdf -------------------------------------------------------------------------------- /notes3/Resources/ImageBlocks.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/Resources/ImageBlocks.pptx -------------------------------------------------------------------------------- /notes3/Resources/NNet.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/Resources/NNet.pptx -------------------------------------------------------------------------------- /notes3/fig/421nnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/421nnet.png -------------------------------------------------------------------------------- /notes3/fig/ErrorSignal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/ErrorSignal.png -------------------------------------------------------------------------------- /notes3/fig/ErrorSignal2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/ErrorSignal2.png -------------------------------------------------------------------------------- /notes3/fig/ErrorSignal3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/ErrorSignal3.png -------------------------------------------------------------------------------- /notes3/fig/Error_Surf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/Error_Surf.png -------------------------------------------------------------------------------- /notes3/fig/NonlinearBoundary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/NonlinearBoundary.png -------------------------------------------------------------------------------- /notes3/fig/SimpleFF.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/SimpleFF.png -------------------------------------------------------------------------------- /notes3/fig/SingleLayerNeuralNetwork.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/SingleLayerNeuralNetwork.png -------------------------------------------------------------------------------- /notes3/fig/dropout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/dropout.png -------------------------------------------------------------------------------- /notes3/fig/graph_hardtanh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/graph_hardtanh.png -------------------------------------------------------------------------------- /notes3/fig/graph_leaky.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/graph_leaky.png -------------------------------------------------------------------------------- /notes3/fig/graph_relu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/graph_relu.png -------------------------------------------------------------------------------- /notes3/fig/graph_sigmoid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/graph_sigmoid.png -------------------------------------------------------------------------------- /notes3/fig/graph_softsign.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/graph_softsign.png -------------------------------------------------------------------------------- /notes3/fig/graph_tanh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/graph_tanh.png -------------------------------------------------------------------------------- /notes3/fig/sigmoidneuron.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes3/fig/sigmoidneuron.png -------------------------------------------------------------------------------- /notes4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes4.pdf -------------------------------------------------------------------------------- /notes4.tex: -------------------------------------------------------------------------------- 1 | \documentclass{tufte-handout} 2 | 3 | \title{CS224n: Natural Language Processing with Deep Learning 4 | \thanks{Course Instructors: Christopher Manning, Richard Socher} \\ 5 | \Large Lecture Notes: Part IV\thanks{Authors: Lisa Wang, Juhi Naik, and Shayne Longpre}} 6 | 7 | \date{Winter 2017} % without \date command, current date is supplied 8 | 9 | %\geometry{showframe} % display margins for debugging page layout 10 | 11 | \usepackage{graphicx} % allow embedded images 12 | \setkeys{Gin}{width=\linewidth,totalheight=\textheight,keepaspectratio} 13 | \graphicspath{{notes4/fig/}} % set of paths to search for images 14 | \usepackage{amsmath} % extended mathematics 15 | \usepackage{amstext} % extended text 16 | \usepackage{booktabs} % book-quality tables 17 | \usepackage{units} % non-stacked fractions and better unit spacing 18 | \usepackage{multicol} % multiple column layout facilities 19 | \usepackage{lipsum} % filler text 20 | \usepackage{fancyvrb} % extended verbatim environments 21 | \usepackage{placeins} 22 | \fvset{fontsize=\normalsize}% default font size for fancy-verbatim environments 23 | \usepackage[normalem]{ulem} 24 | \usepackage{algpseudocode} 25 | \usepackage{algorithm} 26 | 27 | 28 | % tikz package 29 | \usepackage{tikz} 30 | \usetikzlibrary{patterns, shapes,calc,positioning,arrows,mindmap,matrix} 31 | \usetikzlibrary{decorations.pathreplacing} 32 | 33 | % Standardize command font styles and environments 34 | \newcommand{\doccmd}[1]{\texttt{\textbackslash#1}}% command name -- adds backslash automatically 35 | \newcommand{\docopt}[1]{\ensuremath{\langle}\textrm{\textit{#1}}\ensuremath{\rangle}}% optional command argument 36 | \newcommand{\docarg}[1]{\textrm{\textit{#1}}}% (required) command argument 37 | \newcommand{\docenv}[1]{\textsf{#1}}% environment name 38 | \newcommand{\docpkg}[1]{\texttt{#1}}% package name 39 | \newcommand{\doccls}[1]{\texttt{#1}}% document class name 40 | \newcommand{\docclsopt}[1]{\texttt{#1}}% document class option name 41 | \newenvironment{docspec}{\begin{quote}\noindent}{\end{quote}}% command specification environment 42 | \newcommand{\argmin}{\operatornamewithlimits{argmin}} 43 | \newcommand{\argmax}{\operatornamewithlimits{argmax}} 44 | \newcommand{\textunderscript}[1]{$_{\text{#1}}$} 45 | 46 | \setcounter{secnumdepth}{3} 47 | 48 | \begin{document} 49 | 50 | \maketitle% this prints the handout title, author, and date 51 | 52 | \textbf{Keyphrases: Dependency Parsing.} 53 | %\printclassoptions 54 | \section{Dependency Grammar and Dependency Structure} 55 | Parse trees in NLP, analogous to those in compilers, are used to analyze the syntactic structure of sentences. There are two main types of structures used - constituency structures and dependency structures. 56 | 57 | Constituency Grammar uses phrase structure grammar to organize words into nested constituents. This will be covered in more detail in following chapters. We now focus on Dependency Parsing. 58 | 59 | Dependency structure of sentences shows which words depend on (modify or are arguments of) which other words. These binary asymmetric relations between the words are called dependencies and are depicted as arrows going from the \textbf{head} (or governor, superior, regent) to the \textbf{dependent} (or modifier, inferior, subordinate). Usually these dependencies form a tree structure. They are often typed with the name of grammatical relations (subject, prepositional object, apposition, etc.). An example of such a dependency tree is shown in Figure~\ref{fig:dep_tree}. Sometimes a fake \textsc{ROOT} node is added as the head to the whole tree so that every word is a dependent of exactly one node. 60 | 61 | \begin{marginfigure} 62 | \centering 63 | \includegraphics[width=\linewidth]{dep_tree.png} 64 | \caption {Dependency tree for the sentence "Bills on ports and immigration were submitted by Senator Brownback, Republican of Kansas"} 65 | \label{fig:dep_tree} 66 | \end{marginfigure} 67 | 68 | \subsection{Dependency Parsing} 69 | 70 | Dependency parsing is the task of analyzing the syntactic dependency structure of a given input sentence $S$. The output of a dependency parser is a dependency tree where the words of the input sentence are connected by typed dependency relations. Formally, the dependency parsing problem asks to create a mapping from the input sentence with words $S=w_0w_1...w_n$ (where $w_0$ is the $\textsc{ROOT}$) to its dependency tree graph $G$. 71 | Many different variations of dependency-based methods have been developed in recent years, including neural network-based methods, which we will describe later. \\ 72 | % they all have in common that they do not make any assumptions of the specific dependency types (e.g. grammatical functions). \\ 73 | To be precise, there are two subproblems in dependency parsing (adapted from Kuebler et al., chapter 1.2): 74 | \begin{enumerate} 75 | \item \textit{Learning:} 76 | Given a training set $D$ of sentences annotated with dependency graphs, induce a parsing model $M$ that can be used to parse new sentences. 77 | \item \textit{Parsing:} 78 | Given a parsing model $M$ and a sentence $S$, derive the optimal dependency graph $D$ for $S$ according to $M$. 79 | \end{enumerate} 80 | 81 | 82 | % copy over to references 83 | % @article{kubler2009dependency, 84 | % title={Dependency parsing}, 85 | % author={K{\"u}bler, Sandra and McDonald, Ryan and Nivre, Joakim}, 86 | % journal={Synthesis Lectures on Human Language Technologies}, 87 | % volume={1}, 88 | % number={1}, 89 | % pages={1--127}, 90 | % year={2009}, 91 | % publisher={Morgan \& Claypool Publishers} 92 | % } 93 | 94 | \subsection{Transition-Based Dependency Parsing} 95 | Transition-based dependency parsing relies on a state machine which defines the possible transitions to create the mapping from the input sentence to the dependency tree. 96 | The \textit{learning problem} is to induce a model which can predict the next transition in the state machine based on the transition history. The \textit{parsing problem} is to construct the optimal sequence of transitions for the input sentence, given the previously induced model. 97 | Most transition-based systems do not make use of a formal grammar. 98 | 99 | \subsection{Greedy Deterministic Transition-Based Parsing} 100 | % using a greedy deterministic parsing algorithm. 101 | This system was introduced by Nivre in 2003 and was radically different from other methods in use at that time.\\ 102 | This transition system is a state machine, which consists of \textit{states} and \textit{transitions} between those states. The model induces a sequence of transitions from some \textit{initial} state to one of several \textit{terminal} states. 103 | 104 | \textbf{States: }\\ 105 | For any sentence $S=w_0w_1...w_n$, a state can be described with a triple $c=(\sigma, \beta, A)$: 106 | \begin{enumerate} 107 | \item a stack $\sigma$ of words $w_i$ from $S$, 108 | \item a buffer $\beta$ of words $w_i$ from $S$, 109 | \item a set of dependency arcs $A$ of the form $(w_i, r, w_j)$, where $w_i, w_j$ are from $S$, and $r$ describes a dependency relation. 110 | \end{enumerate} 111 | It follows that for any sentence $S=w_0w_1...w_n$, 112 | \begin{enumerate} 113 | \item an \textit{initial} state $c_0$ is of the form $([ w_0]_{\sigma}, [w_1, ...,w_n]_{\beta}, \emptyset)$ (only the $\textsc{ROOT}$ is on the stack $\sigma$, all other words are in the buffer $\beta$ and no actions have been chosen yet), 114 | \item a terminal state has the form $(\sigma, []_{\beta}, A)$. 115 | \end{enumerate} 116 | 117 | \textbf{Transitions:} \\ 118 | % add image of transitions 119 | 120 | \begin{marginfigure} 121 | \centering 122 | \includegraphics[width=\linewidth]{transitions} 123 | \caption{Transitions for Dependency Parsing.} 124 | \label{fig:transitions} 125 | \end{marginfigure} 126 | 127 | There are three types of transitions between states: 128 | \begin{enumerate} 129 | \item $\textsc{Shift}$: Remove the first word in the buffer and push it on top of the stack. (Pre-condition: buffer has to be non-empty.) 130 | % \item $\textsc{Left-Arc}_r$: Add a dependency arc $(w_j, r, w_i)$ to the arc set $A$, where $w_i$ is the word on the top of the stack and $w_j$ is the first word in the buffer. Pop $w_i$ of the stack. (Pre-condition: both the stack and the buffer have to be non-empty and $w_i$ cannot be the $\textsc{ROOT}$. ) 131 | % \item $\textsc{Right-Arc}_r$: Add a dependency arc $(w_i, r, w_j)$ to the arc set $A$, where $w_i$ is the word on the top of the stack and $w_j$ is the first word in the buffer. Pop $w_i$ of the stack. Replace $w_j$ with $w_i$ at the front of the buffer. (Pre-condition: both the stack and the buffer have to be non-empty.) 132 | \item $\textsc{Left-Arc}_r$: Add a dependency arc $(w_j, r, w_i)$ to the arc set $A$, where $w_i$ is the word second to the top of the stack and $w_j$ is the word at the top of the stack. Remove $w_i$ from the stack. (Pre-condition: the stack needs to contain at least two items and $w_i$ cannot be the $\textsc{ROOT}$.) 133 | \item $\textsc{Right-Arc}_r$: Add a dependency arc $(w_i, r, w_j)$ to the arc set $A$, where $w_i$ is the word second to the top of the stack and $w_j$ is the word at the top of the stack. Remove $w_j$ from the stack. (Pre-condition: The stack needs to contain at least two items.) 134 | \end{enumerate} 135 | A more formal definition of these three transitions is presented in Figure ~\ref{fig:transitions}. 136 | 137 | \subsection{Neural Dependency Parsing} 138 | 139 | While there are many deep models for dependency parsing, this section focuses specifically on greedy, transition-based neural dependency parsers. This class of model has demonstrated comparable performance and significantly better efficiency than traditional feature-based discriminative dependency parsers. The primary distinction from previous models is the reliance on dense rather than sparse feature representations. 140 | 141 | The model we will describe employs the arc-standard system for transitions, as presented in section 1.3. Ultimately, the aim of the model is to predict a transition sequence from some initial configuration $c$ to a terminal configuration, in which the dependency parse tree is encoded. As the model is greedy, it attempts to correctly predict one transition $T\in\{\textsc{shift},\textsc{Left-Arc}_r,\textsc{Right-Arc}_r\}$ at a time, based on features extracted from the current configuration $c=(\sigma, \beta, A)$. Recall, $\sigma$ is the stack, $\beta$ the buffer, and $A$ the set of dependency arcs for a given sentence. 142 | \par% or empty line in the source code 143 | \bigskip 144 | \textbf{Feature Selection:} 145 | \par% or empty line in the source code 146 | \bigskip 147 | 148 | Depending on the desired complexity of the model, there is flexibility in defining the input to the neural network. The features for a given sentence $S$ generally include some subset of: 149 | 150 | \begin{enumerate} 151 | \item $S_{word}$: Vector representations for some of the words in $S$ (and their dependents) at the top of the stack $\sigma$ and buffer $\beta$. 152 | \item $S_{tag}$: Part-of-Speech (POS) tags for some of the words in $S$. POS tags comprise a small, discrete set: $\mathcal{P}=\{NN,NNP,NNS,DT,JJ,...\}$ 153 | \item $S_{label}$: The arc-labels for some of the words in $S$. The arc-labels comprise a small, discrete set, describing the dependency relation: $\mathcal{L}=\{amod,tmod,nsubj,csubj,dobj,...\}$ 154 | \end{enumerate} 155 | For each feature type, we will have a corresponding embedding matrix, mapping from the feature's one hot encoding, to a $d$-dimensional dense vector representation. The full embedding matrix for $S_{word}$ is $E^w \in \mathbb{R}^{d \times N_w}$ where $N_w$ is the dictionary/vocabulary size. Correspondingly, the POS and label embedding matrices are $E^t \in \mathbb{R}^{d \times N_t}$ and $E^l \in \mathbb{R}^{d \times N_l}$ where $N_t$ and $N_l$ are the number of distinct POS tags and arc labels. \\ 156 | Lastly, let the number of chosen elements from each set of features be denoted as $n_{word}$, $n_{tag}$, and $n_{label}$ respectively. \\ 157 | 158 | \par% or empty line in the source code 159 | \bigskip 160 | \textbf{Feature Selection Example:} 161 | \par% or empty line in the source code 162 | \bigskip 163 | 164 | As an example, consider the following choices for $S_{word}$, $S_{tag}$, and $S_{label}$. 165 | \begin{enumerate} 166 | \item $S_{word}$: The top 3 words on the stack and buffer: $s_1, s_2, s_3, b_1, b_2, b_3$. The first and second leftmost / rightmost children of the top two words on the stack: $lc_1(s_i), rc_1(s_i), lc_2(s_i), rc_2(s_i)$, $i = 1, 2$. The leftmost of leftmost / rightmost of rightmost children of the top two words on the stack: $lc_1(lc_1(s_i)), rc_1(rc_1(s_i))$, $i = 1, 2$. In total $S_{word}$ contains $n_w = 18$ elements. 167 | \item $S_{tag}$: The corresponding POS tags for $S_{tag}$ ($n_t = 18$). 168 | \item $S_{label}$: The corresponding arc labels of words, excluding those 6 words on the stack/buffer ($n_l = 12$). 169 | \end{enumerate} 170 | Note that we use a special $\textsc{Null}$ token for non-existent elements: when the stack and buffer are empty or dependents have not been assigned yet. For a given sentence example, we select the words, POS tags and arc labels given the schematic defined above, extract their corresponding dense feature representations produced from the embedding matrices $E^w$, $E^t$, and $E^l$, and concatenate these vectors into our inputs $[x^{w},x^{t},x^{l}]$. At training time we backpropagate into the dense vector representations, as well as the parameters at later layers. 171 | 172 | \par% or empty line in the source code 173 | \bigskip 174 | \textbf{Feedforward Neural Network Model:} 175 | \par% or empty line in the source code 176 | \bigskip 177 | 178 | The network contains an input layer $[x^{w},x^{t},x^{l}]$, a hidden layer, and a final softmax layer with a cross-entropy loss function. We can either define a single weight matrix in the hidden layer, to operate on a concatenation of $[x^{w},x^{t},x^{l}]$, or we can use three weight matrices $[W_{1}^{w},W_{1}^{t},W_{1}^{l}]$, one for each input type, as shown in Figure ~\ref{fig:model_arch}. We then apply a non-linear function and use one more affine layer $[W_{2}]$ so that there are an equivalent number of softmax probabilities to the number of possible transitions (the output dimension). 179 | 180 | \begin{figure*}[!ht] 181 | \input{notes4/fig/nn.tex} 182 | \caption{The neural network architecture for greedy, transition-based dependency parsing.} 183 | \label{fig:model_arch} 184 | \end{figure*} 185 | 186 | Note that in Figure ~\ref{fig:model_arch}, $f(x)=x^3$ is the non-linear function used. 187 | 188 | \par% or empty line in the source code 189 | \bigskip 190 | For a more complete explanation of a greedy transition-based neural dependency parser, refer to "A Fast and Accurate Dependency Parser using Neural Networks" under Further Reading. 191 | 192 | \par% or empty line in the source code 193 | \bigskip 194 | 195 | \textbf{Further reading:} \\ 196 | 197 | Danqi Chen, and Christopher D. Manning. "A Fast and Accurate Dependency Parser using Neural Networks." EMNLP. 2014. \\ 198 | 199 | Kuebler, Sandra, Ryan McDonald, and Joakim Nivre. ``Dependency parsing.'' Synthesis Lectures on Human Language Technologies 1.1 (2009): 1-127. 200 | 201 | \end{document} 202 | -------------------------------------------------------------------------------- /notes4/fig/dep_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes4/fig/dep_tree.png -------------------------------------------------------------------------------- /notes4/fig/nn.tex: -------------------------------------------------------------------------------- 1 | % \ifx \allfiles \undefined 2 | 3 | % \documentclass{article} 4 | 5 | % \usepackage{tikz} 6 | % \usetikzlibrary{shapes,calc,positioning,arrows,mindmap,matrix} 7 | % \usetikzlibrary{decorations.pathreplacing} 8 | 9 | % \begin{document} 10 | % \fi 11 | 12 | \def\layersep{1.2cm} 13 | \def\numHidden{6} 14 | \def\numOutput{4} 15 | \def\dx{3.3} 16 | \def\dy{2.2} 17 | \tikzset{ 18 | treenode/.style = {align=center, inner sep=4pt, text centered,font=\sffamily}, 19 | node/.style = {treenode, minimum width=1.3em, text height=1em}, 20 | line/.style = {very thick, dashed, rounded corners, fill=orange!15!white, fill opacity=0.2} 21 | } 22 | 23 | % \tikzset{ 24 | 25 | % line/.style = {very thick, dashed, fill=orange!30!white, fill opacity=0.2} 26 | % } 27 | 28 | \begin{tikzpicture}[scale=0.8,shorten >=1pt,->,draw=black!50, node distance=\layersep] 29 | 30 | \tikzstyle{neuron}=[circle,fill=black!25,minimum size=13pt,inner sep=0pt] 31 | \tikzstyle{annot} = [text width=15em, text centered] 32 | 33 | \foreach \name / \y in {1,...,3} 34 | \node[neuron,fill=red!50] (I-\name) at (\y/2, 0) {}; 35 | 36 | \node[annot] (cdot-1) at (3/2+0.8,0) {$\cdots$}; 37 | 38 | \foreach \name / \y in {4,...,6} 39 | \node[neuron,fill=red!50] (I-\name) at (\y/2+1, 0) {}; 40 | 41 | \foreach \name / \y in {7,...,9} 42 | \node[neuron,fill=blue!50,postaction={pattern=north east lines}] (I-\name) at (\y/2+1.5, 0) {}; 43 | 44 | \node[annot] (cdot-2) at (9/2+2.3,0) {$\cdots$}; 45 | 46 | \foreach \name / \y in {10,...,12} 47 | \node[neuron,fill=blue!50,postaction={pattern=north east lines}] (I-\name) at (\y/2+2.5, 0) {}; 48 | 49 | \foreach \name / \y in {13,...,15} 50 | \node[neuron,fill=orange!50,postaction={pattern=north west lines}] (I-\name) at (\y/2+3, 0) {}; 51 | 52 | \foreach \name / \y in {1,...,3} 53 | \node[neuron,fill=gray!50] (H-\name) at (\y*0.8+2.4,\layersep) {}; 54 | 55 | \node[annot] (cdot-3) at (5.5,\layersep) {$\cdots$}; 56 | 57 | \foreach \name / \y in {4,...,6} 58 | \node[neuron,fill=gray!50] (H-\name) at (\y*0.8+3.1,\layersep) {}; 59 | 60 | 61 | \foreach \name / \y in {1,...,2} 62 | \node[neuron,fill=black!50] (S-\name) at (\y*0.8+3.1,\layersep*2) {}; 63 | 64 | \node[annot] (cdot-4) at (5.5,\layersep*2) {$\cdots$}; 65 | 66 | \foreach \name / \y in {3,...,4} 67 | \node[neuron,fill=black!50] (S-\name) at (\y*0.8+3.8,\layersep*2) {}; 68 | 69 | 70 | \foreach \source in {2,5,8,11,14} 71 | \foreach \dest in {2, 5} 72 | \path (I-\source) edge (H-\dest); 73 | 74 | \foreach \source in {2, 5} 75 | \foreach \dest in {1,...,\numOutput} 76 | \path (H-\source) edge (S-\dest); 77 | 78 | 79 | \node[annot] at (-3,0) {\textbf{Input layer}: $[x^w, x^t, x^l]$}; 80 | \node[annot] at (-3,\layersep){\textbf{Hidden layer}: \\ $h = (W^w_1 x^w + W^t_1 x^t + W^l_1 x^l + b_1)^3$}; 81 | \node[annot] at (-3,\layersep*2){\textbf{Softmax layer}: \\ $p = \texttt{softmax}(W_2 h)$}; 82 | 83 | \draw [line] ($(I-1.south west)+(-0.2, -0.2)$) rectangle ($(I-3.north east)+( 0.2, 0.2)$); 84 | \draw [line] ($(I-4.south west)+(-0.2, -0.2)$) rectangle ($(I-6.north east)+( 0.2, 0.2)$); 85 | \draw [line] ($(I-7.south west)+(-0.2, -0.2)$) rectangle ($(I-9.north east)+( 0.2, 0.2)$); 86 | \draw [line] ($(I-10.south west)+(-0.2, -0.2)$) rectangle ($(I-12.north east)+( 0.2, 0.2)$); 87 | \draw [line] ($(I-13.south west)+(-0.2, -0.2)$) rectangle ($(I-15.north east)+( 0.2, 0.2)$); 88 | 89 | \draw [line,solid] ($(I-1.south west)+(-0.3, -0.3)$) rectangle ($(I-15.north east)+( 0.3, 0.3)$); 90 | 91 | \draw [line,solid] ($(H-1.south west)+(-0.2, -0.2)$) rectangle ($(H-\numHidden.north east)+( 0.2, 0.2)$); 92 | \draw [line,solid] ($(S-1.south west)+(-0.3, -0.3)$) rectangle ($(S-\numOutput.north east)+( 0.3, 0.3)$); 93 | 94 | \draw [decorate,decoration={brace,amplitude=5pt,mirror,raise=3pt}] ($(I-1.south west)+(-0.3, -0.3)$) -- ($(I-6.south east)+(0.3, -0.3)$) node [black,midway,yshift=-0.6cm] {words}; 95 | 96 | \draw [decorate,decoration={brace,amplitude=5pt,mirror,raise=3pt}] ($(I-7.south west)+(-0.3, -0.3)$) -- ($(I-12.south east)+(0.3, -0.3)$) node [black,midway,yshift=-0.6cm] {POS tags}; 97 | 98 | \draw [decorate,decoration={brace,amplitude=5pt,mirror,raise=3pt}] ($(I-13.south west)+(-0.3, -0.3)$) -- ($(I-15.south east)+(0.3, -0.3)$) node [black,midway,yshift=-0.6cm] {arc labels}; 99 | 100 | 101 | \node [node] at (-2+\dx,-5+\dy) (1) {ROOT}; 102 | 103 | \node [node] at (0+\dx,-5+\dy) (2) {has\_VBZ} 104 | child[level distance=1.25cm] 105 | { 106 | node [node,xshift=-1cm,font=\sffamily] (4) {He\_PRP} 107 | edge from parentnode[left, xshift=1.3cm, yshift=-0.2cm] {nsubj} 108 | }; 109 | 110 | \node [node] at (0+\dx,-5+\dy) (2) {has\_VBZ}; 111 | \node [node] at (2+\dx,-5+\dy) (3) {good\_JJ}; 112 | \node [node] at (5+\dx,-5+\dy) (4) {control\_NN}; 113 | \node [node] at (7+\dx,-5+\dy) (5) {.\_.}; 114 | 115 | \node [node] at (0+\dx,-4+\dy) {Stack}; 116 | \node [node] at (6+\dx,-4+\dy) {Buffer}; 117 | 118 | % \path (3) edge (I-5); 119 | % \path (3) edge (I-8); 120 | 121 | % \draw [dashed, thick, ->] (2+\dx,-4.8+\dy) -- (3.5,-0.2); 122 | % \draw [dashed, thick, ->] (2.5+\dx,-4.8+\dy) -- (5.5,-0.2); 123 | % \draw [dashed, thick, ->] (0.5+\dx,-5.8+\dy) -- (10,-0.2); 124 | 125 | \node[annot] at (-3,-5+\dy){\textbf{Configuration}}; 126 | 127 | \draw [very thick, fill=orange!30!white, fill opacity=0.2] ($(1.south west)+(-0.1, -0.1)$) rectangle ($(3.north east)+( 0.1, 0.1)$); 128 | \draw [very thick, fill=orange!30!white, fill opacity=0.2] ($(4.south west)+(-0.1, -0.1)$) rectangle ($(5.north east)+( 0.1, 0.1)$); 129 | 130 | \end{tikzpicture} 131 | % End of code 132 | 133 | % \ifx \allfiles \undefined 134 | % \end{document} 135 | % \fi 136 | -------------------------------------------------------------------------------- /notes4/fig/transitions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes4/fig/transitions.png -------------------------------------------------------------------------------- /notes4/reference.bib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes4/reference.bib -------------------------------------------------------------------------------- /notes5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes5.pdf -------------------------------------------------------------------------------- /notes5.tex: -------------------------------------------------------------------------------- 1 | \documentclass{tufte-handout} 2 | 3 | \title{CS224n: Natural Language Processing with Deep Learning 4 | \thanks{Course Instructors: Christopher Manning, Richard Socher} \\ 5 | \Large Lecture Notes: Part V\thanks{Authors: Milad Mohammadi, Rohit Mundra, Richard Socher, Lisa Wang}} 6 | 7 | \date{Winter 2017} % without \date command, current date is supplied 8 | 9 | %\geometry{showframe} % display margins for debugging page layout 10 | 11 | \usepackage{graphicx} % allow embedded images 12 | \setkeys{Gin}{width=\linewidth,totalheight=\textheight,keepaspectratio} 13 | \graphicspath{{notes5/fig/}} % set of paths to search for images 14 | \usepackage{amsmath} % extended mathematics 15 | \usepackage{amstext} % extended text 16 | \usepackage{booktabs} % book-quality tables 17 | \usepackage{units} % non-stacked fractions and better unit spacing 18 | \usepackage{multicol} % multiple column layout facilities 19 | \usepackage{lipsum} % filler text 20 | \usepackage{fancyvrb} % extended verbatim environments 21 | \usepackage{placeins} 22 | \fvset{fontsize=\normalsize}% default font size for fancy-verbatim environments 23 | \usepackage[normalem]{ulem} 24 | \usepackage{algpseudocode} 25 | \usepackage{algorithm} 26 | 27 | 28 | % tikz package 29 | \usepackage{tikz} 30 | \usetikzlibrary{patterns, shapes,calc,positioning,arrows,mindmap,matrix} 31 | \usetikzlibrary{decorations.pathreplacing} 32 | 33 | % Standardize command font styles and environments 34 | \newcommand{\doccmd}[1]{\texttt{\textbackslash#1}}% command name -- adds backslash automatically 35 | \newcommand{\docopt}[1]{\ensuremath{\langle}\textrm{\textit{#1}}\ensuremath{\rangle}}% optional command argument 36 | \newcommand{\docarg}[1]{\textrm{\textit{#1}}}% (required) command argument 37 | \newcommand{\docenv}[1]{\textsf{#1}}% environment name 38 | \newcommand{\docpkg}[1]{\texttt{#1}}% package name 39 | \newcommand{\doccls}[1]{\texttt{#1}}% document class name 40 | \newcommand{\docclsopt}[1]{\texttt{#1}}% document class option name 41 | \newenvironment{docspec}{\begin{quote}\noindent}{\end{quote}}% command specification environment 42 | \newcommand{\argmin}{\operatornamewithlimits{argmin}} 43 | \newcommand{\argmax}{\operatornamewithlimits{argmax}} 44 | \newcommand{\textunderscript}[1]{$_{\text{#1}}$} 45 | 46 | \setcounter{secnumdepth}{3} 47 | 48 | \begin{document} 49 | 50 | \maketitle% this prints the handout title, author, and date 51 | 52 | \textbf{Keyphrases: Language Models. RNN. Bi-directional RNN. Deep RNN. GRU. LSTM.} 53 | 54 | \section{Language Models} 55 | Language models compute the probability of occurrence of a number of words in a particular sequence. The probability of a sequence of $m$ words $\{w_1, ..., w_m \}$ is denoted as $P(w_1,...,w_m)$. Since the number of words coming before a word, $w_i$, varies depending on its location in the input document, $P(w_1,...,w_m)$ is usually conditioned on a window of $n$ previous words rather than all previous words: 56 | 57 | \begin{equation} 58 | P(w_1,...,w_m) = \prod_{i=1}^{i=m} P(w_{i} | w_1, ..., w_{i-1}) \approx \prod_{i=1}^{i=m} P(w_{i} | w_{i-n}, ..., w_{i-1}) 59 | \label{eqn:nat_model} 60 | \end{equation} 61 | 62 | Equation~\ref{eqn:nat_model} is especially useful for speech and translation systems when determining whether a word sequence is an accurate translation of an input sentence. In existing language translation systems, for each phrase / sentence translation, the software generates a number of alternative word sequences (e.g. \textit{\{I have, I had, I has, me have, me had\}}) and scores them to identify the most likely translation sequence. 63 | 64 | In machine translation, the model chooses the best word ordering for an input phrase by assigning a \textit{goodness} score to each output word sequence alternative. To do so, the model may choose between different word ordering or word choice alternatives. It would achieve this objective by running all word sequence candidates through a probability function that assigns each a score. The sequence with the highest score is the output of the translation. For example, the machine would give a higher score to \textit{"the cat is small"} compared to \textit{"small the is cat"}, and a higher score to \textit{"walking home after school"} compare do \textit{"walking house after school"}. To compute these probabilities, the count of each n-gram would be compared against the frequency of each word. For instance, if the model takes bi-grams, the frequency of each bi-gram, calculated via combining a word with its previous word, would be divided by the frequency of the corresponding uni-gram. Equations~\ref{eqn:bigram} and \ref{eqn:trigram} show this relationship for bigram and trigram models. 65 | 66 | \begin{equation} 67 | p(w_2 | w_1) = \dfrac {count(w_1,w_2)}{count(w_1)} 68 | \label{eqn:bigram} 69 | \end{equation} 70 | \begin{equation} 71 | p(w_3 | w_1, w_2) = \dfrac {count(w_1,w_2,w_3)}{count(w_1, w_2)} 72 | \label{eqn:trigram} 73 | \end{equation} 74 | 75 | The relationship in Equation~\ref{eqn:trigram} focuses on making predictions based on a fixed window of context (i.e. the $n$ previous words) used to predict the next word. In some cases, the window of past consecutive $n$ words may not be sufficient to capture the context. For instance, consider a case where an article discusses the history of Spain and France and somewhere later in the text, it reads "The two countries went on a battle"; clearly the information presented in this sentence alone is not sufficient to identify the name of the two countries. Bengio et al. introduced the first large-scale deep learning for natural language processing model that enables capturing this type of context via \textit{learning a distributed representation of words}; Figure~\ref{fig:bengio_03} shows the corresponding neural network architecture. In this model, input word vectors are used by both the hidden layer and the output layer. 76 | Equation~\ref{eqn:bengio_eqn} shows the parameters of the $\operatorname{softmax()}$ function consisting of the standard $\operatorname{tanh()}$ function (i.e. the hidden layer) as well as the linear function, $W^{(3)}x+b^{(3)}$, that captures all the previous $n$ input word vectors. 77 | 78 | \begin{marginfigure} 79 | \centering 80 | \includegraphics[width=\linewidth]{bengio_03.png} 81 | \caption {The first deep neural network architecture model for NLP presented by Bengio et al.} 82 | \label{fig:bengio_03} 83 | \end{marginfigure} 84 | 85 | \begin{equation} 86 | \hat{y} = softmax (W^{(2)} tanh(W^{(1)}x+b^{(1)})+W^{(3)}x+b^{(3)}) 87 | \label{eqn:bengio_eqn} 88 | \end{equation} 89 | Note that the weight matrix $W^{(1)}$ is applied to the word vectors (solid green arrows in Figure ~\ref{fig:bengio_03}), $W^{(2)}$ is applied to the hidden layer (also solid green arrow) and $W^{(3)}$ is applied to the word vectors (dashed green arrows). 90 | 91 | In all conventional language models, the memory requirements of the system grows exponentially with the window size $n$ making it nearly impossible to model large word windows without running out of memory. 92 | 93 | \section{Recurrent Neural Networks (RNN)} 94 | Unlike the conventional translation models, where only a finite window of previous words would be considered for conditioning the language model, Recurrent Neural Networks (RNN) are capable of conditioning the model on \textit{all} previous words in the corpus. 95 | 96 | Figure~\ref{fig:rnn} introduces the RNN architecture where rectangular box is a hidden layer at a time-step, $t$. Each such layer holds a number of neurons, each of which performing a linear matrix operation on its inputs followed by a non-linear operation (e.g. $\operatorname{tanh()}$). At each time-step, the output of the previous step along with the next word vector in the document, $x_t$, are inputs to the hidden layer to produce a prediction output $\hat{y}$ and output features $h_t$ (Equations~\ref{eqn:h_t} and \ref{eqn:y}). The inputs and outputs of each single neuron are illustrated in Figure~\ref{fig:rnn_node}. 97 | \begin{equation} 98 | h_t = \sigma (W^{(hh)}h_{t-1} + W^{(hx)}x_{[t]}) 99 | \label{eqn:h_t} 100 | \end{equation} 101 | 102 | \begin{equation} 103 | \hat{y}_t = softmax(W^{(S)}h_t) 104 | \label{eqn:y} 105 | \end{equation} 106 | 107 | \begin{marginfigure} 108 | \centering 109 | \includegraphics[width=\linewidth]{rnn.pdf} 110 | \caption {A Recurrent Neural Network (RNN). Three time-steps are shown.} 111 | \label{fig:rnn} 112 | \end{marginfigure} 113 | 114 | Below are the details associated with each parameter in the network: 115 | \begin{itemize} 116 | \item $x_1, ..., x_{t-1}, x_t, x_{t+1}, ... x_{T}$: the word vectors corresponding to a corpus with T words. 117 | \item $h_t = \sigma(W^{(hh)} h_{t-1} + W^{(hx)} x_{t})$: the relationship to compute the hidden layer output features at each time-step $t$ 118 | \begin{itemize} 119 | \item $x_{t} \in \mathbb{R}^{d}$: input word vector at time $t$. 120 | \item $W^{hx} \in \mathbb{R}^{D_h \times d}$: weights matrix used to condition the input word vector, $x_t$ 121 | \item $W^{hh} \in \mathbb{R}^{D_h \times D_h}$: weights matrix used to condition the output of the previous time-step, $h_{t-1}$ 122 | \item $h_{t-1} \in \mathbb{R}^{D_h}$: output of the non-linear function at the previous time-step, $t-1$. $h_0 \in \mathbb{R}^{D_h}$ is an initialization vector for the hidden layer at time-step $t = 0$. 123 | \item $\sigma ()$: the non-linearity function (sigmoid here) 124 | \end{itemize} 125 | \item $\hat{y}_t = softmax (W^{(S)}h_t)$: the output probability distribution over the vocabulary at each time-step $t$. Essentially, $\hat{y}_t$ is the next predicted word given the document context score so far (i.e. $h_{t-1}$) and the last observed word vector $x^{(t)}$. Here, $W^{(S)} \in \mathbb{R}^{|V| \times D_h}$ and $\hat{y} \in \mathbb{R}^{|V|}$ where $|V|$ is the vocabulary. 126 | \end{itemize} 127 | 128 | The loss function used in RNNs is often the cross entropy error introduced in earlier notes. Equation~\ref{eqn:rnn_loss} shows this function as the sum over the entire vocabulary at time-step $t$. 129 | \begin{equation} 130 | J^{(t)}(\theta) = - \sum_{j=1}^{|V|} y_{t,j} \times log (\hat{y}_{t,j}) 131 | \label {eqn:rnn_loss} 132 | \end{equation} 133 | 134 | The cross entropy error over a corpus of size $T$ is: 135 | \begin{equation} 136 | J = \dfrac{1}{T} \sum_{t=1}^{T} J^{(t)}(\theta) = - \dfrac{1}{T} \sum_{t=1}^{T} \sum_{j=1}^{|V|} y_{t,j} \times log (\hat{y}_{t,j}) 137 | \label {eqn:rnn_loss_T} 138 | \end{equation} 139 | 140 | \begin{marginfigure} 141 | \centering 142 | \includegraphics[width=\linewidth]{rnn_node.pdf} 143 | \caption {The inputs and outputs to a neuron of a RNN} 144 | \label{fig:rnn_node} 145 | \end{marginfigure} 146 | 147 | 148 | Equation~\ref{eqn:perplexity} is called the \textit{perplexity} relationship; it is basically 2 to the power of the negative log probability of the cross entropy error function shown in Equation~\ref{eqn:rnn_loss_T}. Perplexity is a measure of confusion where lower values imply more confidence in predicting the next word in the sequence (compared to the ground truth outcome). 149 | 150 | \begin{equation} 151 | Perplexity = 2^{J} 152 | \label{eqn:perplexity} 153 | \end{equation} 154 | 155 | The amount of memory required to run a layer of RNN is proportional to the number of words in the corpus. For instance, a sentence with k words would have k word vectors to be stored in memory. Also, the RNN must maintain two pairs of $W, b$ matrices. While the size of W could be very large, it does not scale with the size of the corpus (unlike the traditional language models). For a RNN with 1000 recurrent layers, the matrix would be $1000 \times 1000$ regardless of the corpus size. 156 | 157 | Figure~\ref{fig:rnn_loop} is an alternative representation of RNNs used in some publications. It represents the RNN hidden layer as a loop. 158 | 159 | \begin{marginfigure} 160 | \centering 161 | \includegraphics[width=\linewidth]{rnn_loop.pdf} 162 | \caption {The illustration of a RNN as a loop over time-steps} 163 | \label{fig:rnn_loop} 164 | \end{marginfigure} 165 | 166 | %\subsection{RNN Training} 167 | %Training RNN is specifically difficult because ? 168 | 169 | \subsection{Vanishing Gradient \& Gradient Explosion Problems} 170 | Recurrent neural networks propagate weight matrices from one time-step to the next. Recall the goal of a RNN implementation is to enable propagating context information through faraway time-steps. For example, consider the following two sentences: 171 | 172 | \null 173 | \centering 174 | \uline{Sentence 1} 175 | 176 | "Jane walked into the room. John walked in too. Jane said hi to \_\_\_" 177 | 178 | \null 179 | \uline{Sentence 2} 180 | 181 | "Jane walked into the room. John walked in too. It was late in the day, and everyone was walking home after a long day at work. Jane said hi to \_\_\_" 182 | 183 | \null 184 | \justify 185 | In both sentences, given their context, one can tell the answer to both blank spots is most likely "John". It is important that the RNN predicts the next word as "John", the second person who has appeared several time-steps back in both contexts. Ideally, this should be possible given what we know about RNNs so far. In practice, however, it turns out RNNs are more likely to correctly predict the blank spot in Sentence 1 than in Sentence 2. This is because during the back-propagation phase, the contribution of gradient values gradually vanishes as they propagate to earlier time-steps. Thus, for long sentences, the probability that "John" would be recognized as the next word reduces with the size of the context. Below, we discuss the mathematical reasoning behind the vanishing gradient problem. 186 | 187 | Consider Equations~\ref{eqn:h_t} and \ref{eqn:y} at a time-step $t$; to compute the RNN error, $dE/dW$, we sum the error at each time-step. That is, $dE_t/dW$ for every time-step, $t$, is computed and accumulated. 188 | \begin{equation} 189 | \dfrac{\partial E}{\partial W} = \sum_{t=1}^{T}\dfrac{\partial E_t}{\partial W} 190 | \label{eqn:bp_rnn_error} 191 | \end{equation} 192 | 193 | The error for each time-step is computed through applying the chain rule differentiation to Equations~\ref{eqn:y} and \ref{eqn:h_t}; Equation~\ref{eqn:bp_rnn_chain} shows the corresponding differentiation. Notice $dh_t/dh_k$ refers to the partial derivative of $h_t$ with respect to \textit{all} previous $k$ time-steps. 194 | \begin{equation} 195 | \dfrac{\partial E_t}{\partial W} = \sum_{k=1}^{t} \dfrac{\partial E_t}{\partial y_t} \dfrac{\partial y_t}{\partial h_t} \dfrac{\partial h_t}{\partial h_k} \dfrac{\partial h_k}{\partial W} 196 | \label{eqn:bp_rnn_chain} 197 | \end{equation} 198 | 199 | Equation~\ref{eqn:bp_rnn_k} shows the relationship to compute each $dh_t/dh_k$; this is simply a chain rule differentiation over all hidden layers within the $[k, t]$ time interval. 200 | \begin{equation} 201 | \dfrac{\partial h_t}{\partial h_k} = \prod_{j=k+1}^{t}\dfrac{\partial h_j}{\partial h_{j-1}} = \prod_{j=k+1}^{t}W^T \times diag [f'(j_{j-1})] 202 | \label{eqn:bp_rnn_k} 203 | \end{equation} 204 | 205 | Because $h \in \mathbb{R}^{D_n}$, each $\partial h_j/\partial h_{j-1}$ is the Jacobian matrix for $h$: 206 | \begin{equation} 207 | \dfrac{\partial h_j}{\partial h_{j-1}} = {[\dfrac{\partial h_{j}}{\partial h_{j-1,1}} ... \dfrac{\partial h_{j}}{\partial h_{j-1,D_n}}]} = 208 | \begin{bmatrix} 209 | \dfrac{\partial h_{j,1}}{\partial h_{j-1,1}} & . & . & . & \dfrac{\partial h_{j,1}}{\partial h_{j-1,D_n}} \\ 210 | . & . & & & . \\ 211 | . & & . & & . \\ 212 | . & & & . & . \\ 213 | \dfrac{\partial h_{j,D_n}}{\partial h_{j-1,1}} & . & . & . & \dfrac{\partial h_{j,D_n}}{\partial h_{j-1,D_n}} \\ 214 | \end{bmatrix} 215 | \label{eqn:bp_rnn_jaocb} 216 | \end{equation} 217 | 218 | Putting Equations~\ref{eqn:bp_rnn_error}, \ref{eqn:bp_rnn_chain}, \ref{eqn:bp_rnn_k} together, we have the following relationship. 219 | \begin{equation} 220 | \dfrac{\partial E}{\partial W} = \sum_{t=1}^{T}\sum_{k=1}^{t} \dfrac{\partial E_t}{\partial y_t} \dfrac{\partial y_t}{\partial h_t} (\prod_{j=k+1}^{t}\dfrac{\partial h_j}{\partial h_{j-1}}) \dfrac{\partial h_k}{\partial W} 221 | \end{equation} 222 | 223 | Equation~\ref{eqn:bp_rnn_k_norm} shows the norm of the Jacobian matrix relationship in Equation~\ref{eqn:bp_rnn_jaocb}. Here, $\beta_W$ and $\beta_h$ represent the upper bound values for the two matrix norms. The norm of the partial gradient at each time-step, $t$, is therefore, calculated through the relationship shown in Equation~\ref{eqn:bp_rnn_k_norm}. 224 | 225 | \begin {equation} 226 | \parallel \dfrac{\partial h_j}{\partial h_{j-1}} \parallel \leq \parallel W^T\parallel \parallel diag [f'(h_{j-1})]\parallel \leq \beta_W \beta_h 227 | \label{eqn:bp_rnn_k_norm} 228 | \end {equation} 229 | 230 | The norm of both matrices is calculated through taking their L2-norm. The norm of $f'(h_{j-1})$ can only be as large as 1 given the sigmoid non-linearity function. 231 | 232 | \begin {equation} 233 | \parallel \dfrac{\partial h_t}{\partial h_k} \parallel = \parallel \prod_{j=k+1}^{t} \dfrac{\partial h_j}{\partial h_{j-1}}\parallel \leq (\beta_W \beta_h)^{t-k} 234 | \label{eqn:bp_rnn_k_norm_total} 235 | \end {equation} 236 | 237 | The exponential term $(\beta_W \beta_h)^{t-k}$ can easily become a very small or large number when $\beta_W \beta_h$ is much smaller or larger than 1 and $t-k$ is sufficiently large. Recall a large $t-k$ evaluates the cross entropy error due to faraway words. The contribution of faraway words to predicting the next word at time-step $t$ diminishes when the gradient vanishes early on. 238 | 239 | During experimentation, once the gradient value grows extremely large, it causes an overflow (i.e. NaN) which is easily detectable at runtime; this issue is called the \textit{Gradient Explosion Problem}. When the gradient value goes to zero, however, it can go undetected while drastically reducing the learning quality of the model for far-away words in the corpus; this issue is called the \textit{Vanishing Gradient Problem}. 240 | 241 | To gain practical intuition about the vanishing gradient problem, you may visit the following \uline{\href{http://cs224d.stanford.edu/notebooks/vanishing_grad_example.html}{example website}}. 242 | 243 | \subsection{Solution to the Exploding \& Vanishing Gradients} 244 | Now that we gained intuition about the nature of the vanishing gradients problem and how it manifests itself in deep neural networks, let us focus on a simple and practical heuristic to solve these problems. 245 | 246 | To solve the problem of exploding gradients, Thomas Mikolov first introduced a simple heuristic solution that \textit{clips} gradients to a small number whenever they explode. That is, whenever they reach a certain threshold, they are set back to a small number as shown in Algorithm~\ref{alg:clip}. 247 | 248 | \begin{algorithm} 249 | \begin{algorithmic} 250 | \State $\hat{g} \gets \dfrac{\partial E}{\partial W}$ 251 | \If {$\parallel \hat{g} \parallel \geq threshold$} 252 | \State $\hat{g} \gets \dfrac{threshold}{\parallel \hat{g} \parallel} \hat{g}$ 253 | \EndIf 254 | \end{algorithmic} 255 | \caption{Psudo-code for norm clipping in the gradients whenever they explode} 256 | \label{alg:clip} 257 | \end{algorithm} 258 | 259 | Figure~\ref{fig:clipping} visualizes the effect of gradient clipping. It shows the decision surface of a small recurrent neural network with respect to its $W$ matrix and its bias terms, $b$. The model consists of a single unit of recurrent neural network running through a small number of time-steps; the solid arrows illustrate the training progress on each gradient descent step. When the gradient descent model hits the high error wall in the objective function, the gradient is pushed off to a far-away location on the decision surface. The clipping model produces the dashed line where it instead pulls back the error gradient to somewhere close to the original gradient landscape. 260 | 261 | \begin{marginfigure} 262 | \centering 263 | \includegraphics[width=\linewidth]{cliping.pdf} 264 | \caption {Gradient explosion clipping visualization} 265 | \label{fig:clipping} 266 | \end{marginfigure} 267 | 268 | To solve the problem of vanishing gradients, we introduce two techniques. The first technique is that instead of initializing $W^{(hh)}$ randomly, start off from an identify matrix initialization. 269 | %TODO the initialization idea requires more writing 270 | 271 | The second technique is to use the Rectified Linear Units (ReLU) instead of the sigmoid function. The derivative for the ReLU is either 0 or 1. This way, gradients would flow through the neurons whose derivative is 1 without getting attenuated while propagating back through time-steps. 272 | 273 | 274 | \subsection{Deep Bidirectional RNNs} 275 | So far, we have focused on RNNs that look into the past words to predict the next word in the sequence. It is possible to make predictions based on future words by having the RNN model read through the corpus backwards. Irsoy et al. shows a bi-directional deep neural network; at each time-step, $t$, this network maintains two hidden layers, one for the left-to-right propagation and another for the right-to-left propagation. To maintain two hidden layers at any time, this network consumes twice as much memory space for its weight and bias parameters. The final classification result, $\hat{y_t}$, is generated through combining the score results produced by both RNN hidden layers. Figure~\ref{fig:birnn} shows the bi-directional network architecture, and Equations~\ref{eqn:rnn_right} and \ref{eqn:rnn_left} show the mathematical formulation behind setting up the bi-directional RNN hidden layer. The only difference between these two relationships is in the direction of recursing through the corpus. Equation~\ref{eqn:birnn_classifier} shows the classification relationship used for predicting the next word via summarizing past and future word representations. 276 | 277 | \begin{marginfigure} 278 | \centering 279 | \includegraphics[width=\linewidth]{birnn.pdf} 280 | \caption {A bi-directional RNN model} 281 | \label{fig:birnn} 282 | \end{marginfigure} 283 | 284 | \begin{equation} 285 | \overrightarrow{h}_t = f(\overrightarrow{W} x_t + \overrightarrow{V} \overrightarrow{h}_{t-1} + \overrightarrow{b}) 286 | \label{eqn:rnn_right} 287 | \end{equation} 288 | \begin{equation} 289 | \overleftarrow{h}_t = f(\overleftarrow{W} x_t + \overleftarrow{V} \overleftarrow{h}_{t+1} + \overleftarrow{b}) 290 | \label{eqn:rnn_left} 291 | \end{equation} 292 | \begin{equation} 293 | \hat{y}_t = g(U h_t + c) = g(U [\overrightarrow{h}_t; \overleftarrow{h}_t] + c) 294 | \label{eqn:birnn_classifier} 295 | \end{equation} 296 | 297 | Figure~\ref{fig:deepbirnn} shows a multi-layer bi-directional RNN where each lower layer feeds the next layer. As shown in this figure, in this network architecture, at time-step $t$ each intermediate neuron receives one set of parameters from the previous time-step (in the same RNN layer), and two sets of parameters from the previous RNN hidden layer; one input comes from the left-to-right RNN and the other from the right-to-left RNN. 298 | 299 | To construct a Deep RNN with $L$ layers, the above relationships are modified to the relationships in Equations~\ref{eqn:d_rnn_right} and \ref{eqn:d_rnn_left} where the input to each intermediate neuron at level $i$ is the output of the RNN at layer $i-1$ at the same time-step, $t$. The output, $\hat{y}$, at each time-step is the result of propagating input parameters through all hidden layers (Equation~\ref{eqn:d_birnn_classifier}). 300 | 301 | \begin{marginfigure} 302 | \centering 303 | \includegraphics[width=\linewidth]{deepbirnn.pdf} 304 | \caption {A deep bi-directional RNN with three RNN layers.} 305 | \label{fig:deepbirnn} 306 | \end{marginfigure} 307 | 308 | \begin{equation} 309 | \overrightarrow{h}_t^{(i)} = f(\overrightarrow{W}^{(i)} h_t^{(i-1)} + \overrightarrow{V}^{(i)} \overrightarrow{h}_{t-1}^{(i)} + \overrightarrow{b}^{(i)}) 310 | \label{eqn:d_rnn_right} 311 | \end{equation} 312 | \begin{equation} 313 | \overleftarrow{h}_t^{(i)} = f(\overleftarrow{W}^{(i)} h_t^{(i-1)} + \overleftarrow{V}^{(i)} \overleftarrow{h}_{t+1}^{(i)} + \overleftarrow{b}^{(i)}) 314 | \label{eqn:d_rnn_left} 315 | \end{equation} 316 | \begin{equation} 317 | \hat{y}_t = g(U h_t + c) = g(U [\overrightarrow{h}_t^{(L)}; \overleftarrow{h}_t^{(L)}] + c) 318 | \label{eqn:d_birnn_classifier} 319 | \end{equation} 320 | 321 | \subsection{Application: RNN Translation Model} 322 | Traditional translation models are quite complex; they consist of numerous machine learning algorithms applied to different stages of the language translation pipeline. In this section, we discuss the potential for adopting RNNs as a replacement to traditional translation modules. Consider the RNN example model shown in Figure~\ref{fig:rnn_translate}; here, the German phrase \textit{Echt dicke Kiste} is translated to \textit{Awesome sauce}. The first three hidden layer time-steps \textit{encode} the German language words into some language word features ($h_3$). The last two time-steps \textit{decode} $h_3$ into English word outputs. Equation~\ref{eqn:encoder} shows the relationship for the Encoder stage and Equations~\ref{eqn:decoder_h} and \ref{eqn:decoder_y} show the equation for the Decoder stage. 323 | 324 | \begin{marginfigure} 325 | \centering 326 | \includegraphics[width=\linewidth]{rnn_translate.pdf} 327 | \caption {A RNN-based translation model. The first three RNN hidden layers belong to the source language model encoder, and the last two belong to the destination language model decoder.} 328 | \label{fig:rnn_translate} 329 | \end{marginfigure} 330 | 331 | \begin{equation} 332 | h_t = \phi (h_{t-1}, x_t) = f (W^{(hh)} h_{t-1} + W^{(hx)} x_t) 333 | \label{eqn:encoder} 334 | \end{equation} 335 | 336 | \begin{equation} 337 | h_t = \phi (h_{t-1}) = f (W^{(hh)} h_{t-1}) 338 | \label{eqn:decoder_h} 339 | \end{equation} 340 | \begin{equation} 341 | y_t = softmax (W^{(S)}h_t) 342 | \label{eqn:decoder_y} 343 | \end{equation} 344 | 345 | One may naively assume this RNN model along with the cross-entropy function shown in Equation~\ref{eqn:decoder_ce} can produce high-accuracy translation results. In practice, however, several extensions are to be added to the model to improve its translation accuracy performance. 346 | 347 | \begin{equation} 348 | \max_{\theta} \dfrac {1}{N} \sum_{n=1}^{N} \log p_{\theta} (y^{(n)}|x^{(n)}) 349 | \label{eqn:decoder_ce} 350 | \end{equation} 351 | 352 | \paragraph{Extension I:} train different RNN weights for encoding and decoding. This decouples the two units and allows for more accuracy prediction of each of the two RNN modules. This means the $\phi()$ functions in Equations~\ref{eqn:encoder} and \ref{eqn:decoder_h} would have different $W^{(hh)}$ matrices. 353 | 354 | \paragraph{Extension II:} compute every hidden state in the decoder using three different inputs: 355 | 356 | \begin{itemize} 357 | \item The previous hidden state (standard) 358 | \item Last hidden layer of the encoder ($c = h_T$ in Figure~\ref{fig:en_decoder}) 359 | \item Previous predicted output word, $\hat{y}_{t-1}$ 360 | \end{itemize} 361 | 362 | \begin{marginfigure} 363 | \centering 364 | \includegraphics[width=\linewidth]{en_decoder.png} 365 | \caption {Language model with three inputs to each decoder neuron: ($h_{t-1}, c, y_{t-1}$)} 366 | \label{fig:en_decoder} 367 | \end{marginfigure} 368 | 369 | Combining the above three inputs transforms the $\phi$ function in the decoder function of Equation~\ref{eqn:decoder_h} to the one in Equation~\ref{eqn:decoder_hs}. Figure~\ref{fig:en_decoder} illustrates this model. 370 | 371 | \begin{equation} 372 | h_{t} = \phi (h_{t-1}, c, y_{t-1}) 373 | \label{eqn:decoder_hs} 374 | \end{equation} 375 | 376 | \paragraph{Extension III:} train deep recurrent neural networks using multiple RNN layers as discussed earlier in this chapter. Deeper layers often improve prediction accuracy due to their higher learning capacity. Of course, this implies a large training corpus must be used to train the model. 377 | 378 | \paragraph{Extension IV:} train bi-directional encoders to improve accuracy similar to what was discussed earlier in this chapter. 379 | 380 | \paragraph{Extension V:} given a word sequence \textit{A B C} in German whose translation is \textit{X Y} in English, instead of training the RNN using \textit{A B C} $\to$ \textit{X Y}, train it using \textit{C B A} $\to$ \textit{X Y}. The intutition behind this technique is that \textit{A} is more likely to be translated to \textit{X}. Thus, given the vanishing gradient problem discussed earlier, reversing the order of the input words can help reduce the error rate in generating the output phrase. 381 | 382 | \section{Gated Recurrent Units}\label{sec:grus} 383 | Beyond the extensions discussed so far, RNNs have been found to perform better with the use of more complex units for activation. So far, we have discussed methods that transition from hidden state $h_{t-1}$ to $h_{t}$ using an affine transformation and a point-wise nonlinearity. Here, we discuss the use of a gated activation function thereby modifying the RNN architecture. What motivates this? Well, although RNNs can theoretically capture long-term dependencies, they are very hard to actually train to do this. Gated recurrent units are designed in a manner to have more persistent memory thereby making it easier for RNNs to capture long-term dependencies. Let us see mathematically how a GRU uses $h_{t-1}$ and $x_{t}$ to generate the next hidden state $h_{t}$. We will then dive into the intuition of this architecture. 384 | \begin{align*} 385 | z_{t} &= \sigma(W^{(z)}x_{t} + U^{(z)}h_{t-1})&~\text{(Update gate)}\\ 386 | r_{t} &= \sigma(W^{(r)}x_{t} + U^{(r)}h_{t-1})&~\text{(Reset gate)}\\ 387 | \tilde{h}_{t} &= \operatorname{tanh}(r_{t}\circ Uh_{t-1} + Wx_{t} )&~\text{(New memory)}\\ 388 | h_{t} &= (1 - z_{t}) \circ \tilde{h}_{t} + z_{t} \circ h_{t-1}&~\text{(Hidden state)} 389 | \end{align*} 390 | The above equations can be thought of a GRU's four fundamental operational stages and they have intuitive interpretations that make this model much more intellectually satisfying (see Figure~\ref{fig:GRU}): 391 | \begin{enumerate} 392 | \item \textbf{New memory generation:} A new memory $\tilde{h}_{t}$ is the consolidation of a new input word $x_{t}$ with the past hidden state $h_{t-1}$. Anthropomorphically, this stage is the one who knows the recipe of combining a newly observed word with the past hidden state $h_{t-1}$ to summarize this new word in light of the contextual past as the vector $\tilde{h}_{t}$. 393 | \item \textbf{Reset Gate:} The reset signal $r_{t}$ is responsible for determining how important $h_{t-1}$ is to the summarization $\tilde{h}_{t}$. The reset gate has the ability to completely diminish past hidden state if it finds that $h_{t-1}$ is irrelevant to the computation of the new memory. 394 | \item \textbf{Update Gate:} The update signal $z_{t}$ is responsible for determining how much of $h_{t-1}$ should be carried forward to the next state. For instance, if $z_{t} \approx 1$, then $h_{t-1}$ is almost entirely copied out to $h_{t}$. Conversely, if $z_{t} \approx 0$, then mostly the new memory $\tilde{h}_{t}$ is forwarded to the next hidden state. 395 | \item \textbf{Hidden state:} The hidden state $h_{t}$ is finally generated using the past hidden input $h_{t-1}$ and the new memory generated $\tilde{h}_{t}$ with the advice of the update gate. 396 | \end{enumerate} 397 | 398 | \begin{figure*}% 399 | \includegraphics[width = 15cm]{GRU} 400 | \caption{The detailed internals of a GRU} 401 | \label{fig:GRU} 402 | \end{figure*} 403 | 404 | It is important to note that to train a GRU, we need to learn all the different parameters: $W, U, W^{(r)}, U^{(r)}, W^{(z)}, U^{(z)}$. These follow the same backpropagation procedure we have seen in the past. 405 | 406 | \section{Long-Short-Term-Memories}\label{sec:lstm} 407 | 408 | Long-Short-Term-Memories are another type of complex activation unit that differ a little from GRUs. The motivation for using these is similar to those for GRUs however the architecture of such units does differ. Let us first take a look at the mathematical formulation of LSTM units before diving into the intuition behind this design: 409 | \begin{align*} 410 | i_{t} &= \sigma(W^{(i)}x_{t} + U^{(i)}h_{t-1})&~\text{(Input gate)}\\ 411 | f_{t} &= \sigma(W^{(f)}x_{t} + U^{(f)}h_{t-1})&~\text{(Forget gate)}\\ 412 | o_{t} &= \sigma(W^{(o)}x_{t} + U^{(o)}h_{t-1})&~\text{(Output/Exposure gate)}\\ 413 | \tilde{c}_{t} &= \operatorname{tanh}(W^{(c)}x_{t} + U^{(c)}h_{t-1})&~\text{(New memory cell)}\\ 414 | c_{t} &= f_{t} \circ c_{t-1} + i_{t} \circ \tilde{c}_{t}&~\text{(Final memory cell)}\\ 415 | h_{t} &= o_{t} \circ \operatorname{tanh}(c_{t}) 416 | \end{align*} 417 | 418 | \begin{figure*}% 419 | \includegraphics[width = 15cm]{LSTM} 420 | \caption{The detailed internals of a LSTM} 421 | \label{fig:LSTM} 422 | \end{figure*} 423 | 424 | We can gain intuition of the structure of an LSTM by thinking of its architecture as the following stages: 425 | \begin{enumerate} 426 | \item \textbf{New memory generation:} This stage is analogous to the new memory generation stage we saw in GRUs. We essentially use the input word $x_{t}$ and the past hidden state $h_{t-1}$ to generate a new memory $\tilde{c}_{t}$ which includes aspects of the new word $x^{(t)}$. 427 | \item \textbf{Input Gate:} We see that the new memory generation stage doesn't check if the new word is even important before generating the new memory -- this is exactly the input gate's function. The input gate uses the input word and the past hidden state to determine whether or not the input is worth preserving and thus is used to gate the new memory. It thus produces $i_{t}$ as an indicator of this information. 428 | \item \textbf{Forget Gate:} This gate is similar to the input gate except that it does not make a determination of usefulness of the input word -- instead it makes an assessment on whether the past memory cell is useful for the computation of the current memory cell. Thus, the forget gate looks at the input word and the past hidden state and produces $f_{t}$. 429 | \item \textbf{Final memory generation:} This stage first takes the advice of the forget gate $f_{t}$ and accordingly forgets the past memory $c_{t-1}$. Similarly, it takes the advice of the input gate $i_{t}$ and accordingly gates the new memory $\tilde{c}_{t}$. It then sums these two results to produce the final memory $c_{t}$. 430 | \item \textbf{Output/Exposure Gate:} This is a gate that does not explicitly exist in GRUs. It's purpose is to separate the final memory from the hidden state. The final memory $c_{t}$ contains a lot of information that is not necessarily required to be saved in the hidden state. Hidden states are used in every single gate of an LSTM and thus, this gate makes the assessment regarding what parts of the memory $c_{t}$ needs to be exposed/present in the hidden state $h_{t}$. The signal it produces to indicate this is $o_{t}$ and this is used to gate the point-wise tanh of the memory. 431 | \end{enumerate} 432 | 433 | \end{document} 434 | -------------------------------------------------------------------------------- /notes5/fig/GRU.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes5/fig/GRU.png -------------------------------------------------------------------------------- /notes5/fig/LSTM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes5/fig/LSTM.png -------------------------------------------------------------------------------- /notes5/fig/bengio_03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes5/fig/bengio_03.png -------------------------------------------------------------------------------- /notes5/fig/birnn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes5/fig/birnn.pdf -------------------------------------------------------------------------------- /notes5/fig/cliping.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes5/fig/cliping.pdf -------------------------------------------------------------------------------- /notes5/fig/deepbirnn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes5/fig/deepbirnn.pdf -------------------------------------------------------------------------------- /notes5/fig/en_decoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes5/fig/en_decoder.png -------------------------------------------------------------------------------- /notes5/fig/nn.tex: -------------------------------------------------------------------------------- 1 | % \ifx \allfiles \undefined 2 | 3 | % \documentclass{article} 4 | 5 | % \usepackage{tikz} 6 | % \usetikzlibrary{shapes,calc,positioning,arrows,mindmap,matrix} 7 | % \usetikzlibrary{decorations.pathreplacing} 8 | 9 | % \begin{document} 10 | % \fi 11 | 12 | \def\layersep{1.2cm} 13 | \def\numHidden{6} 14 | \def\numOutput{4} 15 | \def\dx{3.3} 16 | \def\dy{2.2} 17 | \tikzset{ 18 | treenode/.style = {align=center, inner sep=4pt, text centered,font=\sffamily}, 19 | node/.style = {treenode, minimum width=1.3em, text height=1em}, 20 | line/.style = {very thick, dashed, rounded corners, fill=orange!15!white, fill opacity=0.2} 21 | } 22 | 23 | % \tikzset{ 24 | 25 | % line/.style = {very thick, dashed, fill=orange!30!white, fill opacity=0.2} 26 | % } 27 | 28 | \begin{tikzpicture}[scale=0.8,shorten >=1pt,->,draw=black!50, node distance=\layersep] 29 | 30 | \tikzstyle{neuron}=[circle,fill=black!25,minimum size=13pt,inner sep=0pt] 31 | \tikzstyle{annot} = [text width=15em, text centered] 32 | 33 | \foreach \name / \y in {1,...,3} 34 | \node[neuron,fill=red!50] (I-\name) at (\y/2, 0) {}; 35 | 36 | \node[annot] (cdot-1) at (3/2+0.8,0) {$\cdots$}; 37 | 38 | \foreach \name / \y in {4,...,6} 39 | \node[neuron,fill=red!50] (I-\name) at (\y/2+1, 0) {}; 40 | 41 | \foreach \name / \y in {7,...,9} 42 | \node[neuron,fill=blue!50,postaction={pattern=north east lines}] (I-\name) at (\y/2+1.5, 0) {}; 43 | 44 | \node[annot] (cdot-2) at (9/2+2.3,0) {$\cdots$}; 45 | 46 | \foreach \name / \y in {10,...,12} 47 | \node[neuron,fill=blue!50,postaction={pattern=north east lines}] (I-\name) at (\y/2+2.5, 0) {}; 48 | 49 | \foreach \name / \y in {13,...,15} 50 | \node[neuron,fill=orange!50,postaction={pattern=north west lines}] (I-\name) at (\y/2+3, 0) {}; 51 | 52 | \foreach \name / \y in {1,...,3} 53 | \node[neuron,fill=gray!50] (H-\name) at (\y*0.8+2.4,\layersep) {}; 54 | 55 | \node[annot] (cdot-3) at (5.5,\layersep) {$\cdots$}; 56 | 57 | \foreach \name / \y in {4,...,6} 58 | \node[neuron,fill=gray!50] (H-\name) at (\y*0.8+3.1,\layersep) {}; 59 | 60 | 61 | \foreach \name / \y in {1,...,2} 62 | \node[neuron,fill=black!50] (S-\name) at (\y*0.8+3.1,\layersep*2) {}; 63 | 64 | \node[annot] (cdot-4) at (5.5,\layersep*2) {$\cdots$}; 65 | 66 | \foreach \name / \y in {3,...,4} 67 | \node[neuron,fill=black!50] (S-\name) at (\y*0.8+3.8,\layersep*2) {}; 68 | 69 | 70 | \foreach \source in {2,5,8,11,14} 71 | \foreach \dest in {2, 5} 72 | \path (I-\source) edge (H-\dest); 73 | 74 | \foreach \source in {2, 5} 75 | \foreach \dest in {1,...,\numOutput} 76 | \path (H-\source) edge (S-\dest); 77 | 78 | 79 | \node[annot] at (-3,0) {\textbf{Input layer}: $[x^w, x^t, x^l]$}; 80 | \node[annot] at (-3,\layersep){\textbf{Hidden layer}: \\ $h = (W^w_1 x^w + W^t_1 x^t + W^l_1 x^l + b_1)^3$}; 81 | \node[annot] at (-3,\layersep*2){\textbf{Softmax layer}: \\ $p = \texttt{softmax}(W_2 h)$}; 82 | 83 | \draw [line] ($(I-1.south west)+(-0.2, -0.2)$) rectangle ($(I-3.north east)+( 0.2, 0.2)$); 84 | \draw [line] ($(I-4.south west)+(-0.2, -0.2)$) rectangle ($(I-6.north east)+( 0.2, 0.2)$); 85 | \draw [line] ($(I-7.south west)+(-0.2, -0.2)$) rectangle ($(I-9.north east)+( 0.2, 0.2)$); 86 | \draw [line] ($(I-10.south west)+(-0.2, -0.2)$) rectangle ($(I-12.north east)+( 0.2, 0.2)$); 87 | \draw [line] ($(I-13.south west)+(-0.2, -0.2)$) rectangle ($(I-15.north east)+( 0.2, 0.2)$); 88 | 89 | \draw [line,solid] ($(I-1.south west)+(-0.3, -0.3)$) rectangle ($(I-15.north east)+( 0.3, 0.3)$); 90 | 91 | \draw [line,solid] ($(H-1.south west)+(-0.2, -0.2)$) rectangle ($(H-\numHidden.north east)+( 0.2, 0.2)$); 92 | \draw [line,solid] ($(S-1.south west)+(-0.3, -0.3)$) rectangle ($(S-\numOutput.north east)+( 0.3, 0.3)$); 93 | 94 | \draw [decorate,decoration={brace,amplitude=5pt,mirror,raise=3pt}] ($(I-1.south west)+(-0.3, -0.3)$) -- ($(I-6.south east)+(0.3, -0.3)$) node [black,midway,yshift=-0.6cm] {words}; 95 | 96 | \draw [decorate,decoration={brace,amplitude=5pt,mirror,raise=3pt}] ($(I-7.south west)+(-0.3, -0.3)$) -- ($(I-12.south east)+(0.3, -0.3)$) node [black,midway,yshift=-0.6cm] {POS tags}; 97 | 98 | \draw [decorate,decoration={brace,amplitude=5pt,mirror,raise=3pt}] ($(I-13.south west)+(-0.3, -0.3)$) -- ($(I-15.south east)+(0.3, -0.3)$) node [black,midway,yshift=-0.6cm] {arc labels}; 99 | 100 | 101 | \node [node] at (-2+\dx,-5+\dy) (1) {ROOT}; 102 | 103 | \node [node] at (0+\dx,-5+\dy) (2) {has\_VBZ} 104 | child[level distance=1.25cm] 105 | { 106 | node [node,xshift=-1cm,font=\sffamily] (4) {He\_PRP} 107 | edge from parentnode[left, xshift=1.3cm, yshift=-0.2cm] {nsubj} 108 | }; 109 | 110 | \node [node] at (0+\dx,-5+\dy) (2) {has\_VBZ}; 111 | \node [node] at (2+\dx,-5+\dy) (3) {good\_JJ}; 112 | \node [node] at (5+\dx,-5+\dy) (4) {control\_NN}; 113 | \node [node] at (7+\dx,-5+\dy) (5) {.\_.}; 114 | 115 | \node [node] at (0+\dx,-4+\dy) {Stack}; 116 | \node [node] at (6+\dx,-4+\dy) {Buffer}; 117 | 118 | % \path (3) edge (I-5); 119 | % \path (3) edge (I-8); 120 | 121 | % \draw [dashed, thick, ->] (2+\dx,-4.8+\dy) -- (3.5,-0.2); 122 | % \draw [dashed, thick, ->] (2.5+\dx,-4.8+\dy) -- (5.5,-0.2); 123 | % \draw [dashed, thick, ->] (0.5+\dx,-5.8+\dy) -- (10,-0.2); 124 | 125 | \node[annot] at (-3,-5+\dy){\textbf{Configuration}}; 126 | 127 | \draw [very thick, fill=orange!30!white, fill opacity=0.2] ($(1.south west)+(-0.1, -0.1)$) rectangle ($(3.north east)+( 0.1, 0.1)$); 128 | \draw [very thick, fill=orange!30!white, fill opacity=0.2] ($(4.south west)+(-0.1, -0.1)$) rectangle ($(5.north east)+( 0.1, 0.1)$); 129 | 130 | \end{tikzpicture} 131 | % End of code 132 | 133 | % \ifx \allfiles \undefined 134 | % \end{document} 135 | % \fi 136 | -------------------------------------------------------------------------------- /notes5/fig/rnn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes5/fig/rnn.pdf -------------------------------------------------------------------------------- /notes5/fig/rnn_loop.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes5/fig/rnn_loop.pdf -------------------------------------------------------------------------------- /notes5/fig/rnn_node.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes5/fig/rnn_node.pdf -------------------------------------------------------------------------------- /notes5/fig/rnn_translate.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes5/fig/rnn_translate.pdf -------------------------------------------------------------------------------- /notes5/fig/two_layer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes5/fig/two_layer.pdf -------------------------------------------------------------------------------- /notes5/reference.bib: -------------------------------------------------------------------------------- 1 | @article{bengio2003neural, 2 | title={A neural probabilistic language model}, 3 | author={Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Janvin, Christian}, 4 | journal={The Journal of Machine Learning Research}, 5 | volume={3}, 6 | pages={1137--1155}, 7 | year={2003}, 8 | publisher={JMLR. org} 9 | } 10 | -------------------------------------------------------------------------------- /notes6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes6.pdf -------------------------------------------------------------------------------- /notes6/BPE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes6/BPE.png -------------------------------------------------------------------------------- /notes6/BiEncoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes6/BiEncoder.png -------------------------------------------------------------------------------- /notes6/Decoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes6/Decoder.png -------------------------------------------------------------------------------- /notes6/Encoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes6/Encoder.png -------------------------------------------------------------------------------- /notes6/alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes6/alignment.png -------------------------------------------------------------------------------- /notes6/candidate_list.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes6/candidate_list.png -------------------------------------------------------------------------------- /notes6/google_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes6/google_example.png -------------------------------------------------------------------------------- /notes6/hybrid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes6/hybrid.png -------------------------------------------------------------------------------- /notes6/longsentences.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes6/longsentences.png -------------------------------------------------------------------------------- /notes6/partition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes6/partition.png -------------------------------------------------------------------------------- /notes6/pointer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes6/pointer.png -------------------------------------------------------------------------------- /notes7.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7.pdf -------------------------------------------------------------------------------- /notes7/Resources/ImageBlocks.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/Resources/ImageBlocks.pptx -------------------------------------------------------------------------------- /notes7/Resources/NNet.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/Resources/NNet.pptx -------------------------------------------------------------------------------- /notes7/fig/2d_convolution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/2d_convolution.png -------------------------------------------------------------------------------- /notes7/fig/CNN-alternates.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/CNN-alternates.png -------------------------------------------------------------------------------- /notes7/fig/ConstituencyParsing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/ConstituencyParsing.png -------------------------------------------------------------------------------- /notes7/fig/denpendency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/denpendency.png -------------------------------------------------------------------------------- /notes7/fig/img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/img1.png -------------------------------------------------------------------------------- /notes7/fig/img10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/img10.png -------------------------------------------------------------------------------- /notes7/fig/img11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/img11.png -------------------------------------------------------------------------------- /notes7/fig/img2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/img2.png -------------------------------------------------------------------------------- /notes7/fig/img3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/img3.png -------------------------------------------------------------------------------- /notes7/fig/img4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/img4.png -------------------------------------------------------------------------------- /notes7/fig/img5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/img5.png -------------------------------------------------------------------------------- /notes7/fig/img6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/img6.png -------------------------------------------------------------------------------- /notes7/fig/img7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/img7.png -------------------------------------------------------------------------------- /notes7/fig/img8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/img8.png -------------------------------------------------------------------------------- /notes7/fig/img9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/img9.png -------------------------------------------------------------------------------- /notes7/fig/narrow_vs_wide.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/narrow_vs_wide.png -------------------------------------------------------------------------------- /notes7/fig/nonsense.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/nonsense.png -------------------------------------------------------------------------------- /notes7/fig/single-conv-complete.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/single-conv-complete.png -------------------------------------------------------------------------------- /notes7/fig/single-conv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes7/fig/single-conv.png -------------------------------------------------------------------------------- /notes8.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes8.pdf -------------------------------------------------------------------------------- /notes8.tex: -------------------------------------------------------------------------------- 1 | \documentclass{tufte-handout} 2 | 3 | 4 | \title{CS224n: Deep Learning for NLP\thanks{Course Instructor: Richard Socher} \\ 5 | \Large Lecture Notes: Part VIII \thanks{Authors:}} 6 | 7 | \date{Winter 2017} % without \date command, current date is supplied 8 | 9 | %\geometry{showframe} % display margins for debugging page layout 10 | 11 | \usepackage{graphicx} % allow embedded images 12 | \setkeys{Gin}{width=\linewidth,totalheight=\textheight,keepaspectratio} 13 | \graphicspath{{notes8/fig/}} % set of paths to search for images 14 | \usepackage{amsmath} % extended mathematics 15 | \usepackage{amstext} % extended text 16 | \usepackage{booktabs} % book-quality tables 17 | \usepackage{units} % non-stacked fractions and better unit spacing 18 | \usepackage{multicol} % multiple column layout facilities 19 | \usepackage{lipsum} % filler text 20 | \usepackage{fancyvrb} % extended verbatim environments 21 | \usepackage{placeins} 22 | \usepackage{mdframed}% http://ctan.org/pkg/mdframed 23 | \fvset{fontsize=\normalsize}% default font size for fancy-verbatim environments 24 | 25 | % Standardize command font styles and environments 26 | \newcommand{\doccmd}[1]{\texttt{\textbackslash#1}}% command name -- adds backslash automatically 27 | \newcommand{\docopt}[1]{\ensuremath{\langle}\textrm{\textit{#1}}\ensuremath{\rangle}}% optional command argument 28 | \newcommand{\docarg}[1]{\textrm{\textit{#1}}}% (required) command argument 29 | \newcommand{\docenv}[1]{\textsf{#1}}% environment name 30 | \newcommand{\docpkg}[1]{\texttt{#1}}% package name 31 | \newcommand{\doccls}[1]{\texttt{#1}}% document class name 32 | \newcommand{\docclsopt}[1]{\texttt{#1}}% document class option name 33 | \newenvironment{docspec}{\begin{quote}\noindent}{\end{quote}}% command specification environment 34 | \newcommand{\argmin}{\operatornamewithlimits{argmin}} 35 | \newcommand{\argmax}{\operatornamewithlimits{argmax}} 36 | \newcommand{\norm}[1]{\left\lVert#1\right\rVert} 37 | \newcommand{\textunderscript}[1]{$_{\text{#1}}$} 38 | \allowdisplaybreaks 39 | \setcounter{secnumdepth}{3} 40 | 41 | \newmdtheoremenv[outerlinewidth=2,leftmargin=40,rightmargin=40,% 42 | backgroundcolor=lightgray,outerlinecolor=blue,innertopmargin=10pt,% 43 | splittopskip=\topskip,skipbelow=\baselineskip,% 44 | skipabove=\baselineskip,ntheorem,roundcorner=5pt]{theorem}{Snippet}[section] 45 | 46 | \begin{document} 47 | 48 | \maketitle% this prints the handout title, author, and date 49 | 50 | %\printclassoptions 51 | 52 | 53 | \textbf{Keyphrases: Coreference Resolution, Dynamic Memory Networks for Question Answering over Text and Images} 54 | 55 | %\section{Coreference Resolution} 56 | % from http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture15.pdf 57 | % put your figures in notes8/fig/ 58 | 59 | 60 | \section{Dynamic Memory Networks for Question Answering over Text and Images} 61 | % from http://cs224d.stanford.edu/lectures/CS224d-Lecture17.pdf 62 | % put your figures in notes8/fig/ 63 | 64 | The idea of a QA system is to extract information (sometimes passages, or spans of words) directly from documents, conversations, online searches, etc., that will meet user's information needs. Rather than make the user read through an entire document, QA system prefers to give a short and concise answer. Nowadays, a QA system can combine very easily with other NLP systems like chatbots, and some QA systems even go beyond the search of text documents and can extract information from a collection of pictures. 65 | 66 | There are many types of questions, and the simplest of them is factoid question answering. It contains questions that look like ``The symbol for mercuric oxide is?'' ``Which NFL team represented the AFC at Super Bowl 50?''. There are of course other types such as mathematical questions (``2+3=?''), logical questions that require extensive reasoning (and no background information). However, we can argue that the information-seeking factoid questions are the most common questions in people's daily life. 67 | 68 | In fact, most of the NLP problems can be considered as a question-answering problem, the paradigm is simple: we issue a query, and the machine provides a response. By reading through a document, or a set of instructions, an intelligent system should be able to answer a wide variety of questions. We can ask the POS tags of a sentence, we can ask the system to respond in a different language. So naturally, we would like to design a model that can be used for general QA. 69 | 70 | In order to achieve this goal, we face two major obstacles. Many NLP tasks use different architectures, such as TreeLSTM (Tai et al., 2015) for sentiment analysis, Memory Network (Weston et al., 2015) for question answering, and Bi-directional LSTM-CRF (Huang et al., 2015) for part-of-speech tagging. The second problem is full multi-task learning tends to be very difficult, and transfer-learning remains to be a major obstacle for current neural network architectures across artificial intelligence domains (computer vision, reinforcement learning, etc.). 71 | 72 | We can tackle the first problem with a shared architecture for NLP: Dynamic Memory Network (DMN), an architecture designed for general QA tasks. QA is difficult, partially because reading a long paragraph is difficult. Even for humans, we are not able to store a long document in your working memory. 73 | 74 | \begin{marginfigure}% 75 | \includegraphics[width=\linewidth]{DMN} 76 | \caption{A graphical illustration of the Dynamic Memory Network. } 77 | \label{fig:DMN} 78 | \end{marginfigure} 79 | 80 | \subsection{Input Module} 81 | 82 | Dynamic Memory Network is divided into modules. First we look at input module. The input module takes as input a sequence of $T_I$ words and outputs a sequence of $T_C$ fact representations. If the output is a list of words, we have $T_C = T_I$ and if the output is a list of sentences, we have $T_C$ as the number of sentences and $T_I$ as the number of words in the sentences. We use a simple GRU to read the sentences in, i.e. the hidden state $h_t = \mathrm{GRU}(x_t, h_{t-1})$ where $x_t = L[w_t]$, where $L$ is the embedding matrix and $w_t$ is the word at time $t$. We further improve it by using Bi-GRU as shown in Figure \ref{fig:DMN}. 83 | 84 | \begin{marginfigure}% 85 | \includegraphics[width=\linewidth]{BiGRU} 86 | \caption{A graphical illustration of the Dynamic Memory Network. } 87 | \label{fig:DMN} 88 | \end{marginfigure} 89 | 90 | 91 | \subsection{Question Module} 92 | 93 | We also use a standard GRU to read in the question (using embedding matrix $L$: $q_t = \mathrm{GRU}(L[w_t^Q], q_{t-1})$), but the output of the question module is an encoded representation of the question. 94 | 95 | \subsection{Episodic Memory Module} 96 | 97 | One of the distinctive features of the dynamic memory network is the episodic memory module which runs over the input sequence multiple times, each time paying attention to a different subset of facts from the input. 98 | 99 | It accomplishes this using a Bi-GRU that takes input of the sentence-level representation passed in from the input module, and produces an episodic memory representation. 100 | 101 | We denote the episodic memory representation as $m^i$ and the episode representation (output by the attention mechanism) as $e^i$. The episodic memory representation is initialized using $m^0 = q$, and proceeds using the GRU: $m^i = \mathrm{GRU}(e^i, m^{i-1})$. The episode representation is updated using the hidden state outputs from the input module as follows where $g$ is the attention mechanism: 102 | \begin{align*} 103 | h_t^i &= g_t^i \mathrm{GRU}(c_t, h^i_{t-1}) + (1 - g_t^i) h_{t-1}^i \\ 104 | e_i &= h_{T_C}^i 105 | \end{align*} 106 | 107 | The attention vector $g$ may be computed in a number of ways, but in the original DMN paper (Kumar et al. 2016), the following formulation was found to work best: 108 | \begin{align*} 109 | g_t^i &= G(c_t, m^{i-1}, q) \\ 110 | G(c, m, q) &= \sigma(W^{(2)} \tanh (W^{(1)} z(c,m,q) + b^{(1)}) + b^{(2)})\\ 111 | z(c,m,q) &= [c, m, q, c \circ q, c \circ m , |c-q|, |c-m|, c^TW^{(b)}q, c^TW^{(b)}m] 112 | \end{align*} 113 | 114 | In this way, gates in this module are activated if the sentence is relevant to the question or memory. In the $i$th pass, if the summary is not sufficient to answer the question, we can repeat sequence over input in the $(i+1)$th pass. For example, consider the question "Where is the football?" and input sequences ``John kicked the football" and ``John was in the field." In this example, \textit{John} and \textit{football} could be linked in one pass and then \textit{John} and \textit{field} could be linked in the second pass, allowing for the network to perform a transitive inference based on the two pieces of information. 115 | 116 | \subsection{Answer Module} 117 | 118 | The answer module is a simple GRU decoder that takes in the output of question module, and episodic memory module, and output a word (or in general a computational result). It works as follows: 119 | \begin{align*} 120 | y_t &= \mathrm{softmax}(W^{(a)}a_t) \\ 121 | a_t &= \mathrm{GRU}([y_{t-1}, q], a_{t-1}) 122 | \end{align*} 123 | 124 | \subsection{Experiments} 125 | 126 | Through the experiments we can see DMN is able to outperform MemNN in babl question answering tasks, and it can outperform other architectures for sentiment analysis and part-of-speech tagging. How many episodes are needed in the episodic memory? The answer is that the harder the task is, the more passes are required. Multiple passes also allows the network to truly comprehend the sentence by paying attention to only relevant parts for the final task, instead of reacting to just the information from the word embedding. 127 | 128 | The key idea is to modularize the system, and you can allow different types of input by change the input module. For example, if we replace the input module with a convolutional neural network-based module, then this architecture can handle a task called visual question answering (VQA). It is also able to outperform other models in this task. 129 | 130 | \subsection{Summary} 131 | 132 | The zeal to search for a general architecture that would solve all problems has slightly faded since 2015, but the desire to train on one domain and generalize to other domains has increased. To comprehend more advanced modules for question answering, readers can refer to the dynamic coattention network (DCN). 133 | 134 | 135 | %\section{Speech Processing} 136 | %In this class, we have only looked at natural language in the form of text thus far. Speech is another medium of natural language. In the following section, $$ 137 | 138 | \end{document} -------------------------------------------------------------------------------- /notes8/fig/BiGRU.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes8/fig/BiGRU.png -------------------------------------------------------------------------------- /notes8/fig/DMN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/notes8/fig/DMN.png -------------------------------------------------------------------------------- /review-differential-calculus.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/review-differential-calculus.pdf -------------------------------------------------------------------------------- /review-differential-calculus.tex: -------------------------------------------------------------------------------- 1 | \documentclass{tufte-handout} 2 | 3 | \title{Review of differential calculus theory \thanks{Author: Guillaume Genthial}} 4 | 5 | 6 | \date{Winter 2017} % without \date command, current date is supplied 7 | 8 | %\geometry{showframe} % display margins for debugging page layout 9 | 10 | \usepackage{graphicx} % allow embedded images 11 | \setkeys{Gin}{width=\linewidth,totalheight=\textheight,keepaspectratio} 12 | \graphicspath{{}} % set of paths to search for images 13 | \usepackage{amsmath} % extended mathematics 14 | \usepackage{amstext} % extended text 15 | \usepackage{booktabs} % book-quality tables 16 | \usepackage{units} % non-stacked fractions and better unit spacing 17 | \usepackage{multicol} % multiple column layout facilities 18 | \usepackage{lipsum} % filler text 19 | \usepackage{fancyvrb} % extended verbatim environments 20 | \usepackage{placeins} 21 | \fvset{fontsize=\normalsize}% default font size for fancy-verbatim environments 22 | \usepackage[normalem]{ulem} 23 | \usepackage{algpseudocode} 24 | \usepackage{algorithm} 25 | 26 | 27 | % tikz package 28 | \usepackage{tikz} 29 | \usetikzlibrary{patterns, shapes,calc,positioning,arrows,mindmap,matrix} 30 | \usetikzlibrary{decorations.pathreplacing} 31 | 32 | % Standardize command font styles and environments 33 | \newcommand{\doccmd}[1]{\texttt{\textbackslash#1}}% command name -- adds backslash automatically 34 | \newcommand{\docopt}[1]{\ensuremath{\langle}\textrm{\textit{#1}}\ensuremath{\rangle}}% optional command argument 35 | \newcommand{\docarg}[1]{\textrm{\textit{#1}}}% (required) command argument 36 | \newcommand{\docenv}[1]{\textsf{#1}}% environment name 37 | \newcommand{\docpkg}[1]{\texttt{#1}}% package name 38 | \newcommand{\doccls}[1]{\texttt{#1}}% document class name 39 | \newcommand{\docclsopt}[1]{\texttt{#1}}% document class option name 40 | \newenvironment{docspec}{\begin{quote}\noindent}{\end{quote}}% command specification environment 41 | \newcommand{\argmin}{\operatornamewithlimits{argmin}} 42 | \newcommand{\argmax}{\operatornamewithlimits{argmax}} 43 | \newcommand{\textunderscript}[1]{$_{\text{#1}}$} 44 | \newcommand{\ud}{\mathrm{d}} 45 | 46 | \setcounter{secnumdepth}{3} 47 | \begin{document} 48 | \maketitle 49 | 50 | \textbf{Keywords}: 51 | \noindent Differential, Gradients, partial derivatives, Jacobian, chain-rule 52 | 53 | \bigskip 54 | 55 | \textbf{This note is optional and is aimed at students who wish to have a deeper understanding of differential calculus}. It defines and explains the links between derivatives, gradients, jacobians, etc. First, we go through definitions and examples for $ f : \mathbb{R}^n \mapsto \mathbb{R} $. Then we introduce the Jacobian and generalize to higher dimension. Finally, we introduce the chain-rule. 56 | 57 | %\tableofcontents 58 | 59 | \section{Introduction} 60 | 61 | 62 | We use derivatives all the time, but we forget what they mean. In general, we have in mind that for a function $ f : \mathbb{R} \mapsto \mathbb{R} $, we have something like 63 | 64 | $$ f(x+h) - f(x) \approx f'(x)h $$ 65 | 66 | Some people use different notation, especially when dealing with higher dimensions, and there usually is a lot of confusion between the following notations 67 | 68 | \begin{align*} 69 | f'(x)\\ 70 | \frac{\ud f}{\ud x}\\ 71 | \frac{\partial f}{\partial x}\\ 72 | \nabla_x f 73 | \end{align*} 74 | 75 | \marginnote{\textbf{Scalar-product and dot-product}\\ 76 | Given two vectors $ a $ and $ b $, 77 | \begin{itemize} 78 | \item \textbf{scalar-product} $ \langle a | b \rangle = \sum_{i=1}^n a_i b_i $ 79 | \item \textbf{dot-product} $ a^T \cdot b = \langle a | b \rangle = \sum_{i=1}^n a_i b_i$ 80 | \end{itemize}} 81 | However, these notations refer to different mathematical objects, and the confusion can lead to mistakes. This paper recalls some notions about these objects. 82 | 83 | \newpage 84 | 85 | \section{Theory for $ f : \mathbb{R}^n \mapsto \mathbb{R} $} 86 | \subsection{Differential} 87 | \marginnote{\textbf{Notation}\\ 88 | $ \ud_x f $ is a \textbf{linear form} $ \mathbb{R}^n \mapsto \mathbb{R} $\\ 89 | This is the best \textbf{linear approximation} of the function $ f$} 90 | 91 | \textbf{Formal definition} 92 | 93 | Let's consider a function $ f : \mathbb{R}^n \mapsto \mathbb{R} $ defined on $ \mathbb{R}^n $ with the scalar product $ \langle \cdot | \cdot \rangle $. We suppose that this function is \textbf{differentiable}, which means that for $ x \in \mathbb{R}^n $ (fixed) and a small variation $ h $ (can change) we can write: 94 | 95 | \marginnote{$ \ud_x f $ is called the \textbf{differential} of $ f $ in $ x $} 96 | 97 | \begin{equation} 98 | f(x + h) = f(x) + \ud_x f (h) + o_{h \rightarrow 0}(h) 99 | \end{equation} 100 | \label{eq:diff} 101 | 102 | \marginnote{ $ o_{h \rightarrow 0}(h) $ (Landau notation) is equivalent to the existence of a function $ \epsilon(h) $ such that $ \lim\limits_{h \rightarrow 0} \epsilon(h) = 0 $} 103 | 104 | and $ \ud_x f : \mathbb{R}^n \mapsto \mathbb{R} $ is a linear form, which means that $ \forall x, y \in \mathbb{R}^n $ , we have $ \ud_x f(x + y) = \ud_x f(x) + \ud_x f(y)$. 105 | 106 | \textbf{Example} 107 | 108 | Let $ f : \mathbb{R}^2 \mapsto \mathbb{R} $ such that $ f(\begin{pmatrix} 109 | x_1 \\ x_2 110 | \end{pmatrix}) = 3x_1 + x_2^2 $. Let's pick $ \begin{pmatrix} 111 | a \\ 112 | b 113 | \end{pmatrix}\in \mathbb{R}^2 $ and $ h = \begin{pmatrix} 114 | h_1 \\ 115 | h_2 116 | \end{pmatrix}\in \mathbb{R}^2 $. We have 117 | 118 | \begin{align*} 119 | f(\begin{pmatrix} 120 | a + h_1\\ 121 | b + h_2 122 | \end{pmatrix}) &= 3(a + h_1) + (b + h_2)^2\\ 123 | &= 3 a + 3h_1 + b^2 + 2 b h_2 + h_2^2\\ 124 | &= 3 a + b^2 + 3 h_1 + 2 b h_2 + h_2^2\\ 125 | &= f(a, b) + 3 h_1 + 2 b h_2 + o(h) 126 | \end{align*} 127 | \marginnote{$ h^2 = h\cdot h = o_{h\rightarrow 0}(h) $} 128 | 129 | Then, $ \ud_{\begin{pmatrix} 130 | a \\ 131 | b 132 | \end{pmatrix}}f (\begin{pmatrix} 133 | h_1\\ 134 | h_2 135 | \end{pmatrix}) = 3 h_1 + 2 b h_2 $ 136 | 137 | \subsection{Link with the gradients} 138 | \marginnote{\textbf{Notation} for $ x \in \mathbb{R}^n $, the gradient is usually written $ \nabla_x f \in \mathbb{R}^n $ $$ $$} 139 | 140 | 141 | 142 | \textbf{Formal definition} 143 | 144 | It can be shown that for all linear forms $ a : \mathbb{R}^n \mapsto \mathbb{R} $, there exists a vector $ u_a \in \mathbb{R}^n $ such that $ \forall h \in \mathbb{R}^n $ 145 | 146 | \marginnote{The dual of a vector space $ E^* $ is isomorphic to $ E $\\ 147 | See Riesz representation theorem} 148 | 149 | $$ a(h) = \langle u_a | h \rangle $$ 150 | 151 | In particular, for the \textbf{differential} $ \ud_x f $, we can find a vector $ u \in \mathbb{R}^n $ such that 152 | 153 | $$ \ud_x (h) = \langle u | h \rangle $$. 154 | 155 | \marginnote{The gradient has the \textbf{same shape} as $ x $ $$ $$} 156 | 157 | We can thus define the \textbf{gradient} of $ f $ in $ x $ 158 | 159 | $$ \nabla_x f := u $$ 160 | 161 | Then, as a conclusion, we can rewrite equation \ref{eq:diff} 162 | 163 | \marginnote{\textbf{Gradients} and \textbf{differential} of a function are conceptually very different. The \textbf{gradient} is a vector, while the \textbf{differential} is a function} 164 | 165 | \begin{align} 166 | f(x + h) &= f(x) + \ud_x f (h) + o_{h \rightarrow 0}(h)\\ 167 | &= f(x) + \langle \nabla_x f | h \rangle + o_{h \rightarrow 0}(h) 168 | \end{align} 169 | \label{eq:grad} 170 | 171 | \textbf{Example} 172 | \label{example:gradient} 173 | 174 | Same example as before, $ f : \mathbb{R}^2 \mapsto \mathbb{R} $ such that $ f(\begin{pmatrix} 175 | x_1 \\ 176 | x_2 177 | \end{pmatrix}) = 3x_1 + x_2^2 $. We showed that 178 | 179 | $$ \ud_{\begin{pmatrix} 180 | a \\ 181 | b 182 | \end{pmatrix}}f (\begin{pmatrix} 183 | h_1\\ 184 | h_2 185 | \end{pmatrix}) = 3 h_1 + 2 b h_2 $$ 186 | 187 | We can rewrite this as 188 | 189 | $$ \ud_{\begin{pmatrix} 190 | a \\ 191 | b 192 | \end{pmatrix}}f (\begin{pmatrix} 193 | h_1\\ 194 | h_2 195 | \end{pmatrix}) = \langle \begin{pmatrix} 196 | 3\\ 197 | 2b 198 | \end{pmatrix} | \begin{pmatrix} 199 | h_1 \\ 200 | h_2 201 | \end{pmatrix} \rangle $$ 202 | 203 | and thus our gradient is 204 | 205 | $$ \nabla_{\begin{pmatrix} 206 | a \\ 207 | b 208 | \end{pmatrix}}f = \begin{pmatrix} 209 | 3 \\ 210 | 2b 211 | \end{pmatrix} $$ 212 | 213 | \subsection{Partial derivatives} 214 | \marginnote{\textbf{Notation}\\ 215 | Partial derivatives are usually written $ \frac{\partial f}{\partial x} $ but you may also see $ \partial_{x} f $ or $ f'_x$ 216 | \begin{itemize} 217 | \item $ \frac{\partial f}{\partial x_i} $ is a \textbf{function} $ \mathbb{R}^n \mapsto \mathbb{R} $ 218 | \item $ \frac{\partial f}{\partial x} = (\frac{\partial f}{\partial x_1}, \ldots, \frac{\partial f}{\partial x_n})^T $ is a \textbf{function} $ \mathbb{R}^n \mapsto \mathbb{R}^n $. 219 | \item $ \frac{\partial f}{\partial x_i}(x) \in \mathbb{R} $ 220 | \item $ \frac{\partial f}{\partial x} (x) = (\frac{\partial f}{\partial x_1}(x), \ldots, \frac{\partial f}{\partial x_n}(x))^T \in \mathbb{R}^n $ 221 | \end{itemize} 222 | } 223 | \textbf{Formal definition} 224 | 225 | Now, let's consider an orthonormal basis $ (e_1, \ldots, e_n ) $ of $ \mathbb{R}^n $. Let's define the partial derivative 226 | 227 | $$ \frac{\partial f}{\partial x_i}(x) := \lim\limits_{h \rightarrow 0} \frac{f(x_1, \ldots, x_{i-1}, x_i + h, x_{i+1}, \ldots, x_n) - f(x_1,\ldots, x_n)}{h} 228 | $$ 229 | 230 | Note that the partial derivative $ \frac{\partial f}{\partial x_i}(x) \in \mathbb{R} $ and that it is defined \emph{with respect to the $i$-th component and evaluated in $ x $.} 231 | 232 | \textbf{Example} 233 | 234 | Same example as before, $ f : \mathbb{R}^2 \mapsto \mathbb{R} $ such that $ f(x_1,x_2) = 3x_1 + x_2^2 $. Let's write 235 | 236 | \marginnote{Depending on the context, most people omit to write the $ (x) $ evaluation and just write \\ 237 | $ \frac{\partial f}{\partial x} \in \mathbb{R}^n $ instead of $ \frac{\partial f}{\partial x} (x) $} 238 | \begin{align*} 239 | \frac{\partial f}{\partial x_1}(\begin{pmatrix} 240 | a\\b 241 | \end{pmatrix}) &= \lim\limits_{h \rightarrow 0} \frac{f(\begin{pmatrix} 242 | a + h\\b 243 | \end{pmatrix}) - f(\begin{pmatrix} 244 | a \\b 245 | \end{pmatrix})}{h}\\ 246 | &= \lim\limits_{h \rightarrow 0} \frac{3(a+h) + b^2 - (3a + b^2)}{h}\\ 247 | &= \lim\limits_{h \rightarrow 0} \frac{3h}{h}\\ 248 | &= 3 249 | \end{align*} 250 | 251 | In a similar way, we find that 252 | 253 | $$ \frac{\partial f}{\partial x_2}(\begin{pmatrix} 254 | a \\ b 255 | \end{pmatrix}) = 2b $$ 256 | 257 | \subsection{Link with the partial derivatives} 258 | \marginnote{That's why we usually write 259 | $$ \nabla_x f = \frac{\partial f}{\partial x}(x)$$ 260 | (\textbf{same shape as $ x $}) $$ $$} 261 | 262 | \textbf{Formal definition} 263 | 264 | It can be shown that 265 | \marginnote{$e_i$ is a orthonormal basis. For instance, in the canonical basis\\ 266 | $$ e_i = (0, \ldots, 1, \ldots 0) $$ with $ 1 $ at index $ i $} 267 | \begin{align*} 268 | \nabla_x f &= \sum_{i=1}^{n} \frac{\partial f}{\partial x_i}(x) e_i\\ 269 | &= \begin{pmatrix} 270 | \frac{\partial f}{\partial x_1}(x)\\ 271 | \vdots\\ 272 | \frac{\partial f}{\partial x_n}(x) 273 | \end{pmatrix} 274 | \end{align*} 275 | 276 | 277 | where $ \frac{\partial f}{\partial x_i} (x) $ denotes the partial derivative of $ f $ with respect to the $ i $th component, evaluated in $ x $. 278 | 279 | \textbf{Example} 280 | 281 | We showed that 282 | 283 | $$ \begin{cases} 284 | \frac{\partial f}{\partial x_1}(\begin{pmatrix} 285 | a\\b 286 | \end{pmatrix}) &= 3\\ 287 | \frac{\partial f}{\partial x_2}(\begin{pmatrix} 288 | a \\ b 289 | \end{pmatrix}) &= 2b 290 | \end{cases}$$ 291 | 292 | and that 293 | 294 | $$ \nabla_{\begin{pmatrix} 295 | a \\ b 296 | \end{pmatrix}}f = \begin{pmatrix} 297 | 3 \\ 2b 298 | \end{pmatrix} $$ 299 | 300 | and then we verify that 301 | 302 | $$ \nabla_{\begin{pmatrix} 303 | a \\ b 304 | \end{pmatrix}}f = \begin{pmatrix} 305 | \frac{\partial f}{\partial x_1}(\begin{pmatrix} 306 | a \\ b 307 | \end{pmatrix})\\ 308 | \frac{\partial f}{\partial x_2}(\begin{pmatrix} 309 | a \\ b 310 | \end{pmatrix}) 311 | \end{pmatrix} 312 | $$ 313 | 314 | \section{Summary} 315 | \label{sec:summary} 316 | 317 | \textbf{Formal definition} 318 | 319 | For a function $ f : \mathbb{R}^n \mapsto \mathbb{R} $, we have defined the following objects which can be summarized in the following equation 320 | \marginnote{Recall that $ a^T \cdot b = \langle a | b \rangle = \sum_{i=1}^n a_i b_i$} 321 | \begin{align*} 322 | f(x+h) &= f(x) + \ud_x f(h) + o_{h \rightarrow 0} (h) & \text{\textbf{differential}}\\ 323 | &= f(x) + \langle \nabla_x f | h \rangle + o_{h \rightarrow 0} (h) & \text{\textbf{gradient}}\\ 324 | &= f(x) + \langle \frac{\partial f}{\partial x}(x) | h \rangle + o_{h \rightarrow 0}\\ 325 | &= f(x) + \langle \begin{pmatrix} 326 | \frac{\partial f}{\partial x_1}(x) \\ \vdots \\ \frac{\partial f}{\partial x_n}(x) 327 | \end{pmatrix}| h \rangle + o_{h \rightarrow 0} & \text{\textbf{partial derivatives}}\\ 328 | \end{align*} 329 | 330 | \textbf{Remark} 331 | 332 | Let's consider $ x : \mathbb{R} \mapsto \mathbb{R} $ such that $ x(u) = u $ for all $ u $. Then we can easily check that $ \ud_u x(h) = h $. As this differential does not depend on $ u $, we may simply write $ \ud x $. 333 | \marginnote{The $ \ud x $ that we use refers to the differential of $ u \mapsto u $, the identity mapping!} 334 | That's why the following expression has some meaning, 335 | 336 | $$ \ud_x f (\cdot) = \frac{\partial f}{\partial x}(x) \ud x (\cdot) $$ 337 | 338 | because 339 | 340 | \begin{align*} 341 | \ud_x f (h) &= \frac{\partial f}{\partial x}(x) \ud x (h) \\ 342 | &= \frac{\partial f}{\partial x}(x) h 343 | \end{align*} 344 | In higher dimension, we write 345 | 346 | $$ \ud_x f = \sum_{i=1}^n \frac{\partial f}{\partial x_i}(x) \ud x_i $$ 347 | 348 | \section{\textbf{Jacobian}: Generalization to $ f: \mathbb{R}^n \mapsto \mathbb{R}^m $ } 349 | \label{sec:gen} 350 | For a function 351 | 352 | $$ f: 353 | \begin{pmatrix} 354 | x_1\\ 355 | \vdots\\ 356 | x_n 357 | \end{pmatrix} \mapsto 358 | \begin{pmatrix} 359 | f_1(x_1, \ldots, x_n)\\ 360 | \vdots\\ 361 | f_m(x_1, \ldots, x_n)\\ 362 | \end{pmatrix} 363 | $$ 364 | 365 | We can apply the previous section to each $ f_i(x) $ : 366 | 367 | \begin{align*} 368 | f_i(x+h) &= f_i(x) + \ud_x f_i(h) + o_{h \rightarrow 0} (h) \\ 369 | &= f_i(x) + \langle \nabla_x f_i | h \rangle + o_{h \rightarrow 0} (h)\\ 370 | &= f_i(x) + \langle \frac{\partial f_i}{\partial x}(x) | h \rangle + o_{h \rightarrow 0}\\ 371 | &= f_i(x) + \langle (\frac{\partial f_i}{\partial x_1}(x), \ldots, \frac{\partial f_i}{\partial x_n}(x))^T | h \rangle + o_{h \rightarrow 0}\\ 372 | \end{align*} 373 | 374 | Putting all this in the same vector yields 375 | 376 | $$ f \begin{pmatrix} 377 | x_1 + h_1\\ 378 | \vdots\\ 379 | x_n + h_n 380 | \end{pmatrix} = f 381 | \begin{pmatrix} 382 | x_1\\ 383 | \vdots\\ 384 | x_n 385 | \end{pmatrix} + 386 | \begin{pmatrix} 387 | \frac{\partial f_1}{\partial x}(x)^T \cdot h\\ 388 | \vdots\\ 389 | \frac{\partial f_m}{\partial x}(x)^T \cdot h\\ 390 | \end{pmatrix} + o(h) 391 | $$ 392 | 393 | Now, let's define the \textbf{Jacobian} matrix as 394 | \marginnote{The \textbf{Jacobian} matrix has dimensions $ m \times n $ and is a generalization of the gradient} 395 | $$ J(x) := \begin{pmatrix} 396 | \frac{\partial f_1}{\partial x}(x)^T\\ 397 | \vdots\\ 398 | \frac{\partial f_m}{\partial x}(x)^T\\ 399 | \end{pmatrix} = 400 | \begin{pmatrix} 401 | \frac{\partial f_1}{\partial x_1}(x) \ldots \frac{\partial f_1}{\partial x_n}(x)\\ 402 | \ddots \\ 403 | \frac{\partial f_m}{\partial x_1}(x) \ldots \frac{\partial f_m}{\partial x_n}(x)\\ 404 | \end{pmatrix} 405 | $$ 406 | 407 | 408 | Then, we have that 409 | 410 | \begin{align*} 411 | f \begin{pmatrix} 412 | x_1 + h_1\\ 413 | \vdots\\ 414 | x_n + h_n 415 | \end{pmatrix} &= f 416 | \begin{pmatrix} 417 | x_1\\ 418 | \vdots\\ 419 | x_n 420 | \end{pmatrix} + 421 | \begin{pmatrix} 422 | \frac{\partial f_1}{\partial x_1}(x) \ldots \frac{\partial f_1}{\partial x_n}(x)\\ 423 | \ddots \\ 424 | \frac{\partial f_m}{\partial x_1}(x) \ldots \frac{\partial f_m}{\partial x_n}(x)\\ 425 | \end{pmatrix} \cdot 426 | h + o(h)\\ 427 | &= f(x) + J(x)\cdot h + o(h) 428 | \end{align*} 429 | 430 | \textbf{Example 1 : $ m = 1 $} 431 | \label{example:jac1} 432 | \marginnote{In the case where $ m = 1 $, the \textbf{Jacobian} is a \textbf{row vector} \\ 433 | $ \frac{\partial f_1}{\partial x_1}(x) \ldots \frac{\partial f_1}{\partial x_n}(x) $\\ 434 | Remember that our \textbf{gradient} was defined as a column vector with the same elements. We thus have that \\ 435 | $ J(x) = \nabla_x f^T $ 436 | } 437 | 438 | 439 | Let's take our first function $ f : \mathbb{R}^2 \mapsto \mathbb{R} $ such that $ f(\begin{pmatrix} 440 | x_1 \\ x_2 441 | \end{pmatrix}) = 3x_1 + x_2^2 $. Then, the Jacobian of $ f $ is 442 | 443 | \begin{align*} 444 | \begin{pmatrix} 445 | \frac{\partial f}{\partial x_1}(x) & \frac{\partial f}{\partial x_2}(x) 446 | \end{pmatrix} &= 447 | \begin{pmatrix} 448 | 3 & 2x_2 449 | \end{pmatrix}\\ 450 | &= \begin{pmatrix} 451 | 3\\2x_2 452 | \end{pmatrix}^T\\ 453 | &= \nabla_f (x)^T 454 | \end{align*} 455 | 456 | \textbf{Example 2 : $ g : \mathbb{R}^3 \mapsto \mathbb{R}^2 $} 457 | \label{example:jac2} 458 | Let's define 459 | 460 | \begin{align*} 461 | g (\begin{pmatrix} 462 | y_1\\y_2\\y_3 463 | \end{pmatrix}) 464 | =\begin{pmatrix} 465 | y_1 + 2y_2 + 3y_3\\ y_1y_2y_3 466 | \end{pmatrix} 467 | \end{align*} 468 | 469 | 470 | Then, the Jacobian of $ g $ is 471 | 472 | \begin{align*} 473 | J_g(y) &= 474 | \begin{pmatrix} 475 | \frac{\partial (y_1 + 2y_2 + 3y_3)}{\partial y}(y)^T\\ 476 | \frac{\partial (y_1y_2y_3)}{\partial y}(y)^T\\ 477 | \end{pmatrix}\\ 478 | &= 479 | \begin{pmatrix} 480 | \frac{\partial (y_1 + 2y_2 + 3y_3)}{\partial y_1}(y) & \frac{\partial (y_1 + 2y_2 + 3y_3)}{\partial y_2}(y) & \frac{\partial (y_1 + 2y_2 + 3y_3)}{\partial y_3}(y)\\ 481 | \frac{\partial (y_1y_2y_3)}{\partial y_1}(y) & \frac{\partial (y_1y_2y_3)}{\partial y_2}(y) & \frac{\partial (y_1y_2y_3)}{\partial y_3}(y)\\ 482 | \end{pmatrix}\\ 483 | &= 484 | \begin{pmatrix} 485 | 1 & 2 & 3\\ 486 | y_2y_3 &y_1y_3 & y_1y_2 487 | \end{pmatrix} 488 | \end{align*} 489 | 490 | 491 | 492 | 493 | \section{Generalization to $ f : \mathbb{R}^{n \times p} \mapsto \mathbb{R} $} 494 | 495 | If a function takes as input a matrix $ A \in \mathbb{R}^{n \times p} $, we can transform this matrix into a vector $ a \in \mathbb{R}^{np} $, such that 496 | 497 | $$ A[i, j] = a[i + nj] $$ 498 | 499 | Then, we end up with a function $ \tilde{f} : \mathbb{R}^{np} \mapsto \mathbb{R} $. We can apply the results from \ref{sec:summary} and we obtain for $ x, h \in \mathbb{R}^{np} $ corresponding to $ X, h \in \mathbb{R}^{n \times p} $, 500 | 501 | \begin{align*} 502 | \tilde{f}(x + h) &= f(x) + \langle \nabla_x f | h \rangle + o(h) \\ 503 | \end{align*} 504 | 505 | where $ \nabla_x f = \begin{pmatrix} 506 | \frac{\partial f}{\partial x_1}(x) \\ \vdots \\ \frac{\partial f}{\partial x_{np}(x)} 507 | \end{pmatrix} $. 508 | 509 | Now, we would like to give some meaning to the following equation 510 | 511 | \marginnote{The gradient of $ f $ wrt to a matrix $ X $ is a matrix of same shape as $ X $ and defined by \\ 512 | $ \nabla_X f_{ij} = \frac{\partial f}{\partial X_{ij}}(X) $ 513 | } 514 | 515 | $$ f(X + H) = f(X) + \langle \nabla_X f | H \rangle + o(H) $$ 516 | 517 | Now, you can check that if you define 518 | 519 | $$ \nabla_X f_{ij} = \frac{\partial f}{\partial X_{ij}}(X) $$ 520 | 521 | 522 | 523 | that these two terms are equivalent 524 | 525 | \begin{align*} 526 | \langle \nabla_x f | h \rangle &= \langle \nabla_X f | H \rangle \\ 527 | \sum_{i=1}^{np} \frac{\partial f}{\partial x_i}(x) h_i &= \sum_{i, j} \frac{\partial f}{\partial X_{ij}}(X) H_{ij} 528 | \end{align*} 529 | 530 | \section{Generalization to $ f : \mathbb{R}^{n \times p} \mapsto \mathbb{R}^m $} 531 | \marginnote{Let's generalize the generalization of the previous section} 532 | 533 | Applying the same idea as before, we can write 534 | 535 | $$ f(x + h) = f(x) + J(x) \cdot h + o(h) $$ 536 | 537 | where $ J $ has dimension $ m \times n \times p $ and is defined as 538 | 539 | $$ J_{ijk}(x) = \frac{\partial f_i}{\partial X_{jk}} (x) $$ 540 | 541 | Writing the 2d-dot product $ \delta = J(x) \cdot h \in \mathbb{R}^m $ means that the $i$-th component of $ \delta $ is 542 | \marginnote{You can apply the same idea to any dimensions!} 543 | 544 | $$ \delta_i = \sum_{j=1}^n \sum_{k=1}^p \frac{\partial f_i}{\partial X_{jk}} (x) h_{jk} $$ 545 | 546 | \section{Chain-rule} 547 | 548 | \textbf{Formal definition} 549 | 550 | Now let's consider $ f: \mathbb{R}^n \mapsto \mathbb{R}^m $ and $ g: \mathbb{R}^p \mapsto \mathbb{R}^n $. We want to compute the \textbf{differential} of the composition $ h = f \circ g $ such that $ h :x \mapsto u = g(x) \mapsto f(g(x)) = f(u) $, or 551 | 552 | $$ \ud_x (f \circ g) $$. 553 | 554 | It can be shown that the differential is the composition of the differentials 555 | 556 | $$ \ud_x (f \circ g) = \ud_{g(x)} f \circ \ud_x g $$ 557 | 558 | Where $ \circ $ is the composition operator. Here, $ \ud_{g(x)} f $ and $ \ud_x g $ are linear transformations (see section \ref{sec:gen}). Then, the resulting differential is also a linear transformation and the \textbf{jacobian} is just the dot product between the jacobians. In other words, 559 | 560 | \marginnote{The \textbf{chain-rule} is just writing the resulting \textbf{jacobian} as a dot product of \textbf{jacobians}. Order of the dot product is very important!} 561 | 562 | $$J_h(x) = J_f(g(x)) \cdot J_g(x) $$ 563 | 564 | where $ \cdot $ is the dot-product. This dot-product between two matrices can also be written component-wise: 565 | 566 | $$ J_h(x)_{ij} = \sum_{k=1}^n J_f (g(x))_{ik} \cdot J_g(x)_{kj}$$ 567 | 568 | \textbf{Example} 569 | 570 | Let's keep our example function $ f : (\begin{pmatrix} 571 | x_1\\x_2 572 | \end{pmatrix}) \mapsto 3x_1 + x_2^2 $ and our function 573 | $ g : (\begin{pmatrix} 574 | y_1\\y_2\\y_3 575 | \end{pmatrix}) = \begin{pmatrix} 576 | y_1 + 2y_2 + 3y_3\\ y_1y_2y_3 577 | \end{pmatrix} $. 578 | 579 | The composition of $ f $ and $ g $ is $ h = f \circ g : \mathbb{R}^3 \mapsto \mathbb{R} $ 580 | 581 | \begin{align*} 582 | h(\begin{pmatrix} 583 | y_1\\y_2\\y_3 584 | \end{pmatrix}) &= f(\begin{pmatrix} 585 | y_1 + 2y_2 + 3y_3 \\ y_1y_2y_3 586 | \end{pmatrix})\\ 587 | &= 3(y_1 + 2y_2 + 3y_3) + (y_1y_2y_3)^2 588 | \end{align*} 589 | 590 | We can compute the three components of the gradient of $ h $ with the partial derivatives 591 | 592 | \begin{align*} 593 | \frac{\partial h}{\partial y_1}(y) &= 3 + 2y_1y_2^2y_3^2\\ 594 | \frac{\partial h}{\partial y_2}(y) &= 6 + 2y_2y_1^2y_3^2\\ 595 | \frac{\partial h}{\partial y_3}(y) &= 9 + 2y_3y_1^2y_2^2\\ 596 | \end{align*} 597 | 598 | And then our gradient is 599 | 600 | $$ \nabla_y h = \begin{pmatrix} 601 | 3 + 2y_1y_2^2y_3^2 \\ 6 + 2y_2y_1^2y_3^2 \\ 9 + 2y_3y_1^2y_2^2 602 | \end{pmatrix} $$ 603 | 604 | In this process, we did not use our previous calculation, and that's a shame. Let's use the chain-rule to make use of it. With examples \ref{example:gradient} and \ref{example:jac1}, we had 605 | 606 | \marginnote{For a function $f : \mathbb{R}^n \mapsto \mathbb{R} $, the Jacobian is the transpose of the gradient\\ 607 | $$\nabla_x f^T = J_f(x)$$ } 608 | 609 | \begin{align*} 610 | J_f(x) &= \nabla_x f^T\\ 611 | &= \begin{pmatrix} 612 | 3 & 2 x_2 613 | \end{pmatrix} 614 | \end{align*} 615 | 616 | We also need the jacobian of $ g $, which we computed in \ref{example:jac2} 617 | 618 | \begin{align*} 619 | J_g(y) &= 620 | \begin{pmatrix} 621 | 1 & 2 & 3\\ 622 | y_2y_3 &y_1y_3 & y_1y_2 623 | \end{pmatrix} 624 | \end{align*} 625 | 626 | Applying the chain rule, we obtain that the \textbf{jacobian} of $ h $ is the product $ J_f \cdot J_g $ (\textbf{in this order}). Recall that for a function $ \mathbb{R}^n \mapsto \mathbb{R} $, the jacobian is formally the transpose of the gradient. Then, 627 | 628 | 629 | \begin{align*} 630 | J_h (y) &= J_f(g(y))\cdot J_g(y)\\ 631 | &= \nabla_{g(y)}^T f \cdot J_g(y)\\ 632 | &= \begin{pmatrix} 633 | 3 & 2y_1y_2y_3 634 | \end{pmatrix}\cdot 635 | \begin{pmatrix} 636 | 1 & 2 & 3\\ 637 | y_2y_3 &y_1y_3 & y_1y_2 638 | \end{pmatrix}\\ 639 | &= 640 | \begin{pmatrix} 641 | 3 + 2y_1 y_2^2 y_3^2 & 6 + 2y_2y_1^2y_3^2 & 9 + 2y_3y_1^2y_2^2 642 | \end{pmatrix} 643 | \end{align*} 644 | 645 | and taking the transpose we find the same gradient that we computed before! 646 | 647 | \textbf{Important remark} 648 | \begin{itemize} 649 | \item The gradient is only defined for function with values in $ \mathbb{R} $. 650 | \item Note that the chain rule gives us a way to compute the \textbf{Jacobian} and \underline{not the \textbf{gradient}}. However, we showed that in the case of a function $ f: \mathbb{R}^n \mapsto \mathbb{R} $, the \textbf{jacobian} and the \textbf{gradient} are directly identifiable, because $ \nabla_x J^T = J (x) $. Thus, if we want to compute the gradient of a function by using the chain-rule, the best way to do it is to compute the Jacobian. 651 | \item As the gradient must have the same shape as the variable against which we derive, and 652 | \begin{itemize} 653 | \item we know that the Jacobian is the transpose of the gradient 654 | \item and the Jacobian is the dot product of Jacobians 655 | \end{itemize} 656 | an efficient way of computing the gradient is to find the ordering of jacobian (or the transpose of the jacobian) that yield correct shapes! 657 | \item the notation $ \frac{\partial \cdot }{\partial \cdot} $ is often ambiguous and can refer to either the gradient or the Jacobian. 658 | \end{itemize} 659 | 660 | %\section{Examples} 661 | \end{document} 662 | -------------------------------------------------------------------------------- /sty/code_snippet.sty: -------------------------------------------------------------------------------- 1 | \usepackage{listings, textcomp, color, verbatim} 2 | \definecolor{deepgreen}{rgb}{0,0.4,0} 3 | 4 | \lstdefinestyle{python}{ 5 | language=python, 6 | basicstyle=\color{black}\ttfamily\footnotesize, 7 | stringstyle=\color{deepgreen}\slshape, 8 | commentstyle=\color{gray}\slshape, 9 | keywordstyle=\color{red}\bf, 10 | emphstyle=\color{blue}\bf, 11 | tabsize=2, 12 | %%%%%%%%%%%%%%% 13 | showstringspaces=false, 14 | emph={access,and,break,class,continue,def,del,elif,else,except,exec,finally,for,from,global,if,import,in,i s,lambda,not,or,pass,print,raise,return,try,while,as}, 15 | upquote=true, 16 | morecomment=[s]{"""}{"""}, 17 | literate=* 18 | {:}{{\textcolor{blue}:}}{1}% 19 | {=}{{\textcolor{blue}=}}{1}% 20 | {-}{{\textcolor{blue}-}}{1}% 21 | {+}{{\textcolor{blue}+}}{1}% 22 | {*}{{\textcolor{blue}*}}{1}% 23 | {!}{{\textcolor{blue}!}}{1}% 24 | {(}{{\textcolor{blue}(}}{1}% 25 | {)}{{\textcolor{blue})}}{1}% 26 | {[}{{\textcolor{blue}[}}{1}% 27 | {]}{{\textcolor{blue}]}}{1}% 28 | {<}{{\textcolor{blue}<}}{1}% 29 | {>}{{\textcolor{blue}>}}{1},% 30 | %%%%%%%%%%%%%%%% 31 | aboveskip=\baselineskip, 32 | xleftmargin=20pt, xrightmargin=15pt, 33 | frame=single, 34 | numbers=none, numberstyle=\tiny 35 | } 36 | \lstnewenvironment{python}{\lstset{style=python}}{} 37 | \newcommand{\inputsamplepython}[1]{\lstinputlisting[style=python]{../ClassCodes/#1}} 38 | \newcommand{\inputpython}[1]{\lstinputlisting[style=python]{#1}} 39 | -------------------------------------------------------------------------------- /sty/kbordermatrix.sty: -------------------------------------------------------------------------------- 1 | \NeedsTeXFormat{LaTeX2e} 2 | \ProvidesPackage{kbordermatrix}[2011/09/21 Bordered matrix with brackets] 3 | 4 | % Author: Kim C Border 5 | % Date: SuperBowl XXXVII (Go Bucs) 6 | 7 | % Revised 2003/09/20 8 | % to allow flush right option. 9 | % Revised 2011/09/21 10 | % at urging of Bruno Calfa (CMU) 11 | % to coexist with package arydshln 12 | % by adding \def\@xarraycr ... 13 | 14 | % Defines \kbordermatrix along the lines of plain tex's 15 | % \bordermatrix (which is still available in LaTeX). 16 | 17 | % In particular, as with \bordermatrix, 18 | % 1. It takes the array as an argument. It does not use \begin{}..\end{}. 19 | % Is this a feature or a bug? 20 | % 2. The first row is spaced a bit further apart from the rest. 21 | % 3. The lower (n-1) by (n-1) block is set off by delimiters. 22 | % 4. There is an invisible bottom row of the same height as the segregated 23 | % top row that adds to the height of the equation. 24 | 25 | % Differences from \bordermatrix: 26 | % 1. Square brackets are used in place of parentheses. 27 | % 2. You may use \\ instead of \cr. 28 | % 3. The line heights agree with LaTeX's line heights for the 29 | % array environment, and \arraystretch is respected. This means 30 | % the bottom (n-1) rows align with the rows of an (n-1)-rowed 31 | % \begin{array}..\end{array} (with or without delimiters). 32 | % 4. All columns are centered. 33 | % ** Modified 2003-9-20 to allow flush right option. 34 | % 5. The first column is spaced a bit further apart from the rest. 35 | 36 | 37 | % Differences from \left\[\begin{array}...\end{array}\right\] 38 | % 1. It takes the array as an argument. It does not use \begin{}..\end{}. 39 | % Is this a feature or a bug? 40 | % 2. Consequently, you cannot use a column specifier (e.g., {l|cr}). 41 | % 3. Consequently the maximum number of columns is not specified. 42 | % 4. Vertical rules must be put in each row in a separate column. 43 | % 5. You can use \hline and \cline. 44 | 45 | % At least it works in the cases I have tried, but I offer no guarantees. 46 | 47 | 48 | 49 | % cf. \bordermatrix p. 361, and \vrulealign p. 392 of The TeXbook 50 | 51 | 52 | 53 | % Style parameters, they may be redefined according to taste. 54 | \newcommand{\kbldelim}{[} % Left delimiter 55 | \newcommand{\kbrdelim}{]}% Right delimiter 56 | 57 | \newcommand{\kbrowstyle}{\scriptstyle}% Style applied to first row 58 | \newcommand{\kbcolstyle}{\scriptstyle}% Style applied to first column 59 | 60 | \newlength{\kbcolsep} % Extra separation after first border column 61 | \newlength{\kbrowsep} % Extra separation after first border row 62 | 63 | \setlength{\kbcolsep}{.5\arraycolsep} 64 | \setlength{\kbrowsep}{.2ex} 65 | 66 | \newif\ifkbalignright 67 | 68 | % Scratch lengths (to be computed) 69 | \newlength{\br@kwd} % Width of delimiter 70 | \newlength{\k@bordht} % Height of border column 71 | 72 | % This is it 73 | \newcommand{\kbordermatrix}[1]{% 74 | \begingroup 75 | % \br@kwd depends on font size, so compute it now. 76 | \setbox0=\hbox{$\left\kbldelim\right.$} 77 | \setlength{\br@kwd}{\wd0} 78 | % Compute the array strut based on current value of \arraystretch. 79 | \setbox\@arstrutbox\hbox{\vrule 80 | \@height\arraystretch\ht\strutbox 81 | \@depth\arraystretch\dp\strutbox 82 | \@width\z@} 83 | % Compute height of first row and extra space. 84 | \setlength{\k@bordht}{\kbrowsep} 85 | \addtolength{\k@bordht}{\ht\@arstrutbox} 86 | \addtolength{\k@bordht}{\dp\@arstrutbox} 87 | % turn off mathsurround 88 | \m@th 89 | % Set the first row style 90 | \def\@kbrowstyle{\kbrowstyle} 91 | % Swallow the alignment into box0: 92 | \setbox0=\vbox{% 93 | % Define \cr for first row to include the \kbrowsep 94 | % and to reset the row style 95 | \def\cr{\crcr\noalign{\kern\kbrowsep 96 | \global\let\cr=\endline 97 | \global\let\@kbrowstyle=\relax}} 98 | % Redefine \\ a la LaTeX: 99 | \let\\\@arraycr 100 | % The following are needed to make a solid \vrule with no gaps 101 | % between the lines. 102 | \lineskip\z@skip 103 | \baselineskip\z@skip 104 | % Compute the length of the skip after the first column 105 | \dimen0\kbcolsep \advance\dimen0\br@kwd 106 | % Here begins the alignment: 107 | \ialign{\tabskip\dimen0 % This space will show up after the first column 108 | \kern\arraycolsep\hfil\@arstrut$\kbcolstyle ##$\hfil\kern\arraycolsep& 109 | \tabskip\z@skip % Cancel extra space for other columns 110 | \kern\arraycolsep\hfil$\@kbrowstyle ##$\ifkbalignright\relax\else\hfil\fi\kern\arraycolsep&& 111 | \kern\arraycolsep\hfil$\@kbrowstyle ##$\ifkbalignright\relax\else\hfil\fi\kern\arraycolsep\crcr 112 | % That ends the template. 113 | % Here is the argument: 114 | #1\crcr}% End \ialign 115 | }% End \setbox0. 116 | % \box0 now holds the array. 117 | % 118 | % This next line uses \box2 to hold a throwaway 119 | % copy of \box0, leaving \box0 intact, 120 | % while putting the last row in \box5. 121 | \setbox2=\vbox{\unvcopy0 \global\setbox5=\lastbox} 122 | % We want the width of the first column, 123 | % so we lop off columns until there is only one left. 124 | % It's not elegant or efficient, but at 1 gHz, who cares. 125 | \loop 126 | \setbox2=\hbox{\unhbox5 \unskip \global\setbox3=\lastbox} 127 | \ifhbox3 128 | \global\setbox5=\box2 129 | \global\setbox1=\box3 130 | \repeat 131 | % \box1 now holds the first column of last row. 132 | % 133 | % This next line stores the alignment in \box2, 134 | % while calculating the proper 135 | % delimiter height and placement. 136 | \setbox2=\hbox{$\kern\wd1\kern\kbcolsep\kern-\arraycolsep 137 | \left\kbldelim 138 | \kern-\wd1\kern-\kbcolsep\kern-\br@kwd 139 | % 140 | % Here is the output. The \vcenter aligns the array with the "math axis." 141 | % The negative vertical \kern only shrinks the delimiter's height. 142 | % BTW, I didn't find this in the TeXbook, 143 | % I had to try various \kerns to see what they did in a 144 | % \left[\vcenter{}\right]. 145 | \vcenter{\kern-\k@bordht\vbox{\unvbox0}} 146 | \right\kbrdelim$} 147 | \null\vbox{\kern\k@bordht\box2} 148 | % 149 | \endgroup 150 | } 151 | 152 | -------------------------------------------------------------------------------- /tensorflow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/tensorflow.pdf -------------------------------------------------------------------------------- /tensorflow.tex: -------------------------------------------------------------------------------- 1 | \documentclass{tufte-handout} 2 | 3 | \title{CS224n: Natural Language Processing with Deep Learning 4 | \thanks{Course Instructors: Christopher Manning, Richard Socher} \\ 5 | \Large Lecture Notes: TensorFlow\thanks{Authors: Zhedi Liu, Jon Gauthier, Bharath Ramsundar, Chip Huyen}} 6 | 7 | \date{Winter 2017} % without \date command, current date is supplied 8 | 9 | %\geometry{showframe} % display margins for debugging page layout 10 | 11 | \usepackage{graphicx} % allow embedded images 12 | \setkeys{Gin}{width=\linewidth,totalheight=\textheight,keepaspectratio} 13 | \graphicspath{{tensorflow/fig/}} % set of paths to search for images 14 | \usepackage{amsmath} % extended mathematics 15 | \usepackage{amstext} % extended text 16 | \usepackage{booktabs} % book-quality tables 17 | \usepackage{units} % non-stacked fractions and better unit spacing 18 | \usepackage{multicol} % multiple column layout facilities 19 | \usepackage{lipsum} % filler text 20 | \usepackage{fancyvrb} % extended verbatim environments 21 | \usepackage{placeins} 22 | \fvset{fontsize=\normalsize}% default font size for fancy-verbatim environments 23 | \usepackage[normalem]{ulem} 24 | \usepackage{algpseudocode} 25 | \usepackage{algorithm} 26 | \usepackage{listings} 27 | 28 | \usepackage{sty/code_snippet} 29 | 30 | % tikz package 31 | \usepackage{tikz} 32 | \usetikzlibrary{patterns, shapes,calc,positioning,arrows,mindmap,matrix} 33 | \usetikzlibrary{decorations.pathreplacing} 34 | 35 | % Standardize command font styles and environments 36 | \newcommand{\doccmd}[1]{\texttt{\textbackslash#1}}% command name -- adds backslash automatically 37 | \newcommand{\docopt}[1]{\ensuremath{\langle}\textrm{\textit{#1}}\ensuremath{\rangle}}% optional command argument 38 | \newcommand{\docarg}[1]{\textrm{\textit{#1}}}% (required) command argument 39 | \newcommand{\docenv}[1]{\textsf{#1}}% environment name 40 | \newcommand{\docpkg}[1]{\texttt{#1}}% package name 41 | \newcommand{\doccls}[1]{\texttt{#1}}% document class name 42 | \newcommand{\docclsopt}[1]{\texttt{#1}}% document class option name 43 | \newenvironment{docspec}{\begin{quote}\noindent}{\end{quote}}% command specification environment 44 | \newcommand{\argmin}{\operatornamewithlimits{argmin}} 45 | \newcommand{\argmax}{\operatornamewithlimits{argmax}} 46 | \newcommand{\textunderscript}[1]{$_{\text{#1}}$} 47 | 48 | \setcounter{secnumdepth}{3} 49 | \setcounter{tocdepth}{3} 50 | 51 | 52 | \begin{document} 53 | 54 | \maketitle% this prints the handout title, author, and date 55 | 56 | \textbf{Keyphrases: TensorFlow} \\ 57 | \noindent 58 | \textbf{Code Demo: \url{https://github.com/nishithbsk/tensorflow_tutorials}} 59 | 60 | \section{Introduction} 61 | TensorFlow is an open source software library for numerical computation using data flow graphs. It was originally developed by researchers and engineers working on the Google Brain Team within Google's Machine Intelligence research organization for the purposes of conducting machine learning and deep neural networks research. \\ 62 | 63 | \marginnote{Check the official tutorial\\ \url{https://www.tensorflow.org/get_started/}} 64 | 65 | Nodes in TensorFlow's data flow graph represent mathematical operations, while the edges represent the multidimensional data arrays (tensors) communicated between them. The advantage of the flexible architecture is that it allows users to build complex models step by step and makes gradient calculations simple. TensorFlow programs use a tensor data structure to represent all data -- only tensors are passed between operations in the computation graph. You can think of a TensorFlow tensor as an n-dimensional array or list. A tensor has a static type, a rank, and a shape. 66 | 67 | \section{Concepts} 68 | \subsection{Variables, Placeholders, Mathematical Operations} 69 | Let's use $$h = ReLU(Wx + b)$$ where $ReLU$ (Rectified Linear Unit) is defined as $f(x) = max(0, x)$ as an example to take a closer look at TensorFlow's data flow graph, shown in Figure~\ref{fig:tensorFlow}. There are three types of nodes in a flow graph: variables, placeholders and mathematical operations. 70 | 71 | \begin{marginfigure} 72 | \centering 73 | \includegraphics[width=\linewidth]{tensorFlow.png} 74 | \caption {An Illustration of a TensorFlow Flow Graph} 75 | \label{fig:tensorFlow} 76 | \end{marginfigure} 77 | 78 | Variables are stateful nodes that maintain state across executions of the graph. By stateful, we mean that variables retain their current values over multiple executions, and it's easy to restore those saved values. Variables can be saved to disk during and after training. Typically, variables are parameters in a neural network. In our example, weights $W$ and bias $b$ are variables. 79 | 80 | Placeholders are nodes whose values are fed in at execution time. The rationale behind having placeholders is that we want to be able to build flow graphs without having to load external data, as we only want to pass in them at run time. Placeholders, unlike variables, require initialization. In order to initialize a placeholder, type and shape of data have to be passed in as arguments. Input data and labels are some examples that need to be initialized as placeholders. In our example, placeholder is $x$. See the code snippet below for initializing an input placeholder that has type tf.float32 and shape (batch\_size, n\_features), and a labels placeholder that has type tf.int32 and shape (batch\_size, n\_classes). 81 | 82 | 83 | \begin{python} 84 | ## Example code snippet 85 | input_placeholder = tf.placeholder(tf.float32, 86 | shape=(batch_size, n_features)) 87 | labels_placeholder = tf.placeholder(tf.int32, s 88 | hape=(batch_size, n_classes)) 89 | \end{python} 90 | 91 | Mathematical operations, as the name suggests, represent mathematical operations in a flow graph. In our example, \texttt{MatMul} (multiply two matrix values), \texttt{Add} (add element-wise with broadcasting) and \texttt{ReLU} (activate with element-wise rectified linear function) are mathematical operations. 92 | 93 | Now we are ready to see our flow graph in code. Let's assume our input $x$ has shape ($N$, $Dx$), $W$ has shape ($Dx$, $N$) and type tf.float32, $b$ has shape ($N$, 1) and we will initialize $W\sim \textrm{Uniform}(-1, 1)$ and $b = \boldmath{0}$. Then the code snippet below shows us how to build our flow graph for $h = ReLU(Wx + b)$. 94 | 95 | \begin{python} 96 | ## Example code snippet 97 | import tensorflow as tf 98 | 99 | b = tf.Variable(tf.zeros((N,))) 100 | W = tf.Variable(tf.random_uniform((Dx, N), -1, 1)) 101 | x = tf.placeholder(tf.float32, (N, Dx)) 102 | h = tf.nn.relu(tf.matmul(x, W) + b) 103 | 104 | \end{python} 105 | 106 | The key thing to remember about symbolic programming language is that, up to what what we have written here, no data is actually being computed. $x$ is just a placeholder for our input data. A flow graph merely defines a function. We cannot do \texttt{print(h)} and gets its value as it only represents a node in the graph. 107 | 108 | \subsection{Fetch, Fetch} 109 | Now that we've defined a graph, the next steps are to deploy this graph with a session and run the session to get our outputs. A session is an environment that supports the execution of all operations to a particular execution context (e.g. CPU, GPU). A session can be easily built by doing \texttt{sess = tf.Session()}. In order for a session to run, two arguments have to be fed: fetches and feeds. We use feeds and fetches to get data into and out of arbitrary operations. 110 | 111 | Fetches represent a list of graph nodes and return the outputs of these nodes. We could fetch a single node or multiple tensors. See the code snippet below for an example of fetching two tensors: \texttt{mul} and \texttt{intermed}. 112 | 113 | \begin{python} 114 | ## Example code snippet 115 | import tensorflow as tf 116 | 117 | input1 = tf.constant([3.0]) 118 | input2 = tf.constant([2.0]) 119 | input3 = tf.constant([5.0]) 120 | intermed = tf.add(input2, input3) 121 | mul = tf.mul(input1, intermed) 122 | 123 | with tf.Session() as sess: 124 | result = sess.run([mul, intermed]) 125 | print(result) 126 | 127 | # output: 128 | # [array([ 21.], dtype=float32), array([ 7.], dtype=float32)] 129 | 130 | \end{python} 131 | 132 | A feed, supplied as an argument to a \texttt{run()} call, temporarily replaces the output of an operation with a tensor value. The feed is only used for the \texttt{run} call to which it is passed. Essentially, feeds are dictionaries mapping placeholders to their values. Nodes that depend on placeholders cannot run unless their values are fed. See the code snippet below for an example of feeding a \texttt{feed\_dict}. 133 | 134 | \begin{python} 135 | ## Example code snippet 136 | import tensorflow as tf 137 | 138 | input1 = tf.placeholder(tf.float32) 139 | input2 = tf.placeholder(tf.float32) 140 | output = tf.mul(input1, input2) 141 | 142 | with tf.Session() as sess: 143 | print(sess.run([output], feed_dict={input1:[7.], input2:[2.]})) 144 | 145 | # output: 146 | # [array([ 14.], dtype=float32)] 147 | 148 | \end{python} 149 | 150 | Before moving on to how to train a model, let's see a slightly more complicated example combining fetch and feed. In this example, we have a placeholder $x$ that requires initialization. We have two variables $W$ and $b$. It should be noted that when we launch a graph, all variables have to be explicitly initialized before one can run Ops that use their value. A variable can be initialized by running its initializer op, restoring the variable from a save file, or simply running an assign Op that assigns a value to the variable. In fact, the variable initializer op is just an assign Op that assigns the variable's initial value to the variable itself. An example usage is \texttt{sess.run(w.initializer)} where $w$ is a variable in the graph. The more common initialization pattern is to use the convenience function \texttt{tf.initialize\_all\_variables()} to add an Op to the graph that initializes all the variables, as illustrated in the code snippet below. 151 | 152 | \begin{python} 153 | ## Example code snippet 154 | import numpy as np 155 | import tensorflow as tf 156 | 157 | b = tf.Variable(tf.zeros((100,))) 158 | W = tf.Variable(tf.random_uniform((784, 100), 159 | -1, 1)) 160 | 161 | x = tf.placeholder(tf.float32, (100, 784)) 162 | h = tf.nn.relu(tf.matmul(x, W) + b) 163 | 164 | sess = tf.Session() 165 | sess.run(tf.initialize_all_variables()) 166 | # {x: np.random.random(100, 784)} is a feed 167 | # that assigns np.random.random(100, 784) to placeholder x 168 | sess.run(h, {x: np.random.random(100, 784)}) 169 | 170 | \end{python} 171 | 172 | \subsection{How to Train a Model in TensorFlow} 173 | 174 | \textit{1. Define a Loss} \\ 175 | The first thing to do in order to train a model is to build a loss node. See the code snippet below for an example of defining a cross-entropy loss. We build the loss node using labels and prediction. Note that we use \texttt{tf.reduce\_sum} to compute the sum of elements across dimensions of a tensor. For our example, \texttt{axis=1} is used to perform a row-wise sum. 176 | 177 | \begin{python} 178 | ## Example code snippet 179 | import tensorflow as tf 180 | 181 | prediction = tf.nn.softmax(...) #Output of neural network 182 | label = tf.placeholder(tf.float32, [100, 10]) 183 | 184 | cross_entropy = -tf.reduce_sum(label * tf.log(prediction), axis=1) 185 | 186 | # More examples of using tf.reduce_sum 187 | # 'x' is [[1, 1, 1] 188 | # [1, 1, 1]] 189 | # tf.reduce_sum(x) ==> 6 190 | # tf.reduce_sum(x, 0) ==> [2, 2, 2] 191 | # tf.reduce_sum(x, 1) ==> [3, 3] 192 | # tf.reduce_sum(x, 1, keep_dims=True) ==> [[3], [3]] 193 | # tf.reduce_sum(x, [0, 1]) ==> 6 194 | \end{python} 195 | 196 | \noindent 197 | \textit{2. Compute Gradients} \\ 198 | The next thing we have to do is to compute gradients. TensorFlow nodes have attached operations; therefore gradients with respect to parameters are automatically computed with backpropagation. All we need to do is creating an optimizer object and calling the \texttt{minimize} function on previously defined loss. See code snippet below for an example of using a \texttt{GradientDescentOptimizer} optimizer where \texttt{cross\_entropy} is the same as we introduced in the previous code snippet. Evaluating the minimization operation, \texttt{train\_step} at runtime will automatically compute and apply gradients to all variables in the graph. 199 | 200 | \begin{python} 201 | ## Example code snippet 202 | import tensorflow as tf 203 | 204 | lr = 0.5 # learning rate 205 | optimizer = tf.train.GradientDescentOptimizer(lr) 206 | train_step = optimizer.minimize(cross_entropy) 207 | \end{python} 208 | 209 | \noindent 210 | \textit{3. Train Model} \\ 211 | Now we are ready to train a model. This can simply be done by creating an iterating training schedule that feeds in data, labels and applies gradients to the variables, as shown in the code snippet below. 212 | 213 | \begin{python} 214 | ## Example code snippet 215 | import tensorflow as tf 216 | 217 | sess = tf.Session() 218 | sess.run(tf.initialize_all_variables()) 219 | 220 | for i in range(1000): 221 | batch_x, batch_label = data.next_batch() 222 | sess.run(train_step, feed_dict={x: batch_x, label: batch_label} 223 | \end{python} 224 | 225 | \subsection{Variable Sharing} 226 | One last important concept is variable sharing. When building complex models, we often need to share large sets of variables and might want to initialize all of them in one place. This can be done by using \texttt{tf.variable\_scope()} and \texttt{tf.get\_variable()}. 227 | 228 | Imagine we are building a neural nets with two layers, if we use \texttt{tf.Variable}, we would have two sets of weights and two sets of biases. Let's assume that these variables are initialized in \texttt{define\_variables()}. The problem arises when we want to use this model for two tasks that share the same parameters. We would have to call \texttt{define\_variables(inputs)} twice, resulting in two sets of variables, 4 variables in each one, for a total of 8 variables. A common try to share variables is to create them in a separate piece of code and pass them to functions that use them, say by using a dictionary. I.e. the \texttt{define\_variables} now takes two arguments, \texttt{inputs} and \texttt{variables\_dict}. While convenient, creating a \texttt{variables\_dict}, outside of the code, breaks encapsulation: 1) the code that builds the graph must document the names, types, and shapes of variables to create, and 2) When the code changes, the callers may have to create more, or less, or different variables. One way to address the problem is to use classes to create a model, where the classes take care of managing the variables they need. For a lighter solution, not involving classes, TensorFlow provides a Variable Scope mechanism that allows to easily share named variables while constructing a graph. 229 | 230 | Variable Scope mechanism in TensorFlow consists of two main functions: \texttt{tf.get\_variable(, , )} creates or returns a variable with a given name instead of a direct call to \texttt{tf.Variable}; \texttt{tf.variable\_scope()} manages namespaces for names passed to \texttt{tf.get\_variable()}. \texttt{tf.get\_variable} does one of two things depending on the scope it is called in. Let's set \texttt{v = tf.get\_variable(name, shape, dtype, initializer)}. 231 | 232 | Case 1: the scope is set for creating new variables, i.e. \texttt{tf.get\_variable\_scope(name, reuse=False)}.In this case, $v$ will be a newly created \texttt{tf.Variable} with the provided shape and data type. The full name of the created variable will be set to the current variable scope name + the provided name and a check will be performed to ensure that no variable with this full name exists yet. If a variable with this full name already exists, the function will raise a \texttt{ValueError}. If a new variable is created, it will be initialized to the value \texttt{initializer(shape)}. For example, 233 | 234 | \begin{python} 235 | ## Example code snippet 236 | import tensorflow as tf 237 | 238 | with tf.variable_scope("foo"): 239 | v = tf.get_variable("v", [1]) 240 | assert v.name == "foo/v:0" 241 | \end{python} 242 | 243 | Case 2: the scope is set for reusing variables, i.e. \texttt{tf.get\_variable\_scope(name, reuse=True)}. In this case, the call will search for an already existing variable with name equal to the current variable scope name + the provided name. If no such variable exists, a \texttt{ValueError} will be raised. If the variable is found, it will be returned. If a variable already exists but \texttt{reuse=False}, program will crash. For example: 244 | 245 | \begin{python} 246 | ## Example code snippet 247 | import tensorflow as tf 248 | 249 | with tf.variable_scope("foo"): 250 | v = tf.get_variable("v", [1]) 251 | with tf.variable_scope("foo", reuse=True): 252 | v1 = tf.get_variable("v", [1]) 253 | with tf.variable_scope("foo", reuse=False): 254 | v1 = tf.get_variable("v") # CRASH foo/v:0 already exists! 255 | \end{python} 256 | 257 | 258 | 259 | \end{document} 260 | -------------------------------------------------------------------------------- /tensorflow/fig/tensorFlow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/cs224n-winter17-notes/9df6ef031c148d484a487f50c2009cfba8276c59/tensorflow/fig/tensorFlow.png -------------------------------------------------------------------------------- /tensorflow/reference.bib: -------------------------------------------------------------------------------- 1 | @article{bengio2003neural, 2 | title={A neural probabilistic language model}, 3 | author={Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Janvin, Christian}, 4 | journal={The Journal of Machine Learning Research}, 5 | volume={3}, 6 | pages={1137--1155}, 7 | year={2003}, 8 | publisher={JMLR. org} 9 | } 10 | -------------------------------------------------------------------------------- /update_overleaf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # SCRIPT FOR INSTRUCTORS TO UPDATE THE OVERLEAF 4 | # must commit changes of the github repo first 5 | 6 | # add overleaf as a remote to the repo 7 | git remote add overleaf https://git.overleaf.com/7857286dnrjvpkjbnph 8 | # check that the remote is added 9 | git remote -v 10 | # fetch overleaf 11 | git fetch overleaf 12 | # merge with github repo 13 | git merge overleaf/master 14 | # RESOLVE conflict if there are some 15 | # push changes to both remotes 16 | git push overleaf master 17 | git push origin master 18 | 19 | --------------------------------------------------------------------------------