├── 01-intro.tex ├── 02-transformers.tex ├── 03-RNN.tex ├── 03.5-Results.tex ├── 04-Practicalities.tex ├── 05-Extensions.tex ├── 06-final.tex ├── DoWeNeedAttention.pdf ├── Figs ├── Allowed.png ├── Attention.png ├── Banana.png ├── BiGS.png ├── Biden.png ├── Complex.png ├── ComplexBad.png ├── Conv.png ├── Cumsum.png ├── DSSM.pdf ├── FeedForward.png ├── GLUE.png ├── H3.png ├── Is-Attention-All-You-Need-.png ├── Kernel1.png ├── MNLI.png ├── Mega.png ├── ModelSize0.jpg ├── ModelSize2.png ├── ModelSize3.png ├── RASP.png ├── RNNParam.pdf ├── RWKV.png ├── S4LRA.png ├── SGParam.pdf ├── SSM (1).pdf ├── SSM.pdf ├── SSMParam.pdf ├── SSMSide.pdf ├── SSMStart.pdf ├── assoc.png ├── assoc2.png ├── attention.png ├── attractors.png ├── big.png ├── comparison_results (1).png ├── comparison_results.png ├── elmo.png ├── ema.png ├── frame_10_delay-0.1s.png ├── frame_20_delay-0.1s.png ├── frame_30_delay-0.1s.png ├── frame_40_delay-0.1s.png ├── frame_50_delay-0.1s.png ├── graph.png ├── graph2.png ├── hippo.png ├── hippo_kernel.png ├── hyena.png ├── induct.png ├── induct1.png ├── induct2.png ├── kernel.pdf ├── kernel2.png ├── listops-s4.png ├── llama.png ├── lra-s4.png ├── match.png ├── model_architecture_comparison2.pdf ├── out-rnn.png ├── out.png ├── out2 (1).png ├── out2.png ├── out3 (1).png ├── out3.png ├── out4.png ├── out5.png ├── phase.png ├── rnn.png ├── sgconv.png ├── shift.png ├── solve.png ├── speech.png ├── ssm.png ├── ssmrec.png ├── ssmrec0.png ├── ssmrec1.png ├── temp.png └── transformer.png ├── LICENSE.md ├── MLSys_Slides (11).pdf ├── Makefile ├── README.md ├── SSM Start.pdf ├── anthology.bib ├── beamercolorthemeauriga.sty ├── beamerthemeauriga.sty ├── old.tex ├── p-notes.tex ├── p.tex ├── presentation-netflix.tex ├── presentation.tex ├── slides ├── brackets.tex ├── bullets.tex ├── centered.tex ├── figure.tex ├── link.tex ├── monospace.tex └── split.tex ├── ssm.bib └── temp.tex /01-intro.tex: -------------------------------------------------------------------------------- 1 | \begin{frame}{Do we need \textcolor{blue}{Attention}?} 2 | \centering 3 | \only<2>{Or can we use something simpler...} 4 | \begin{figure} 5 | \centering 6 | 7 | \includegraphics<1>[height=0.6\textheight,clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/Complex.png} 8 | \includegraphics<2>[height=0.55\textheight,clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/Allowed.png} 9 | 10 | 11 | \end{figure} 12 | \end{frame} 13 | 14 | \begin{frame}[label=current]{Proposition - One year ago} 15 | \begin{quote} 16 | On January 1, 2027, an \textcolor{blue}{Attention-based} model will be state-of-the-art in natural language processing. 17 | \end{quote} 18 | 19 | \begin{figure} 20 | \centering 21 | \includegraphics[width=0.7\linewidth,clip, trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/Is-Attention-All-You-Need-.png} 22 | \label{fig:my_label} 23 | \end{figure} 24 | 25 | \end{frame} 26 | 27 | 28 | \begin{frame}[c, label=current]{} 29 | \begin{figure} 30 | \centering 31 | \includegraphics[width=0.8\linewidth]{Figs/Biden.png} 32 | \label{fig:my_label} 33 | \end{figure} 34 | \end{frame} 35 | 36 | 37 | 38 | \begin{frame}{Algorithmic Goal} 39 | GPT models are growing, but still limited by context length. 40 | \vspace{1cm} 41 | 42 | \begin{itemize} 43 | \item \textcolor{blue}{Training Speed} - Cost is quadratic in length 44 | \item \textcolor{blue}{Generation Speed} - Attention requires full lookback 45 | \end{itemize} 46 | \end{frame} 47 | 48 | 49 | 50 | 51 | \begin{frame}{Survey: Progress on Attention Alternatives} 52 | \begin{center} 53 | Recent research has made significant progress. 54 | \end{center} 55 | 56 | \begin{columns} 57 | \begin{column}{0.4\textwidth} 58 | \textit{S4}~\cite{gu2022parameterization} 59 | \textit{DSS}~\cite{gupta2022diagonal} 60 | \textit{GSS}~\cite{mehta2022long} 61 | \textit{S4D}~\cite{Gu2022-jz} 62 | \textit{H3}~\cite{dao2022hungry} 63 | \textit{S5}~\cite{smith2022simplified} 64 | \textit{BiGS}~\cite{Wang2022-un} 65 | \end{column} 66 | \begin{column}{0.4\textwidth} 67 | \textit{QRNN}~\cite{mccann2017learned} 68 | \textit{LRU}~\cite{Orvieto2023-an} 69 | \textit{RWKV}~\cite{Peng2023-yp} 70 | \textit{Mega}~\cite{ma2022mega} 71 | \textit{Hyena}~\cite{Poli2023-ag} 72 | \textit{SGConv}~\cite{Li2022-pn} 73 | \end{column} 74 | \end{columns} 75 | \pause 76 | 77 | 78 | \begin{center} 79 | \structure{Note:} Just one research direction. 80 | 81 | \end{center} 82 | 83 | 84 | \end{frame} 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /02-transformers.tex: -------------------------------------------------------------------------------- 1 | 2 | 3 | % \begin{frame}[c]{} 4 | % \centering 5 | % \begin{figure} 6 | % \centering 7 | % \includegraphics[height=0.9\textheight]{Figs/FeedForward.png} 8 | % \label{fig:my_label} 9 | % \end{figure} 10 | % \end{frame} 11 | 12 | \begin{frame}[c]{Transformers for Sequence Modeling} 13 | \centering 14 | \begin{columns} 15 | \begin{column}{0.3\textwidth} 16 | Repeated components 17 | \vspace{0.5cm} 18 | 19 | \begin{itemize} 20 | \item Feed Forward 21 | 22 | \item Attention 23 | \end{itemize} 24 | \end{column} 25 | \begin{column}{0.7\textwidth} 26 | 27 | \begin{figure} 28 | \centering 29 | \includegraphics[height=0.8\textheight, clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/out.png} 30 | %\includegraphics[height=0.8\textheight]{Figs/out2 (1).png} \label{fig:my_label} 31 | \end{figure} 32 | \end{column} 33 | \end{columns} 34 | 35 | \end{frame} 36 | 37 | \begin{frame}{Feed Forward} 38 | \begin{itemize} 39 | \item Acts on each position independently. 40 | \end{itemize} 41 | \begin{figure} 42 | \centering 43 | \includegraphics[height=0.5\textheight, clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/out4.png} 44 | \end{figure} 45 | \end{frame} 46 | 47 | \begin{frame}[c]{Attention} 48 | \begin{itemize} 49 | \item Fully connected interactions. 50 | \end{itemize} 51 | 52 | \centering 53 | \begin{figure} 54 | \centering 55 | \includegraphics[height=0.5\textheight, clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/out5.png} 56 | \label{fig:my_label} 57 | \end{figure} 58 | \end{frame} 59 | 60 | 61 | % \begin{frame}{Attention Matrix} 62 | % \centering 63 | % \begin{itemize} 64 | % \item Schematic of interactions at each layer (quadratic) 65 | % \end{itemize} 66 | 67 | 68 | % \begin{figure} 69 | % \centering 70 | % % \includegraphics[height=0.7\textheight,clip,trim={14cm 3cm 0.5cm 3cm}]{Figs/Attention.png} \hspace{1cm} 71 | % \includegraphics[height=0.7\textheight, clip, trim={1.5cm 1.3cm 0.1cm 0.1cm}]{Figs/Cumsum.png} 72 | % \label{fig:my_label} 73 | % \end{figure} 74 | % \end{frame} 75 | 76 | 77 | \begin{frame}[c]{Task: Language Generation} 78 | \centering 79 | Predict the next word. 80 | \vspace{1.5cm} 81 | 82 | 83 | \structure{Final:} The dog walked to the \textcolor{red}{park} 84 | 85 | \vspace{1.5cm} 86 | 87 | \textcolor{blue}{Input:} The dog walked to the \textcolor{red}{?} 88 | 89 | \end{frame} 90 | 91 | \begin{frame}[c]{Task: Long Range Arena (ListOps)} 92 | \centering 93 | Calculate the equation ($\uparrow$=max $\downarrow$=min) 94 | \vspace{1.5cm} 95 | 96 | 97 | \structure{Final:} [ $\uparrow$ 2 9 [ $\downarrow$ 4 7 ] 0 ] \textcolor{red}{9} 98 | 99 | \vspace{1.5cm} 100 | 101 | 102 | \textcolor{blue}{Input:} [ $\uparrow$ 2 9 [ $\downarrow$ 4 7 ] 0 ] \textcolor{red}{?} 103 | \end{frame} 104 | 105 | 106 | 107 | \begin{frame}[c]{Attention Matrix} 108 | 109 | \centering 110 | 111 | \begin{center} 112 | All quadratic interactions possible. 113 | \end{center} 114 | 115 | \begin{figure} 116 | \centering 117 | \includegraphics[height=0.6\textheight, clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/Complex.png} 118 | \label{fig:my_label} 119 | \end{figure} 120 | \end{frame} 121 | 122 | \begin{frame}[c]{Attention for Realistic Examples} 123 | \centering 124 | \begin{center} 125 | Listops goes to 2,000 steps. This is 100. 126 | \end{center} 127 | 128 | \begin{figure} 129 | \centering 130 | \includegraphics[height=0.6\textheight, clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/big.png} 131 | \label{fig:my_label} 132 | \end{figure} 133 | \end{frame} 134 | 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /03-RNN.tex: -------------------------------------------------------------------------------- 1 | \begin{frame}{Discrete Time Sequence} 2 | 3 | From \structure{scalar} sequence $u_{1}, \ldots, u_L$ to $y_1, \ldots, y_L$. 4 | 5 | \begin{figure} 6 | \centering 7 | \includegraphics[width=0.5\textwidth]{Figs/SSMStart.pdf} 8 | \label{fig:my_label} 9 | \end{figure} 10 | \end{frame} 11 | 12 | 13 | \begin{frame}{Review: RNN for Language Generation} 14 | \begin{columns} 15 | \begin{column}{0.5\textwidth} 16 | \centering 17 | \includegraphics[width=.8\textwidth]{Figs/rnn.png} 18 | 19 | \end{column} 20 | \begin{column}{0.5\textwidth} 21 | 22 | \begin{align*} 23 | x_{k} &= \textcolor{red}{\sigma}(\textcolor{green}{\boldsymbol{\overline{A}}} x_{k-1} + \textcolor{blue}{\boldsymbol{\overline{B}}} u_k) \\ 24 | y_k &= \phantom{\sigma (} \textcolor{orange}{\boldsymbol{\overline{C}}} x_{k \phantom{- 1}} 25 | \end{align*} 26 | \end{column} 27 | \end{columns} 28 | 29 | \end{frame} 30 | 31 | \begin{frame}{Review: RNN versus Attention} 32 | \begin{columns} 33 | \begin{column}{0.5\textwidth} 34 | \centering 35 | \includegraphics[width=.8\textwidth]{Figs/rnn.png} 36 | \end{column} 37 | \begin{column}{0.5\textwidth} 38 | \centering 39 | 40 | \includegraphics[width=0.8\textwidth]{Figs/out5.png} 41 | \end{column} 42 | \end{columns} 43 | \vspace{0.5cm} 44 | 45 | \begin{itemize} 46 | \item \structure{Training Speed:} Slow (\textcolor{red}{Serial} bottleneck) 47 | \item \structure{Generation Speed:} Fast (constant-time per step) 48 | 49 | \end{itemize} 50 | \end{frame} 51 | 52 | 53 | \begin{frame}{Didn't we try this RNN thing? } 54 | 55 | \begin{center} 56 | The last major RNN model in NLP - \textcolor{red}{ELMo} 57 | \end{center} 58 | 59 | \pause 60 | 61 | \begin{figure} 62 | \centering 63 | \includegraphics[width=0.5\textwidth]{Figs/GLUE.png} 64 | 65 | \label{fig:my_label} 66 | \end{figure} 67 | \blfootnote{\cite{DBLP:conf/naacl/PetersNIGCLZ18, devlin2018bert}} 68 | \end{frame} 69 | 70 | \begin{frame}{RNN Revival: Two Differences} 71 | \begin{columns} 72 | \begin{column}{0.5\textwidth} 73 | 74 | \begin{enumerate} 75 | \item Efficient Linear RNNs 76 | \item Effective Long-Range Parameterizations 77 | \end{enumerate} 78 | 79 | 80 | % Orthogonal RNN - Linear 81 | % QRNN - > Linear RNN. $\bar{A}$ time-varying Linear non-homogenous. input depdendent 82 | 83 | % A static over time. 84 | % SISO - property. 85 | % Orthogonal - > 86 | 87 | \end{column} 88 | \begin{column}{0.5\textwidth} 89 | \centering 90 | \includegraphics[width=0.4\textwidth, clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/out-rnn.png} 91 | \end{column} 92 | 93 | \end{columns} 94 | \end{frame} 95 | 96 | 97 | 98 | \begin{frame}{Component 1: \textcolor{blue}{Linear} RNN} 99 | 100 | \begin{align*} 101 | x_{k} &= \textcolor{green}{\boldsymbol{\overline{A}}} \textcolor{black}{\boldsymbol{x}_{k-1}} + \textcolor{blue}{\boldsymbol{\overline{B}}} \textcolor{black}{u_k} \\ 102 | y_k &= \textcolor{orange}{\boldsymbol{\overline{C}}} x_{k \phantom{- 1}} 103 | \end{align*} 104 | \pause 105 | \begin{figure} 106 | \centering 107 | \includegraphics[width=0.6\textwidth]{Figs/ssm.png} 108 | \label{fig:my_label} 109 | \end{figure} 110 | \end{frame} 111 | 112 | \begin{frame}{Expansion Of Terms} 113 | \vspace{-0.5cm} 114 | \begin{align*} 115 | y_k = \textcolor{orange}{\boldsymbol{\overline{C}}} x_{k \phantom{- 1}} \ 116 | x_{k} = \textcolor{green}{\boldsymbol{\overline{A}}} \textcolor{black}{\boldsymbol{x}_{k-1}} + \textcolor{blue}{\boldsymbol{\overline{B}}} \textcolor{black}{u_k} \ 117 | \end{align*} 118 | \pause 119 | \vspace{-2cm} 120 | \begin{figure} 121 | \centering 122 | \only<2>{\[y_1\]}\only<3>{\[y_2\]} \only<4->{\[y_3\]} 123 | \includegraphics<2>[height=0.12\textwidth]{Figs/ssmrec0} 124 | 125 | \includegraphics<3>[height=0.12\textwidth]{Figs/ssmrec1} 126 | 127 | \includegraphics<4->[height=0.1\textwidth]{Figs/ssmrec} 128 | \label{fig:my_label} 129 | \end{figure} 130 | \vspace{-0.5cm} 131 | 132 | \pause\pause\pause 133 | \begin{align*} 134 | \overline{K} &= (\textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{blue}{\boldsymbol{\overline{B}}}, \textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{green}{\boldsymbol{\overline{A}}}\textcolor{blue}{\boldsymbol{\overline{B}}}, \dots, \textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{green}{\boldsymbol{\overline{A}}^{L-1}}\textcolor{blue}{\boldsymbol{\overline{B}}}) 135 | \end{align*} 136 | \end{frame} 137 | 138 | \begin{frame}{Convolutional Form} 139 | 140 | \begin{align*} 141 | y_k = \textcolor{orange}{\boldsymbol{\overline{C}}} x_{k \phantom{- 1}} \ 142 | x_{k} = \textcolor{green}{\boldsymbol{\overline{A}}} \textcolor{black}{\boldsymbol{x}_{k-1}} + \textcolor{blue}{\boldsymbol{\overline{B}}} \textcolor{black}{u_k} \ 143 | \end{align*} 144 | 145 | 146 | 147 | \begin{align*} 148 | \overline{K} &= (\textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{blue}{\boldsymbol{\overline{B}}}, \textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{green}{\boldsymbol{\overline{A}}}\textcolor{blue}{\boldsymbol{\overline{B}}}, \dots, \textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{green}{\boldsymbol{\overline{A}}^{L-1}}\textcolor{blue}{\boldsymbol{\overline{B}}}) \\ 149 | y &= \text{conv1d}(\overline{K}_L \ldots \overline{K}_1, u_1 \ldots u_L) 150 | \end{align*} 151 | 152 | 153 | 154 | % Intuition: 155 | % \pause 156 | % $$y_1 = \boldsymbol{\overline{C}} \boldsymbol{\overline{B}} u_1$$ 157 | % \pause 158 | % $$y_2 = \boldsymbol{\overline{C}} \boldsymbol{\overline{A}} \boldsymbol{\overline{B}} u_1 + \boldsymbol{\overline{C}} \boldsymbol{\overline{B}} u_2 = \boldsymbol{\overline{C}} (\boldsymbol{\overline{A}} \boldsymbol{\overline{B}} u_1 + \boldsymbol{\overline{B}} u_2) = \boldsymbol{\overline{C}} (\boldsymbol{x}_1 + \boldsymbol{\overline{B}} u_2) $$ 159 | \end{frame} 160 | 161 | 162 | \begin{frame}{Convolutional Form} 163 | \begin{align*} 164 | \overline{K} &= (\textcolor{black}{\boldsymbol{\overline{C}}}\textcolor{black}{\boldsymbol{\overline{B}}}, \textcolor{black}{\boldsymbol{\overline{C}}}\textcolor{black}{\boldsymbol{\overline{A}}}\textcolor{black}{\boldsymbol{\overline{B}}}, \dots, \textcolor{black}{\boldsymbol{\overline{C}}}\textcolor{black}{\boldsymbol{\overline{A}}^{L-1}}\textcolor{black}{\boldsymbol{\overline{B}}}) \\ 165 | \end{align*} 166 | \begin{figure} 167 | \centering 168 | \includegraphics[width=0.6\textwidth]{Figs/SSM (1).pdf} 169 | \label{fig:my_label} 170 | \end{figure} 171 | \end{frame} 172 | 173 | \begin{frame}{Computation 1: FFT} 174 | Compute convolution in Fourier space, 175 | 176 | \begin{align*} 177 | &y = \boldsymbol{\overline{K}} \ast u 178 | \end{align*} 179 | \begin{itemize} 180 | \item $O(L \log L)$ for padded FFT of $K$ and $u$, mult, then iFFT 181 | \item Accelerators optimize this to different levels. 182 | \end{itemize} 183 | \end{frame} 184 | 185 | \begin{frame}[c]{Computation 2: Associative Scan (S5)} 186 | 187 | 188 | \begin{columns} 189 | \begin{column}{0.5\textwidth} 190 | Associative $e_1\bullet \ldots \bullet e_L$ 191 | 192 | \begin{center} 193 | \Tree [.$\bullet$ [.$\bullet$ [.$\bullet$ $e_1$ ] [.$\bullet$ $e_2$ ] ] [.$\bullet$ [.$\bullet$ $e_3$ ] [.$\bullet$ $e_4$ ] ] ] 194 | \end{center} 195 | \end{column} 196 | 197 | \begin{column}{0.5\textwidth} 198 | \centering 199 | \[e_k = (\boldsymbol{E}_k, \boldsymbol{e}_k) = (\bar{\textcolor{green}{\boldsymbol{A}}}, \bar{\textcolor{blue}{\boldsymbol{B}}}u_k)\] 200 | \begin{figure} 201 | \centering 202 | \includegraphics[height=0.1\textwidth,clip,trim={0cm 0cm 6cm 0cm}]{Figs/assoc.png} 203 | \label{fig:my_label} 204 | \end{figure} 205 | \[e_i \bullet e_j = (\boldsymbol{E}_i \boldsymbol{E}_j, \boldsymbol{E}_j \boldsymbol{e}_i + \boldsymbol{e}_j ) \] 206 | \begin{figure} 207 | \centering 208 | \includegraphics[height=0.1\textwidth,clip,trim={0cm 0cm 6cm 0cm}]{Figs/assoc2.png} 209 | \end{figure} 210 | 211 | \end{column} 212 | \end{columns} 213 | \blfootnote{\cite{Blelloch1990-yo,Martin2018-bq,smith2022simplified}} 214 | \end{frame} 215 | % \begin{frame}{Alternative Computation: Associative Scan \cite{smith2022simplified}} 216 | 217 | 218 | % \end{frame} 219 | 220 | 221 | % \begin{frame}{ Associative Scan: S5 } 222 | % Potential benefits versus FFT 223 | % \vspace{0.5cm} 224 | 225 | % \begin{itemize} 226 | % \item Compute hidden states explicitly 227 | % \item Allows alternative RNN forms. 228 | % \item Faster on some architectures 229 | % \end{itemize} 230 | % \end{frame} 231 | 232 | 233 | \begin{frame}{Linear RNN Computational Profile} 234 | 235 | \begin{align*} 236 | x_{k} &= \textcolor{green}{\boldsymbol{\overline{A}}} \textcolor{black}{{x}_{k-1}} + \textcolor{blue}{\boldsymbol{\overline{B}}} \textcolor{black}{u_k} \\ 237 | y_k &= \textcolor{orange}{\boldsymbol{\overline{C}}} x_{k \phantom{- 1}} 238 | \end{align*} 239 | \begin{itemize} 240 | \item \structure{Training Speed:} \sout{Weak} Strong (Parallelizable convolution) 241 | \item \structure{Generation Speed:} Strong (constant-time per step) \pause 242 | \item \structure{Accuracy:} Extremely \textcolor{red}{Poor...} Barely learns. 243 | \end{itemize} 244 | \end{frame} 245 | 246 | \begin{frame}{Interactions} 247 | \begin{center} 248 | Routing here must be static and regular (conv). 249 | \end{center} 250 | \begin{figure} 251 | \centering 252 | \includegraphics[height=0.45\textheight,clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/Allowed.png} 253 | \vspace{0.5cm} 254 | 255 | \includegraphics[width=0.5\textwidth]{Figs/SGParam.pdf} 256 | \label{fig:my_label} 257 | \end{figure} 258 | \end{frame} 259 | 260 | 261 | 262 | 263 | \begin{frame}{Component 2: Model Parameterization} 264 | 265 | Linear RNN behavior highly dependent on $\boldsymbol{\overline{A}}$ 266 | 267 | \begin{align*} 268 | \overline{K} &= (\textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{blue}{\boldsymbol{\overline{B}}}, \textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{green}{\boldsymbol{\overline{A}}}\textcolor{blue}{\boldsymbol{\overline{B}}}, \dots, \textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{green}{\boldsymbol{\overline{A}}^{L-1}}\textcolor{blue}{\boldsymbol{\overline{B}}}) 269 | \end{align*} 270 | \vspace{0.5cm} 271 | 272 | Choice of $\boldsymbol{\overline{A}}$ is critical: stable and informative. 273 | \end{frame} 274 | 275 | 276 | \begin{frame}{Mathematical Model: State Space Model (SSM) } 277 | 278 | A SSM is a continuous-time, differential equation. 279 | \begin{align*} 280 | x'(t) &= \boldsymbol{A}x(t) + \boldsymbol{B}u(t) \\ 281 | y(t) &= \boldsymbol{C}x(t). 282 | \end{align*} 283 | 284 | Used to explore Linear RNN parameterization. 285 | \end{frame} 286 | 287 | \begin{frame}{Hidden State Form~\cite{gu2020hippo}} 288 | \textcolor{red}{Summarize} history in vector $x$ with \textcolor{blue}{Legendre} coefficients 289 | \begin{figure} 290 | \centering 291 | \includegraphics[width=0.7\textwidth]{Figs/hippo.png} 292 | \end{figure} 293 | \end{frame} 294 | 295 | \begin{frame}{Choice of Parameters~\cite{gu2020hippo}} 296 | Intuition: Hidden state vector $\textcolor{blue}{x}$ should \textcolor{red}{summarize} past $u$. 297 | 298 | \begin{figure} 299 | \centering 300 | \includegraphics<1>[width=\textwidth]{Figs/frame_10_delay-0.1s.png} 301 | \includegraphics<2>[width=\textwidth]{Figs/frame_20_delay-0.1s.png} 302 | \includegraphics<3>[width=\textwidth]{Figs/frame_30_delay-0.1s.png} 303 | \includegraphics<4>[width=1\textwidth]{Figs/frame_40_delay-0.1s.png} 304 | \includegraphics<5>[width=1\textwidth]{Figs/frame_50_delay-0.1s.png} 305 | \end{figure} 306 | 307 | \end{frame} 308 | 309 | 310 | 311 | % \begin{frame}{Practical Consequence: HiPPO~\cite{gu2020hippo}} 312 | % Motivates an initialization of the (discrete-time) kernel $\bar{K}$. 313 | 314 | % \begin{figure} 315 | % \centering 316 | % \includegraphics[width=0.5\textwidth]{Figs/hippo_kernel.png} 317 | 318 | % \includegraphics[width=0.5\textwidth]{Figs/SGParam.pdf} 319 | % \label{fig:enter-label} 320 | % \end{figure} 321 | % \end{frame} 322 | 323 | % \begin{frame}{S4 \cite{gu2022parameterization} } 324 | 325 | % Learn parameters of SSM, convert to linear RNN parameters 326 | 327 | % $$\boldsymbol{\overline{A}}, \boldsymbol{\overline{B}}, \boldsymbol{\overline{C}} = \text{discretize}(\boldsymbol{A}, \boldsymbol{B}, \boldsymbol{C}, \Delta )$$ 328 | 329 | % \begin{figure} 330 | % \centering 331 | % \includegraphics[width=0.6\textwidth]{Figs/SSMParam.pdf} 332 | % \caption{} 333 | % \label{fig:my_label} 334 | % \end{figure} 335 | % \pause 336 | % \vspace{-2cm} 337 | 338 | % Note: There are \textit{many more} important details here. 339 | 340 | % \end{frame} 341 | 342 | 343 | % \begin{frame}{Determining RNN Parameterization} 344 | % \begin{itemize} 345 | % \item \cite{gu2020hippo} develop \textit{HiPPO} Matrix for SSM $\boldsymbol{A}$ 346 | 347 | % % \begin{scriptsize} 348 | % % \begin{align*} 349 | % % \boldsymbol{A}_{nk}= - 350 | % % \begin{cases} 351 | % % (2n+1)^{1/2}(2k+1)^{1/2} & \text{if } n > k \\ n+1 &\text{if } n=k \text{\ else\ } 0 352 | % % \end{cases} 353 | % % \end{align*} 354 | % % \end{scriptsize} 355 | 356 | % \item Approximates history through Legendre coefficients 357 | % \end{itemize} 358 | % \begin{figure} 359 | % \centering 360 | % \includegraphics[width=0.7\textwidth]{Figs/hippo.png} 361 | % \end{figure} 362 | % \end{frame} 363 | 364 | % \begin{frame}{Key Insight: Choice of $\boldsymbol{A}$ } 365 | 366 | % \cite{gu2020hippo,gu2022parameterization} 367 | 368 | % Show that HiPPO 369 | 370 | 371 | % \end{frame} 372 | 373 | \begin{frame}[c]{Results: ListOps \cite{gu2022parameterization}} 374 | \centering 375 | Example: [ $\uparrow$ 2 9 [ $\downarrow$ 4 7 ] 0 ] \textcolor{red}{9} 376 | 377 | \begin{figure} 378 | \centering 379 | 380 | \includegraphics[height=0.6\textheight,clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/listops-s4.png} 381 | \label{fig:my_label} 382 | \end{figure} 383 | Requires communication over 2,000 steps 384 | 385 | \end{frame} 386 | 387 | 388 | \begin{frame}[c]{Results: Long-Range Arena \cite{gu2022parameterization}} 389 | \centering 390 | \begin{figure} 391 | \centering 392 | \includegraphics[height=0.8\textheight,clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/lra-s4.png} 393 | \label{fig:my_label} 394 | \end{figure} 395 | \end{frame} 396 | 397 | 398 | 399 | 400 | 401 | % \begin{frame}{Computing with Static Kernels} 402 | % \structure{Final:} a b c $\Rightarrow$ d e f \textcolor{red}{d} 403 | 404 | % \begin{figure} 405 | % \centering 406 | % \includegraphics[height=0.5\textheight]{Figs/induct1.png} 407 | % \includegraphics[height=0.5\textheight]{Figs/induct2.png} 408 | % \label{fig:my_label} 409 | % \end{figure} 410 | 411 | % \textcolor{blue}{Input:} a b c $\Rightarrow$ d e f \textcolor{red}{?} 412 | 413 | 414 | % \end{frame} -------------------------------------------------------------------------------- /03.5-Results.tex: -------------------------------------------------------------------------------- 1 | \begin{frame}{Applying Linear RNNs} 2 | \vspace{1cm} 3 | \begin{columns} 4 | \begin{column}{0.6\textwidth} 5 | \begin{itemize} 6 | \item Speech~\cite{goel2022s} 7 | \item Video~\cite{Nguyen2022-qi} 8 | \item RL~\cite{Lu2023-ov} 9 | \item \textcolor{red}{NLP} 10 | \end{itemize} 11 | \end{column} 12 | \begin{column}{0.4\textwidth} 13 | \includegraphics[width=0.9\textwidth, ,clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/speech.png} 14 | \end{column} 15 | 16 | \end{columns} 17 | \end{frame} 18 | 19 | \begin{frame}{NLP Results} 20 | Two types of model 21 | \vspace{1cm} 22 | 23 | \begin{itemize} 24 | \item Bidirectional LM (BERT) 25 | \item Unidirectional LM (GPT) 26 | \end{itemize} 27 | \vspace{1cm} 28 | 29 | % Different architectures used, Some with partial attention 30 | 31 | \end{frame} 32 | 33 | 34 | \begin{frame}{Results: Bidirectional LM \cite{Wang2022-un}} 35 | \begin{figure} 36 | \centering 37 | \includegraphics[height=0.6\textheight]{Figs/BiGS.png} 38 | \end{figure} 39 | \end{frame} 40 | 41 | 42 | \begin{frame}{Analysis: Kernel Visualization $\boldsymbol{\bar{K}}$} 43 | 44 | \begin{figure} 45 | \centering 46 | \includegraphics[width=\textwidth]{Figs/kernel1.png} 47 | \end{figure} 48 | 49 | \begin{itemize} 50 | \item Replaces Attention Matrix 51 | \item Single Kernel per layer 52 | \end{itemize} 53 | \end{frame} 54 | 55 | \begin{frame}{Analysis: All Kernels} 56 | \begin{figure} 57 | \centering 58 | \includegraphics[height=0.6\textheight]{Figs/kernel2.png} 59 | \end{figure} 60 | \end{frame} 61 | 62 | \begin{frame}{Analysis: Change in Kernels during Finetuning } 63 | 64 | \centerline{Task: Long-Range Sentence Matching} 65 | \begin{figure} 66 | \centering 67 | \includegraphics[width=0.8\textwidth]{Figs/comparison_results.png} 68 | \end{figure} 69 | \end{frame} 70 | 71 | 72 | 73 | \begin{frame}{Results: Unidirectional LM \cite{dao2022hungry} $\downarrow$} 74 | \begin{figure} 75 | \centering 76 | \includegraphics[width=0.7\textwidth]{Figs/H3.png} 77 | \caption{Caption} 78 | \label{fig:my_label} 79 | \end{figure} 80 | \end{frame} 81 | 82 | \begin{frame} 83 | \includegraphics[ clip, height=\textheight]{Figs/ModelSize0.jpg} 84 | \end{frame} 85 | 86 | % \begin{frame}{Frame Title} 87 | 88 | % \end{frame} 89 | 90 | \section{Alternative Parameterizations} 91 | 92 | \begin{frame}{Do we need the SSM?} 93 | \begin{figure} 94 | \centering 95 | \includegraphics[width=1\textwidth]{Figs/frame_50_delay-0.1s.png} 96 | \end{figure} 97 | \end{frame} 98 | 99 | \begin{frame}{CNN Param: Decaying Structure \cite{Li2022-pn}} 100 | Parameterization should decay $\bar{K}$ over time. 101 | 102 | \begin{figure} 103 | \centering 104 | \includegraphics[width=0.4\textwidth]{Figs/sgconv.png} 105 | \label{fig:my_label} 106 | \end{figure} 107 | 108 | \begin{figure} 109 | \centering 110 | \includegraphics[width=0.5\textwidth]{Figs/SGParam.pdf} 111 | \label{fig:my_label} 112 | \end{figure} 113 | 114 | \pause 115 | \begin{center} 116 | \alert{However}, no linear RNN form. 117 | \end{center} 118 | 119 | \end{frame} 120 | 121 | 122 | 123 | \begin{frame}{RNN Param: LRU \cite{Orvieto2023-an}} 124 | Stable diagonal parameterization of Linear RNN 125 | \begin{align*} 126 | \textcolor{green}{\bar{A}}_{j,j} &= \exp(-\exp({\nu_j}) + i \exp(\theta_j))\\ 127 | \textcolor{blue}{\bar{B}}_{j} &= (1 - |\bar{A}_{j,j}|^2)^{1/2} 128 | \end{align*} 129 | 130 | \begin{figure} 131 | \centering 132 | \includegraphics[width=0.8\textwidth]{Figs/phase.png} 133 | \label{fig:my_label} 134 | \end{figure} 135 | \end{frame} 136 | 137 | \begin{frame}{RNN Param: MEGA \cite{ma2022mega}} 138 | Use a parameterized damped, exponential moving average 139 | \begin{align*} 140 | \textcolor{green}{\bar{A}}_{j,j} &= 1 − \alert{\alpha_j} \times \delta_j \\ 141 | \textcolor{blue}{\bar{B}}_{j} &= \alpha_j 142 | \end{align*} 143 | \begin{figure} 144 | \centering 145 | \includegraphics[width=0.7\textwidth]{Figs/ema.png} 146 | \label{fig:my_label} 147 | \end{figure} 148 | \begin{center} 149 | Very good results on NLP tasks like Translation. 150 | \end{center} 151 | 152 | \end{frame} 153 | 154 | 155 | \begin{frame}{RNN Param: RWKV \cite{Peng2023-yp}} 156 | Inspired by Attention 157 | 158 | Split into Keys, Values, and Receptance (no Query): 159 | \begin{align*} 160 | K_i, V_i, R_i 161 | \end{align*} 162 | \pause 163 | Then compute averaged values normalized by keys. 164 | 165 | \begin{align*} 166 | R_i\frac{\sum_{i'=1}^i \textcolor{green}{\exp(w)}^{i'}\exp(K_{i'}) V_{i'}} {\sum_{i'=1}^i \textcolor{green}{\exp(w)}^{i'}\exp(K_{i'})\phantom{ V_{i'}}} = R_i \frac{\text{LR}_1(\exp(K_i)V_i)}{\text{LR}_2(\exp(K_i))\phantom{V_i}}\\ 167 | \end{align*} 168 | 169 | Yields a product of Linear RNNs (Computed directly). 170 | 171 | \end{frame} 172 | 173 | 174 | \begin{frame}{Results: RWKV \cite{Peng2023-yp}} 175 | \begin{center} 176 | Largest RNN. Trained up to 14B parameter scale. 177 | \end{center} 178 | \pause 179 | \begin{figure} 180 | \centering 181 | \includegraphics[width=1\textwidth]{Figs/RWKV.png} 182 | \label{fig:my_label} 183 | \end{figure} 184 | Lots of practical interest and community. 185 | \end{frame} 186 | 187 | 188 | \begin{frame}{Open Question: In-Context Learning} 189 | \begin{itemize} 190 | \item Results show comparable loss at medium scales. 191 | \item Significant interest is in abilities such as in-context learning 192 | \item Current understanding relies of Attention mechanisms. 193 | \end{itemize} 194 | \end{frame} 195 | 196 | 197 | % \begin{frame}{Parameterization: Diagonal RNN \cite{Li2022-pn}} 198 | % \begin{figure} 199 | % \centering 200 | % \includegraphics[width=0.5\textwidth]{Figs/DSSM.pdf} 201 | 202 | % \label{fig:my_label} 203 | % \end{figure} 204 | % \end{frame} 205 | 206 | % \begin{frame}{Results: GSS $\downarrow$} 207 | % \begin{figure} 208 | % \centering 209 | % \includegraphics[width=0.7\textwidth]{} 210 | % \caption{Caption} 211 | % \label{fig:my_label} 212 | % \end{figure} 213 | % \end{frame} 214 | 215 | 216 | 217 | 218 | 219 | % \begin{frame}{Results: MEGA \cite{ma2022mega} $\uparrow$} 220 | % \begin{figure} 221 | % \centering 222 | % \includegraphics[width=0.7\textwidth]{Figs/Mega.png} 223 | % \label{fig:my_label} 224 | % \end{figure} 225 | % \end{frame} 226 | -------------------------------------------------------------------------------- /04-Practicalities.tex: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/04-Practicalities.tex -------------------------------------------------------------------------------- /05-Extensions.tex: -------------------------------------------------------------------------------- 1 | 2 | % \begin{frame}{Usage} 3 | 4 | % Linear RNNs opens up the modeling design space 5 | 6 | % \vspace{1cm} 7 | % \begin{itemize} 8 | % \item How to efficiently calculate? 9 | % \item How to parameterize? 10 | % \end{itemize} 11 | % \end{frame} 12 | 13 | % \begin{frame}{Calculation} 14 | % Recall the main calculation is a $L$ length convolution, 15 | 16 | % \begin{figure} 17 | % \centering 18 | % \includegraphics[width=0.7\textwidth]{Figs/SSM.pdf} 19 | % \label{fig:my_label} 20 | % \end{figure} 21 | % \end{frame} 22 | 23 | 24 | 25 | \begin{frame}{Method 2: Parallel Associative Scan \cite{smith2022simplified} } 26 | Compute $e_1\bullet \ldots \bullet e_l$ for any associative operator $\bullet$ 27 | 28 | \begin{center} 29 | \Tree [.$\bullet$ [.$\bullet$ [.$\bullet$ $e_1$ ] [.$\bullet$ $e_2$ ] ] [.$\bullet$ [.$\bullet$ $e_3$ ] [.$\bullet$ $e_4$ ] ] ] 30 | 31 | \end{center} 32 | \cite{Blelloch1990-yo,Martin2018-bq} 33 | \end{frame} 34 | 35 | \begin{frame}{} 36 | \[e_k = (\boldsymbol{E}_k, \boldsymbol{e}_k) = (\bar{\textcolor{green}{\boldsymbol{A}}}, \bar{\textcolor{blue}{\boldsymbol{B}}}u_k)\] 37 | \begin{figure} 38 | \centering 39 | \includegraphics[height=0.1\textwidth,clip,trim={0cm 0cm 5cm 0cm}]{Figs/assoc.png} 40 | \label{fig:my_label} 41 | \end{figure} 42 | 43 | \[e_i \bullet e_j = (\boldsymbol{E}_i \boldsymbol{E}_j, \boldsymbol{E}_j \boldsymbol{e}_i + \boldsymbol{e}_j ) \] 44 | \begin{figure} 45 | \centering 46 | \includegraphics[height=0.1\textwidth,clip,trim={0cm 0cm 6cm 0cm}]{Figs/assoc2.png} 47 | \end{figure} 48 | \end{frame} 49 | 50 | 51 | 52 | % \begin{frame}{Parmeterization of RNN Models} 53 | % SSM framing gives an elegant parameterization of Linear RNNs, 54 | 55 | % \begin{figure} 56 | % \centering 57 | % \includegraphics[width=0.7\textwidth]{Figs/SSMParam.pdf} 58 | % \label{fig:my_label} 59 | % \end{figure} 60 | 61 | % Researchers have explored other parameterizations 62 | % \end{frame} 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /06-final.tex: -------------------------------------------------------------------------------- 1 | 2 | 3 | \begin{frame}{Benefits of Linear RNNs} 4 | \begin{itemize} 5 | \item Methods for training (CNN) and generation (RNN) 6 | \item Potentially more FLOP efficient. 7 | \item However not yet used in practice 8 | \end{itemize} 9 | \end{frame} 10 | 11 | \begin{frame}[c]{Current Efficiency with Scale \cite{Poli2023-ag}} 12 | \begin{figure} 13 | \centering 14 | \includegraphics[width=\textwidth]{Figs/hyena.png} 15 | \caption{} 16 | \end{figure} 17 | Models become more efficient at long time-scales. 18 | \end{frame} 19 | 20 | \begin{frame}{Issues on Accelerators} 21 | Approaches require: 22 | \vspace{0.5cm} 23 | 24 | \begin{itemize} 25 | \item Support for complex numbers 26 | \item Support for FFT (lower precision, TPU) 27 | \item Numerical Stability 28 | \item Fast Associative Scans 29 | \end{itemize} 30 | \vspace{0.5cm} 31 | 32 | Hard to compete with pure MatMul in Attention. 33 | \end{frame} 34 | 35 | \begin{frame}{} 36 | \begin{figure} 37 | \centering 38 | \includegraphics[width=0.7\linewidth,clip, trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/Is-Attention-All-You-Need-.png} 39 | \label{fig:my_label} 40 | \end{figure} 41 | \end{frame} 42 | 43 | 44 | % \begin{frame}{Frame Title} 45 | % Call to action. 46 | 47 | % * Modeling benefits 48 | % * Theoretical approaches 49 | % * interplay with hardware efficiency. 50 | % * Matching transformers 51 | % * FFT / Complex 52 | % * Associative scans 53 | % * GPU / TPUs 54 | % * Models are more flop efficient, FLOPs are not equal. 55 | % * Matmuls are more efficienct. 56 | % * Numerical stability / complex numbers 57 | % * 58 | % \end{frame} 59 | 60 | 61 | 62 | 63 | % \begin{frame}{State Retrieval} 64 | % \begin{itemize} 65 | % \item Benchmarks compare perplexity of models 66 | % \item Significant interest is in abilities such as in-context learning 67 | % \item Current understanding relies of set-based Transformer mechanisms. 68 | % \end{itemize} 69 | % \end{frame} 70 | 71 | 72 | 73 | 74 | % \begin{frame}{In-Context Learning} 75 | % \begin{itemize} 76 | % \item Benchmarks compare perplexity of models 77 | % \item Significant interest is in abilities such as in-context learning 78 | % \item Current understanding relies of set-based Transformer mechanisms. 79 | % \end{itemize} 80 | % \end{frame} 81 | 82 | % \begin{frame}{Better Transformers} 83 | % \begin{itemize} 84 | % \item Models are being scaled to longer ranges (>100k) 85 | % \item For language, approximations of attention may be fine. 86 | % \item 87 | % \end{itemize} 88 | % \end{frame} 89 | 90 | 91 | % \begin{frame}{Inductive Bias} 92 | % \begin{itemize} 93 | % \item Transformers are set-based models 94 | % \item Linear RNNs encoder sequential bias 95 | % \item For language, unclear whether this is beneficial or not. 96 | % \end{itemize} 97 | % \end{frame} -------------------------------------------------------------------------------- /DoWeNeedAttention.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/DoWeNeedAttention.pdf -------------------------------------------------------------------------------- /Figs/Allowed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Allowed.png -------------------------------------------------------------------------------- /Figs/Attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Attention.png -------------------------------------------------------------------------------- /Figs/Banana.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Banana.png -------------------------------------------------------------------------------- /Figs/BiGS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/BiGS.png -------------------------------------------------------------------------------- /Figs/Biden.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Biden.png -------------------------------------------------------------------------------- /Figs/Complex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Complex.png -------------------------------------------------------------------------------- /Figs/ComplexBad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ComplexBad.png -------------------------------------------------------------------------------- /Figs/Conv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Conv.png -------------------------------------------------------------------------------- /Figs/Cumsum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Cumsum.png -------------------------------------------------------------------------------- /Figs/DSSM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/DSSM.pdf -------------------------------------------------------------------------------- /Figs/FeedForward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/FeedForward.png -------------------------------------------------------------------------------- /Figs/GLUE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/GLUE.png -------------------------------------------------------------------------------- /Figs/H3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/H3.png -------------------------------------------------------------------------------- /Figs/Is-Attention-All-You-Need-.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Is-Attention-All-You-Need-.png -------------------------------------------------------------------------------- /Figs/Kernel1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Kernel1.png -------------------------------------------------------------------------------- /Figs/MNLI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/MNLI.png -------------------------------------------------------------------------------- /Figs/Mega.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Mega.png -------------------------------------------------------------------------------- /Figs/ModelSize0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ModelSize0.jpg -------------------------------------------------------------------------------- /Figs/ModelSize2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ModelSize2.png -------------------------------------------------------------------------------- /Figs/ModelSize3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ModelSize3.png -------------------------------------------------------------------------------- /Figs/RASP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/RASP.png -------------------------------------------------------------------------------- /Figs/RNNParam.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/RNNParam.pdf -------------------------------------------------------------------------------- /Figs/RWKV.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/RWKV.png -------------------------------------------------------------------------------- /Figs/S4LRA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/S4LRA.png -------------------------------------------------------------------------------- /Figs/SGParam.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/SGParam.pdf -------------------------------------------------------------------------------- /Figs/SSM (1).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/SSM (1).pdf -------------------------------------------------------------------------------- /Figs/SSM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/SSM.pdf -------------------------------------------------------------------------------- /Figs/SSMParam.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/SSMParam.pdf -------------------------------------------------------------------------------- /Figs/SSMSide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/SSMSide.pdf -------------------------------------------------------------------------------- /Figs/SSMStart.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/SSMStart.pdf -------------------------------------------------------------------------------- /Figs/assoc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/assoc.png -------------------------------------------------------------------------------- /Figs/assoc2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/assoc2.png -------------------------------------------------------------------------------- /Figs/attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/attention.png -------------------------------------------------------------------------------- /Figs/attractors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/attractors.png -------------------------------------------------------------------------------- /Figs/big.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/big.png -------------------------------------------------------------------------------- /Figs/comparison_results (1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/comparison_results (1).png -------------------------------------------------------------------------------- /Figs/comparison_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/comparison_results.png -------------------------------------------------------------------------------- /Figs/elmo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/elmo.png -------------------------------------------------------------------------------- /Figs/ema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ema.png -------------------------------------------------------------------------------- /Figs/frame_10_delay-0.1s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/frame_10_delay-0.1s.png -------------------------------------------------------------------------------- /Figs/frame_20_delay-0.1s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/frame_20_delay-0.1s.png -------------------------------------------------------------------------------- /Figs/frame_30_delay-0.1s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/frame_30_delay-0.1s.png -------------------------------------------------------------------------------- /Figs/frame_40_delay-0.1s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/frame_40_delay-0.1s.png -------------------------------------------------------------------------------- /Figs/frame_50_delay-0.1s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/frame_50_delay-0.1s.png -------------------------------------------------------------------------------- /Figs/graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/graph.png -------------------------------------------------------------------------------- /Figs/graph2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/graph2.png -------------------------------------------------------------------------------- /Figs/hippo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/hippo.png -------------------------------------------------------------------------------- /Figs/hippo_kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/hippo_kernel.png -------------------------------------------------------------------------------- /Figs/hyena.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/hyena.png -------------------------------------------------------------------------------- /Figs/induct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/induct.png -------------------------------------------------------------------------------- /Figs/induct1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/induct1.png -------------------------------------------------------------------------------- /Figs/induct2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/induct2.png -------------------------------------------------------------------------------- /Figs/kernel.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/kernel.pdf -------------------------------------------------------------------------------- /Figs/kernel2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/kernel2.png -------------------------------------------------------------------------------- /Figs/listops-s4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/listops-s4.png -------------------------------------------------------------------------------- /Figs/llama.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/llama.png -------------------------------------------------------------------------------- /Figs/lra-s4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/lra-s4.png -------------------------------------------------------------------------------- /Figs/match.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/match.png -------------------------------------------------------------------------------- /Figs/model_architecture_comparison2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/model_architecture_comparison2.pdf -------------------------------------------------------------------------------- /Figs/out-rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out-rnn.png -------------------------------------------------------------------------------- /Figs/out.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out.png -------------------------------------------------------------------------------- /Figs/out2 (1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out2 (1).png -------------------------------------------------------------------------------- /Figs/out2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out2.png -------------------------------------------------------------------------------- /Figs/out3 (1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out3 (1).png -------------------------------------------------------------------------------- /Figs/out3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out3.png -------------------------------------------------------------------------------- /Figs/out4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out4.png -------------------------------------------------------------------------------- /Figs/out5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out5.png -------------------------------------------------------------------------------- /Figs/phase.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/phase.png -------------------------------------------------------------------------------- /Figs/rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/rnn.png -------------------------------------------------------------------------------- /Figs/sgconv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/sgconv.png -------------------------------------------------------------------------------- /Figs/shift.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/shift.png -------------------------------------------------------------------------------- /Figs/solve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/solve.png -------------------------------------------------------------------------------- /Figs/speech.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/speech.png -------------------------------------------------------------------------------- /Figs/ssm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ssm.png -------------------------------------------------------------------------------- /Figs/ssmrec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ssmrec.png -------------------------------------------------------------------------------- /Figs/ssmrec0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ssmrec0.png -------------------------------------------------------------------------------- /Figs/ssmrec1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ssmrec1.png -------------------------------------------------------------------------------- /Figs/temp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/temp.png -------------------------------------------------------------------------------- /Figs/transformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/transformer.png -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | ===================== 3 | 4 | **Copyright (c) 2019 Anish Athalye (me@anishathalye.com)** 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of 7 | this software and associated documentation files (the "Software"), to deal in 8 | the Software without restriction, including without limitation the rights to 9 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 | of the Software, and to permit persons to whom the Software is furnished to do 11 | so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /MLSys_Slides (11).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/MLSys_Slides (11).pdf -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BUILD := \ 2 | p \ 3 | p-notes \ 4 | 5 | 6 | DEPS := \ 7 | beamerthemeauriga.sty \ 8 | beamercolorthemeauriga.sty \ 9 | presentation.tex \ 10 | $(shell find slides -name '*.tex') \ 11 | 12 | 13 | LATEX := lualatex 14 | 15 | LATEXOPTS := -interaction nonstopmode 16 | 17 | TARGETS := $(patsubst %, %.pdf, $(BUILD)) 18 | 19 | # phony targets 20 | 21 | all: $(TARGETS) 22 | 23 | clean: 24 | rm -rf *.pdf *.aux *.bbl *.blg *.log *.nav *.out *.snm *.toc *.vrb 25 | 26 | .PHONY: all clean 27 | 28 | # main targets 29 | 30 | %.pdf: %.tex $(DEPS) 31 | $(eval SRC_$@ = $(patsubst %.tex, %, $<)) 32 | $(LATEX) $(LATEXOPTS) $(SRC_$@) 33 | $(LATEX) $(LATEXOPTS) $(SRC_$@) 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Do we need Attention? 2 | 3 | Slides: https://github.com/srush/do-we-need-attention/blob/main/DoWeNeedAttention.pdf 4 | 5 | Video: https://www.youtube.com/watch?v=dKJEpOtVgXc 6 | -------------------------------------------------------------------------------- /SSM Start.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/SSM Start.pdf -------------------------------------------------------------------------------- /anthology.bib: -------------------------------------------------------------------------------- 1 | % Please download the latest anthology.bib from 2 | % 3 | % http://aclweb.org/anthology/anthology.bib.gz 4 | @article{gu2021efficiently, 5 | title={Efficiently Modeling Long Sequences with Structured State Spaces}, 6 | author={Gu, Albert and Goel, Karan and R{\'e}, Christopher}, 7 | journal={arXiv preprint arXiv:2111.00396}, 8 | year={2021} 9 | } 10 | 11 | @article{tay2020long, 12 | title={Long range arena: A benchmark for efficient transformers}, 13 | author={Tay, Yi and Dehghani, Mostafa and Abnar, Samira and Shen, Yikang and Bahri, Dara and Pham, Philip and Rao, Jinfeng and Yang, Liu and Ruder, Sebastian and Metzler, Donald}, 14 | journal={arXiv preprint arXiv:2011.04006}, 15 | year={2020} 16 | } 17 | 18 | @article{tay2020efficient, 19 | title={Efficient transformers: A survey}, 20 | author={Tay, Yi and Dehghani, Mostafa and Bahri, Dara and Metzler, Donald}, 21 | journal={arXiv preprint arXiv:2009.06732}, 22 | year={2020} 23 | } 24 | 25 | 26 | @inproceedings{katharopoulos2020transformers, 27 | title={Transformers are rnns: Fast autoregressive transformers with linear attention}, 28 | author={Katharopoulos, Angelos and Vyas, Apoorv and Pappas, Nikolaos and Fleuret, Fran{\c{c}}ois}, 29 | booktitle={International Conference on Machine Learning}, 30 | pages={5156--5165}, 31 | year={2020}, 32 | organization={PMLR} 33 | } 34 | 35 | 36 | @article{beltagy2020longformer, 37 | title={Longformer: The long-document transformer}, 38 | author={Beltagy, Iz and Peters, Matthew E and Cohan, Arman}, 39 | journal={arXiv preprint arXiv:2004.05150}, 40 | year={2020} 41 | } 42 | 43 | 44 | 45 | @article{izsak2021train, 46 | title={How to train bert with an academic budget}, 47 | author={Izsak, Peter and Berchansky, Moshe and Levy, Omer}, 48 | journal={arXiv preprint arXiv:2104.07705}, 49 | year={2021} 50 | } 51 | 52 | 53 | @article{gupta2022diagonal, 54 | title={Diagonal State Spaces are as Effective as Structured State Spaces}, 55 | author={Gupta, Ankit}, 56 | journal={arXiv preprint arXiv:2203.14343}, 57 | year={2022} 58 | } 59 | 60 | @article{devlin2018bert, 61 | title={Bert: Pre-training of deep bidirectional transformers for language understanding}, 62 | author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, 63 | journal={arXiv preprint arXiv:1810.04805}, 64 | year={2018} 65 | } 66 | 67 | @article{mccann2017learned, 68 | title={Learned in translation: Contextualized word vectors}, 69 | author={McCann, Bryan and Bradbury, James and Xiong, Caiming and Socher, Richard}, 70 | journal={Advances in neural information processing systems}, 71 | volume={30}, 72 | year={2017} 73 | } 74 | 75 | @article{peters2019tune, 76 | title={To tune or not to tune? adapting pretrained representations to diverse tasks}, 77 | author={Peters, Matthew E and Ruder, Sebastian and Smith, Noah A}, 78 | journal={arXiv preprint arXiv:1903.05987}, 79 | year={2019} 80 | } 81 | 82 | @article{vaswani2017attention, 83 | title={Attention is all you need}, 84 | author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, 85 | journal={Advances in neural information processing systems}, 86 | volume={30}, 87 | year={2017} 88 | } 89 | 90 | @article{rush2022s4, 91 | title={The Annotated S4}, 92 | author={Alexander Rush}, 93 | journal={International Conference on Learning Representations}, 94 | year={2022} 95 | } 96 | 97 | @inproceedings{wolf2020transformers, 98 | title={Transformers: State-of-the-art natural language processing}, 99 | author={Wolf, Thomas and Debut, Lysandre and Sanh, Victor and Chaumond, Julien and Delangue, Clement and Moi, Anthony and Cistac, Pierric and Rault, Tim and Louf, R{\'e}mi and Funtowicz, Morgan and others}, 100 | booktitle={Proceedings of the 2020 conference on empirical methods in natural language processing: system demonstrations}, 101 | pages={38--45}, 102 | year={2020} 103 | } 104 | 105 | @article{gu2020hippo, 106 | title={Hippo: Recurrent memory with optimal polynomial projections}, 107 | author={Gu, Albert and Dao, Tri and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher}, 108 | journal={Advances in Neural Information Processing Systems}, 109 | volume={33}, 110 | pages={1474--1487}, 111 | year={2020} 112 | } 113 | 114 | @article{gu2021combining, 115 | title={Combining Recurrent, Convolutional, and Continuous-time Models with Linear State Space Layers}, 116 | author={Gu, Albert and Johnson, Isys and Goel, Karan and Saab, Khaled and Dao, Tri and Rudra, Atri and R{\'e}, Christopher}, 117 | journal={Advances in Neural Information Processing Systems}, 118 | volume={34}, 119 | year={2021} 120 | } 121 | 122 | @article{liu2019roberta, 123 | title={Roberta: A robustly optimized bert pretraining approach}, 124 | author={Liu, Yinhan and Ott, Myle and Goyal, Naman and Du, Jingfei and Joshi, Mandar and Chen, Danqi and Levy, Omer and Lewis, Mike and Zettlemoyer, Luke and Stoyanov, Veselin}, 125 | journal={arXiv preprint arXiv:1907.11692}, 126 | year={2019} 127 | } 128 | 129 | @article{loshchilov2017decoupled, 130 | title={Decoupled weight decay regularization}, 131 | author={Loshchilov, Ilya and Hutter, Frank}, 132 | journal={arXiv preprint arXiv:1711.05101}, 133 | year={2017} 134 | } 135 | 136 | @article{lewis2019bart, 137 | title={Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension}, 138 | author={Lewis, Mike and Liu, Yinhan and Goyal, Naman and Ghazvininejad, Marjan and Mohamed, Abdelrahman and Levy, Omer and Stoyanov, Ves and Zettlemoyer, Luke}, 139 | journal={arXiv preprint arXiv:1910.13461}, 140 | year={2019} 141 | } 142 | 143 | @article{wang2018glue, 144 | title={GLUE: A multi-task benchmark and analysis platform for natural language understanding}, 145 | author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R}, 146 | journal={arXiv preprint arXiv:1804.07461}, 147 | year={2018} 148 | } 149 | 150 | @inproceedings{hua2022transformer, 151 | title={Transformer quality in linear time}, 152 | author={Hua, Weizhe and Dai, Zihang and Liu, Hanxiao and Le, Quoc}, 153 | booktitle={International Conference on Machine Learning}, 154 | pages={9099--9117}, 155 | year={2022}, 156 | organization={PMLR} 157 | } 158 | 159 | @article{shaham2022scrolls, 160 | title={Scrolls: Standardized comparison over long language sequences}, 161 | author={Shaham, Uri and Segal, Elad and Ivgi, Maor and Efrat, Avia and Yoran, Ori and Haviv, Adi and Gupta, Ankit and Xiong, Wenhan and Geva, Mor and Berant, Jonathan and others}, 162 | journal={arXiv preprint arXiv:2201.03533}, 163 | year={2022} 164 | } 165 | 166 | @article{gu2022parameterization, 167 | title={On the parameterization and initialization of diagonal state space models}, 168 | author={Gu, Albert and Gupta, Ankit and Goel, Karan and R{\'e}, Christopher}, 169 | journal={arXiv preprint arXiv:2206.11893}, 170 | year={2022} 171 | } 172 | 173 | @article{mehta2022long, 174 | title={Long range language modeling via gated state spaces}, 175 | author={Mehta, Harsh and Gupta, Ankit and Cutkosky, Ashok and Neyshabur, Behnam}, 176 | journal={arXiv preprint arXiv:2206.13947}, 177 | year={2022} 178 | } 179 | 180 | @techreport{rumelhart1985learning, 181 | title={Learning internal representations by error propagation}, 182 | author={Rumelhart, David E and Hinton, Geoffrey E and Williams, Ronald J}, 183 | year={1985}, 184 | institution={California Univ San Diego La Jolla Inst for Cognitive Science} 185 | } 186 | 187 | @article{goel2022s, 188 | title={It's Raw! Audio Generation with State-Space Models}, 189 | author={Goel, Karan and Gu, Albert and Donahue, Chris and R{\'e}, Christopher}, 190 | journal={arXiv preprint arXiv:2202.09729}, 191 | year={2022} 192 | } 193 | 194 | @article{tay2021pre, 195 | title={Are Pre-trained Convolutions Better than Pre-trained Transformers?}, 196 | author={Tay, Yi and Dehghani, Mostafa and Gupta, Jai and Bahri, Dara and Aribandi, Vamsi and Qin, Zhen and Metzler, Donald}, 197 | journal={arXiv preprint arXiv:2105.03322}, 198 | year={2021} 199 | } 200 | 201 | @inproceedings{DBLP:conf/naacl/PetersNIGCLZ18, 202 | author = {Matthew E. Peters and 203 | Mark Neumann and 204 | Mohit Iyyer and 205 | Matt Gardner and 206 | Christopher Clark and 207 | Kenton Lee and 208 | Luke Zettlemoyer}, 209 | editor = {Marilyn A. Walker and 210 | Heng Ji and 211 | Amanda Stent}, 212 | title = {Deep Contextualized Word Representations}, 213 | booktitle = {Proceedings of the 2018 Conference of the North American Chapter of 214 | the Association for Computational Linguistics: Human Language Technologies, 215 | {NAACL-HLT} 2018, New Orleans, Louisiana, USA, June 1-6, 2018, Volume 216 | 1 (Long Papers)}, 217 | pages = {2227--2237}, 218 | publisher = {Association for Computational Linguistics}, 219 | year = {2018}, 220 | url = {https://doi.org/10.18653/v1/n18-1202}, 221 | doi = {10.18653/v1/n18-1202}, 222 | timestamp = {Fri, 06 Aug 2021 00:41:32 +0200}, 223 | biburl = {https://dblp.org/rec/conf/naacl/PetersNIGCLZ18.bib}, 224 | bibsource = {dblp computer science bibliography, https://dblp.org} 225 | } 226 | 227 | 228 | @article{smith2022simplified, 229 | title={Simplified state space layers for sequence modeling}, 230 | author={Smith, Jimmy TH and Warrington, Andrew and Linderman, Scott W}, 231 | journal={arXiv preprint arXiv:2208.04933}, 232 | year={2022} 233 | } 234 | 235 | 236 | @article{lee2021fnet, 237 | title={Fnet: Mixing tokens with fourier transforms}, 238 | author={Lee-Thorp, James and Ainslie, Joshua and Eckstein, Ilya and Ontanon, Santiago}, 239 | journal={arXiv preprint arXiv:2105.03824}, 240 | year={2021} 241 | } 242 | 243 | @article{marvin2018targeted, 244 | title={Targeted syntactic evaluation of language models}, 245 | author={Marvin, Rebecca and Linzen, Tal}, 246 | journal={arXiv preprint arXiv:1808.09031}, 247 | year={2018} 248 | } 249 | 250 | @article{linzen2016assessing, 251 | title={Assessing the ability of LSTMs to learn syntax-sensitive dependencies}, 252 | author={Linzen, Tal and Dupoux, Emmanuel and Goldberg, Yoav}, 253 | journal={Transactions of the Association for Computational Linguistics}, 254 | volume={4}, 255 | pages={521--535}, 256 | year={2016}, 257 | publisher={MIT Press} 258 | } 259 | 260 | @article{goldberg2019assessing, 261 | title={Assessing BERT's syntactic abilities}, 262 | author={Goldberg, Yoav}, 263 | journal={arXiv preprint arXiv:1901.05287}, 264 | year={2019} 265 | } 266 | 267 | @inproceedings{dauphin2017language, 268 | title={Language modeling with gated convolutional networks}, 269 | author={Dauphin, Yann N and Fan, Angela and Auli, Michael and Grangier, David}, 270 | booktitle={International conference on machine learning}, 271 | pages={933--941}, 272 | year={2017}, 273 | organization={PMLR} 274 | } 275 | 276 | @article{shazeer2020glu, 277 | title={Glu variants improve transformer}, 278 | author={Shazeer, Noam}, 279 | journal={arXiv preprint arXiv:2002.05202}, 280 | year={2020} 281 | } 282 | 283 | @article{narang2021transformer, 284 | title={Do transformer modifications transfer across implementations and applications?}, 285 | author={Narang, Sharan and Chung, Hyung Won and Tay, Yi and Fedus, William and Fevry, Thibault and Matena, Michael and Malkan, Karishma and Fiedel, Noah and Shazeer, Noam and Lan, Zhenzhong and others}, 286 | journal={arXiv preprint arXiv:2102.11972}, 287 | year={2021} 288 | } 289 | 290 | @article{warstadt2019linguistic, 291 | title={Linguistic analysis of pretrained sentence encoders with acceptability judgments}, 292 | author={Warstadt, Alex and Bowman, Samuel R}, 293 | journal={arXiv preprint arXiv:1901.03438}, 294 | year={2019} 295 | } 296 | 297 | @article{gulordava2018colorless, 298 | title={Colorless green recurrent networks dream hierarchically}, 299 | author={Gulordava, Kristina and Bojanowski, Piotr and Grave, Edouard and Linzen, Tal and Baroni, Marco}, 300 | journal={arXiv preprint arXiv:1803.11138}, 301 | year={2018} 302 | } 303 | 304 | @article{clark2019does, 305 | title={What does bert look at? an analysis of bert's attention}, 306 | author={Clark, Kevin and Khandelwal, Urvashi and Levy, Omer and Manning, Christopher D}, 307 | journal={arXiv preprint arXiv:1906.04341}, 308 | year={2019} 309 | } 310 | 311 | @article{tenney2019bert, 312 | title={BERT rediscovers the classical NLP pipeline}, 313 | author={Tenney, Ian and Das, Dipanjan and Pavlick, Ellie}, 314 | journal={arXiv preprint arXiv:1905.05950}, 315 | year={2019} 316 | } 317 | 318 | @inproceedings{rajpurkar2016squad, 319 | title={SQuAD: 100,000+ Questions for Machine Comprehension of Text}, 320 | author={Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin and Liang, Percy}, 321 | booktitle={Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing}, 322 | pages={2383--2392}, 323 | year={2016} 324 | } 325 | 326 | @article{wettig2022should, 327 | title={Should You Mask 15\% in Masked Language Modeling?}, 328 | author={Wettig, Alexander and Gao, Tianyu and Zhong, Zexuan and Chen, Danqi}, 329 | journal={arXiv preprint arXiv:2202.08005}, 330 | year={2022} 331 | } 332 | 333 | @article{warstadt2019neural, 334 | title={Neural network acceptability judgments}, 335 | author={Warstadt, Alex and Singh, Amanpreet and Bowman, Samuel R}, 336 | journal={Transactions of the Association for Computational Linguistics}, 337 | volume={7}, 338 | pages={625--641}, 339 | year={2019}, 340 | publisher={MIT Press} 341 | } 342 | 343 | @article{hendrycks2016gaussian, 344 | title={Gaussian error linear units (gelus)}, 345 | author={Hendrycks, Dan and Gimpel, Kevin}, 346 | journal={arXiv preprint arXiv:1606.08415}, 347 | year={2016} 348 | } 349 | 350 | @article{ma2022mega, 351 | title={Mega: moving average equipped gated attention}, 352 | author={Ma, Xuezhe and Zhou, Chunting and Kong, Xiang and He, Junxian and Gui, Liangke and Neubig, Graham and May, Jonathan and Zettlemoyer, Luke}, 353 | journal={arXiv preprint arXiv:2209.10655}, 354 | year={2022} 355 | } 356 | 357 | @article{dao2022hungry, 358 | title={Hungry Hungry Hippos: Towards Language Modeling with State Space Models}, 359 | author={Dao, Tri and Fu, Daniel Y and Saab, Khaled K and Thomas, Armin W and Rudra, Atri and R{\'e}, Christopher}, 360 | journal={arXiv preprint arXiv:2212.14052}, 361 | year={2022} 362 | } 363 | 364 | @inproceedings{joshi2017triviaqa, 365 | title={TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}, 366 | author={Joshi, Mandar and Choi, Eunsol and Weld, Daniel S and Zettlemoyer, Luke}, 367 | booktitle={Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, 368 | pages={1601--1611}, 369 | year={2017} 370 | } 371 | 372 | @inproceedings{yang2015wikiqa, 373 | title={Wikiqa: A challenge dataset for open-domain question answering}, 374 | author={Yang, Yi and Yih, Wen-tau and Meek, Christopher}, 375 | booktitle={Proceedings of the 2015 conference on empirical methods in natural language processing}, 376 | pages={2013--2018}, 377 | year={2015} 378 | } 379 | 380 | -------------------------------------------------------------------------------- /beamercolorthemeauriga.sty: -------------------------------------------------------------------------------- 1 | % Auriga theme 2 | % https://github.com/anishathalye/auriga 3 | 4 | % ==================== 5 | % Definitions 6 | % ==================== 7 | 8 | \definecolor{lightgray}{RGB}{245, 246, 250} 9 | \definecolor{darkgray}{RGB}{79,79,79} 10 | 11 | % ==================== 12 | % Theme 13 | % ==================== 14 | 15 | % Basic colors 16 | \setbeamercolor{palette primary}{fg=black,bg=white} 17 | \setbeamercolor{palette secondary}{fg=black,bg=white} 18 | \setbeamercolor{palette tertiary}{bg=black,fg=white} 19 | \setbeamercolor{palette quaternary}{fg=black,bg=white} 20 | \setbeamercolor{structure}{fg=darkgray} 21 | 22 | % Itemize 23 | \setbeamercolor{item}{fg=black} 24 | 25 | % Page numbering 26 | \setbeamercolor{page number in head/foot}{fg=structure.fg} 27 | 28 | % Frame titles 29 | \setbeamercolor{frametitle}{fg=black} 30 | -------------------------------------------------------------------------------- /beamerthemeauriga.sty: -------------------------------------------------------------------------------- 1 | % Auriga theme 2 | % https://github.com/anishathalye/auriga 3 | 4 | % ==================== 5 | % Dependencies 6 | % ==================== 7 | 8 | \RequirePackage{exscale} 9 | \RequirePackage{ragged2e} 10 | \RequirePackage{changepage} 11 | \RequirePackage{fontspec} 12 | \RequirePackage{xpatch} 13 | 14 | % ==================== 15 | % Fonts 16 | % ==================== 17 | 18 | \newfontfamily\Raleway[Ligatures=TeX]{Raleway} 19 | \newfontfamily\Lato[Ligatures=TeX]{Lato} 20 | 21 | \usefonttheme{professionalfonts} 22 | 23 | \setsansfont{Lato}[ 24 | UprightFont=*-Regular, 25 | ItalicFont=*-Italic, 26 | BoldFont=*-Bold, 27 | BoldItalicFont=*-BoldItalic 28 | ] 29 | \setmonofont{Hack} 30 | 31 | \setbeamerfont{title page}{family=\Raleway} 32 | \setbeamerfont{title page title}{size=\LARGE,series=\bfseries} 33 | \setbeamerfont{title page author}{size=\footnotesize} 34 | \setbeamerfont{title page institute}{size=\scriptsize} 35 | \setbeamerfont{frametitle}{family=\Raleway,size=\large,series=\bfseries} 36 | \setbeamerfont{caption}{size=\footnotesize} 37 | 38 | 39 | % ==================== 40 | % Macros 41 | % ==================== 42 | 43 | \newcommand{\samelineand}{\qquad} 44 | 45 | % ==================== 46 | % Elements 47 | % ==================== 48 | 49 | % Itemize 50 | 51 | \setbeamertemplate{itemize item}[circle] 52 | \setbeamertemplate{itemize subitem}[circle] 53 | \setbeamertemplate{itemize subsubitem}[circle] 54 | \xpatchcmd{\itemize} 55 | {\def\makelabel} 56 | {\ifnum\@itemdepth=1\relax 57 | \setlength\itemsep{3ex}% separation for first level 58 | \else 59 | \ifnum\@itemdepth=2\relax 60 | \setlength\itemsep{0.5ex}% separation for second level 61 | \else 62 | \ifnum\@itemdepth=3\relax 63 | \setlength\itemsep{0.5ex}% separation for third level 64 | \fi\fi\fi\def\makelabel 65 | } 66 | {} 67 | {} 68 | 69 | % Equation 70 | \setlength\belowdisplayshortskip{2ex} 71 | 72 | % Caption 73 | \setlength{\abovecaptionskip}{2ex} 74 | \setlength{\belowcaptionskip}{1ex} 75 | \setbeamertemplate{caption} 76 | { 77 | {\usebeamerfont{caption}\insertcaption} 78 | } 79 | 80 | % Navigation 81 | \beamertemplatenavigationsymbolsempty 82 | 83 | % ==================== 84 | % Components 85 | % ==================== 86 | 87 | % Title page 88 | \setbeamertemplate{title page} 89 | { 90 | \begin{centering} 91 | \vskip5ex plus 1filll 92 | {\usebeamerfont{title page title}\usebeamercolor[fg]{title page}\inserttitle\\[1.5ex]} 93 | {\usebeamerfont{title page author}\usebeamercolor[fg]{title page}\insertauthor\\[2ex]} 94 | {\usebeamerfont{title page institute}\usebeamercolor[fg]{title page}\insertinstitute\\[1ex]} 95 | \vskip0pt plus 1filll 96 | \end{centering} 97 | } 98 | 99 | % Footer 100 | \setbeamertemplate{footline}{ 101 | \hfill% 102 | \usebeamercolor[fg]{page number in head/foot}% 103 | \usebeamerfont{page number in head/foot}% 104 | \hspace{2em}% 105 | %\insertframenumber\kern1em\vskip2ex% 106 | } 107 | 108 | % Frame title 109 | \setbeamertemplate{frametitle}{ 110 | \nointerlineskip 111 | \vskip2ex 112 | {\usebeamerfont{frametitle}\usebeamercolor[fg]{frametitle}\insertframetitle} 113 | } 114 | 115 | \renewcommand\footnoterule{} 116 | 117 | \setbeamertemplate{footnote}{% 118 | \parindent 0.5em\noindent% 119 | \raggedleft 120 | \usebeamercolor{footnote}\hbox to 5.8em{}\scriptsize \insertfootnotetext\par% 121 | } -------------------------------------------------------------------------------- /old.tex: -------------------------------------------------------------------------------- 1 | % Auriga theme 2 | % find the most up-to-date version here: https://github.com/anishathalye/auriga 3 | 4 | \documentclass[14pt,aspectratio=169]{beamer} 5 | \usepackage{pgfpages} 6 | \usepackage{fancyvrb} 7 | \usepackage{tikz} 8 | \usepackage{pgfplots} 9 | \usepackage{booktabs} 10 | \includeonlyframes{current} 11 | 12 | \usetheme{auriga} 13 | \usecolortheme{auriga} 14 | \setbeamercolor{math text}{fg=blue} 15 | 16 | \newcommand\blfootnote[1]{% 17 | \begingroup 18 | \renewcommand\thefootnote{}\footnote{#1}% 19 | \addtocounter{footnote}{-1}% 20 | \endgroup 21 | } 22 | 23 | %\setbeamertemplate{footline}[] 24 | %\renewcommand\footnotemark{} 25 | 26 | 27 | % define some colors for a consistent theme across slides 28 | \definecolor{red}{RGB}{181, 23, 0} 29 | \definecolor{blue}{RGB}{0, 118, 186} 30 | \definecolor{gray}{RGB}{146, 146, 146} 31 | 32 | \title{Do we need \textcolor{blue}{Attention}?} 33 | 34 | \author{Alexander "Sasha" Rush} 35 | 36 | \institute[shortinst]{} 37 | 38 | \begin{document} 39 | 40 | { 41 | % rather than use the frame options [noframenumbering,plain], we make the 42 | % color match, so that the indicated page numbers match PDF page numbers 43 | \setbeamercolor{page number in head/foot}{fg=background canvas.bg} 44 | \begin{frame} 45 | \titlepage 46 | \end{frame} 47 | } 48 | 49 | \begin{frame}[label=current]{} 50 | \cite{gu2022parameterization} 51 | \cite{} 52 | \cite{dao2022hungry} 53 | \cite{ma2022mega} 54 | \end{frame} 55 | 56 | 57 | \section{Context} 58 | % \begin{frame}{Outline} 59 | % \tableofcontents 60 | % \end{frame} 61 | 62 | 63 | \begin{frame}[label=current]{Caveats} 64 | \begin{itemize} 65 | \item LLMs are remarkable, we should use them for most things 66 | \item This talk is \structure{not} about LLMs 67 | \end{itemize} 68 | \end{frame} 69 | 70 | 71 | \begin{frame} 72 | \includegraphics[trim={10cm 0 10cm 0}, clip, height=\textheight]{Figs/ModelSize2.png} 73 | \end{frame} 74 | 75 | \begin{frame} 76 | \includegraphics[trim={10cm 0 10cm 0}, clip, height=\textheight]{Figs/ModelSize3.png} 77 | \end{frame} 78 | 79 | \begin{frame}{Context} 80 | \begin{itemize} 81 | \item BERT used to require non-trivial compute 82 | \item Belief: Open architecture questions in NLP 83 | \item Today's Talk: How important is \textit{attention}? 84 | \end{itemize} 85 | \end{frame} 86 | 87 | 88 | \begin{frame}{\textcolor{red}{ELMo} } 89 | 90 | \begin{columns} 91 | \begin{column}{0.3\linewidth} 92 | \centerline{Bidirectional RNN} 93 | \end{column} 94 | \begin{column}{0.7\linewidth} 95 | 96 | \begin{figure} 97 | \includegraphics[width=0.8\textwidth]{Figs/elmo.png} 98 | \end{figure} 99 | \end{column} 100 | \end{columns} 101 | \blfootnote{\cite{DBLP:conf/naacl/PetersNIGCLZ18}} 102 | 103 | \end{frame} 104 | 105 | \begin{frame}{\textcolor{red}{ELMo} For Pretraining} 106 | \begin{table} 107 | \begin{tabular}{lc} 108 | \toprule 109 | Model & GLUE\\ 110 | \midrule 111 | ELMo& 67.7 \\ 112 | ELMo+Attn& 71.0\\ 113 | \visible<2>{BERT-Base & 79 - 83} \\ 114 | \bottomrule 115 | \end{tabular} 116 | \end{table} 117 | \blfootnote{\cite{DBLP:conf/naacl/PetersNIGCLZ18, devlin2018bert}} 118 | \end{frame} 119 | 120 | \begin{frame}{Architecture?} 121 | \begin{itemize} 122 | \item 123 | Several confounding differences, e.g. frozen model. 124 | \item Followup: \textit{To Tune or Not to Tune? Adapting Pretrained Representations to Diverse Tasks} \cite{peters2019tune} 125 | \pause 126 | 127 | \item Conclusion: Transformers significantly beat BiLSTMs 128 | \end{itemize} 129 | \end{frame} 130 | 131 | \begin{frame}{Other Models} 132 | 133 | Maybe there are other models 134 | 135 | \vspace{0.5cm} 136 | 137 | \begin{itemize} 138 | \item Convolutions? 139 | \item Mixers? 140 | \end{itemize} 141 | 142 | % \textit{Are Pre-trained Convolutions Better than Pre-trained Transformers?} 143 | % \\ 144 | % \\ 145 | % Answer: No. 146 | 147 | \end{frame} 148 | 149 | \begin{frame}{Pretraining with CNNs} 150 | \textit{Are Pre-trained Convolutions Better than Pre-trained Transformers?} \cite{tay2020efficient} 151 | 152 | \vspace{0.5cm} 153 | 154 | \visible<2>{\structure{Answer: No.} 155 | 156 | \begin{table} 157 | \begin{tabular}{lc} 158 | \toprule 159 | Model & SST-2\\ 160 | \midrule 161 | ELMo & 91.8 \\ 162 | Best CNN & 92.2 \\ 163 | BERT-Base & 93.5 \\ 164 | \bottomrule 165 | \end{tabular} 166 | \end{table} 167 | 168 | } 169 | 170 | \end{frame} 171 | 172 | 173 | % \begin{frame}{Results: CNNs} 174 | % \begin{table} 175 | % \begin{tabular}{lc} 176 | % \toprule 177 | % Model & SST-2\\ 178 | % \midrule 179 | % Best CNN & 92.2 \\ 180 | % ELMo & 91.8 \\ 181 | % BERT-Base & 93.5 \\ 182 | % \bottomrule 183 | % \end{tabular} 184 | % \end{table} 185 | % \end{frame} 186 | 187 | \begin{frame}{Pretraining with FNet} 188 | \textit{FNet: Mixing Tokens with Fourier Transforms} \cite{lee2021fnet} 189 | 190 | \vspace{0.5cm} 191 | 192 | Replaces attention with 2D FFT mixing-layer. 193 | 194 | \visible<2>{ 195 | \begin{table} 196 | \begin{tabular}{lc} 197 | \toprule 198 | Model & GLUE (dev)\\ 199 | \midrule 200 | Best FNet & 76.3 \\ 201 | BERT-Base & 83.3 \\ 202 | \bottomrule 203 | \end{tabular} 204 | \end{table} 205 | } 206 | \end{frame} 207 | 208 | 209 | 210 | \begin{frame}{Transformers are Great...} 211 | \begin{itemize} 212 | \item Highly optimized training 213 | \item Long-range ability 214 | \item Expensive $O(n^2)$, but we have the money... 215 | \end{itemize} 216 | \vspace{0.5cm} 217 | 218 | \visible<2>{(But aren't you curious...)} 219 | \end{frame} 220 | 221 | \section{State Space Models} 222 | \begin{frame}{Outline} 223 | \tableofcontents[currentsection] 224 | \end{frame} 225 | 226 | 227 | \begin{frame}{State Space Models (SSM)} 228 | \begin{itemize} 229 | 230 | \item Think hybrid RNN / CNN 231 | 232 | \item SOTA on speech generation and long-range tasks 233 | 234 | \item Tutorial at \textit{The Annotated S4} 235 | \end{itemize} 236 | 237 | \blfootnote{\cite{gu2020hippo,gu2021combining,gu2021efficiently}} 238 | \end{frame} 239 | 240 | 241 | \begin{frame}{State Space Model - Continuous Time} 242 | Let $u(t) \in \mathbb{R}$ be a continuous input and $y(t) \in \mathbb{R}$ be output. 243 | 244 | \pause 245 | \vspace{0.5cm} 246 | 247 | SSM is a differential equation. 248 | \begin{align*} 249 | \boldsymbol{x}'(t) &= \boldsymbol{A}\boldsymbol{x}(t) + \boldsymbol{B}u(t) \\ 250 | y(t) &= \boldsymbol{C}\boldsymbol{x}(t) + \boldsymbol{D}u(t). 251 | \end{align*} 252 | 253 | \pause 254 | Where $\boldsymbol{x}(t) \in \mathbf{R}^N$ is a hidden state and model \structure{parameters}, 255 | 256 | $$\boldsymbol{A} \in \mathbb{R}^{N\times N}, \boldsymbol{B}\in \mathbb{R}^{N \times 1}, \boldsymbol{C} \in \mathbb{R}^{1 \times N}, \boldsymbol{D} \in \mathbb{R}^{1\times 1}$$ 257 | 258 | \end{frame} 259 | \begin{frame}{Discrete Time Sequence} 260 | 261 | Goal: Map scalar sequence $u_{1}, \ldots, u_L$ to $y_1, \ldots, y_L$, 262 | 263 | \begin{figure} 264 | \centering 265 | \includegraphics[width=0.5\textwidth]{Figs/SSMStart.pdf} 266 | \label{fig:my_label} 267 | \end{figure} 268 | \end{frame} 269 | 270 | \begin{frame}{Discrete Time SSM} 271 | 272 | SSM on discretize time data, 273 | 274 | \begin{align*} 275 | \boldsymbol{x}_{k} &= \boldsymbol{\overline{A}} \boldsymbol{x}_{k-1} + \boldsymbol{\overline{B}} u_k \\ 276 | y_k &= \boldsymbol{\overline{C}} \boldsymbol{x}_{k \phantom{- 1}} + \boldsymbol{\overline{D}} u_k. 277 | \end{align*} 278 | 279 | Using discretization with (learned) sampling rate parameter $\Delta$, 280 | 281 | $$\boldsymbol{\overline{A}}, \boldsymbol{\overline{B}}, \boldsymbol{\overline{C}} = \text{discretize}(\boldsymbol{A}, \boldsymbol{B}, \boldsymbol{C}, \Delta )$$ 282 | 283 | \end{frame} 284 | 285 | \begin{frame}{Recurrent Form} 286 | 287 | Output sequence $y_1, \ldots, y_L$ can be computed as a linear RNN, 288 | 289 | \begin{align*} 290 | \boldsymbol{x}_{k} &= \boldsymbol{\overline{A}} \boldsymbol{x}_{k-1} + \boldsymbol{\overline{B}} u_k \\ 291 | y_k &= \boldsymbol{\overline{C}} \boldsymbol{x}_{k \phantom{- 1}} + \boldsymbol{\overline{D}} u_k. 292 | \end{align*} 293 | 294 | Note $\boldsymbol{x}_k \in \mathbb{R}^N$ is the bigger hidden state for $u_k \in \mathbb{R}$, and $\boldsymbol{x}_0 = \mathbf{0}$. 295 | 296 | \end{frame} 297 | 298 | \begin{frame}{Convolutional Form} 299 | 300 | Alternative: 1D convolution with kernel $\boldsymbol{\overline{K}}$ (width $L$), 301 | 302 | \begin{align*} 303 | \overline{K} &= (\boldsymbol{\overline{C}}\boldsymbol{\overline{B}}, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}\boldsymbol{\overline{B}}, \dots, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}^{L-1}\boldsymbol{\overline{B}}) \\ 304 | y &= \text{conv1d}(\overline{K}_L \ldots \overline{K}_1, u_1 \ldots u_L) 305 | \end{align*} 306 | 307 | Intuition: 308 | \pause 309 | $$y_1 = \boldsymbol{\overline{C}} \boldsymbol{\overline{B}} u_1$$ 310 | \pause 311 | $$y_2 = \boldsymbol{\overline{C}} \boldsymbol{\overline{A}} \boldsymbol{\overline{B}} u_1 + \boldsymbol{\overline{C}} \boldsymbol{\overline{B}} u_2 = \boldsymbol{\overline{C}} (\boldsymbol{\overline{A}} \boldsymbol{\overline{B}} u_1 + \boldsymbol{\overline{B}} u_2) = \boldsymbol{\overline{C}} (\boldsymbol{x}_1 + \boldsymbol{\overline{B}} u_2) $$ 312 | \end{frame} 313 | 314 | \begin{frame}{Convolutional Form} 315 | Step 1: Discretize (Training Only). Step 2: Apply 1D Conv 316 | \begin{figure} 317 | \centering 318 | \includegraphics[width=0.6\textwidth]{Figs/SSMSide.pdf} 319 | \label{fig:my_label} 320 | \end{figure} 321 | \end{frame} 322 | 323 | \begin{frame}{Implementation - Computing Kernel} 324 | 325 | $$\boldsymbol{\overline{K}} = (\boldsymbol{\overline{C}}\boldsymbol{\overline{B}}, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}\boldsymbol{\overline{B}}, \dots, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}^{L-1}\boldsymbol{\overline{B}}) $$ 326 | 327 | \begin{itemize} 328 | \item Simple approximations work well (See S4D, DSS) 329 | \end{itemize} 330 | \blfootnote{\cite{gu2021efficiently,gupta2022diagonal,gu2022parameterization}} 331 | \end{frame} 332 | 333 | 334 | \begin{frame}{Implementation - Fourier Transform} 335 | \begin{align*} 336 | &y = \boldsymbol{\overline{K}} \ast u 337 | \end{align*} 338 | \begin{itemize} 339 | \item At long $L$, convolution computed with FFT. 340 | \item More efficient than self-attention or standard RNN. 341 | \end{itemize} 342 | \end{frame} 343 | 344 | 345 | \begin{frame}{Important Training Initialization} 346 | \begin{itemize} 347 | \item Parameter $\boldsymbol{A}$ is initialized with HiPPO Matrix \cite{gu2020hippo} 348 | 349 | % \begin{scriptsize} 350 | % \begin{align*} 351 | % \boldsymbol{A}_{nk}= - 352 | % \begin{cases} 353 | % (2n+1)^{1/2}(2k+1)^{1/2} & \text{if } n > k \\ n+1 &\text{if } n=k \text{\ else\ } 0 354 | % \end{cases} 355 | % \end{align*} 356 | % \end{scriptsize} 357 | 358 | 359 | \item Kernel formed by Legendre coefficients 360 | \end{itemize} 361 | \begin{figure} 362 | \centering 363 | \includegraphics[width=0.7\textwidth]{Figs/hippo.png} 364 | \end{figure} 365 | \end{frame} 366 | 367 | 368 | 369 | \begin{frame}{Summary: SSM} 370 | \begin{itemize} 371 | \item Mapping from sequence-to-sequence 372 | \item Acts like an RNN, Computed like a CNN 373 | \item Fast to train and utilize 374 | \end{itemize} 375 | \end{frame} 376 | 377 | \section{Model Architectures} 378 | \begin{frame}{Outline} 379 | \tableofcontents[currentsection] 380 | \end{frame} 381 | 382 | \begin{frame}{Objective: Replicate BERT with SSM} 383 | \begin{itemize} 384 | \item Everything else identical (loss, number of parameters, data) 385 | \end{itemize} 386 | \end{frame} 387 | 388 | % \begin{frame}{Architectures for Pretraining} 389 | % \begin{itemize} 390 | % \item Idea 1: Just replace self-attention 391 | % \item Minimal change to Transformer arch 392 | % \end{itemize} 393 | % \end{frame} 394 | 395 | 396 | \begin{frame}{\structure{Naive Idea} Self-attention $\Rightarrow$ SSM} 397 | \begin{figure} 398 | \centering 399 | \includegraphics[height=0.8\textheight,trim={0 0 18cm 0},clip]{Figs/model_architecture_comparison2.pdf} 400 | \caption{} 401 | \label{} 402 | \end{figure} 403 | \end{frame} 404 | 405 | \begin{frame}{Can this work?} 406 | \begin{itemize} 407 | \item SSM is significantly less expressive than self-attention. 408 | \item Static routing through the model like a CNN. 409 | \item Can it learn to do \structure{matching} across sentences? 410 | \end{itemize} 411 | \pause 412 | \vspace{0.5cm} 413 | 414 | 415 | 416 | 417 | \end{frame} 418 | 419 | 420 | \begin{frame}{Test: Matching Across Gaps} 421 | \centerline{Task: QNLI \cite{wang2018glue}} 422 | \vspace{0.5cm} 423 | 424 | 425 | \centerline{\textcolor{red}{What percentage of farmland grows wheat?}} 426 | 427 | \centerline{$\sim \sim \sim $} 428 | 429 | \centerline{\textcolor{olivegreen}{More than 50\% of this area is sown for wheat and 33\% for barley.}} 430 | 431 | \pause 432 | 433 | \begin{table}[t] 434 | \center 435 | \begin{tabular}{ccc} 436 | \toprule 437 | \centering 438 | Arch & \textcolor{red}{H} P & \textcolor{red}{H} $\sim$ P \\ 439 | \midrule 440 | \textsc{stack} / \textsc{ssm} & 77.4 & 69.7\\ 441 | % \textsc{gated} / \textsc{ssm} & 77.4 & 77.7\\ 442 | \bottomrule 443 | \end{tabular} 444 | \caption{} 445 | \label{tab:synthetic} 446 | \end{table} 447 | \end{frame} 448 | 449 | 450 | 451 | % \begin{frame}{Does this work} 452 | 453 | % \end{frame} 454 | 455 | 456 | 457 | \begin{frame}{\structure{Proposed Fix}: Multiplicative Gating} 458 | 459 | Add dynamism to stacked model with multiplicative gating. 460 | 461 | $$\sigma(\mathbf{W} \mathbf{u}) \otimes (\mathbf{V} \mathbf{u})$$ 462 | 463 | Positive results with CNN, Transformer, and SSM models. 464 | 465 | 466 | \blfootnote{\cite{dauphin2017language, shazeer2020glu, narang2021transformer}} 467 | 468 | \end{frame} 469 | 470 | \begin{frame}{Proposed Architecture: BiGS} 471 | \begin{figure} 472 | \centering 473 | \includegraphics[height=0.7\textheight,trim={16cm 0 0 0},clip]{Figs/model_architecture_comparison2.pdf} 474 | \caption{} 475 | \label{fig:my_label} 476 | \end{figure} 477 | \end{frame} 478 | 479 | \begin{frame}{Gating Adaptation} 480 | \centerline{\textcolor{red}{What percentage of farmland grows wheat?}} 481 | 482 | \centerline{$\sim \sim \sim $} 483 | 484 | \centerline{\textcolor{olivegreen}{More than 50\% of this area is sown for wheat and 33\% for barley.}} 485 | 486 | 487 | \begin{table}[t] 488 | \center 489 | \begin{tabular}{lcc} 490 | \toprule 491 | \centering 492 | Arch & \textcolor{red}{H} P & \textcolor{red}{H} $\sim$ P \\ 493 | \midrule 494 | \textsc{stack} / \textsc{ssm} & 77.4 & 69.7\\ 495 | \textsc{gated} / \textsc{ssm} & 77.4 & 77.7\\ 496 | \bottomrule 497 | \end{tabular} 498 | \caption{ } 499 | \label{tab:synthetic} 500 | \end{table} 501 | \pause 502 | 503 | 504 | \end{frame} 505 | 506 | \begin{frame}{Full Experiment: QNLI} 507 | 508 | Preview: Experimental results, pretraining for QNLI. 509 | 510 | \begin{figure} 511 | \centering 512 | \includegraphics[height=0.7\textheight]{Figs/graph.png} 513 | \label{fig:my_label} 514 | \end{figure} 515 | \end{frame} 516 | 517 | \begin{frame}{Related Result: Induction Heads (H3)} 518 | Synthetic \structure{induction head} experiment from \cite{dao2022hungry} 519 | 520 | \vspace{0.5cm} 521 | 522 | \centerline{a b c d e $\Rightarrow$ f g h i . . . x y z $\Rightarrow$ \ \ \ \ \textcolor{red}{f} } 523 | 524 | \begin{table}[t] 525 | \center 526 | \begin{tabular}{lcc} 527 | \toprule 528 | \centering 529 | Arch & Induction \\ 530 | \midrule 531 | \textsc{ssm} & 35.6 \\ 532 | \textsc{gating} + \textsc{ssm} & 100\\ 533 | \textsc{attention} & 100\\ 534 | \bottomrule 535 | \end{tabular} 536 | \caption{ } 537 | \label{tab:synthetic} 538 | \end{table} 539 | \end{frame} 540 | 541 | 542 | \begin{frame}{Induction Heads} 543 | 544 | \begin{columns} 545 | \begin{column}{0.5\textwidth} 546 | \begin{figure} 547 | \centering 548 | \includegraphics[height=0.8\textheight]{Figs/induct.png} 549 | 550 | \label{fig:my_label} 551 | \end{figure} 552 | \end{column} 553 | \begin{column}{0.5\textwidth} 554 | \begin{figure} 555 | \centering 556 | 557 | \includegraphics[height=0.3\textheight]{Figs/RASP.png} 558 | \label{fig:my_label} 559 | \end{figure} 560 | \end{column} 561 | \end{columns} 562 | 563 | \end{frame} 564 | 565 | 566 | 567 | % \begin{frame}{Gating} 568 | 569 | % \end{frame} 570 | 571 | % \begin{frame}{Simpler multiplicative Interactions} 572 | % \begin{figure} 573 | % \centering 574 | % % \includegraphics{Figs/model_architecture_comparison2.pdf} 575 | % \caption{Caption} 576 | % \label{fig:my_label} 577 | % \end{figure} 578 | % \end{frame} 579 | 580 | \section{Experiments} 581 | 582 | \begin{frame}{Outline} 583 | \tableofcontents[currentsection] 584 | \end{frame} 585 | 586 | 587 | \begin{frame}{\structure{Experiment 1:} BERT} 588 | \begin{itemize} 589 | \item Models trained using ``24 Hour'' BERT \cite{izsak2021train} 590 | \begin{itemize} 591 | \item All BERT-Large Size 592 | \item Training length (Short 11B, Medium 22B, Full >100B) 593 | \item 128 Length Sequences 594 | \end{itemize} 595 | 596 | \item Codebase in JAX (from Annotated S4 {\small \cite{rush2022s4}}) using S4D 597 | \item Training data and masking is identical 598 | \end{itemize} 599 | \end{frame} 600 | 601 | % \begin{frame}{Short Training $\sim$11B Tokens} 602 | % \begin{table} 603 | % \begin{tabular}{lc} 604 | % \toprule 605 | % Model & GLUE (Dev)\\ 606 | % \midrule 607 | % BERT & 84.1\\ 608 | % Stacked-SSM & 77.2 \\ 609 | % BiGS & 84.0 \\ 610 | % \bottomrule 611 | % \end{tabular} 612 | % \end{table} 613 | % \end{frame} 614 | 615 | \begin{frame}{Short Training $\sim$11B Tokens} 616 | \begin{table} 617 | \begin{tabular}{lc} 618 | \toprule 619 | Model & GLUE (Dev)\\ 620 | \midrule 621 | ELMo & 68.7 \\ 622 | BERT & 84.1\\ 623 | Stacked-SSM & 77.2 \\ 624 | BiGS & 84.0 \\ 625 | \bottomrule 626 | \end{tabular} 627 | \end{table} 628 | \end{frame} 629 | 630 | \begin{frame}{Is it just Gating?} 631 | \begin{table} 632 | \begin{tabular}{lc} 633 | \toprule 634 | Model & GLUE \\ 635 | \midrule 636 | BERT & 84.1\\ 637 | Gated-BERT & 82.6 \\ 638 | \bottomrule 639 | \end{tabular} 640 | \end{table} 641 | \end{frame} 642 | 643 | 644 | \begin{frame}{BERT Large > 100B Tokens} 645 | \begin{table} 646 | \begin{tabular}{lc} 647 | \toprule 648 | Model & GLUE (Test)\\ 649 | \midrule 650 | BERT-Large^* & 83.0\\ 651 | BiGS & 83.0 \\ 652 | \bottomrule 653 | \end{tabular} 654 | \end{table} 655 | \centerline{$^*$Best reported BERT-Large Results.} 656 | \end{frame} 657 | 658 | \begin{frame}{Analysis: Masked PPL Transfer} 659 | \begin{figure} 660 | \centering 661 | \includegraphics[width=0.6\textwidth]{Figs/MNLI.png} 662 | \end{figure} 663 | \end{frame} 664 | 665 | \begin{frame}{Analysis: Kernel Visualization} 666 | 667 | 668 | \begin{figure} 669 | \centering 670 | \includegraphics[width=\textwidth]{Figs/kernel1.png} 671 | \end{figure} 672 | 673 | \begin{itemize} 674 | \item Each BiGS layer only has 2 kernels (forward / backward). 675 | \item Shows \structure{all routing} in layer 2! (vs $O(HT^2)$ attention coef.) 676 | \end{itemize} 677 | \end{frame} 678 | 679 | \begin{frame}{Analysis: All Kernels} 680 | \begin{figure} 681 | \centering 682 | \includegraphics[height=0.6\textheight]{Figs/kernel2.png} 683 | \end{figure} 684 | \end{frame} 685 | 686 | \begin{frame}{Analysis: Change in Kernels during Finetuning } 687 | 688 | \centerline{Task: MNLI} 689 | \begin{figure} 690 | \centering 691 | \includegraphics[width=0.8\textwidth]{Figs/comparison_results.png} 692 | \end{figure} 693 | \end{frame} 694 | 695 | \begin{frame}{Analysis: Syntax} 696 | \begin{itemize} 697 | \item Observation: SSM model seems to do better on syntax-centric tasks 698 | \item Hypothesis: Locality of features encourages a stack-like inductive bias. 699 | \end{itemize} 700 | \end{frame} 701 | 702 | \begin{frame}{\structure{Observation 1}: COLA} 703 | \begin{table} 704 | \begin{tabular}{lc} 705 | \toprule 706 | Model & COLA \\ 707 | \midrule 708 | BERT & 60.5\\ 709 | BiGS & 64.7 \\ 710 | \bottomrule 711 | \end{tabular} 712 | \end{table} 713 | Statistically significant across runs. 714 | \end{frame} 715 | 716 | 717 | \begin{frame}{\structure{Observation 2}: Agreement Attractors} 718 | Task from \cite{linzen2016assessing,goldberg2019assessing}. 719 | \vspace{0.5cm} 720 | 721 | \begin{quote} 722 | Yet the \textbf{ratio} of \underline{men} who survive to the \underline{women} and \underline{children} who survive [is] not clear in this story 723 | \end{quote} 724 | 725 | \begin{figure} 726 | \centering 727 | \includegraphics[height=0.5\textheight]{Figs/attractors.png} 728 | \label{fig:my_label} 729 | \end{figure} 730 | 731 | \end{frame} 732 | 733 | \begin{frame}{\structure{Observation 3}: Diagnostics } 734 | From \cite{marvin2018targeted,goldberg2019assessing}: 735 | \begin{table}[t] 736 | \centering 737 | \scriptsize 738 | \begin{tabular}{lrrr} 739 | \toprule 740 | & BiGS & BERT & LSTM \\ 741 | \midrule 742 | \textsl{SUBJECT-VERB:} & & & \\ 743 | Simple & 100.0 & 100.0& 94.0 \\ 744 | Sentential complement & 85.1 & 85.6 & 99.0 \\ 745 | Short VP coordination & 91.0 & 86.5 & 90.0 \\ 746 | Long VP coordination & 97.5 & 97.5 & 61.0 \\ 747 | Across prep phrase & 88.6 & 84.8 & 57.0 \\ 748 | Across subj relative clause & 88.4 & 84.9 & 56.0 \\ 749 | Across obj relative clause & 89.9 & 85.1 & 50.0 \\ 750 | Across obj relative (-that) & 86.9 & 81.1 & 52.0 \\ 751 | In obj relative clause & 97.2 & 99.1 & 84.0 \\ 752 | In obj relative (-that) & 88.7 & 81.6 & 71.0 \\ 753 | \midrule 754 | \textsl{REFL ANAPHORA:} & & & \\ 755 | Simple & 97.1 & 98.9 & 83.0 \\ 756 | In a sentential complement & 79.9 & 86.2 & 86.0 \\ 757 | Across a relative clause & 79.1 & 75.9 & 55.0 \\ 758 | \bottomrule 759 | \end{tabular} 760 | \end{table} 761 | \end{frame} 762 | 763 | 764 | \begin{frame}{\structure{Experiment 2:} Longformer} 765 | \begin{itemize} 766 | \item Can we lengthen SSM $L\rightarrow L'$ without approximation? 767 | 768 | \item Continued training based on Longformer protocol. 769 | 770 | \item Two experimental scales 771 | % \begin{itemize} 772 | % \item 128->512 SQuAD \cite{rajpurkar2016squad} 773 | % \item 128->4096 SCROLLS \cite{shaham2022scrolls} 774 | % \end{itemize} 775 | \end{itemize} 776 | \end{frame} 777 | 778 | % \begin{frame}{SQuAD} 779 | % \begin{table}[tb] 780 | % \centering 781 | % \begin{tabular}{ll|c} 782 | % \toprule 783 | % & & SQuAD 1.1 \\ 784 | % \midrule 785 | % BERT & (512) & 90.9\\ 786 | % \midrule 787 | % BERT &(128 $\rightarrow$ 512) & 87.3 \\ 788 | % BiGS & (128 $\rightarrow$ 512) & 89.5 \\ 789 | % \bottomrule 790 | % \end{tabular} 791 | % \caption{ } 792 | % \label{tab:squad} 793 | % \end{table} 794 | % \end{frame} 795 | 796 | 797 | 798 | \begin{frame}{SCROLLS} 799 | \begin{table}[tb] 800 | \centering 801 | \begin{tabular}{lr|cc} 802 | \toprule 803 | & Length & QALT & CNLI \\ 804 | \midrule 805 | LED & 1024 & 26.6/27.2 & 73.4\\ 806 | & 4096 & 26.6/27.3 & 71.5\\ 807 | & 16384 & 25.8/25.4 & 71.5\\ 808 | \midrule 809 | BART & 256 & 26.0/25.8 & 69.8\\ 810 | & 512 & 26.8/27.4 & 71.6\\ 811 | & 1024 & 26.0/25.9 & 77.4\\ 812 | \midrule 813 | BiGS & 128 & 32.3/30.0 & 68.7 \\ 814 | % BiGS & 1024 & & \\ 815 | & 4096 & 32.8/31.7 & 71.4 \\ 816 | \bottomrule 817 | \end{tabular} 818 | \caption{} 819 | \label{tab:scroll} 820 | \end{table} 821 | \end{frame} 822 | 823 | \begin{frame}{FLOPs} 824 | \begin{figure} 825 | \centering 826 | \includegraphics[height=0.5\textheight]{Figs/graph2.png} 827 | 828 | \label{fig:my_label} 829 | \end{figure} 830 | \end{frame} 831 | 832 | \begin{frame}{Related Results: H3 - SSM For Language Modeling} 833 | \begin{itemize} 834 | \item Alternative gating method for language modeling 835 | \item Use 2 attention layers + SSM and reach Transformer PPL. 836 | \item Efficient implementation targeting on GPUs. 837 | \end{itemize} 838 | 839 | \blfootnote{\cite{dao2022hungry}} 840 | \end{frame} 841 | 842 | 843 | % \section{Next Steps} 844 | % \begin{frame}{Outline} 845 | % \tableofcontents[currentsection] 846 | % \end{frame} 847 | 848 | 849 | \begin{frame}{Next Steps} 850 | \begin{itemize} 851 | \item Attention may not be required? Simpler routing + gating. 852 | \item More analysis on feed-forward contribution. 853 | \item Transfer from pretraining unclear. 854 | \end{itemize} 855 | \end{frame} 856 | 857 | % \begin{frame} 858 | % \includegraphics[height=\textheight]{Figs/ModelSize0.jpg} 859 | % \end{frame} 860 | 861 | 862 | % \input{slides/bullets} 863 | % \input{slides/split} 864 | % \input{slides/figure} 865 | % \input{slides/centered} 866 | % \input{slides/monospace} 867 | % \input{slides/brackets} 868 | % \input{slides/link} 869 | \begin{frame}[allowframebreaks, label=current] 870 | \frametitle{References} 871 | \footnotesize 872 | \bibliographystyle{apalike} 873 | \bibliography{anthology.bib} 874 | \bibliography{ssm.bib} 875 | 876 | \end{frame} 877 | \end{document} 878 | -------------------------------------------------------------------------------- /p-notes.tex: -------------------------------------------------------------------------------- 1 | \newif\ifnotes\notestrue\input{presentation.tex} 2 | -------------------------------------------------------------------------------- /p.tex: -------------------------------------------------------------------------------- 1 | % Auriga theme 2 | % find the most up-to-date version here: https://github.com/anishathalye/auriga 3 | \newif\ifnotes\notesfalse\input{presentation.tex} 4 | -------------------------------------------------------------------------------- /presentation-netflix.tex: -------------------------------------------------------------------------------- 1 | % Auriga theme 2 | % find the most up-to-date version here: https://github.com/anishathalye/auriga 3 | 4 | \documentclass[14pt,aspectratio=169]{beamer} 5 | \usepackage{pgfpages} 6 | \usepackage{fancyvrb} 7 | \usepackage{tikz} 8 | \usepackage{pgfplots} 9 | \usepackage{booktabs} 10 | 11 | \usetheme{auriga} 12 | \usecolortheme{auriga} 13 | \setbeamercolor{math text}{fg=blue} 14 | 15 | \newcommand\blfootnote[1]{% 16 | \begingroup 17 | \renewcommand\thefootnote{}\footnote{#1}% 18 | \addtocounter{footnote}{-1}% 19 | \endgroup 20 | } 21 | 22 | %\setbeamertemplate{footline}[] 23 | %\renewcommand\footnotemark{} 24 | 25 | 26 | % define some colors for a consistent theme across slides 27 | \definecolor{red}{RGB}{181, 23, 0} 28 | \definecolor{blue}{RGB}{0, 118, 186} 29 | \definecolor{gray}{RGB}{146, 146, 146} 30 | 31 | \title{Pretraining Without Attention} 32 | 33 | \author{Junxiong Wang \and Jing Nathan Yan \and Albert Gu \and \underline{Sasha Rush} \inst{*}} 34 | 35 | \institute[shortinst]{\inst{*} Preprint} 36 | 37 | \begin{document} 38 | 39 | { 40 | % rather than use the frame options [noframenumbering,plain], we make the 41 | % color match, so that the indicated page numbers match PDF page numbers 42 | \setbeamercolor{page number in head/foot}{fg=background canvas.bg} 43 | \begin{frame} 44 | \titlepage 45 | \end{frame} 46 | } 47 | 48 | % \begin{frame}{Introduction - Sasha Rush} 49 | % \begin{itemize} 50 | % \item \structure{Associate Professor} - Cornell Tech 51 | % \item \structure{Researcher} - Hugging Face 52 | % \item \structure{Open Source Machine Learning} - @srush 53 | % \end{itemize} 54 | % \end{frame} 55 | 56 | 57 | % \begin{frame}{Transformer} 58 | % \begin{figure} 59 | % \centering 60 | % \includegraphics[height=0.6\textheight] 61 | % {Figs/transformer.png} 62 | % \end{figure} 63 | % \end{frame} 64 | 65 | % \begin{frame}{Transformer Self-Attention} 66 | % \begin{figure} 67 | % \centering 68 | % \includegraphics[height=0.8\textheight] 69 | % {Figs/attention.png} 70 | % \end{figure} 71 | % \end{frame} 72 | 73 | \section{Context} 74 | % \begin{frame}{Outline} 75 | % \tableofcontents 76 | % \end{frame} 77 | \begin{frame} 78 | \includegraphics[trim={10cm 0 10cm 0}, clip, height=\textheight]{Figs/ModelSize2.png} 79 | \end{frame} 80 | 81 | 82 | \begin{frame} 83 | \includegraphics[width=\textwidth]{Figs/Banana.png} 84 | \end{frame} 85 | 86 | % \begin{frame} 87 | % \includegraphics[width=\textwidth]{Figs/llama.png} 88 | % \end{frame} 89 | 90 | 91 | 92 | 93 | 94 | \begin{frame}{Caveats} 95 | \begin{itemize} 96 | \item LLMs are remarkable, we should use them for most things 97 | \item This talk is \structure{not} about LLMs 98 | \end{itemize} 99 | \end{frame} 100 | 101 | 102 | 103 | 104 | \begin{frame} 105 | \includegraphics[trim={10cm 0 10cm 0}, clip, height=\textheight]{Figs/ModelSize3.png} 106 | \end{frame} 107 | 108 | \begin{frame}{Context} 109 | \begin{itemize} 110 | \item BERT used to require non-trivial compute 111 | \item Belief: Open architecture questions in NLP 112 | \item Today's Talk: How important is \textit{attention}? 113 | \end{itemize} 114 | \end{frame} 115 | 116 | 117 | \begin{frame}{\textcolor{red}{ELMo} } 118 | 119 | \begin{columns} 120 | \begin{column}{0.3\linewidth} 121 | \centerline{Bidirectional RNN} 122 | \end{column} 123 | \begin{column}{0.7\linewidth} 124 | 125 | \begin{figure} 126 | \includegraphics[width=0.8\textwidth]{Figs/elmo.png} 127 | \end{figure} 128 | \end{column} 129 | \end{columns} 130 | \blfootnote{\cite{DBLP:conf/naacl/PetersNIGCLZ18}} 131 | 132 | \end{frame} 133 | 134 | \begin{frame}{\textcolor{red}{ELMo} For Pretraining} 135 | \begin{table} 136 | \begin{tabular}{lc} 137 | \toprule 138 | Model & GLUE\\ 139 | \midrule 140 | ELMo& 67.7 \\ 141 | ELMo+Attn& 71.0\\ 142 | \visible<2>{BERT-Base & 79 - 83} \\ 143 | \bottomrule 144 | \end{tabular} 145 | \end{table} 146 | \blfootnote{\cite{DBLP:conf/naacl/PetersNIGCLZ18, devlin2018bert}} 147 | \end{frame} 148 | 149 | \begin{frame}{Architecture?} 150 | \begin{itemize} 151 | \item 152 | Several confounding differences, e.g. frozen model. 153 | \item Followup: \textit{To Tune or Not to Tune? Adapting Pretrained Representations to Diverse Tasks} \cite{peters2019tune} 154 | \pause 155 | 156 | \item Conclusion: Transformers significantly beat BiLSTMs 157 | \end{itemize} 158 | \end{frame} 159 | 160 | \begin{frame}{Other Models} 161 | 162 | Maybe there are other models 163 | 164 | \vspace{0.5cm} 165 | 166 | \begin{itemize} 167 | \item Convolutions? 168 | \item Mixers? 169 | \end{itemize} 170 | 171 | % \textit{Are Pre-trained Convolutions Better than Pre-trained Transformers?} 172 | % \\ 173 | % \\ 174 | % Answer: No. 175 | 176 | \end{frame} 177 | 178 | \begin{frame}{Pretraining with CNNs} 179 | \textit{Are Pre-trained Convolutions Better than Pre-trained Transformers?} \cite{tay2020efficient} 180 | 181 | \vspace{0.5cm} 182 | 183 | \visible<2>{\structure{Answer: No.} 184 | 185 | \begin{table} 186 | \begin{tabular}{lc} 187 | \toprule 188 | Model & SST-2\\ 189 | \midrule 190 | ELMo & 91.8 \\ 191 | Best CNN & 92.2 \\ 192 | BERT-Base & 93.5 \\ 193 | \bottomrule 194 | \end{tabular} 195 | \end{table} 196 | 197 | } 198 | 199 | \end{frame} 200 | 201 | 202 | % \begin{frame}{Results: CNNs} 203 | % \begin{table} 204 | % \begin{tabular}{lc} 205 | % \toprule 206 | % Model & SST-2\\ 207 | % \midrule 208 | % Best CNN & 92.2 \\ 209 | % ELMo & 91.8 \\ 210 | % BERT-Base & 93.5 \\ 211 | % \bottomrule 212 | % \end{tabular} 213 | % \end{table} 214 | % \end{frame} 215 | 216 | \begin{frame}{Pretraining with FNet} 217 | \textit{FNet: Mixing Tokens with Fourier Transforms} \cite{lee2021fnet} 218 | 219 | \vspace{0.5cm} 220 | 221 | Replaces attention with 2D FFT mixing-layer. 222 | 223 | \visible<2>{ 224 | \begin{table} 225 | \begin{tabular}{lc} 226 | \toprule 227 | Model & GLUE (dev)\\ 228 | \midrule 229 | Best FNet & 76.3 \\ 230 | BERT-Base & 83.3 \\ 231 | \bottomrule 232 | \end{tabular} 233 | \end{table} 234 | } 235 | \end{frame} 236 | 237 | 238 | 239 | \begin{frame}{Transformers are Great...} 240 | \begin{itemize} 241 | \item Highly optimized training 242 | \item Long-range ability 243 | \item Expensive $O(n^2)$, but we have the money... 244 | \end{itemize} 245 | \vspace{0.5cm} 246 | 247 | \visible<2>{(But aren't you curious...)} 248 | \end{frame} 249 | 250 | \section{State Space Models} 251 | \begin{frame}{Outline} 252 | \tableofcontents[currentsection] 253 | \end{frame} 254 | 255 | 256 | \begin{frame}{State Space Models (SSM)} 257 | \begin{itemize} 258 | 259 | \item Think hybrid RNN / CNN 260 | 261 | \item SOTA on speech generation and long-range tasks 262 | 263 | \item Tutorial at \textit{The Annotated S4} 264 | \end{itemize} 265 | 266 | \blfootnote{\cite{gu2020hippo,gu2021combining,gu2021efficiently}} 267 | \end{frame} 268 | 269 | 270 | \begin{frame}{State Space Model - Continuous Time} 271 | Let $u(t) \in \mathbb{R}$ be a continuous input and $y(t) \in \mathbb{R}$ be output. 272 | 273 | \pause 274 | \vspace{0.5cm} 275 | 276 | SSM is a differential equation. 277 | \begin{align*} 278 | \boldsymbol{x}'(t) &= \boldsymbol{A}\boldsymbol{x}(t) + \boldsymbol{B}u(t) \\ 279 | y(t) &= \boldsymbol{C}\boldsymbol{x}(t) + \boldsymbol{D}u(t). 280 | \end{align*} 281 | 282 | \pause 283 | Where $\boldsymbol{x}(t) \in \mathbf{R}^N$ is a hidden state and model \structure{parameters}, 284 | 285 | $$\boldsymbol{A} \in \mathbb{R}^{N\times N}, \boldsymbol{B}\in \mathbb{R}^{N \times 1}, \boldsymbol{C} \in \mathbb{R}^{1 \times N}, \boldsymbol{D} \in \mathbb{R}^{1\times 1}$$ 286 | 287 | \end{frame} 288 | \begin{frame}{Discrete Time Sequence} 289 | 290 | Goal: Map scalar sequence $u_{1}, \ldots, u_L$ to $y_1, \ldots, y_L$, 291 | 292 | \begin{figure} 293 | \centering 294 | \includegraphics[width=0.5\textwidth]{Figs/SSMStart.pdf} 295 | \label{fig:my_label} 296 | \end{figure} 297 | \end{frame} 298 | 299 | \begin{frame}{Discrete Time SSM} 300 | 301 | SSM on discretize time data, 302 | 303 | \begin{align*} 304 | \boldsymbol{x}_{k} &= \boldsymbol{\overline{A}} \boldsymbol{x}_{k-1} + \boldsymbol{\overline{B}} u_k \\ 305 | y_k &= \boldsymbol{\overline{C}} \boldsymbol{x}_{k \phantom{- 1}} + \boldsymbol{\overline{D}} u_k. 306 | \end{align*} 307 | 308 | Using discretization with (learned) sampling rate parameter $\Delta$, 309 | 310 | $$\boldsymbol{\overline{A}}, \boldsymbol{\overline{B}}, \boldsymbol{\overline{C}} = \text{discretize}(\boldsymbol{A}, \boldsymbol{B}, \boldsymbol{C}, \Delta )$$ 311 | 312 | \end{frame} 313 | 314 | \begin{frame}{Recurrent Form} 315 | 316 | Output sequence $y_1, \ldots, y_L$ can be computed as a linear RNN, 317 | 318 | \begin{align*} 319 | \boldsymbol{x}_{k} &= \boldsymbol{\overline{A}} \boldsymbol{x}_{k-1} + \boldsymbol{\overline{B}} u_k \\ 320 | y_k &= \boldsymbol{\overline{C}} \boldsymbol{x}_{k \phantom{- 1}} + \boldsymbol{\overline{D}} u_k. 321 | \end{align*} 322 | 323 | Note $\boldsymbol{x}_k \in \mathbb{R}^N$ is the bigger hidden state for $u_k \in \mathbb{R}$, and $\boldsymbol{x}_0 = \mathbf{0}$. 324 | 325 | \end{frame} 326 | 327 | \begin{frame}{Convolutional Form} 328 | 329 | Alternative: 1D convolution with kernel $\boldsymbol{\overline{K}}$ (width $L$), 330 | 331 | \begin{align*} 332 | \overline{K} &= (\boldsymbol{\overline{C}}\boldsymbol{\overline{B}}, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}\boldsymbol{\overline{B}}, \dots, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}^{L-1}\boldsymbol{\overline{B}}) \\ 333 | y &= \text{conv1d}(\overline{K}_L \ldots \overline{K}_1, u_1 \ldots u_L) 334 | \end{align*} 335 | 336 | Intuition: 337 | \pause 338 | $$y_1 = \boldsymbol{\overline{C}} \boldsymbol{\overline{B}} u_1$$ 339 | \pause 340 | $$y_2 = \boldsymbol{\overline{C}} \boldsymbol{\overline{A}} \boldsymbol{\overline{B}} u_1 + \boldsymbol{\overline{C}} \boldsymbol{\overline{B}} u_2 = \boldsymbol{\overline{C}} (\boldsymbol{\overline{A}} \boldsymbol{\overline{B}} u_1 + \boldsymbol{\overline{B}} u_2) = \boldsymbol{\overline{C}} (\boldsymbol{x}_1 + \boldsymbol{\overline{B}} u_2) $$ 341 | \end{frame} 342 | 343 | \begin{frame}{Convolutional Form} 344 | Step 1: Discretize (Training Only). Step 2: Apply 1D Conv 345 | \begin{figure} 346 | \centering 347 | \includegraphics[width=0.6\textwidth]{Figs/SSMSide.pdf} 348 | \label{fig:my_label} 349 | \end{figure} 350 | \end{frame} 351 | 352 | \begin{frame}{Implementation - Computing Kernel} 353 | 354 | $$\boldsymbol{\overline{K}} = (\boldsymbol{\overline{C}}\boldsymbol{\overline{B}}, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}\boldsymbol{\overline{B}}, \dots, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}^{L-1}\boldsymbol{\overline{B}}) $$ 355 | 356 | \begin{itemize} 357 | \item Simple approximations work well (See S4D, DSS) 358 | \end{itemize} 359 | \blfootnote{\cite{gu2021efficiently,gupta2022diagonal,gu2022parameterization}} 360 | \end{frame} 361 | 362 | 363 | \begin{frame}{Implementation - Fourier Transform} 364 | \begin{align*} 365 | &y = \boldsymbol{\overline{K}} \ast u 366 | \end{align*} 367 | \begin{itemize} 368 | \item At long $L$, convolution computed with FFT. 369 | \item More efficient than self-attention or standard RNN. 370 | \end{itemize} 371 | \end{frame} 372 | 373 | 374 | \begin{frame}{Important Training Initialization} 375 | \begin{itemize} 376 | \item Parameter $\boldsymbol{A}$ is initialized with HiPPO Matrix \cite{gu2020hippo} 377 | 378 | % \begin{scriptsize} 379 | % \begin{align*} 380 | % \boldsymbol{A}_{nk}= - 381 | % \begin{cases} 382 | % (2n+1)^{1/2}(2k+1)^{1/2} & \text{if } n > k \\ n+1 &\text{if } n=k \text{\ else\ } 0 383 | % \end{cases} 384 | % \end{align*} 385 | % \end{scriptsize} 386 | 387 | 388 | \item Kernel formed by Legendre coefficients 389 | \end{itemize} 390 | \begin{figure} 391 | \centering 392 | \includegraphics[width=0.7\textwidth]{Figs/hippo.png} 393 | \end{figure} 394 | \end{frame} 395 | 396 | 397 | 398 | \begin{frame}{Summary: SSM} 399 | \begin{itemize} 400 | \item Mapping from sequence-to-sequence 401 | \item Acts like an RNN, Computed like a CNN 402 | \item Fast to train and utilize 403 | \end{itemize} 404 | \end{frame} 405 | 406 | \section{Model Architectures} 407 | \begin{frame}{Outline} 408 | \tableofcontents[currentsection] 409 | \end{frame} 410 | 411 | \begin{frame}{Objective: Replicate BERT with SSM} 412 | \begin{itemize} 413 | \item Everything else identical (loss, number of parameters, data) 414 | \end{itemize} 415 | \end{frame} 416 | 417 | % \begin{frame}{Architectures for Pretraining} 418 | % \begin{itemize} 419 | % \item Idea 1: Just replace self-attention 420 | % \item Minimal change to Transformer arch 421 | % \end{itemize} 422 | % \end{frame} 423 | 424 | 425 | \begin{frame}{\structure{Naive Idea}: Self-attention $\Rightarrow$ SSM} 426 | \begin{figure} 427 | \centering 428 | \includegraphics[height=0.8\textheight,trim={0 0 18cm 0},clip]{Figs/model_architecture_comparison2.pdf} 429 | \caption{} 430 | \label{} 431 | \end{figure} 432 | \end{frame} 433 | 434 | \begin{frame}{Can this work?} 435 | \begin{itemize} 436 | \item SSM is significantly less expressive than self-attention. 437 | \item Static routing through the model like a CNN. 438 | \item Can it learn to do \structure{matching} across sentences? 439 | \end{itemize} 440 | \pause 441 | \vspace{0.5cm} 442 | 443 | 444 | 445 | 446 | \end{frame} 447 | 448 | 449 | \begin{frame}{Test: Matching Across Gaps} 450 | \centerline{Task: QNLI \cite{wang2018glue}} 451 | \vspace{0.5cm} 452 | 453 | 454 | \centerline{\textcolor{red}{What percentage of farmland grows wheat?}} 455 | 456 | \centerline{$\sim \sim \sim $} 457 | 458 | \centerline{\textcolor{olivegreen}{More than 50\% of this area is sown for wheat and 33\% for barley.}} 459 | 460 | \pause 461 | 462 | \begin{table}[t] 463 | \center 464 | \begin{tabular}{ccc} 465 | \toprule 466 | \centering 467 | Arch & \textcolor{red}{H} P & \textcolor{red}{H} $\sim$ P \\ 468 | \midrule 469 | \textsc{stack} / \textsc{ssm} & 77.4 & 69.7\\ 470 | % \textsc{gated} / \textsc{ssm} & 77.4 & 77.7\\ 471 | \bottomrule 472 | \end{tabular} 473 | \caption{} 474 | \label{tab:synthetic} 475 | \end{table} 476 | \end{frame} 477 | 478 | 479 | 480 | % \begin{frame}{Does this work} 481 | 482 | % \end{frame} 483 | 484 | 485 | 486 | \begin{frame}{\structure{Proposed Fix}: Multiplicative Gating} 487 | 488 | Add dynamism to stacked model with multiplicative gating. 489 | 490 | $$\sigma(\mathbf{W} \mathbf{u}) \otimes (\mathbf{V} \mathbf{u})$$ 491 | 492 | Positive results with CNN, Transformer, and SSM models. 493 | 494 | 495 | \blfootnote{\cite{dauphin2017language, shazeer2020glu, narang2021transformer}} 496 | 497 | \end{frame} 498 | 499 | \begin{frame}{Proposed Architecture: BiGS} 500 | \begin{figure} 501 | \centering 502 | \includegraphics[height=0.7\textheight,trim={16cm 0 0 0},clip]{Figs/model_architecture_comparison2.pdf} 503 | \caption{} 504 | \label{fig:my_label} 505 | \end{figure} 506 | \end{frame} 507 | 508 | \begin{frame}{Gating Adaptation} 509 | \centerline{\textcolor{red}{What percentage of farmland grows wheat?}} 510 | 511 | \centerline{$\sim \sim \sim $} 512 | 513 | \centerline{\textcolor{olivegreen}{More than 50\% of this area is sown for wheat and 33\% for barley.}} 514 | 515 | 516 | \begin{table}[t] 517 | \center 518 | \begin{tabular}{lcc} 519 | \toprule 520 | \centering 521 | Arch & \textcolor{red}{H} P & \textcolor{red}{H} $\sim$ P \\ 522 | \midrule 523 | \textsc{stack} / \textsc{ssm} & 77.4 & 69.7\\ 524 | \textsc{gated} / \textsc{ssm} & 77.4 & 77.7\\ 525 | \bottomrule 526 | \end{tabular} 527 | \caption{ } 528 | \label{tab:synthetic} 529 | \end{table} 530 | \pause 531 | 532 | 533 | \end{frame} 534 | 535 | \begin{frame}{Full Experiment: QNLI} 536 | 537 | Preview: Experimental results, pretraining for QNLI. 538 | 539 | \begin{figure} 540 | \centering 541 | \includegraphics[height=0.7\textheight]{Figs/graph.png} 542 | \label{fig:my_label} 543 | \end{figure} 544 | \end{frame} 545 | 546 | \begin{frame}{Related Result: Induction Heads (H3)} 547 | Synthetic \structure{induction head} experiment from \cite{dao2022hungry} 548 | 549 | \vspace{0.5cm} 550 | 551 | \centerline{a b c d e $\Rightarrow$ f g h i . . . x y z $\Rightarrow$ \ \ \ \ \textcolor{red}{f} } 552 | 553 | \begin{table}[t] 554 | \center 555 | \begin{tabular}{lcc} 556 | \toprule 557 | \centering 558 | Arch & Induction \\ 559 | \midrule 560 | \textsc{ssm} & 35.6 \\ 561 | \textsc{gating} + \textsc{ssm} & 100\\ 562 | \textsc{attention} & 100\\ 563 | \bottomrule 564 | \end{tabular} 565 | \caption{ } 566 | \label{tab:synthetic} 567 | \end{table} 568 | \end{frame} 569 | 570 | 571 | \begin{frame}{Induction Heads} 572 | 573 | \begin{columns} 574 | \begin{column}{0.5\textwidth} 575 | \begin{figure} 576 | \centering 577 | \includegraphics[height=0.8\textheight]{Figs/induct.png} 578 | 579 | \label{fig:my_label} 580 | \end{figure} 581 | \end{column} 582 | \begin{column}{0.5\textwidth} 583 | \begin{figure} 584 | \centering 585 | 586 | \includegraphics[height=0.3\textheight]{Figs/RASP.png} 587 | \label{fig:my_label} 588 | \end{figure} 589 | \end{column} 590 | \end{columns} 591 | 592 | \end{frame} 593 | 594 | 595 | 596 | % \begin{frame}{Gating} 597 | 598 | % \end{frame} 599 | 600 | % \begin{frame}{Simpler multiplicative Interactions} 601 | % \begin{figure} 602 | % \centering 603 | % % \includegraphics{Figs/model_architecture_comparison2.pdf} 604 | % \caption{Caption} 605 | % \label{fig:my_label} 606 | % \end{figure} 607 | % \end{frame} 608 | 609 | \section{Experiments} 610 | 611 | \begin{frame}{Outline} 612 | \tableofcontents[currentsection] 613 | \end{frame} 614 | 615 | 616 | \begin{frame}{\structure{Experiment 1:} BERT} 617 | \begin{itemize} 618 | \item Models trained using ``24 Hour'' BERT \cite{izsak2021train} 619 | \begin{itemize} 620 | \item All BERT-Large Size 621 | \item Training length (Short 11B, Medium 22B, Full >100B) 622 | \item 128 Length Sequences 623 | \end{itemize} 624 | 625 | \item Codebase in JAX (from Annotated S4 {\small \cite{rush2022s4}}) using S4D 626 | \item Training data and masking is identical 627 | \end{itemize} 628 | \end{frame} 629 | 630 | % \begin{frame}{Short Training $\sim$11B Tokens} 631 | % \begin{table} 632 | % \begin{tabular}{lc} 633 | % \toprule 634 | % Model & GLUE (Dev)\\ 635 | % \midrule 636 | % BERT & 84.1\\ 637 | % Stacked-SSM & 77.2 \\ 638 | % BiGS & 84.0 \\ 639 | % \bottomrule 640 | % \end{tabular} 641 | % \end{table} 642 | % \end{frame} 643 | 644 | \begin{frame}{Short Training $\sim$11B Tokens} 645 | \begin{table} 646 | \begin{tabular}{lc} 647 | \toprule 648 | Model & GLUE (Dev)\\ 649 | \midrule 650 | ELMo & 68.7 \\ 651 | BERT & 84.1\\ 652 | Stacked-SSM & 77.2 \\ 653 | BiGS & 84.0 \\ 654 | \bottomrule 655 | \end{tabular} 656 | \end{table} 657 | \end{frame} 658 | 659 | \begin{frame}{Is it just Gating?} 660 | \begin{table} 661 | \begin{tabular}{lc} 662 | \toprule 663 | Model & GLUE \\ 664 | \midrule 665 | BERT & 84.1\\ 666 | Gated-BERT & 82.6 \\ 667 | \bottomrule 668 | \end{tabular} 669 | \end{table} 670 | \end{frame} 671 | 672 | 673 | \begin{frame}{BERT Large > 100B Tokens} 674 | \begin{table} 675 | \begin{tabular}{lc} 676 | \toprule 677 | Model & GLUE (Test)\\ 678 | \midrule 679 | BERT-Large^* & 83.0\\ 680 | BiGS & 83.0 \\ 681 | \bottomrule 682 | \end{tabular} 683 | \end{table} 684 | \centerline{$^*$Best reported BERT-Large Results.} 685 | \end{frame} 686 | 687 | \begin{frame}{Analysis: Masked PPL Transfer} 688 | \begin{figure} 689 | \centering 690 | \includegraphics[width=0.6\textwidth]{Figs/MNLI.png} 691 | \end{figure} 692 | \end{frame} 693 | 694 | \begin{frame}{Analysis: Kernel Visualization} 695 | 696 | 697 | \begin{figure} 698 | \centering 699 | \includegraphics[width=\textwidth]{Figs/kernel1.png} 700 | \end{figure} 701 | 702 | \begin{itemize} 703 | \item Each BiGS layer only has 2 kernels (forward / backward). 704 | \item Shows \structure{all routing} in layer 2! (vs $O(HT^2)$ attention coef.) 705 | \end{itemize} 706 | 707 | 708 | \end{frame} 709 | 710 | \begin{frame}{Analysis: All Kernels} 711 | \begin{figure} 712 | \centering 713 | \includegraphics[height=0.6\textheight]{Figs/kernel2.png} 714 | \end{figure} 715 | \end{frame} 716 | 717 | \begin{frame}{Analysis: Change in Kernels during Finetuning } 718 | 719 | \centerline{Task: MNLI} 720 | \begin{figure} 721 | \centering 722 | \includegraphics[width=0.8\textwidth]{Figs/comparison_results.png} 723 | \end{figure} 724 | \end{frame} 725 | 726 | \begin{frame}{Analysis: Syntax} 727 | \begin{itemize} 728 | \item Observation: SSM model seems to do better on syntax-centric tasks 729 | \item Hypothesis: Locality of features encourages a stack-like inductive bias. 730 | \end{itemize} 731 | \end{frame} 732 | 733 | \begin{frame}{\structure{Observation 1}: COLA} 734 | \begin{table} 735 | \begin{tabular}{lc} 736 | \toprule 737 | Model & COLA \\ 738 | \midrule 739 | BERT & 60.5\\ 740 | BiGS & 64.7 \\ 741 | \bottomrule 742 | \end{tabular} 743 | \end{table} 744 | Statistically significant across runs. 745 | \end{frame} 746 | 747 | 748 | \begin{frame}{\structure{Observation 2}: Agreement Attractors} 749 | Task from \cite{linzen2016assessing,goldberg2019assessing}. 750 | \vspace{0.5cm} 751 | 752 | \begin{quote} 753 | Yet the \textbf{ratio} of \underline{men} who survive to the \underline{women} and \underline{children} who survive [is] not clear in this story 754 | \end{quote} 755 | 756 | \begin{figure} 757 | \centering 758 | \includegraphics[height=0.5\textheight]{Figs/attractors.png} 759 | \label{fig:my_label} 760 | \end{figure} 761 | 762 | \end{frame} 763 | 764 | \begin{frame}{\structure{Observation 3}: Diagnostics } 765 | From \cite{marvin2018targeted,goldberg2019assessing}: 766 | \begin{table}[t] 767 | \centering 768 | \scriptsize 769 | \begin{tabular}{lrrr} 770 | \toprule 771 | & BiGS & BERT & LSTM \\ 772 | \midrule 773 | \textsl{SUBJECT-VERB:} & & & \\ 774 | Simple & 100.0 & 100.0& 94.0 \\ 775 | Sentential complement & 85.1 & 85.6 & 99.0 \\ 776 | Short VP coordination & 91.0 & 86.5 & 90.0 \\ 777 | Long VP coordination & 97.5 & 97.5 & 61.0 \\ 778 | Across prep phrase & 88.6 & 84.8 & 57.0 \\ 779 | Across subj relative clause & 88.4 & 84.9 & 56.0 \\ 780 | Across obj relative clause & 89.9 & 85.1 & 50.0 \\ 781 | Across obj relative (-that) & 86.9 & 81.1 & 52.0 \\ 782 | In obj relative clause & 97.2 & 99.1 & 84.0 \\ 783 | In obj relative (-that) & 88.7 & 81.6 & 71.0 \\ 784 | \midrule 785 | \textsl{REFL ANAPHORA:} & & & \\ 786 | Simple & 97.1 & 98.9 & 83.0 \\ 787 | In a sentential complement & 79.9 & 86.2 & 86.0 \\ 788 | Across a relative clause & 79.1 & 75.9 & 55.0 \\ 789 | \bottomrule 790 | \end{tabular} 791 | \end{table} 792 | \end{frame} 793 | 794 | 795 | \begin{frame}{\structure{Experiment 2:} Longformer} 796 | \begin{itemize} 797 | \item Can we lengthen SSM $L\rightarrow L'$ without approximation? 798 | 799 | \item Continued training based on Longformer protocol. 800 | 801 | \item Two experimental scales 802 | % \begin{itemize} 803 | % \item 128->512 SQuAD \cite{rajpurkar2016squad} 804 | % \item 128->4096 SCROLLS \cite{shaham2022scrolls} 805 | % \end{itemize} 806 | \end{itemize} 807 | \end{frame} 808 | 809 | % \begin{frame}{SQuAD} 810 | % \begin{table}[tb] 811 | % \centering 812 | % \begin{tabular}{ll|c} 813 | % \toprule 814 | % & & SQuAD 1.1 \\ 815 | % \midrule 816 | % BERT & (512) & 90.9\\ 817 | % \midrule 818 | % BERT &(128 $\rightarrow$ 512) & 87.3 \\ 819 | % BiGS & (128 $\rightarrow$ 512) & 89.5 \\ 820 | % \bottomrule 821 | % \end{tabular} 822 | % \caption{ } 823 | % \label{tab:squad} 824 | % \end{table} 825 | % \end{frame} 826 | 827 | 828 | 829 | \begin{frame}{SCROLLS} 830 | \begin{table}[tb] 831 | \centering 832 | \begin{tabular}{lr|cc} 833 | \toprule 834 | & Length & QALT & CNLI \\ 835 | \midrule 836 | LED & 1024 & 26.6/27.2 & 73.4\\ 837 | & 4096 & 26.6/27.3 & 71.5\\ 838 | & 16384 & 25.8/25.4 & 71.5\\ 839 | \midrule 840 | BART & 256 & 26.0/25.8 & 69.8\\ 841 | & 512 & 26.8/27.4 & 71.6\\ 842 | & 1024 & 26.0/25.9 & 77.4\\ 843 | \midrule 844 | BiGS & 128 & 32.3/30.0 & 68.7 \\ 845 | % BiGS & 1024 & & \\ 846 | & 4096 & 32.8/31.7 & 71.4 \\ 847 | \bottomrule 848 | \end{tabular} 849 | \caption{} 850 | \label{tab:scroll} 851 | \end{table} 852 | \end{frame} 853 | 854 | \begin{frame}{FLOPs} 855 | \begin{figure} 856 | \centering 857 | \includegraphics[height=0.5\textheight]{Figs/graph2.png} 858 | 859 | \label{fig:my_label} 860 | \end{figure} 861 | \end{frame} 862 | 863 | \begin{frame}{Related Results: H3 - SSM For Language Modeling} 864 | \begin{itemize} 865 | \item Alternative gating method for language modeling 866 | \item Use 2 attention layers + SSM and reach Transformer PPL. 867 | \item Efficient implementation targeting on GPUs. 868 | \end{itemize} 869 | 870 | \blfootnote{\cite{dao2022hungry}} 871 | \end{frame} 872 | 873 | 874 | % \section{Next Steps} 875 | % \begin{frame}{Outline} 876 | % \tableofcontents[currentsection] 877 | % \end{frame} 878 | 879 | 880 | \begin{frame}{Next Steps} 881 | \begin{itemize} 882 | \item Attention may not be required? Simpler routing + gating. 883 | \item More analysis on feed-forward contribution. 884 | \item Transfer from pretraining unclear. 885 | \end{itemize} 886 | \end{frame} 887 | 888 | % \begin{frame} 889 | % \includegraphics[height=\textheight]{Figs/ModelSize0.jpg} 890 | % \end{frame} 891 | 892 | 893 | % \input{slides/bullets} 894 | % \input{slides/split} 895 | % \input{slides/figure} 896 | % \input{slides/centered} 897 | % \input{slides/monospace} 898 | % \input{slides/brackets} 899 | % \input{slides/link} 900 | \begin{frame}[allowframebreaks] 901 | \frametitle{References} 902 | \footnotesize 903 | \bibliographystyle{apalike} 904 | \bibliography{anthology.bib} 905 | \end{frame} 906 | \end{document} 907 | -------------------------------------------------------------------------------- /presentation.tex: -------------------------------------------------------------------------------- 1 | % Auriga theme 2 | % find the most up-to-date version here: https://github.com/anishathalye/auriga 3 | 4 | \documentclass[14pt,aspectratio=169]{beamer} 5 | \usepackage{pgfpages} 6 | \usepackage{fancyvrb} 7 | \usepackage{tikz} 8 | \usepackage{tikz-qtree} 9 | 10 | \usepackage{pgfplots} 11 | 12 | \usepackage{booktabs} 13 | \usepackage[normalem]{ulem} 14 | 15 | 16 | \usetheme{auriga} 17 | \usecolortheme{auriga} 18 | %\setbeamercolor{math text}{fg=blue} 19 | 20 | \newcommand\blfootnote[1]{% 21 | \begingroup 22 | \renewcommand\thefootnote{}\footnote{#1}% 23 | \addtocounter{footnote}{-1}% 24 | \endgroup 25 | } 26 | 27 | %\setbeamertemplate{footline}[] 28 | %\renewcommand\footnotemark{} 29 | 30 | \setbeamertemplate{footline}[frame number] 31 | 32 | % define some colors for a consistent theme across slides 33 | \definecolor{red}{RGB}{181, 23, 0} 34 | \definecolor{blue}{RGB}{0, 118, 186} 35 | \definecolor{gray}{RGB}{146, 146, 146} 36 | \definecolor{orange}{RGB}{255, 165, 0} 37 | \definecolor{green}{RGB}{0, 128, 0} 38 | % Create a slide for each section 39 | \AtBeginSection[]{ 40 | \begin{frame} 41 | \vfill 42 | \centering 43 | \begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title} 44 | \usebeamerfont{title}\insertsectionhead\par% 45 | \end{beamercolorbox} 46 | \vfill 47 | \end{frame} 48 | } 49 | 50 | \title{Do we need \textcolor{blue}{Attention}?} 51 | 52 | \author{Presented by Sasha Rush} 53 | 54 | % \institute[shortinst]{} 55 | 56 | \begin{document} 57 | 58 | { 59 | % rather than use the frame options [noframenumbering,plain], we make the 60 | % color match, so that the indicated page numbers match PDF page numbers 61 | \setbeamercolor{page number in head/foot}{fg=background canvas.bg} 62 | \begin{frame} 63 | \titlepage 64 | \end{frame} 65 | } 66 | 67 | \begin{frame}[label=c]{} 68 | \textit{ 69 | This talk is a survey of work done by: 70 | } 71 | 72 | \begin{center} 73 | Albert Gu, Ankit Gupta, Tri Dao, Dan Fu, Shuangfei Zhai, Antono Orvieto, Michael Poli, Chris Re, Yuhon Li, Tianle Cai, Harsh Mehta, Jimmy Smith, Scott Linderman, Xuezhe Ma, Chunting Zhou, Xiang Kong, Bo Peng, Eric Alcaide, Anthony Quentin, Andrew Warrington, Yi Zhang, Stefano Massaroli, \\and many others 74 | \end{center} 75 | 76 | \end{frame} 77 | 78 | \section{Preface: Transformers and Attention} 79 | 80 | \input{02-transformers} 81 | 82 | \section{The Challenge} 83 | \input{01-intro} 84 | 85 | \section{An RNN Revival} 86 | \input{03-RNN} 87 | 88 | \section{Are we GPT yet?} 89 | \input{03.5-Results} 90 | 91 | % \section{Computation and Parameterization} 92 | % \input{05-Extensions} 93 | 94 | 95 | \section{Scaling Linear RNNs} 96 | \input{06-final} 97 | 98 | 99 | % \section{Practicalities} 100 | 101 | 102 | 103 | 104 | % \input{slides/bullets} 105 | % \input{slides/split} 106 | % \input{slides/figure} 107 | % \input{slides/centered} 108 | % \input{slides/monospace} 109 | % \input{slides/brackets} 110 | % \input{slides/link} 111 | \begin{frame}[allowframebreaks] 112 | \frametitle{References} 113 | \footnotesize 114 | \bibliographystyle{apalike} 115 | \bibliography{ssm.bib,anthology.bib} 116 | \end{frame} 117 | \end{document} 118 | -------------------------------------------------------------------------------- /slides/brackets.tex: -------------------------------------------------------------------------------- 1 | \begin{frame}{A slide with some bracketed text} 2 | 3 | \begin{itemize} 4 | \item Some statement {\color{gray} [Some citation]} 5 | \item Another statement {\color{gray} [Another citation]} 6 | \item A final statement {\color{gray} [The last citation]} 7 | \end{itemize} 8 | 9 | \vspace{3ex} 10 | \begin{center} 11 | \scriptsize (a small note) 12 | \end{center} 13 | 14 | \end{frame} 15 | 16 | -------------------------------------------------------------------------------- /slides/bullets.tex: -------------------------------------------------------------------------------- 1 | \begin{frame}{A slide title} 2 | 3 | \begin{itemize} 4 | \item A bulleted item 5 | \item Another item 6 | \begin{itemize} 7 | \item With sub-bullets 8 | \item And another, with some \textbf{bold} text 9 | \end{itemize} 10 | \item And another, at the top level, with \textit{italic} text 11 | \end{itemize} 12 | 13 | \note{ 14 | Here's a note for this slide. 15 | } 16 | 17 | \end{frame} 18 | -------------------------------------------------------------------------------- /slides/centered.tex: -------------------------------------------------------------------------------- 1 | \begin{frame}{A slide with centered text} 2 | 3 | \begin{center} 4 | Some statement that is centered. 5 | \end{center} 6 | 7 | \vspace{2ex} 8 | \begin{center} 9 | \scriptsize (a small note) 10 | \end{center} 11 | 12 | \end{frame} 13 | -------------------------------------------------------------------------------- /slides/figure.tex: -------------------------------------------------------------------------------- 1 | \begin{frame}{Full-slide figure} 2 | 3 | \begin{figure} 4 | \centering 5 | \begin{tikzpicture}[scale=0.5] 6 | \begin{axis}[ 7 | scale only axis, 8 | no markers, 9 | domain=0:2*pi, 10 | samples=100, 11 | axis lines=center, 12 | axis line style={-}, 13 | ticks=none] 14 | \addplot[red] {sin(deg(x))}; 15 | \addplot[blue] {cos(deg(x))}; 16 | \end{axis} 17 | \end{tikzpicture} 18 | \end{figure} 19 | \blfootnote{[Here is a citation]} 20 | 21 | 22 | \end{frame} 23 | -------------------------------------------------------------------------------- /slides/link.tex: -------------------------------------------------------------------------------- 1 | \begin{frame}{A slide with some text and a link} 2 | 3 | \begin{itemize} 4 | \item This slide has some text along with a link 5 | \begin{itemize} 6 | \item \textbf{Some bold text}: followed by an explanation 7 | \item \textbf{More bold text}: followed by more text 8 | \end{itemize} 9 | \item Another bullet, with sub-bullets 10 | \begin{itemize} 11 | \item A sub-bullet 12 | \item Another sub-bullet, with more text 13 | \end{itemize} 14 | \end{itemize} 15 | 16 | \vspace{2ex} 17 | \begin{center} 18 | \color{blue} \href{https://github.com/anishathalye/auriga}{github.com/anishathalye/auriga} 19 | \end{center} 20 | 21 | \end{frame} 22 | -------------------------------------------------------------------------------- /slides/monospace.tex: -------------------------------------------------------------------------------- 1 | \begin{frame}[fragile]{A slide with some code} 2 | 3 | \begin{columns} 4 | \begin{column}{0.5\linewidth} 5 | \footnotesize 6 | \begin{Verbatim}[commandchars=\\\{\}] 7 | /* some code */ 8 | def foo(x): 9 | return x**0.5 + 2*x 10 | 11 | \color{blue}/* some can be highlighted */ 12 | \color{blue}foo(3) 13 | \end{Verbatim} 14 | \end{column} 15 | \begin{column}{0.5\linewidth} 16 | {\color{red} Some explanatory text, in red, with some \texttt{monospace} text.} 17 | There might be some math, too: 18 | 19 | $$\sqrt{x} + 2x$$ 20 | \end{column} 21 | \end{columns} 22 | 23 | \end{frame} 24 | -------------------------------------------------------------------------------- /slides/split.tex: -------------------------------------------------------------------------------- 1 | \begin{frame}{A 50-50 split slide} 2 | 3 | \begin{columns} 4 | \begin{column}{0.5\linewidth} 5 | \begin{itemize} 6 | \item This side has a bullet 7 | \item And another bullet, with text that wraps if it's long 8 | \end{itemize} 9 | \end{column} 10 | \begin{column}{0.5\linewidth} 11 | \begin{figure} 12 | \centering 13 | \begin{tikzpicture}[scale=2] 14 | \draw[step=0.25cm,color=gray] (-1,-1) grid (1,1); 15 | \draw[color=red] (1,0) -- (0.2,0.2) -- (0,1) -- (-0.2,0.2) -- (-1,0) 16 | -- (-0.2,-0.2) -- (0,-1) -- (0.2,-0.2) -- cycle; 17 | \end{tikzpicture} 18 | \caption{A figure caption} 19 | \end{figure} 20 | \end{column} 21 | \end{columns} 22 | 23 | \note{ 24 | This slide has notes too. 25 | } 26 | 27 | \end{frame} 28 | -------------------------------------------------------------------------------- /ssm.bib: -------------------------------------------------------------------------------- 1 | @ARTICLE{Orvieto2023-an, 2 | title = "Resurrecting Recurrent Neural Networks for Long Sequences", 3 | author = "Orvieto, Antonio and Smith, Samuel L and Gu, Albert and 4 | Fernando, Anushan and Gulcehre, Caglar and Pascanu, Razvan 5 | and De, Soham", 6 | abstract = "Recurrent Neural Networks (RNNs) offer fast inference on 7 | long sequences but are hard to optimize and slow to train. 8 | Deep state-space models (SSMs) have recently been shown to 9 | perform remarkably well on long sequence modeling tasks, and 10 | have the added benefits of fast parallelizable training and 11 | RNN-like fast inference. However, while SSMs are 12 | superficially similar to RNNs, there are important 13 | differences that make it unclear where their performance 14 | boost over RNNs comes from. In this paper, we show that 15 | careful design of deep RNNs using standard signal 16 | propagation arguments can recover the impressive performance 17 | of deep SSMs on long-range reasoning tasks, while also 18 | matching their training speed. To achieve this, we analyze 19 | and ablate a series of changes to standard RNNs including 20 | linearizing and diagonalizing the recurrence, using better 21 | parameterizations and initializations, and ensuring proper 22 | normalization of the forward pass. Our results provide new 23 | insights on the origins of the impressive performance of 24 | deep SSMs, while also introducing an RNN block called the 25 | Linear Recurrent Unit that matches both their performance on 26 | the Long Range Arena benchmark and their computational 27 | efficiency.", 28 | month = mar, 29 | year = 2023, 30 | keywords = "SSM", 31 | archivePrefix = "arXiv", 32 | primaryClass = "cs.LG", 33 | eprint = "2303.06349" 34 | } 35 | 36 | @ARTICLE{Zhai2021-gz, 37 | title = "An Attention Free Transformer", 38 | author = "Zhai, Shuangfei and Talbott, Walter and Srivastava, Nitish 39 | and Huang, Chen and Goh, Hanlin and Zhang, Ruixiang and 40 | Susskind, Josh", 41 | abstract = "We introduce Attention Free Transformer (AFT), an efficient 42 | variant of Transformers that eliminates the need for dot 43 | product self attention. In an AFT layer, the key and value 44 | are first combined with a set of learned position biases, 45 | the result of which is multiplied with the query in an 46 | element-wise fashion. This new operation has a memory 47 | complexity linear w.r.t. both the context size and the 48 | dimension of features, making it compatible to both large 49 | input and model sizes. We also introduce AFT-local and 50 | AFT-conv, two model variants that take advantage of the idea 51 | of locality and spatial weight sharing while maintaining 52 | global connectivity. We conduct extensive experiments on two 53 | autoregressive modeling tasks (CIFAR10 and Enwik8) as well 54 | as an image recognition task (ImageNet-1K classification). 55 | We show that AFT demonstrates competitive performance on all 56 | the benchmarks, while providing excellent efficiency at the 57 | same time.", 58 | month = may, 59 | year = 2021, 60 | keywords = "SSM", 61 | archivePrefix = "arXiv", 62 | primaryClass = "cs.LG", 63 | eprint = "2105.14103" 64 | } 65 | 66 | @ARTICLE{Poli2023-ag, 67 | title = "Hyena Hierarchy: Towards Larger Convolutional Language 68 | Models", 69 | author = "Poli, Michael and Massaroli, Stefano and Nguyen, Eric and 70 | Fu, Daniel Y and Dao, Tri and Baccus, Stephen and Bengio, 71 | Yoshua and Ermon, Stefano and R{\'e}, Christopher", 72 | abstract = "Recent advances in deep learning have relied heavily on the 73 | use of large Transformers due to their ability to learn at 74 | scale. However, the core building block of Transformers, the 75 | attention operator, exhibits quadratic cost in sequence 76 | length, limiting the amount of context accessible. Existing 77 | subquadratic methods based on low-rank and sparse 78 | approximations need to be combined with dense attention 79 | layers to match Transformers, indicating a gap in 80 | capability. In this work, we propose Hyena, a subquadratic 81 | drop-in replacement for attention constructed by 82 | interleaving implicitly parametrized long convolutions and 83 | data-controlled gating. In recall and reasoning tasks on 84 | sequences of thousands to hundreds of thousands of tokens, 85 | Hyena improves accuracy by more than 50 points over 86 | operators relying on state-spaces and other implicit and 87 | explicit methods, matching attention-based models. We set a 88 | new state-of-the-art for dense-attention-free architectures 89 | on language modeling in standard datasets (WikiText103 and 90 | The Pile), reaching Transformer quality with a 20\% 91 | reduction in training compute required at sequence length 92 | 2K. Hyena operators are twice as fast as highly optimized 93 | attention at sequence length 8K, and 100x faster at sequence 94 | length 64K.", 95 | month = feb, 96 | year = 2023, 97 | keywords = "SSM", 98 | archivePrefix = "arXiv", 99 | primaryClass = "cs.LG", 100 | eprint = "2302.10866" 101 | } 102 | 103 | @ARTICLE{Li2022-pn, 104 | title = "What Makes Convolutional Models Great on Long Sequence 105 | Modeling?", 106 | author = "Li, Yuhong and Cai, Tianle and Zhang, Yi and Chen, Deming 107 | and Dey, Debadeepta", 108 | abstract = "Convolutional models have been widely used in multiple 109 | domains. However, most existing models only use local 110 | convolution, making the model unable to handle long-range 111 | dependency efficiently. Attention overcomes this problem by 112 | aggregating global information but also makes the 113 | computational complexity quadratic to the sequence length. 114 | Recently, Gu et al. [2021] proposed a model called S4 115 | inspired by the state space model. S4 can be efficiently 116 | implemented as a global convolutional model whose kernel 117 | size equals the input sequence length. S4 can model much 118 | longer sequences than Transformers and achieve significant 119 | gains over SoTA on several long-range tasks. Despite its 120 | empirical success, S4 is involved. It requires sophisticated 121 | parameterization and initialization schemes. As a result, S4 122 | is less intuitive and hard to use. Here we aim to demystify 123 | S4 and extract basic principles that contribute to the 124 | success of S4 as a global convolutional model. We focus on 125 | the structure of the convolution kernel and identify two 126 | critical but intuitive principles enjoyed by S4 that are 127 | sufficient to make up an effective global convolutional 128 | model: 1) The parameterization of the convolutional kernel 129 | needs to be efficient in the sense that the number of 130 | parameters should scale sub-linearly with sequence length. 131 | 2) The kernel needs to satisfy a decaying structure that the 132 | weights for convolving with closer neighbors are larger than 133 | the more distant ones. Based on the two principles, we 134 | propose a simple yet effective convolutional model called 135 | Structured Global Convolution (SGConv). SGConv exhibits 136 | strong empirical performance over several tasks: 1) With 137 | faster speed, SGConv surpasses S4 on Long Range Arena and 138 | Speech Command datasets. 2) When plugging SGConv into 139 | standard language and vision models, it shows the potential 140 | to improve both efficiency and performance.", 141 | month = oct, 142 | year = 2022, 143 | keywords = "SSM", 144 | archivePrefix = "arXiv", 145 | primaryClass = "cs.LG", 146 | eprint = "2210.09298" 147 | } 148 | 149 | @ARTICLE{Fu2022-bw, 150 | title = "Hungry Hungry Hippos: Towards Language Modeling with State 151 | Space Models", 152 | author = "Fu, Daniel Y and Dao, Tri and Saab, Khaled K and Thomas, 153 | Armin W and Rudra, Atri and R{\'e}, Christopher", 154 | abstract = "State space models (SSMs) have demonstrated state-of-the-art 155 | sequence modeling performance in some modalities, but 156 | underperform attention in language modeling. Moreover, 157 | despite scaling nearly linearly in sequence length instead 158 | of quadratically, SSMs are still slower than Transformers 159 | due to poor hardware utilization. In this paper, we make 160 | progress on understanding the expressivity gap between SSMs 161 | and attention in language modeling, and on reducing the 162 | hardware barrier between SSMs and attention. First, we use 163 | synthetic language modeling tasks to understand the gap 164 | between SSMs and attention. We find that existing SSMs 165 | struggle with two capabilities: recalling earlier tokens in 166 | the sequence and comparing tokens across the sequence. To 167 | understand the impact on language modeling, we propose a new 168 | SSM layer, H3, that is explicitly designed for these 169 | abilities. H3 matches attention on the synthetic languages 170 | and comes within 0.4 PPL of Transformers on OpenWebText. 171 | Furthermore, a hybrid 125M-parameter H3-attention model that 172 | retains two attention layers surprisingly outperforms 173 | Transformers on OpenWebText by 1.0 PPL. Next, to improve the 174 | efficiency of training SSMs on modern hardware, we propose 175 | FlashConv. FlashConv uses a fused block FFT algorithm to 176 | improve efficiency on sequences up to 8K, and introduces a 177 | novel state passing algorithm that exploits the recurrent 178 | properties of SSMs to scale to longer sequences. FlashConv 179 | yields 2$\times$ speedup on the long-range arena benchmark 180 | and allows hybrid language models to generate text 181 | 2.4$\times$ faster than Transformers. Using FlashConv, we 182 | scale hybrid H3-attention language models up to 2.7B 183 | parameters on the Pile and find promising initial results, 184 | achieving lower perplexity than Transformers and 185 | outperforming Transformers in zero- and few-shot learning on 186 | a majority of tasks in the SuperGLUE benchmark.", 187 | month = dec, 188 | year = 2022, 189 | keywords = "SSM", 190 | archivePrefix = "arXiv", 191 | primaryClass = "cs.LG", 192 | eprint = "2212.14052" 193 | } 194 | 195 | @ARTICLE{Mehta2022-pz, 196 | title = "Long Range Language Modeling via Gated State Spaces", 197 | author = "Mehta, Harsh and Gupta, Ankit and Cutkosky, Ashok and 198 | Neyshabur, Behnam", 199 | abstract = "State space models have shown to be effective at modeling 200 | long range dependencies, specially on sequence 201 | classification tasks. In this work we focus on 202 | autoregressive sequence modeling over English books, Github 203 | source code and ArXiv mathematics articles. Based on recent 204 | developments around the effectiveness of gated activation 205 | functions, we propose a new layer named Gated State Space 206 | (GSS) and show that it trains significantly faster than the 207 | diagonal version of S4 (i.e. DSS) on TPUs, is fairly 208 | competitive with several well-tuned Transformer-based 209 | baselines and exhibits zero-shot generalization to longer 210 | inputs while being straightforward to implement. Finally, we 211 | show that leveraging self-attention to model local 212 | dependencies improves the performance of GSS even further.", 213 | month = jun, 214 | year = 2022, 215 | keywords = "SSM", 216 | archivePrefix = "arXiv", 217 | primaryClass = "cs.LG", 218 | eprint = "2206.13947" 219 | } 220 | 221 | @ARTICLE{Smith2022-at, 222 | title = "Simplified State Space Layers for Sequence Modeling", 223 | author = "Smith, Jimmy T H and Warrington, Andrew and Linderman, Scott 224 | W", 225 | abstract = "Models using structured state space sequence (S4) layers 226 | have achieved state-of-the-art performance on long-range 227 | sequence modeling tasks. An S4 layer combines linear state 228 | space models (SSMs), the HiPPO framework, and deep learning 229 | to achieve high performance. We build on the design of the 230 | S4 layer and introduce a new state space layer, the S5 231 | layer. Whereas an S4 layer uses many independent 232 | single-input, single-output SSMs, the S5 layer uses one 233 | multi-input, multi-output SSM. We establish a connection 234 | between S5 and S4, and use this to develop the 235 | initialization and parameterization used by the S5 model. 236 | The result is a state space layer that can leverage 237 | efficient and widely implemented parallel scans, allowing S5 238 | to match the computational efficiency of S4, while also 239 | achieving state-of-the-art performance on several long-range 240 | sequence modeling tasks. S5 averages 87.4\% on the long 241 | range arena benchmark, and 98.5\% on the most difficult 242 | Path-X task.", 243 | month = aug, 244 | year = 2022, 245 | keywords = "SSM", 246 | archivePrefix = "arXiv", 247 | primaryClass = "cs.LG", 248 | eprint = "2208.04933" 249 | } 250 | 251 | @ARTICLE{Ma2022-xw, 252 | title = "Mega: Moving Average Equipped Gated Attention", 253 | author = "Ma, Xuezhe and Zhou, Chunting and Kong, Xiang and He, 254 | Junxian and Gui, Liangke and Neubig, Graham and May, 255 | Jonathan and Zettlemoyer, Luke", 256 | abstract = "The design choices in the Transformer attention mechanism, 257 | including weak inductive bias and quadratic computational 258 | complexity, have limited its application for modeling long 259 | sequences. In this paper, we introduce Mega, a simple, 260 | theoretically grounded, single-head gated attention 261 | mechanism equipped with (exponential) moving average to 262 | incorporate inductive bias of position-aware local 263 | dependencies into the position-agnostic attention mechanism. 264 | We further propose a variant of Mega that offers linear time 265 | and space complexity yet yields only minimal quality loss, 266 | by efficiently splitting the whole sequence into multiple 267 | chunks with fixed length. Extensive experiments on a wide 268 | range of sequence modeling benchmarks, including the Long 269 | Range Arena, neural machine translation, auto-regressive 270 | language modeling, and image and speech classification, show 271 | that Mega achieves significant improvements over other 272 | sequence models, including variants of Transformers and 273 | recent state space models.", 274 | month = sep, 275 | year = 2022, 276 | keywords = "SSM", 277 | archivePrefix = "arXiv", 278 | primaryClass = "cs.LG", 279 | eprint = "2209.10655" 280 | } 281 | 282 | @ARTICLE{Peng2023-yp, 283 | title = "{RWKV}: Reinventing {RNNs} for the Transformer Era", 284 | author = "Peng, Bo and Alcaide, Eric and Anthony, Quentin and Albalak, 285 | Alon and Arcadinho, Samuel and Cao, Huanqi and Cheng, Xin 286 | and Chung, Michael and Grella, Matteo and Gv, Kranthi Kiran 287 | and He, Xuzheng and Hou, Haowen and Kazienko, Przemyslaw and 288 | Kocon, Jan and Kong, Jiaming and Koptyra, Bartlomiej and 289 | Lau, Hayden and Mantri, Krishna Sri Ipsit and Mom, Ferdinand 290 | and Saito, Atsushi and Tang, Xiangru and Wang, Bolun and 291 | Wind, Johan S and Wozniak, Stansilaw and Zhang, Ruichong and 292 | Zhang, Zhenyuan and Zhao, Qihang and Zhou, Peng and Zhu, 293 | Jian and Zhu, Rui-Jie", 294 | abstract = "Transformers have revolutionized almost all natural language 295 | processing (NLP) tasks but suffer from memory and 296 | computational complexity that scales quadratically with 297 | sequence length. In contrast, recurrent neural networks 298 | (RNNs) exhibit linear scaling in memory and computational 299 | requirements but struggle to match the same performance as 300 | Transformers due to limitations in parallelization and 301 | scalability. We propose a novel model architecture, 302 | Receptance Weighted Key Value (RWKV), that combines the 303 | efficient parallelizable training of Transformers with the 304 | efficient inference of RNNs. Our approach leverages a linear 305 | attention mechanism and allows us to formulate the model as 306 | either a Transformer or an RNN, which parallelizes 307 | computations during training and maintains constant 308 | computational and memory complexity during inference, 309 | leading to the first non-transformer architecture to be 310 | scaled to tens of billions of parameters. Our experiments 311 | reveal that RWKV performs on par with similarly sized 312 | Transformers, suggesting that future work can leverage this 313 | architecture to create more efficient models. This work 314 | presents a significant step towards reconciling the 315 | trade-offs between computational efficiency and model 316 | performance in sequence processing tasks.", 317 | month = may, 318 | year = 2023, 319 | keywords = "SSM", 320 | archivePrefix = "arXiv", 321 | primaryClass = "cs.CL", 322 | eprint = "2305.13048" 323 | } 324 | 325 | @UNPUBLISHED{Martin2018-bq, 326 | title = "Parallelizing Linear Recurrent Neural Nets Over Sequence Length", 327 | author = "Martin, Eric and Cundy, Chris", 328 | abstract = "Recurrent neural networks (RNNs) are widely used to model 329 | sequential data but their non-linear dependencies between 330 | sequence elements prevent parallelizing training over sequence 331 | length. We show the training of RNNs with only linear sequential 332 | dependencies can be parallelized over the sequence length using 333 | the parallel scan algorithm, leading to rapid training on long 334 | sequences even with small minibatch size. We develop a parallel 335 | linear recurrence CUDA kernel and show that it can be applied to 336 | immediately speed up training and inference of several state of 337 | the art RNN architectures by up to 9x. We abstract recent work on 338 | linear RNNs into a new framework of linear surrogate RNNs and 339 | develop a linear surrogate model for the long short-term memory 340 | unit, the GILR-LSTM, that utilizes parallel linear recurrence. We 341 | extend sequence learning to new extremely long sequence regimes 342 | that were previously out of reach by successfully training a 343 | GILR-LSTM on a synthetic sequence classification task with a one 344 | million timestep dependency.", 345 | month = feb, 346 | year = 2018, 347 | keywords = "SSM" 348 | } 349 | 350 | @ARTICLE{Wang2022-un, 351 | title = "Pretraining Without Attention", 352 | author = "Wang, Junxiong and Yan, Jing Nathan and Gu, Albert and Rush, 353 | Alexander M", 354 | abstract = "Transformers have been essential to pretraining success in 355 | NLP. While other architectures have been used, downstream 356 | accuracy is either significantly worse, or requires 357 | attention layers to match standard benchmarks such as GLUE. 358 | This work explores pretraining without attention by using 359 | recent advances in sequence routing based on state-space 360 | models (SSMs). Our proposed model, Bidirectional Gated SSM 361 | (BiGS), combines SSM layers with a multiplicative gating 362 | architecture that has been effective in simplified sequence 363 | modeling architectures. The model learns static layers that 364 | do not consider pair-wise interactions. Even so, BiGS is 365 | able to match BERT pretraining accuracy on GLUE and can be 366 | extended to long-form pretraining of 4096 tokens without 367 | approximation. Analysis shows that while the models have 368 | similar average accuracy, the approach has different 369 | inductive biases than BERT in terms of interactions and 370 | syntactic representations. All models from this work are 371 | available at https://github.com/jxiw/BiGS.", 372 | month = dec, 373 | year = 2022, 374 | keywords = "SSM", 375 | archivePrefix = "arXiv", 376 | primaryClass = "cs.CL", 377 | eprint = "2212.10544" 378 | } 379 | 380 | @ARTICLE{Gupta2022-vp, 381 | title = "Diagonal State Spaces are as Effective as Structured State 382 | Spaces", 383 | author = "Gupta, Ankit and Gu, Albert and Berant, Jonathan", 384 | abstract = "Modeling long range dependencies in sequential data is a 385 | fundamental step towards attaining human-level performance 386 | in many modalities such as text, vision, audio and video. 387 | While attention-based models are a popular and effective 388 | choice in modeling short-range interactions, their 389 | performance on tasks requiring long range reasoning has been 390 | largely inadequate. In an exciting result, Gu et al. (ICLR 391 | 2022) proposed the $\textit\{Structured State Space\}$ (S4) 392 | architecture delivering large gains over state-of-the-art 393 | models on several long-range tasks across various 394 | modalities. The core proposition of S4 is the 395 | parameterization of state matrices via a diagonal plus low 396 | rank structure, allowing efficient computation. In this 397 | work, we show that one can match the performance of S4 even 398 | without the low rank correction and thus assuming the state 399 | matrices to be diagonal. Our $\textit\{Diagonal State 400 | Space\}$ (DSS) model matches the performance of S4 on Long 401 | Range Arena tasks, speech classification on Speech Commands 402 | dataset, while being conceptually simpler and 403 | straightforward to implement.", 404 | month = mar, 405 | year = 2022, 406 | keywords = "SSM", 407 | archivePrefix = "arXiv", 408 | primaryClass = "cs.LG", 409 | eprint = "2203.14343" 410 | } 411 | 412 | @MISC{Blelloch1990-yo, 413 | title = "Prefix sums and their applications", 414 | author = "Blelloch, Guy E and Reif, John H", 415 | abstract = "Experienced algorithm designers rely heavily on a set of 416 | building blocks and on the tools needed to put the blocks 417 | together into an algorithm. The understanding of these basic 418 | blocks and tools is therefore critical to the understanding 419 | of algorithms. Many of the blocks and tools needed for 420 | parallel algorithms extend from sequential algorithms, such 421 | as dynamic-programming and divide-and-conquer, but others are 422 | new. This paper introduces one of the simplest and most 423 | useful building blocks for parallel algorithms: the 424 | all-prefixsums operation. The paper defines the operation, 425 | shows how to implement it on a PRAM and illustrates many 426 | applications of the operation. In addition to being a useful 427 | building block, the all-prefix-sums operation is a good 428 | example of a computation that seems inherently sequential, 429 | but for which there is an efficient parallel algorithm.", 430 | publisher = "shelf2.library.cmu.edu", 431 | year = 1990, 432 | howpublished = "\url{http://shelf2.library.cmu.edu/Tech/23445461.pdf}", 433 | note = "Accessed: 2023-5-30", 434 | keywords = "SSM" 435 | } 436 | 437 | @ARTICLE{Gu2022-jz, 438 | title = "On the Parameterization and Initialization of Diagonal State 439 | Space Models", 440 | author = "Gu, Albert and Gupta, Ankit and Goel, Karan and R{\'e}, 441 | Christopher", 442 | abstract = "State space models (SSM) have recently been shown to be very 443 | effective as a deep learning layer as a promising 444 | alternative to sequence models such as RNNs, CNNs, or 445 | Transformers. The first version to show this potential was 446 | the S4 model, which is particularly effective on tasks 447 | involving long-range dependencies by using a prescribed 448 | state matrix called the HiPPO matrix. While this has an 449 | interpretable mathematical mechanism for modeling long 450 | dependencies, it introduces a custom representation and 451 | algorithm that can be difficult to implement. On the other 452 | hand, a recent variant of S4 called DSS showed that 453 | restricting the state matrix to be fully diagonal can still 454 | preserve the performance of the original model when using a 455 | specific initialization based on approximating S4's matrix. 456 | This work seeks to systematically understand how to 457 | parameterize and initialize such diagonal state space 458 | models. While it follows from classical results that almost 459 | all SSMs have an equivalent diagonal form, we show that the 460 | initialization is critical for performance. We explain why 461 | DSS works mathematically, by showing that the diagonal 462 | restriction of S4's matrix surprisingly recovers the same 463 | kernel in the limit of infinite state dimension. We also 464 | systematically describe various design choices in 465 | parameterizing and computing diagonal SSMs, and perform a 466 | controlled empirical study ablating the effects of these 467 | choices. Our final model S4D is a simple diagonal version of 468 | S4 whose kernel computation requires just 2 lines of code 469 | and performs comparably to S4 in almost all settings, with 470 | state-of-the-art results for image, audio, and medical 471 | time-series domains, and averaging 85\% on the Long Range 472 | Arena benchmark.", 473 | month = jun, 474 | year = 2022, 475 | keywords = "SSM", 476 | archivePrefix = "arXiv", 477 | primaryClass = "cs.LG", 478 | eprint = "2206.11893" 479 | } 480 | 481 | @ARTICLE{Goel2022-lv, 482 | title = "It's Raw! Audio Generation with {State-Space} Models", 483 | author = "Goel, Karan and Gu, Albert and Donahue, Chris and R{\'e}, 484 | Christopher", 485 | abstract = "Developing architectures suitable for modeling raw audio is 486 | a challenging problem due to the high sampling rates of 487 | audio waveforms. Standard sequence modeling approaches like 488 | RNNs and CNNs have previously been tailored to fit the 489 | demands of audio, but the resultant architectures make 490 | undesirable computational tradeoffs and struggle to model 491 | waveforms effectively. We propose SaShiMi, a new multi-scale 492 | architecture for waveform modeling built around the recently 493 | introduced S4 model for long sequence modeling. We identify 494 | that S4 can be unstable during autoregressive generation, 495 | and provide a simple improvement to its parameterization by 496 | drawing connections to Hurwitz matrices. SaShiMi yields 497 | state-of-the-art performance for unconditional waveform 498 | generation in the autoregressive setting. Additionally, 499 | SaShiMi improves non-autoregressive generation performance 500 | when used as the backbone architecture for a diffusion 501 | model. Compared to prior architectures in the autoregressive 502 | generation setting, SaShiMi generates piano and speech 503 | waveforms which humans find more musical and coherent 504 | respectively, e.g. 2x better mean opinion scores than 505 | WaveNet on an unconditional speech generation task. On a 506 | music generation task, SaShiMi outperforms WaveNet on 507 | density estimation and speed at both training and inference 508 | even when using 3x fewer parameters. Code can be found at 509 | https://github.com/HazyResearch/state-spaces and samples at 510 | https://hazyresearch.stanford.edu/sashimi-examples.", 511 | month = feb, 512 | year = 2022, 513 | keywords = "SSM", 514 | archivePrefix = "arXiv", 515 | primaryClass = "cs.SD", 516 | eprint = "2202.09729" 517 | } 518 | 519 | @ARTICLE{Lu2023-ov, 520 | title = "Structured State Space Models for {In-Context} Reinforcement 521 | Learning", 522 | author = "Lu, Chris and Schroecker, Yannick and Gu, Albert and 523 | Parisotto, Emilio and Foerster, Jakob and Singh, Satinder 524 | and Behbahani, Feryal", 525 | abstract = "Structured state space sequence (S4) models have recently 526 | achieved state-of-the-art performance on long-range sequence 527 | modeling tasks. These models also have fast inference speeds 528 | and parallelisable training, making them potentially useful 529 | in many reinforcement learning settings. We propose a 530 | modification to a variant of S4 that enables us to 531 | initialise and reset the hidden state in parallel, allowing 532 | us to tackle reinforcement learning tasks. We show that our 533 | modified architecture runs asymptotically faster than 534 | Transformers and performs better than LSTM models on a 535 | simple memory-based task. Then, by leveraging the model's 536 | ability to handle long-range sequences, we achieve strong 537 | performance on a challenging meta-learning task in which the 538 | agent is given a randomly-sampled continuous control 539 | environment, combined with a randomly-sampled linear 540 | projection of the environment's observations and actions. 541 | Furthermore, we show the resulting model can adapt to 542 | out-of-distribution held-out tasks. Overall, the results 543 | presented in this paper suggest that the S4 models are a 544 | strong contender for the default architecture used for 545 | in-context reinforcement learning", 546 | month = mar, 547 | year = 2023, 548 | keywords = "SSM", 549 | archivePrefix = "arXiv", 550 | primaryClass = "cs.LG", 551 | eprint = "2303.03982" 552 | } 553 | 554 | @ARTICLE{Nguyen2022-qi, 555 | title = "{S4ND}: Modeling Images and Videos as Multidimensional 556 | Signals Using State Spaces", 557 | author = "Nguyen, Eric and Goel, Karan and Gu, Albert and Downs, 558 | Gordon W and Shah, Preey and Dao, Tri and Baccus, Stephen A 559 | and R{\'e}, Christopher", 560 | abstract = "Visual data such as images and videos are typically modeled 561 | as discretizations of inherently continuous, 562 | multidimensional signals. Existing continuous-signal models 563 | attempt to exploit this fact by modeling the underlying 564 | signals of visual (e.g., image) data directly. However, 565 | these models have not yet been able to achieve competitive 566 | performance on practical vision tasks such as large-scale 567 | image and video classification. Building on a recent line of 568 | work on deep state space models (SSMs), we propose S4ND, a 569 | new multidimensional SSM layer that extends the 570 | continuous-signal modeling ability of SSMs to 571 | multidimensional data including images and videos. We show 572 | that S4ND can model large-scale visual data in $1$D, $2$D, 573 | and $3$D as continuous multidimensional signals and 574 | demonstrates strong performance by simply swapping Conv2D 575 | and self-attention layers with S4ND layers in existing 576 | state-of-the-art models. On ImageNet-1k, S4ND exceeds the 577 | performance of a Vision Transformer baseline by $1.5\%$ when 578 | training with a $1$D sequence of patches, and matches 579 | ConvNeXt when modeling images in $2$D. For videos, S4ND 580 | improves on an inflated $3$D ConvNeXt in activity 581 | classification on HMDB-51 by $4\%$. S4ND implicitly learns 582 | global, continuous convolutional kernels that are resolution 583 | invariant by construction, providing an inductive bias that 584 | enables generalization across multiple resolutions. By 585 | developing a simple bandlimiting modification to S4 to 586 | overcome aliasing, S4ND achieves strong zero-shot (unseen at 587 | training time) resolution performance, outperforming a 588 | baseline Conv2D by $40\%$ on CIFAR-10 when trained on $8 589 | \times 8$ and tested on $32 \times 32$ images. When trained 590 | with progressive resizing, S4ND comes within $\sim 1\%$ of a 591 | high-resolution model while training $22\%$ faster.", 592 | month = oct, 593 | year = 2022, 594 | keywords = "SSM", 595 | archivePrefix = "arXiv", 596 | primaryClass = "cs.CV", 597 | eprint = "2210.06583" 598 | } 599 | -------------------------------------------------------------------------------- /temp.tex: -------------------------------------------------------------------------------- 1 | \documentclass[tikz,border=2mm]{standalone} 2 | \usepackage{tikz} 3 | \usetikzlibrary{positioning} 4 | \begin{document} 5 | \begin{tikzpicture}[scale=0.8] 6 | \tikzset{layer/.style={draw,minimum width=1.5cm,minimum height=1.5cm}} 7 | \tikzset{dot/.style={circle,fill,inner sep=1.5pt}} 8 | \tikzset{vec/.style={draw,thick,-latex}} 9 | 10 | \node[dot,label=left:$x_1$] (x1) at (0,0) {}; 11 | \node[dot,label=left:$x_2$,below=0.5cm of x1] (x2) {}; 12 | \node[dot,label=left:$x_3$,below=0.5cm of x2] (x3) {}; 13 | 14 | \node[layer,right=1.5cm of x2,align=center] (self-att) {Self-\\Attention}; 15 | 16 | \node[dot,label=right:$y_1$,right=1.5cm of self-att] (y1) {}; 17 | \node[dot,label=right:$y_2$,below=0.5cm of y1] (y2) {}; 18 | \node[dot,label=right:$y_3$,below=0.5cm of y2] (y3) {}; 19 | 20 | \foreach \i in {1,...,3} { 21 | \draw[vec] (x\i) -- (self-att.west |- x\i); 22 | \draw[vec] (self-att.east |- y\i) -- (y\i); 23 | } 24 | 25 | \draw[vec] (self-att) -- (self-att); 26 | \end{tikzpicture} 27 | \end{document} 28 | --------------------------------------------------------------------------------