├── 01-intro.tex
├── 02-transformers.tex
├── 03-RNN.tex
├── 03.5-Results.tex
├── 04-Practicalities.tex
├── 05-Extensions.tex
├── 06-final.tex
├── DoWeNeedAttention.pdf
├── Figs
    ├── Allowed.png
    ├── Attention.png
    ├── Banana.png
    ├── BiGS.png
    ├── Biden.png
    ├── Complex.png
    ├── ComplexBad.png
    ├── Conv.png
    ├── Cumsum.png
    ├── DSSM.pdf
    ├── FeedForward.png
    ├── GLUE.png
    ├── H3.png
    ├── Is-Attention-All-You-Need-.png
    ├── Kernel1.png
    ├── MNLI.png
    ├── Mega.png
    ├── ModelSize0.jpg
    ├── ModelSize2.png
    ├── ModelSize3.png
    ├── RASP.png
    ├── RNNParam.pdf
    ├── RWKV.png
    ├── S4LRA.png
    ├── SGParam.pdf
    ├── SSM (1).pdf
    ├── SSM.pdf
    ├── SSMParam.pdf
    ├── SSMSide.pdf
    ├── SSMStart.pdf
    ├── assoc.png
    ├── assoc2.png
    ├── attention.png
    ├── attractors.png
    ├── big.png
    ├── comparison_results (1).png
    ├── comparison_results.png
    ├── elmo.png
    ├── ema.png
    ├── frame_10_delay-0.1s.png
    ├── frame_20_delay-0.1s.png
    ├── frame_30_delay-0.1s.png
    ├── frame_40_delay-0.1s.png
    ├── frame_50_delay-0.1s.png
    ├── graph.png
    ├── graph2.png
    ├── hippo.png
    ├── hippo_kernel.png
    ├── hyena.png
    ├── induct.png
    ├── induct1.png
    ├── induct2.png
    ├── kernel.pdf
    ├── kernel2.png
    ├── listops-s4.png
    ├── llama.png
    ├── lra-s4.png
    ├── match.png
    ├── model_architecture_comparison2.pdf
    ├── out-rnn.png
    ├── out.png
    ├── out2 (1).png
    ├── out2.png
    ├── out3 (1).png
    ├── out3.png
    ├── out4.png
    ├── out5.png
    ├── phase.png
    ├── rnn.png
    ├── sgconv.png
    ├── shift.png
    ├── solve.png
    ├── speech.png
    ├── ssm.png
    ├── ssmrec.png
    ├── ssmrec0.png
    ├── ssmrec1.png
    ├── temp.png
    └── transformer.png
├── LICENSE.md
├── MLSys_Slides (11).pdf
├── Makefile
├── README.md
├── SSM Start.pdf
├── anthology.bib
├── beamercolorthemeauriga.sty
├── beamerthemeauriga.sty
├── old.tex
├── p-notes.tex
├── p.tex
├── presentation-netflix.tex
├── presentation.tex
├── slides
    ├── brackets.tex
    ├── bullets.tex
    ├── centered.tex
    ├── figure.tex
    ├── link.tex
    ├── monospace.tex
    └── split.tex
├── ssm.bib
└── temp.tex


/01-intro.tex:
--------------------------------------------------------------------------------
 1 | \begin{frame}{Do we need \textcolor{blue}{Attention}?}
 2 |     \centering
 3 |     \only<2>{Or can we use something simpler...}
 4 |     \begin{figure}
 5 |         \centering
 6 |         
 7 |         \includegraphics<1>[height=0.6\textheight,clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/Complex.png} 
 8 |         \includegraphics<2>[height=0.55\textheight,clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/Allowed.png}
 9 |         
10 |         
11 |     \end{figure}
12 | \end{frame}
13 | 
14 | \begin{frame}[label=current]{Proposition - One year ago}
15 |     \begin{quote}
16 |         On January 1, 2027, an \textcolor{blue}{Attention-based} model will be state-of-the-art in natural language processing.
17 |     \end{quote}
18 | 
19 |     \begin{figure}
20 |         \centering
21 |         \includegraphics[width=0.7\linewidth,clip,  trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/Is-Attention-All-You-Need-.png}
22 |         \label{fig:my_label}
23 |     \end{figure}
24 | 
25 | \end{frame}
26 | 
27 | 
28 | \begin{frame}[c, label=current]{}
29 |     \begin{figure}
30 |         \centering
31 |     \includegraphics[width=0.8\linewidth]{Figs/Biden.png}
32 |         \label{fig:my_label}
33 |     \end{figure}
34 | \end{frame}
35 | 
36 | 
37 | 
38 | \begin{frame}{Algorithmic Goal}
39 |     GPT models are growing, but still limited by context length.
40 |     \vspace{1cm}
41 |     
42 |     \begin{itemize}
43 |         \item \textcolor{blue}{Training Speed} - Cost is quadratic in length
44 |         \item \textcolor{blue}{Generation Speed} - Attention requires full lookback
45 |     \end{itemize}
46 | \end{frame}
47 | 
48 | 
49 | 
50 | 
51 | \begin{frame}{Survey: Progress on Attention Alternatives}
52 |     \begin{center}
53 |         Recent research has made significant progress.
54 |     \end{center}
55 | 
56 |     \begin{columns}
57 |     \begin{column}{0.4\textwidth}
58 |     \textit{S4}~\cite{gu2022parameterization}
59 |     \textit{DSS}~\cite{gupta2022diagonal}
60 |     \textit{GSS}~\cite{mehta2022long}
61 |     \textit{S4D}~\cite{Gu2022-jz} 
62 |     \textit{H3}~\cite{dao2022hungry}
63 |     \textit{S5}~\cite{smith2022simplified}
64 |     \textit{BiGS}~\cite{Wang2022-un}
65 |     \end{column}
66 |     \begin{column}{0.4\textwidth}      
67 |     \textit{QRNN}~\cite{mccann2017learned}
68 |     \textit{LRU}~\cite{Orvieto2023-an}
69 |     \textit{RWKV}~\cite{Peng2023-yp}
70 |     \textit{Mega}~\cite{ma2022mega}
71 |     \textit{Hyena}~\cite{Poli2023-ag}
72 |     \textit{SGConv}~\cite{Li2022-pn}
73 |     \end{column}
74 |     \end{columns}
75 |     \pause
76 | 
77 |     
78 |     \begin{center}
79 |         \structure{Note:} Just one research direction.
80 |         
81 |     \end{center}
82 | 
83 | 
84 | \end{frame}
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/02-transformers.tex:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | % \begin{frame}[c]{}
  4 | %     \centering
  5 | %     \begin{figure}
  6 | %         \centering
  7 | %         \includegraphics[height=0.9\textheight]{Figs/FeedForward.png}
  8 | %         \label{fig:my_label}
  9 | %     \end{figure}
 10 | % \end{frame}
 11 | 
 12 | \begin{frame}[c]{Transformers for Sequence Modeling}
 13 |     \centering
 14 |     \begin{columns}
 15 |     \begin{column}{0.3\textwidth}
 16 |         Repeated components 
 17 |         \vspace{0.5cm}
 18 |         
 19 |             \begin{itemize}
 20 |                 \item Feed Forward
 21 | 
 22 |                 \item Attention 
 23 |             \end{itemize}
 24 |     \end{column}        
 25 |     \begin{column}{0.7\textwidth}
 26 | 
 27 |     \begin{figure}
 28 |         \centering
 29 |         \includegraphics[height=0.8\textheight,  clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/out.png}
 30 |         %\includegraphics[height=0.8\textheight]{Figs/out2 (1).png} \label{fig:my_label}
 31 |     \end{figure}
 32 |     \end{column}
 33 |     \end{columns}        
 34 | 
 35 | \end{frame}
 36 | 
 37 | \begin{frame}{Feed Forward}
 38 |     \begin{itemize}
 39 |         \item Acts on each position independently. 
 40 |     \end{itemize}
 41 |     \begin{figure}
 42 |         \centering
 43 |         \includegraphics[height=0.5\textheight, clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/out4.png}
 44 |     \end{figure}
 45 | \end{frame}
 46 | 
 47 | \begin{frame}[c]{Attention} 
 48 |     \begin{itemize}
 49 |         \item Fully connected interactions.
 50 |     \end{itemize}
 51 | 
 52 |     \centering
 53 |     \begin{figure}
 54 |         \centering
 55 |         \includegraphics[height=0.5\textheight,  clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/out5.png}
 56 |         \label{fig:my_label}
 57 |     \end{figure}    
 58 | \end{frame}
 59 | 
 60 | 
 61 | % \begin{frame}{Attention Matrix}
 62 | %         \centering
 63 | %     \begin{itemize}
 64 | %         \item Schematic of interactions at each layer (quadratic)
 65 | %     \end{itemize}
 66 | 
 67 |     
 68 | %     \begin{figure}
 69 | %         \centering
 70 | %         % \includegraphics[height=0.7\textheight,clip,trim={14cm 3cm 0.5cm 3cm}]{Figs/Attention.png} \hspace{1cm} 
 71 | %         \includegraphics[height=0.7\textheight, clip, trim={1.5cm 1.3cm 0.1cm 0.1cm}]{Figs/Cumsum.png}
 72 | %         \label{fig:my_label}
 73 | %     \end{figure}
 74 | % \end{frame}
 75 | 
 76 | 
 77 | \begin{frame}[c]{Task: Language Generation}
 78 |     \centering
 79 |     Predict the next word.
 80 |     \vspace{1.5cm}
 81 |     
 82 | 
 83 |     \structure{Final:}  The dog walked to the \textcolor{red}{park}
 84 |     
 85 |     \vspace{1.5cm}
 86 |     
 87 |         \textcolor{blue}{Input:} The dog walked to the  \textcolor{red}{?}
 88 | 
 89 | \end{frame}
 90 | 
 91 | \begin{frame}[c]{Task: Long Range Arena (ListOps)}
 92 |     \centering
 93 |     Calculate the equation ($\uparrow$=max $\downarrow$=min)
 94 |     \vspace{1.5cm}
 95 | 
 96 |     
 97 |     \structure{Final:} [ $\uparrow$ 2 9 [ $\downarrow$ 4 7 ] 0 ] \textcolor{red}{9}
 98 |     
 99 |     \vspace{1.5cm}
100 | 
101 |     
102 |     \textcolor{blue}{Input:}  [ $\uparrow$ 2 9 [ $\downarrow$ 4 7 ] 0 ] \textcolor{red}{?}
103 | \end{frame}
104 | 
105 | 
106 | 
107 | \begin{frame}[c]{Attention Matrix}
108 | 
109 |     \centering
110 | 
111 |     \begin{center}
112 |         All quadratic interactions possible.
113 |     \end{center}
114 |     
115 |     \begin{figure}
116 |         \centering
117 |         \includegraphics[height=0.6\textheight, clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/Complex.png}
118 |        \label{fig:my_label}
119 |     \end{figure}
120 | \end{frame}
121 | 
122 | \begin{frame}[c]{Attention for Realistic Examples}
123 |     \centering
124 |      \begin{center}
125 |     Listops goes to 2,000 steps. This is 100.
126 |     \end{center}
127 | 
128 |     \begin{figure}
129 |         \centering
130 |         \includegraphics[height=0.6\textheight, clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/big.png}
131 |        \label{fig:my_label}
132 |     \end{figure}
133 | \end{frame}
134 | 
135 | 
136 | 
137 | 
138 |     


--------------------------------------------------------------------------------
/03-RNN.tex:
--------------------------------------------------------------------------------
  1 | \begin{frame}{Discrete Time Sequence}
  2 | 
  3 | From \structure{scalar} sequence $u_{1}, \ldots, u_L$ to $y_1, \ldots, y_L$.
  4 | 
  5 | \begin{figure}
  6 |     \centering
  7 |     \includegraphics[width=0.5\textwidth]{Figs/SSMStart.pdf}
  8 |     \label{fig:my_label}
  9 | \end{figure}
 10 | \end{frame}
 11 | 
 12 | 
 13 | \begin{frame}{Review: RNN for Language Generation}
 14 |     \begin{columns}
 15 |     \begin{column}{0.5\textwidth}
 16 |         \centering
 17 |               \includegraphics[width=.8\textwidth]{Figs/rnn.png}
 18 | 
 19 |     \end{column}
 20 |     \begin{column}{0.5\textwidth}
 21 | 
 22 |     \begin{align*}
 23 |     x_{k} &= \textcolor{red}{\sigma}(\textcolor{green}{\boldsymbol{\overline{A}}} x_{k-1} + \textcolor{blue}{\boldsymbol{\overline{B}}} u_k) \\ 
 24 |     y_k &= \phantom{\sigma (} \textcolor{orange}{\boldsymbol{\overline{C}}} x_{k \phantom{- 1}}
 25 |     \end{align*}
 26 |     \end{column}
 27 |     \end{columns}
 28 | 
 29 | \end{frame}
 30 | 
 31 | \begin{frame}{Review: RNN versus Attention}
 32 |     \begin{columns}
 33 |     \begin{column}{0.5\textwidth}
 34 |         \centering
 35 |         \includegraphics[width=.8\textwidth]{Figs/rnn.png}
 36 |     \end{column}
 37 |     \begin{column}{0.5\textwidth}
 38 |         \centering
 39 |   
 40 |         \includegraphics[width=0.8\textwidth]{Figs/out5.png}
 41 |     \end{column}
 42 |     \end{columns}
 43 |     \vspace{0.5cm}
 44 |     
 45 | \begin{itemize}
 46 |     \item \structure{Training Speed:} Slow (\textcolor{red}{Serial} bottleneck)
 47 |     \item \structure{Generation Speed:} Fast (constant-time per step)
 48 |     
 49 | \end{itemize}
 50 | \end{frame}
 51 | 
 52 | 
 53 | \begin{frame}{Didn't we try this RNN thing?  }
 54 | 
 55 | \begin{center}    
 56 | The last major RNN model in NLP - \textcolor{red}{ELMo}
 57 | \end{center}
 58 | 
 59 | \pause
 60 | 
 61 |     \begin{figure}
 62 |         \centering
 63 |         \includegraphics[width=0.5\textwidth]{Figs/GLUE.png}
 64 | 
 65 |         \label{fig:my_label}
 66 |     \end{figure}
 67 | \blfootnote{\cite{DBLP:conf/naacl/PetersNIGCLZ18, devlin2018bert}}
 68 | \end{frame}
 69 | 
 70 | \begin{frame}{RNN Revival: Two Differences}
 71 | \begin{columns}
 72 |     \begin{column}{0.5\textwidth}
 73 |         
 74 |     \begin{enumerate}
 75 |         \item Efficient Linear RNNs
 76 |         \item Effective Long-Range Parameterizations
 77 |     \end{enumerate}
 78 | 
 79 |     
 80 |     % Orthogonal RNN - Linear 
 81 |     % QRNN - > Linear RNN. $\bar{A}$ time-varying Linear non-homogenous. input depdendent 
 82 | 
 83 |     % A static over time. 
 84 |     % SISO - property. 
 85 |     % Orthogonal - > 
 86 |     
 87 |     \end{column}
 88 |     \begin{column}{0.5\textwidth}
 89 | \centering
 90 | \includegraphics[width=0.4\textwidth, clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/out-rnn.png}
 91 |     \end{column}
 92 | 
 93 | \end{columns}
 94 | \end{frame}
 95 | 
 96 | 
 97 | 
 98 | \begin{frame}{Component 1: \textcolor{blue}{Linear} RNN}
 99 | 
100 |     \begin{align*}
101 |     x_{k} &= \textcolor{green}{\boldsymbol{\overline{A}}} \textcolor{black}{\boldsymbol{x}_{k-1}} + \textcolor{blue}{\boldsymbol{\overline{B}}} \textcolor{black}{u_k} \\ 
102 |     y_k &=  \textcolor{orange}{\boldsymbol{\overline{C}}} x_{k \phantom{- 1}}
103 |     \end{align*}
104 |     \pause
105 |     \begin{figure}
106 |         \centering
107 |         \includegraphics[width=0.6\textwidth]{Figs/ssm.png}
108 |         \label{fig:my_label}
109 |     \end{figure}
110 | \end{frame}
111 | 
112 | \begin{frame}{Expansion Of Terms}
113 | \vspace{-0.5cm}
114 |     \begin{align*}
115 |     y_k =  \textcolor{orange}{\boldsymbol{\overline{C}}} x_{k \phantom{- 1}} \ 
116 |     x_{k} = \textcolor{green}{\boldsymbol{\overline{A}}} \textcolor{black}{\boldsymbol{x}_{k-1}} + \textcolor{blue}{\boldsymbol{\overline{B}}} \textcolor{black}{u_k} \ 
117 |     \end{align*}
118 |     \pause 
119 |     \vspace{-2cm}
120 | \begin{figure}
121 |     \centering
122 |     \only<2>{\[y_1\]}\only<3>{\[y_2\]} \only<4->{\[y_3\]}
123 |     \includegraphics<2>[height=0.12\textwidth]{Figs/ssmrec0}
124 |     
125 |     \includegraphics<3>[height=0.12\textwidth]{Figs/ssmrec1}
126 |     
127 |      \includegraphics<4->[height=0.1\textwidth]{Figs/ssmrec}
128 |     \label{fig:my_label}
129 | \end{figure}
130 | \vspace{-0.5cm}
131 | 
132 | \pause\pause\pause
133 | \begin{align*}
134 | \overline{K} &= (\textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{blue}{\boldsymbol{\overline{B}}}, \textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{green}{\boldsymbol{\overline{A}}}\textcolor{blue}{\boldsymbol{\overline{B}}}, \dots, \textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{green}{\boldsymbol{\overline{A}}^{L-1}}\textcolor{blue}{\boldsymbol{\overline{B}}})
135 | \end{align*}
136 | \end{frame}
137 | 
138 | \begin{frame}{Convolutional Form}
139 | 
140 |     \begin{align*}
141 |     y_k =  \textcolor{orange}{\boldsymbol{\overline{C}}} x_{k \phantom{- 1}} \ 
142 |     x_{k} = \textcolor{green}{\boldsymbol{\overline{A}}} \textcolor{black}{\boldsymbol{x}_{k-1}} + \textcolor{blue}{\boldsymbol{\overline{B}}} \textcolor{black}{u_k} \ 
143 |     \end{align*}
144 | 
145 | 
146 | 
147 | \begin{align*}
148 | \overline{K} &= (\textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{blue}{\boldsymbol{\overline{B}}}, \textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{green}{\boldsymbol{\overline{A}}}\textcolor{blue}{\boldsymbol{\overline{B}}}, \dots, \textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{green}{\boldsymbol{\overline{A}}^{L-1}}\textcolor{blue}{\boldsymbol{\overline{B}}}) \\
149 | y &= \text{conv1d}(\overline{K}_L \ldots \overline{K}_1, u_1 \ldots u_L)
150 | \end{align*}
151 | 
152 | 
153 | 
154 | % Intuition: 
155 | % \pause
156 | % $$y_1 = \boldsymbol{\overline{C}} \boldsymbol{\overline{B}} u_1$$ 
157 | % \pause
158 | % $$y_2 = \boldsymbol{\overline{C}} \boldsymbol{\overline{A}} \boldsymbol{\overline{B}} u_1 + \boldsymbol{\overline{C}} \boldsymbol{\overline{B}} u_2 = \boldsymbol{\overline{C}} (\boldsymbol{\overline{A}} \boldsymbol{\overline{B}} u_1 + \boldsymbol{\overline{B}} u_2) = \boldsymbol{\overline{C}} (\boldsymbol{x}_1 + \boldsymbol{\overline{B}} u_2) $$
159 | \end{frame}
160 | 
161 | 
162 | \begin{frame}{Convolutional Form}
163 | \begin{align*}
164 | \overline{K} &= (\textcolor{black}{\boldsymbol{\overline{C}}}\textcolor{black}{\boldsymbol{\overline{B}}}, \textcolor{black}{\boldsymbol{\overline{C}}}\textcolor{black}{\boldsymbol{\overline{A}}}\textcolor{black}{\boldsymbol{\overline{B}}}, \dots, \textcolor{black}{\boldsymbol{\overline{C}}}\textcolor{black}{\boldsymbol{\overline{A}}^{L-1}}\textcolor{black}{\boldsymbol{\overline{B}}}) \\
165 | \end{align*}
166 | \begin{figure}
167 |     \centering
168 |     \includegraphics[width=0.6\textwidth]{Figs/SSM (1).pdf}
169 |     \label{fig:my_label}
170 | \end{figure}
171 | \end{frame}
172 | 
173 | \begin{frame}{Computation 1: FFT}
174 | Compute convolution in Fourier space, 
175 | 
176 | \begin{align*}
177 | &y = \boldsymbol{\overline{K}} \ast u
178 | \end{align*}
179 | \begin{itemize}
180 |     \item $O(L \log L)$ for padded FFT of $K$ and $u$, mult, then iFFT
181 |     \item Accelerators optimize this to different levels.
182 | \end{itemize}
183 | \end{frame}
184 | 
185 | \begin{frame}[c]{Computation 2: Associative Scan (S5)}
186 | 
187 | 
188 | \begin{columns}
189 |     \begin{column}{0.5\textwidth}
190 |     Associative $e_1\bullet \ldots \bullet e_L$
191 | 
192 |     \begin{center}
193 |     \Tree [.$\bullet$ [.$\bullet$ [.$\bullet$ $e_1$ ] [.$\bullet$ $e_2$ ] ] [.$\bullet$ [.$\bullet$ $e_3$ ] [.$\bullet$ $e_4$ ] ] ]
194 |     \end{center}
195 |     \end{column}
196 |         
197 |  \begin{column}{0.5\textwidth}
198 |  \centering
199 |  \[e_k = (\boldsymbol{E}_k, \boldsymbol{e}_k) = (\bar{\textcolor{green}{\boldsymbol{A}}}, \bar{\textcolor{blue}{\boldsymbol{B}}}u_k)\]
200 |     \begin{figure}
201 |         \centering
202 |         \includegraphics[height=0.1\textwidth,clip,trim={0cm 0cm 6cm 0cm}]{Figs/assoc.png}
203 |         \label{fig:my_label}
204 |     \end{figure}
205 |     \[e_i \bullet e_j = (\boldsymbol{E}_i \boldsymbol{E}_j, \boldsymbol{E}_j \boldsymbol{e}_i + \boldsymbol{e}_j ) \]
206 |     \begin{figure}
207 |         \centering
208 |         \includegraphics[height=0.1\textwidth,clip,trim={0cm 0cm 6cm 0cm}]{Figs/assoc2.png}
209 |     \end{figure}
210 |         
211 |     \end{column}
212 | \end{columns}
213 |     \blfootnote{\cite{Blelloch1990-yo,Martin2018-bq,smith2022simplified}}
214 | \end{frame}
215 | % \begin{frame}{Alternative Computation: Associative Scan \cite{smith2022simplified}}
216 |    
217 | 
218 | % \end{frame}
219 | 
220 | 
221 | % \begin{frame}{ Associative Scan: S5 }
222 | %     Potential benefits versus FFT
223 | %     \vspace{0.5cm}
224 |     
225 | %     \begin{itemize}
226 | %         \item Compute hidden states explicitly
227 | %         \item Allows alternative RNN forms.
228 | %         \item Faster on some architectures
229 | %     \end{itemize}
230 | % \end{frame}
231 | 
232 | 
233 | \begin{frame}{Linear RNN Computational Profile}
234 | 
235 |     \begin{align*}
236 |     x_{k} &= \textcolor{green}{\boldsymbol{\overline{A}}} \textcolor{black}{{x}_{k-1}} + \textcolor{blue}{\boldsymbol{\overline{B}}} \textcolor{black}{u_k} \\ 
237 |     y_k &=  \textcolor{orange}{\boldsymbol{\overline{C}}} x_{k \phantom{- 1}}
238 |     \end{align*}
239 | \begin{itemize}
240 |     \item \structure{Training Speed:} \sout{Weak} Strong (Parallelizable convolution)
241 |     \item \structure{Generation Speed:} Strong (constant-time per step) \pause
242 |     \item \structure{Accuracy:} Extremely \textcolor{red}{Poor...} Barely learns.
243 | \end{itemize}
244 | \end{frame}
245 | 
246 | \begin{frame}{Interactions}
247 |     \begin{center}
248 |         Routing here must be static and regular (conv). 
249 |     \end{center}
250 |     \begin{figure}
251 |         \centering
252 |         \includegraphics[height=0.45\textheight,clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/Allowed.png}
253 |         \vspace{0.5cm}
254 |         
255 |         \includegraphics[width=0.5\textwidth]{Figs/SGParam.pdf}
256 |         \label{fig:my_label}
257 |     \end{figure}
258 | \end{frame}
259 | 
260 | 
261 | 
262 | 
263 | \begin{frame}{Component 2:  Model Parameterization}
264 | 
265 | Linear RNN behavior highly dependent on $\boldsymbol{\overline{A}}$
266 | 
267 | \begin{align*}
268 | \overline{K} &= (\textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{blue}{\boldsymbol{\overline{B}}}, \textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{green}{\boldsymbol{\overline{A}}}\textcolor{blue}{\boldsymbol{\overline{B}}}, \dots, \textcolor{orange}{\boldsymbol{\overline{C}}}\textcolor{green}{\boldsymbol{\overline{A}}^{L-1}}\textcolor{blue}{\boldsymbol{\overline{B}}})
269 | \end{align*}
270 | \vspace{0.5cm}
271 | 
272 | Choice of $\boldsymbol{\overline{A}}$ is critical: stable and informative.
273 | \end{frame}
274 | 
275 | 
276 | \begin{frame}{Mathematical Model: State Space Model (SSM) }
277 | 
278 | A SSM is a continuous-time, differential equation.
279 | \begin{align*}
280 |     x'(t) &= \boldsymbol{A}x(t) + \boldsymbol{B}u(t) \\  
281 |     y(t) &= \boldsymbol{C}x(t).
282 | \end{align*}
283 | 
284 | Used to explore Linear RNN parameterization.
285 | \end{frame}
286 | 
287 | \begin{frame}{Hidden State Form~\cite{gu2020hippo}}
288 |   \textcolor{red}{Summarize} history in vector $x$ with \textcolor{blue}{Legendre} coefficients 
289 |     \begin{figure}
290 |     \centering
291 |     \includegraphics[width=0.7\textwidth]{Figs/hippo.png}
292 |     \end{figure}
293 | \end{frame}
294 | 
295 | \begin{frame}{Choice of Parameters~\cite{gu2020hippo}}
296 |     Intuition: Hidden state vector $\textcolor{blue}{x}$ should \textcolor{red}{summarize} past $u$. 
297 | 
298 | \begin{figure}
299 |     \centering
300 |     \includegraphics<1>[width=\textwidth]{Figs/frame_10_delay-0.1s.png}
301 |     \includegraphics<2>[width=\textwidth]{Figs/frame_20_delay-0.1s.png}
302 |     \includegraphics<3>[width=\textwidth]{Figs/frame_30_delay-0.1s.png}
303 |     \includegraphics<4>[width=1\textwidth]{Figs/frame_40_delay-0.1s.png}
304 |     \includegraphics<5>[width=1\textwidth]{Figs/frame_50_delay-0.1s.png}
305 | \end{figure}
306 | 
307 | \end{frame}
308 | 
309 | 
310 | 
311 | % \begin{frame}{Practical Consequence: HiPPO~\cite{gu2020hippo}}
312 | %     Motivates an initialization of the (discrete-time) kernel $\bar{K}$.
313 |     
314 | %     \begin{figure}
315 | %         \centering
316 | %         \includegraphics[width=0.5\textwidth]{Figs/hippo_kernel.png}
317 |         
318 | %         \includegraphics[width=0.5\textwidth]{Figs/SGParam.pdf}
319 | %         \label{fig:enter-label}
320 | %     \end{figure}
321 | % \end{frame}
322 | 
323 | % \begin{frame}{S4 \cite{gu2022parameterization} }
324 | 
325 | % Learn parameters of SSM, convert to linear RNN parameters 
326 | 
327 | % $$\boldsymbol{\overline{A}}, \boldsymbol{\overline{B}}, \boldsymbol{\overline{C}}  = \text{discretize}(\boldsymbol{A}, \boldsymbol{B}, \boldsymbol{C}, \Delta )$$
328 | 
329 | %     \begin{figure}
330 | %         \centering
331 | %         \includegraphics[width=0.6\textwidth]{Figs/SSMParam.pdf}
332 | %         \caption{}
333 | %         \label{fig:my_label}
334 | %     \end{figure}
335 | % \pause
336 | % \vspace{-2cm}
337 | 
338 | % Note: There are \textit{many more} important details here.
339 |     
340 | % \end{frame}
341 | 
342 | 
343 | % \begin{frame}{Determining RNN Parameterization}
344 | % \begin{itemize}
345 | % \item \cite{gu2020hippo} develop \textit{HiPPO} Matrix for SSM $\boldsymbol{A}$   
346 | 
347 | % % \begin{scriptsize}
348 | % % \begin{align*}  
349 | % % \boldsymbol{A}_{nk}= -
350 | % % \begin{cases} 
351 | % % (2n+1)^{1/2}(2k+1)^{1/2} & \text{if } n > k \\ n+1 &\text{if } n=k \text{\ else\ } 0
352 | % % \end{cases} 
353 | % % \end{align*}
354 | % % \end{scriptsize}
355 | 
356 | %     \item Approximates history through Legendre coefficients 
357 | % \end{itemize}
358 | % \begin{figure}
359 | %     \centering
360 | %     \includegraphics[width=0.7\textwidth]{Figs/hippo.png}
361 | % \end{figure}
362 | % \end{frame}
363 | 
364 | % \begin{frame}{Key Insight: Choice of $\boldsymbol{A}$ }
365 | 
366 | % \cite{gu2020hippo,gu2022parameterization}
367 | 
368 | % Show that HiPPO
369 | 
370 | 
371 | % \end{frame}
372 | 
373 | \begin{frame}[c]{Results: ListOps \cite{gu2022parameterization}}
374 |     \centering
375 |         Example: [ $\uparrow$ 2 9 [ $\downarrow$ 4 7 ] 0 ] \textcolor{red}{9}
376 |         
377 |     \begin{figure}
378 |         \centering
379 | 
380 |     \includegraphics[height=0.6\textheight,clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/listops-s4.png}
381 |         \label{fig:my_label}
382 |     \end{figure}
383 |     Requires communication over 2,000  steps
384 |     
385 | \end{frame}
386 | 
387 | 
388 | \begin{frame}[c]{Results: Long-Range Arena \cite{gu2022parameterization}}
389 |     \centering
390 |     \begin{figure}
391 |         \centering
392 |         \includegraphics[height=0.8\textheight,clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/lra-s4.png}
393 |         \label{fig:my_label}
394 |     \end{figure}
395 | \end{frame}
396 | 
397 | 
398 | 
399 | 
400 | 
401 | % \begin{frame}{Computing with Static Kernels}
402 | %     \structure{Final:} a b c $\Rightarrow$ d e f \textcolor{red}{d}
403 | 
404 | %     \begin{figure}
405 | %         \centering
406 | %         \includegraphics[height=0.5\textheight]{Figs/induct1.png}
407 | %         \includegraphics[height=0.5\textheight]{Figs/induct2.png}
408 | %         \label{fig:my_label}
409 | %     \end{figure}        
410 |     
411 | % \textcolor{blue}{Input:} a b c $\Rightarrow$ d e f \textcolor{red}{?}
412 |  
413 | 
414 | % \end{frame}


--------------------------------------------------------------------------------
/03.5-Results.tex:
--------------------------------------------------------------------------------
  1 | \begin{frame}{Applying Linear RNNs}
  2 |      \vspace{1cm}
  3 |      \begin{columns}
  4 |      \begin{column}{0.6\textwidth}         
  5 |      \begin{itemize}
  6 |          \item Speech~\cite{goel2022s}
  7 |          \item Video~\cite{Nguyen2022-qi}
  8 |          \item RL~\cite{Lu2023-ov}
  9 |          \item \textcolor{red}{NLP} 
 10 |      \end{itemize}
 11 |      \end{column}         
 12 |      \begin{column}{0.4\textwidth}         
 13 |     \includegraphics[width=0.9\textwidth, ,clip,trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/speech.png}
 14 |      \end{column}         
 15 | 
 16 |      \end{columns}    
 17 | \end{frame}
 18 | 
 19 | \begin{frame}{NLP Results}
 20 |     Two types of model
 21 |         \vspace{1cm}
 22 | 
 23 |     \begin{itemize}
 24 |         \item Bidirectional LM (BERT)
 25 |         \item Unidirectional LM (GPT)
 26 |     \end{itemize}
 27 |     \vspace{1cm}
 28 |     
 29 |     % Different architectures used, Some with partial attention
 30 |     
 31 | \end{frame}
 32 | 
 33 | 
 34 | \begin{frame}{Results: Bidirectional LM \cite{Wang2022-un}}
 35 | \begin{figure}
 36 |     \centering
 37 |     \includegraphics[height=0.6\textheight]{Figs/BiGS.png}
 38 | \end{figure}
 39 | \end{frame}
 40 | 
 41 | 
 42 | \begin{frame}{Analysis: Kernel Visualization $\boldsymbol{\bar{K}}$}
 43 | 
 44 | \begin{figure}
 45 |     \centering
 46 |     \includegraphics[width=\textwidth]{Figs/kernel1.png}
 47 | \end{figure}
 48 | 
 49 | \begin{itemize}
 50 |     \item Replaces Attention Matrix
 51 |     \item Single Kernel per layer 
 52 | \end{itemize}
 53 | \end{frame}
 54 | 
 55 | \begin{frame}{Analysis: All Kernels}
 56 | \begin{figure}
 57 |     \centering
 58 |     \includegraphics[height=0.6\textheight]{Figs/kernel2.png}
 59 | \end{figure}
 60 | \end{frame}
 61 | 
 62 | \begin{frame}{Analysis: Change in Kernels during Finetuning }
 63 | 
 64 | \centerline{Task: Long-Range Sentence Matching}
 65 | \begin{figure}
 66 |     \centering
 67 | \includegraphics[width=0.8\textwidth]{Figs/comparison_results.png}
 68 |     \end{figure}
 69 | \end{frame}
 70 | 
 71 | 
 72 | 
 73 | \begin{frame}{Results: Unidirectional LM \cite{dao2022hungry}  $\downarrow$}
 74 |         \begin{figure}
 75 |         \centering
 76 |         \includegraphics[width=0.7\textwidth]{Figs/H3.png}
 77 |         \caption{Caption}
 78 |         \label{fig:my_label}
 79 |     \end{figure}
 80 | \end{frame}
 81 | 
 82 | \begin{frame}
 83 |     \includegraphics[ clip, height=\textheight]{Figs/ModelSize0.jpg}
 84 | \end{frame}
 85 | 
 86 | % \begin{frame}{Frame Title}
 87 |     
 88 | % \end{frame}
 89 | 
 90 | \section{Alternative Parameterizations}
 91 | 
 92 | \begin{frame}{Do we need the SSM?}
 93 |     \begin{figure}
 94 |         \centering
 95 |             \includegraphics[width=1\textwidth]{Figs/frame_50_delay-0.1s.png}
 96 |     \end{figure}
 97 | \end{frame}
 98 | 
 99 | \begin{frame}{CNN Param: Decaying Structure \cite{Li2022-pn}}
100 |     Parameterization should decay $\bar{K}$ over time.
101 | 
102 |     \begin{figure}
103 |         \centering
104 |         \includegraphics[width=0.4\textwidth]{Figs/sgconv.png}
105 |         \label{fig:my_label}
106 |     \end{figure}
107 | 
108 |     \begin{figure}
109 |         \centering
110 |         \includegraphics[width=0.5\textwidth]{Figs/SGParam.pdf}
111 |         \label{fig:my_label}
112 |     \end{figure}
113 | 
114 | \pause
115 |     \begin{center}
116 |     \alert{However}, no linear RNN form.        
117 |     \end{center}
118 |     
119 | \end{frame}
120 | 
121 | 
122 | 
123 | \begin{frame}{RNN Param: LRU \cite{Orvieto2023-an}}
124 |     Stable diagonal parameterization of Linear RNN
125 |     \begin{align*}
126 |     \textcolor{green}{\bar{A}}_{j,j} &= \exp(-\exp({\nu_j}) + i \exp(\theta_j))\\
127 |     \textcolor{blue}{\bar{B}}_{j} &= (1 - |\bar{A}_{j,j}|^2)^{1/2}
128 |     \end{align*}
129 | 
130 |     \begin{figure}
131 |         \centering
132 |         \includegraphics[width=0.8\textwidth]{Figs/phase.png}
133 |         \label{fig:my_label}
134 |     \end{figure}
135 | \end{frame}
136 | 
137 | \begin{frame}{RNN Param: MEGA \cite{ma2022mega}}
138 |      Use a parameterized damped, exponential moving average
139 |     \begin{align*}
140 |     \textcolor{green}{\bar{A}}_{j,j} &= 1 − \alert{\alpha_j} \times \delta_j \\
141 |     \textcolor{blue}{\bar{B}}_{j} &= \alpha_j
142 |     \end{align*}
143 |     \begin{figure}
144 |         \centering
145 |         \includegraphics[width=0.7\textwidth]{Figs/ema.png}
146 |         \label{fig:my_label}
147 |     \end{figure}
148 |     \begin{center}
149 |     Very good results on NLP tasks like Translation. 
150 |     \end{center}
151 |     
152 | \end{frame}
153 | 
154 | 
155 | \begin{frame}{RNN Param: RWKV \cite{Peng2023-yp}}
156 |     Inspired by Attention 
157 |     
158 |     Split into Keys, Values, and Receptance (no Query):
159 |     \begin{align*}
160 |     K_i, V_i, R_i
161 |     \end{align*}
162 |     \pause 
163 |     Then compute averaged values normalized by keys. 
164 |     
165 |     \begin{align*}
166 |      R_i\frac{\sum_{i'=1}^i \textcolor{green}{\exp(w)}^{i'}\exp(K_{i'}) V_{i'}} {\sum_{i'=1}^i \textcolor{green}{\exp(w)}^{i'}\exp(K_{i'})\phantom{ V_{i'}}} = R_i \frac{\text{LR}_1(\exp(K_i)V_i)}{\text{LR}_2(\exp(K_i))\phantom{V_i}}\\
167 |     \end{align*}
168 | 
169 |     Yields a product of Linear RNNs (Computed directly).
170 |     
171 | \end{frame}
172 | 
173 | 
174 | \begin{frame}{Results: RWKV \cite{Peng2023-yp}}
175 |     \begin{center}
176 |         Largest RNN. Trained up to 14B parameter scale.
177 |     \end{center}
178 |     \pause 
179 |     \begin{figure}
180 |         \centering
181 |         \includegraphics[width=1\textwidth]{Figs/RWKV.png}
182 |         \label{fig:my_label}
183 |     \end{figure}
184 |     Lots of practical interest and community.
185 | \end{frame}
186 | 
187 | 
188 | \begin{frame}{Open Question: In-Context Learning}
189 |     \begin{itemize}
190 |         \item Results show comparable loss at medium scales.
191 |         \item Significant interest is in abilities such as in-context learning
192 |         \item Current understanding relies of Attention mechanisms.
193 |     \end{itemize}
194 | \end{frame}
195 | 
196 | 
197 | % \begin{frame}{Parameterization: Diagonal RNN \cite{Li2022-pn}}
198 | %     \begin{figure}
199 | %         \centering
200 | %         \includegraphics[width=0.5\textwidth]{Figs/DSSM.pdf}
201 | 
202 | %         \label{fig:my_label}
203 | %     \end{figure}
204 | % \end{frame}
205 | 
206 | % \begin{frame}{Results: GSS   $\downarrow$}
207 | %         \begin{figure}
208 | %         \centering
209 | %         \includegraphics[width=0.7\textwidth]{}
210 | %         \caption{Caption}
211 | %         \label{fig:my_label}
212 | %     \end{figure}
213 | % \end{frame}
214 | 
215 | 
216 | 
217 | 
218 | 
219 | % \begin{frame}{Results: MEGA \cite{ma2022mega} $\uparrow$}
220 | %     \begin{figure}
221 | %         \centering
222 | %         \includegraphics[width=0.7\textwidth]{Figs/Mega.png}
223 | %         \label{fig:my_label}
224 | %     \end{figure}
225 | % \end{frame}
226 | 


--------------------------------------------------------------------------------
/04-Practicalities.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/04-Practicalities.tex


--------------------------------------------------------------------------------
/05-Extensions.tex:
--------------------------------------------------------------------------------
 1 | 
 2 | % \begin{frame}{Usage}
 3 | 
 4 | %     Linear RNNs opens up the modeling design space
 5 | 
 6 | %     \vspace{1cm}
 7 | %     \begin{itemize}
 8 | %         \item How to efficiently calculate?
 9 | %         \item How to parameterize?
10 | %     \end{itemize}
11 | % \end{frame}
12 | 
13 | % \begin{frame}{Calculation}
14 | %     Recall the main calculation is a $L$ length convolution, 
15 |     
16 | %     \begin{figure}
17 | %         \centering
18 | %         \includegraphics[width=0.7\textwidth]{Figs/SSM.pdf}
19 | %         \label{fig:my_label}
20 | %     \end{figure}
21 | % \end{frame}
22 | 
23 | 
24 | 
25 | \begin{frame}{Method 2: Parallel Associative Scan \cite{smith2022simplified} }
26 |     Compute $e_1\bullet \ldots \bullet e_l$ for any associative operator $\bullet$
27 | 
28 | \begin{center}
29 | \Tree [.$\bullet$ [.$\bullet$ [.$\bullet$ $e_1$ ] [.$\bullet$ $e_2$ ] ] [.$\bullet$ [.$\bullet$ $e_3$ ] [.$\bullet$ $e_4$ ] ] ]
30 |     
31 | \end{center}
32 |     \cite{Blelloch1990-yo,Martin2018-bq}
33 | \end{frame}
34 | 
35 | \begin{frame}{}
36 |     \[e_k = (\boldsymbol{E}_k, \boldsymbol{e}_k) = (\bar{\textcolor{green}{\boldsymbol{A}}}, \bar{\textcolor{blue}{\boldsymbol{B}}}u_k)\]
37 |     \begin{figure}
38 |         \centering
39 |         \includegraphics[height=0.1\textwidth,clip,trim={0cm 0cm 5cm 0cm}]{Figs/assoc.png}
40 |         \label{fig:my_label}
41 |     \end{figure}
42 | 
43 |     \[e_i \bullet e_j = (\boldsymbol{E}_i \boldsymbol{E}_j, \boldsymbol{E}_j \boldsymbol{e}_i + \boldsymbol{e}_j ) \]
44 |     \begin{figure}
45 |         \centering
46 |         \includegraphics[height=0.1\textwidth,clip,trim={0cm 0cm 6cm 0cm}]{Figs/assoc2.png}
47 |     \end{figure}
48 | \end{frame}
49 | 
50 | 
51 | 
52 | % \begin{frame}{Parmeterization of RNN Models}
53 | %     SSM framing gives an elegant parameterization of Linear RNNs,
54 | 
55 | %     \begin{figure}
56 | %         \centering
57 | %         \includegraphics[width=0.7\textwidth]{Figs/SSMParam.pdf}
58 | %         \label{fig:my_label}
59 | %     \end{figure}
60 | 
61 | %     Researchers have explored other parameterizations
62 | % \end{frame}
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/06-final.tex:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | \begin{frame}{Benefits of Linear RNNs}
 4 |     \begin{itemize}
 5 |         \item Methods for training (CNN) and generation (RNN)
 6 |         \item Potentially more FLOP efficient. 
 7 |         \item However not yet used in practice 
 8 |     \end{itemize}    
 9 | \end{frame}
10 | 
11 | \begin{frame}[c]{Current Efficiency with Scale \cite{Poli2023-ag}}
12 | \begin{figure}
13 |     \centering
14 |     \includegraphics[width=\textwidth]{Figs/hyena.png}
15 |     \caption{}
16 | \end{figure}
17 |  Models become more efficient at long time-scales.
18 | \end{frame}
19 | 
20 | \begin{frame}{Issues on Accelerators}
21 |     Approaches require:
22 |     \vspace{0.5cm}
23 | 
24 |     \begin{itemize}
25 |         \item Support for complex numbers
26 |         \item Support for FFT (lower precision, TPU)
27 |         \item Numerical Stability
28 |         \item Fast Associative Scans 
29 |     \end{itemize}
30 |     \vspace{0.5cm}
31 |     
32 |     Hard to compete with pure MatMul in Attention.
33 | \end{frame}
34 | 
35 | \begin{frame}{}
36 |     \begin{figure}
37 |         \centering
38 |         \includegraphics[width=0.7\linewidth,clip,  trim={0.1cm 0.1cm 0.1cm 0.1cm}]{Figs/Is-Attention-All-You-Need-.png}
39 |         \label{fig:my_label}
40 |     \end{figure}
41 | \end{frame}
42 | 
43 | 
44 | % \begin{frame}{Frame Title}
45 | %     Call to action. 
46 | 
47 | %     * Modeling benefits
48 | %     * Theoretical approaches
49 | %     * interplay with hardware efficiency. 
50 | %     * Matching transformers 
51 | %     * FFT / Complex 
52 | %     * Associative scans
53 | %     * GPU / TPUs 
54 | %     * Models are more flop efficient, FLOPs are not equal. 
55 | %     * Matmuls are more efficienct. 
56 | %     * Numerical stability / complex numbers 
57 | %     * 
58 | % \end{frame}
59 | 
60 | 
61 | 
62 | 
63 | % \begin{frame}{State Retrieval}
64 | %     \begin{itemize}
65 | %         \item Benchmarks compare perplexity of models
66 | %         \item Significant interest is in abilities such as in-context learning
67 | %         \item Current understanding relies of set-based Transformer mechanisms.
68 | %     \end{itemize}
69 | % \end{frame}
70 | 
71 | 
72 | 
73 | 
74 | % \begin{frame}{In-Context Learning}
75 | %     \begin{itemize}
76 | %         \item Benchmarks compare perplexity of models
77 | %         \item Significant interest is in abilities such as in-context learning
78 | %         \item Current understanding relies of set-based Transformer mechanisms.
79 | %     \end{itemize}
80 | % \end{frame}
81 | 
82 | % \begin{frame}{Better Transformers}
83 | %     \begin{itemize}
84 | %         \item Models are being scaled to longer ranges (>100k)
85 | %         \item For language, approximations of attention may be fine.
86 | %         \item 
87 | %     \end{itemize}
88 | % \end{frame}
89 | 
90 | 
91 | % \begin{frame}{Inductive Bias}
92 | %     \begin{itemize}
93 | %         \item Transformers are set-based models 
94 | %         \item Linear RNNs encoder sequential bias
95 | %         \item For language, unclear whether this is beneficial or not.
96 | %     \end{itemize}
97 | % \end{frame}


--------------------------------------------------------------------------------
/DoWeNeedAttention.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/DoWeNeedAttention.pdf


--------------------------------------------------------------------------------
/Figs/Allowed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Allowed.png


--------------------------------------------------------------------------------
/Figs/Attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Attention.png


--------------------------------------------------------------------------------
/Figs/Banana.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Banana.png


--------------------------------------------------------------------------------
/Figs/BiGS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/BiGS.png


--------------------------------------------------------------------------------
/Figs/Biden.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Biden.png


--------------------------------------------------------------------------------
/Figs/Complex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Complex.png


--------------------------------------------------------------------------------
/Figs/ComplexBad.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ComplexBad.png


--------------------------------------------------------------------------------
/Figs/Conv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Conv.png


--------------------------------------------------------------------------------
/Figs/Cumsum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Cumsum.png


--------------------------------------------------------------------------------
/Figs/DSSM.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/DSSM.pdf


--------------------------------------------------------------------------------
/Figs/FeedForward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/FeedForward.png


--------------------------------------------------------------------------------
/Figs/GLUE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/GLUE.png


--------------------------------------------------------------------------------
/Figs/H3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/H3.png


--------------------------------------------------------------------------------
/Figs/Is-Attention-All-You-Need-.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Is-Attention-All-You-Need-.png


--------------------------------------------------------------------------------
/Figs/Kernel1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Kernel1.png


--------------------------------------------------------------------------------
/Figs/MNLI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/MNLI.png


--------------------------------------------------------------------------------
/Figs/Mega.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/Mega.png


--------------------------------------------------------------------------------
/Figs/ModelSize0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ModelSize0.jpg


--------------------------------------------------------------------------------
/Figs/ModelSize2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ModelSize2.png


--------------------------------------------------------------------------------
/Figs/ModelSize3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ModelSize3.png


--------------------------------------------------------------------------------
/Figs/RASP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/RASP.png


--------------------------------------------------------------------------------
/Figs/RNNParam.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/RNNParam.pdf


--------------------------------------------------------------------------------
/Figs/RWKV.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/RWKV.png


--------------------------------------------------------------------------------
/Figs/S4LRA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/S4LRA.png


--------------------------------------------------------------------------------
/Figs/SGParam.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/SGParam.pdf


--------------------------------------------------------------------------------
/Figs/SSM (1).pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/SSM (1).pdf


--------------------------------------------------------------------------------
/Figs/SSM.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/SSM.pdf


--------------------------------------------------------------------------------
/Figs/SSMParam.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/SSMParam.pdf


--------------------------------------------------------------------------------
/Figs/SSMSide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/SSMSide.pdf


--------------------------------------------------------------------------------
/Figs/SSMStart.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/SSMStart.pdf


--------------------------------------------------------------------------------
/Figs/assoc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/assoc.png


--------------------------------------------------------------------------------
/Figs/assoc2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/assoc2.png


--------------------------------------------------------------------------------
/Figs/attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/attention.png


--------------------------------------------------------------------------------
/Figs/attractors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/attractors.png


--------------------------------------------------------------------------------
/Figs/big.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/big.png


--------------------------------------------------------------------------------
/Figs/comparison_results (1).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/comparison_results (1).png


--------------------------------------------------------------------------------
/Figs/comparison_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/comparison_results.png


--------------------------------------------------------------------------------
/Figs/elmo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/elmo.png


--------------------------------------------------------------------------------
/Figs/ema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ema.png


--------------------------------------------------------------------------------
/Figs/frame_10_delay-0.1s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/frame_10_delay-0.1s.png


--------------------------------------------------------------------------------
/Figs/frame_20_delay-0.1s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/frame_20_delay-0.1s.png


--------------------------------------------------------------------------------
/Figs/frame_30_delay-0.1s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/frame_30_delay-0.1s.png


--------------------------------------------------------------------------------
/Figs/frame_40_delay-0.1s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/frame_40_delay-0.1s.png


--------------------------------------------------------------------------------
/Figs/frame_50_delay-0.1s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/frame_50_delay-0.1s.png


--------------------------------------------------------------------------------
/Figs/graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/graph.png


--------------------------------------------------------------------------------
/Figs/graph2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/graph2.png


--------------------------------------------------------------------------------
/Figs/hippo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/hippo.png


--------------------------------------------------------------------------------
/Figs/hippo_kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/hippo_kernel.png


--------------------------------------------------------------------------------
/Figs/hyena.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/hyena.png


--------------------------------------------------------------------------------
/Figs/induct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/induct.png


--------------------------------------------------------------------------------
/Figs/induct1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/induct1.png


--------------------------------------------------------------------------------
/Figs/induct2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/induct2.png


--------------------------------------------------------------------------------
/Figs/kernel.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/kernel.pdf


--------------------------------------------------------------------------------
/Figs/kernel2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/kernel2.png


--------------------------------------------------------------------------------
/Figs/listops-s4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/listops-s4.png


--------------------------------------------------------------------------------
/Figs/llama.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/llama.png


--------------------------------------------------------------------------------
/Figs/lra-s4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/lra-s4.png


--------------------------------------------------------------------------------
/Figs/match.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/match.png


--------------------------------------------------------------------------------
/Figs/model_architecture_comparison2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/model_architecture_comparison2.pdf


--------------------------------------------------------------------------------
/Figs/out-rnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out-rnn.png


--------------------------------------------------------------------------------
/Figs/out.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out.png


--------------------------------------------------------------------------------
/Figs/out2 (1).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out2 (1).png


--------------------------------------------------------------------------------
/Figs/out2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out2.png


--------------------------------------------------------------------------------
/Figs/out3 (1).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out3 (1).png


--------------------------------------------------------------------------------
/Figs/out3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out3.png


--------------------------------------------------------------------------------
/Figs/out4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out4.png


--------------------------------------------------------------------------------
/Figs/out5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/out5.png


--------------------------------------------------------------------------------
/Figs/phase.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/phase.png


--------------------------------------------------------------------------------
/Figs/rnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/rnn.png


--------------------------------------------------------------------------------
/Figs/sgconv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/sgconv.png


--------------------------------------------------------------------------------
/Figs/shift.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/shift.png


--------------------------------------------------------------------------------
/Figs/solve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/solve.png


--------------------------------------------------------------------------------
/Figs/speech.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/speech.png


--------------------------------------------------------------------------------
/Figs/ssm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ssm.png


--------------------------------------------------------------------------------
/Figs/ssmrec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ssmrec.png


--------------------------------------------------------------------------------
/Figs/ssmrec0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ssmrec0.png


--------------------------------------------------------------------------------
/Figs/ssmrec1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/ssmrec1.png


--------------------------------------------------------------------------------
/Figs/temp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/temp.png


--------------------------------------------------------------------------------
/Figs/transformer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/Figs/transformer.png


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | =====================
 3 | 
 4 | **Copyright (c) 2019 Anish Athalye (me@anishathalye.com)**
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 7 | this software and associated documentation files (the "Software"), to deal in
 8 | the Software without restriction, including without limitation the rights to
 9 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
10 | of the Software, and to permit persons to whom the Software is furnished to do
11 | so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/MLSys_Slides (11).pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/MLSys_Slides (11).pdf


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | BUILD := \
 2 | 	p \
 3 | 	p-notes \
 4 | 
 5 | 
 6 | DEPS := \
 7 | 	beamerthemeauriga.sty \
 8 | 	beamercolorthemeauriga.sty \
 9 | 	presentation.tex \
10 | 	$(shell find slides -name '*.tex') \
11 | 
12 | 
13 | LATEX  := lualatex
14 | 
15 | LATEXOPTS := -interaction nonstopmode
16 | 
17 | TARGETS := $(patsubst %, %.pdf, $(BUILD))
18 | 
19 | # phony targets
20 | 
21 | all: $(TARGETS)
22 | 
23 | clean:
24 | 	rm -rf *.pdf *.aux *.bbl *.blg *.log *.nav *.out *.snm *.toc *.vrb
25 | 
26 | .PHONY: all clean
27 | 
28 | # main targets
29 | 
30 | %.pdf: %.tex $(DEPS)
31 | 	$(eval SRC_$@ = $(patsubst %.tex, %, $<))
32 | 	$(LATEX) $(LATEXOPTS) $(SRC_$@)
33 | 	$(LATEX) $(LATEXOPTS) $(SRC_$@)
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Do we need Attention?
2 | 
3 | Slides: https://github.com/srush/do-we-need-attention/blob/main/DoWeNeedAttention.pdf
4 | 
5 | Video: https://www.youtube.com/watch?v=dKJEpOtVgXc
6 | 


--------------------------------------------------------------------------------
/SSM Start.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srush/do-we-need-attention/b846d5aeca259978740a074f6c7f1ce671c00f7a/SSM Start.pdf


--------------------------------------------------------------------------------
/anthology.bib:
--------------------------------------------------------------------------------
  1 | % Please download the latest anthology.bib from
  2 | %
  3 | % http://aclweb.org/anthology/anthology.bib.gz
  4 | @article{gu2021efficiently,
  5 |   title={Efficiently Modeling Long Sequences with Structured State Spaces},
  6 |   author={Gu, Albert and Goel, Karan and R{\'e}, Christopher},
  7 |   journal={arXiv preprint arXiv:2111.00396},
  8 |   year={2021}
  9 | }
 10 | 
 11 | @article{tay2020long,
 12 |   title={Long range arena: A benchmark for efficient transformers},
 13 |   author={Tay, Yi and Dehghani, Mostafa and Abnar, Samira and Shen, Yikang and Bahri, Dara and Pham, Philip and Rao, Jinfeng and Yang, Liu and Ruder, Sebastian and Metzler, Donald},
 14 |   journal={arXiv preprint arXiv:2011.04006},
 15 |   year={2020}
 16 | }
 17 | 
 18 | @article{tay2020efficient,
 19 |   title={Efficient transformers: A survey},
 20 |   author={Tay, Yi and Dehghani, Mostafa and Bahri, Dara and Metzler, Donald},
 21 |   journal={arXiv preprint arXiv:2009.06732},
 22 |   year={2020}
 23 | }
 24 | 
 25 | 
 26 | @inproceedings{katharopoulos2020transformers,
 27 |   title={Transformers are rnns: Fast autoregressive transformers with linear attention},
 28 |   author={Katharopoulos, Angelos and Vyas, Apoorv and Pappas, Nikolaos and Fleuret, Fran{\c{c}}ois},
 29 |   booktitle={International Conference on Machine Learning},
 30 |   pages={5156--5165},
 31 |   year={2020},
 32 |   organization={PMLR}
 33 | }
 34 | 
 35 | 
 36 | @article{beltagy2020longformer,
 37 |   title={Longformer: The long-document transformer},
 38 |   author={Beltagy, Iz and Peters, Matthew E and Cohan, Arman},
 39 |   journal={arXiv preprint arXiv:2004.05150},
 40 |   year={2020}
 41 | }
 42 | 
 43 | 
 44 | 
 45 | @article{izsak2021train,
 46 |   title={How to train bert with an academic budget},
 47 |   author={Izsak, Peter and Berchansky, Moshe and Levy, Omer},
 48 |   journal={arXiv preprint arXiv:2104.07705},
 49 |   year={2021}
 50 | }
 51 | 
 52 | 
 53 | @article{gupta2022diagonal,
 54 |   title={Diagonal State Spaces are as Effective as Structured State Spaces},
 55 |   author={Gupta, Ankit},
 56 |   journal={arXiv preprint arXiv:2203.14343},
 57 |   year={2022}
 58 | }
 59 | 
 60 | @article{devlin2018bert,
 61 |   title={Bert: Pre-training of deep bidirectional transformers for language understanding},
 62 |   author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
 63 |   journal={arXiv preprint arXiv:1810.04805},
 64 |   year={2018}
 65 | }
 66 | 
 67 | @article{mccann2017learned,
 68 |   title={Learned in translation: Contextualized word vectors},
 69 |   author={McCann, Bryan and Bradbury, James and Xiong, Caiming and Socher, Richard},
 70 |   journal={Advances in neural information processing systems},
 71 |   volume={30},
 72 |   year={2017}
 73 | }
 74 | 
 75 | @article{peters2019tune,
 76 |   title={To tune or not to tune? adapting pretrained representations to diverse tasks},
 77 |   author={Peters, Matthew E and Ruder, Sebastian and Smith, Noah A},
 78 |   journal={arXiv preprint arXiv:1903.05987},
 79 |   year={2019}
 80 | }
 81 | 
 82 | @article{vaswani2017attention,
 83 |   title={Attention is all you need},
 84 |   author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
 85 |   journal={Advances in neural information processing systems},
 86 |   volume={30},
 87 |   year={2017}
 88 | }
 89 | 
 90 | @article{rush2022s4,
 91 |   title={The Annotated S4},
 92 |   author={Alexander Rush},
 93 |   journal={International Conference on Learning Representations},
 94 |   year={2022}
 95 | }
 96 | 
 97 | @inproceedings{wolf2020transformers,
 98 |   title={Transformers: State-of-the-art natural language processing},
 99 |   author={Wolf, Thomas and Debut, Lysandre and Sanh, Victor and Chaumond, Julien and Delangue, Clement and Moi, Anthony and Cistac, Pierric and Rault, Tim and Louf, R{\'e}mi and Funtowicz, Morgan and others},
100 |   booktitle={Proceedings of the 2020 conference on empirical methods in natural language processing: system demonstrations},
101 |   pages={38--45},
102 |   year={2020}
103 | }
104 | 
105 | @article{gu2020hippo,
106 |   title={Hippo: Recurrent memory with optimal polynomial projections},
107 |   author={Gu, Albert and Dao, Tri and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
108 |   journal={Advances in Neural Information Processing Systems},
109 |   volume={33},
110 |   pages={1474--1487},
111 |   year={2020}
112 | }
113 | 
114 | @article{gu2021combining,
115 |   title={Combining Recurrent, Convolutional, and Continuous-time Models with Linear State Space Layers},
116 |   author={Gu, Albert and Johnson, Isys and Goel, Karan and Saab, Khaled and Dao, Tri and Rudra, Atri and R{\'e}, Christopher},
117 |   journal={Advances in Neural Information Processing Systems},
118 |   volume={34},
119 |   year={2021}
120 | }
121 | 
122 | @article{liu2019roberta,
123 |   title={Roberta: A robustly optimized bert pretraining approach},
124 |   author={Liu, Yinhan and Ott, Myle and Goyal, Naman and Du, Jingfei and Joshi, Mandar and Chen, Danqi and Levy, Omer and Lewis, Mike and Zettlemoyer, Luke and Stoyanov, Veselin},
125 |   journal={arXiv preprint arXiv:1907.11692},
126 |   year={2019}
127 | }
128 | 
129 | @article{loshchilov2017decoupled,
130 |   title={Decoupled weight decay regularization},
131 |   author={Loshchilov, Ilya and Hutter, Frank},
132 |   journal={arXiv preprint arXiv:1711.05101},
133 |   year={2017}
134 | }
135 | 
136 | @article{lewis2019bart,
137 |   title={Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension},
138 |   author={Lewis, Mike and Liu, Yinhan and Goyal, Naman and Ghazvininejad, Marjan and Mohamed, Abdelrahman and Levy, Omer and Stoyanov, Ves and Zettlemoyer, Luke},
139 |   journal={arXiv preprint arXiv:1910.13461},
140 |   year={2019}
141 | }
142 | 
143 | @article{wang2018glue,
144 |   title={GLUE: A multi-task benchmark and analysis platform for natural language understanding},
145 |   author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},
146 |   journal={arXiv preprint arXiv:1804.07461},
147 |   year={2018}
148 | }
149 | 
150 | @inproceedings{hua2022transformer,
151 |   title={Transformer quality in linear time},
152 |   author={Hua, Weizhe and Dai, Zihang and Liu, Hanxiao and Le, Quoc},
153 |   booktitle={International Conference on Machine Learning},
154 |   pages={9099--9117},
155 |   year={2022},
156 |   organization={PMLR}
157 | }
158 | 
159 | @article{shaham2022scrolls,
160 |   title={Scrolls: Standardized comparison over long language sequences},
161 |   author={Shaham, Uri and Segal, Elad and Ivgi, Maor and Efrat, Avia and Yoran, Ori and Haviv, Adi and Gupta, Ankit and Xiong, Wenhan and Geva, Mor and Berant, Jonathan and others},
162 |   journal={arXiv preprint arXiv:2201.03533},
163 |   year={2022}
164 | }
165 | 
166 | @article{gu2022parameterization,
167 |   title={On the parameterization and initialization of diagonal state space models},
168 |   author={Gu, Albert and Gupta, Ankit and Goel, Karan and R{\'e}, Christopher},
169 |   journal={arXiv preprint arXiv:2206.11893},
170 |   year={2022}
171 | }
172 | 
173 | @article{mehta2022long,
174 |   title={Long range language modeling via gated state spaces},
175 |   author={Mehta, Harsh and Gupta, Ankit and Cutkosky, Ashok and Neyshabur, Behnam},
176 |   journal={arXiv preprint arXiv:2206.13947},
177 |   year={2022}
178 | }
179 | 
180 | @techreport{rumelhart1985learning,
181 |   title={Learning internal representations by error propagation},
182 |   author={Rumelhart, David E and Hinton, Geoffrey E and Williams, Ronald J},
183 |   year={1985},
184 |   institution={California Univ San Diego La Jolla Inst for Cognitive Science}
185 | }
186 | 
187 | @article{goel2022s,
188 |   title={It's Raw! Audio Generation with State-Space Models},
189 |   author={Goel, Karan and Gu, Albert and Donahue, Chris and R{\'e}, Christopher},
190 |   journal={arXiv preprint arXiv:2202.09729},
191 |   year={2022}
192 | }
193 | 
194 | @article{tay2021pre,
195 |   title={Are Pre-trained Convolutions Better than Pre-trained Transformers?},
196 |   author={Tay, Yi and Dehghani, Mostafa and Gupta, Jai and Bahri, Dara and Aribandi, Vamsi and Qin, Zhen and Metzler, Donald},
197 |   journal={arXiv preprint arXiv:2105.03322},
198 |   year={2021}
199 | }
200 | 
201 | @inproceedings{DBLP:conf/naacl/PetersNIGCLZ18,
202 |   author    = {Matthew E. Peters and
203 |                Mark Neumann and
204 |                Mohit Iyyer and
205 |                Matt Gardner and
206 |                Christopher Clark and
207 |                Kenton Lee and
208 |                Luke Zettlemoyer},
209 |   editor    = {Marilyn A. Walker and
210 |                Heng Ji and
211 |                Amanda Stent},
212 |   title     = {Deep Contextualized Word Representations},
213 |   booktitle = {Proceedings of the 2018 Conference of the North American Chapter of
214 |                the Association for Computational Linguistics: Human Language Technologies,
215 |                {NAACL-HLT} 2018, New Orleans, Louisiana, USA, June 1-6, 2018, Volume
216 |                1 (Long Papers)},
217 |   pages     = {2227--2237},
218 |   publisher = {Association for Computational Linguistics},
219 |   year      = {2018},
220 |   url       = {https://doi.org/10.18653/v1/n18-1202},
221 |   doi       = {10.18653/v1/n18-1202},
222 |   timestamp = {Fri, 06 Aug 2021 00:41:32 +0200},
223 |   biburl    = {https://dblp.org/rec/conf/naacl/PetersNIGCLZ18.bib},
224 |   bibsource = {dblp computer science bibliography, https://dblp.org}
225 | }
226 | 
227 | 
228 | @article{smith2022simplified,
229 |   title={Simplified state space layers for sequence modeling},
230 |   author={Smith, Jimmy TH and Warrington, Andrew and Linderman, Scott W},
231 |   journal={arXiv preprint arXiv:2208.04933},
232 |   year={2022}
233 | }
234 | 
235 | 
236 | @article{lee2021fnet,
237 |   title={Fnet: Mixing tokens with fourier transforms},
238 |   author={Lee-Thorp, James and Ainslie, Joshua and Eckstein, Ilya and Ontanon, Santiago},
239 |   journal={arXiv preprint arXiv:2105.03824},
240 |   year={2021}
241 | }
242 | 
243 | @article{marvin2018targeted,
244 |   title={Targeted syntactic evaluation of language models},
245 |   author={Marvin, Rebecca and Linzen, Tal},
246 |   journal={arXiv preprint arXiv:1808.09031},
247 |   year={2018}
248 | }
249 | 
250 | @article{linzen2016assessing,
251 |   title={Assessing the ability of LSTMs to learn syntax-sensitive dependencies},
252 |   author={Linzen, Tal and Dupoux, Emmanuel and Goldberg, Yoav},
253 |   journal={Transactions of the Association for Computational Linguistics},
254 |   volume={4},
255 |   pages={521--535},
256 |   year={2016},
257 |   publisher={MIT Press}
258 | }
259 | 
260 | @article{goldberg2019assessing,
261 |   title={Assessing BERT's syntactic abilities},
262 |   author={Goldberg, Yoav},
263 |   journal={arXiv preprint arXiv:1901.05287},
264 |   year={2019}
265 | }
266 | 
267 | @inproceedings{dauphin2017language,
268 |   title={Language modeling with gated convolutional networks},
269 |   author={Dauphin, Yann N and Fan, Angela and Auli, Michael and Grangier, David},
270 |   booktitle={International conference on machine learning},
271 |   pages={933--941},
272 |   year={2017},
273 |   organization={PMLR}
274 | }
275 | 
276 | @article{shazeer2020glu,
277 |   title={Glu variants improve transformer},
278 |   author={Shazeer, Noam},
279 |   journal={arXiv preprint arXiv:2002.05202},
280 |   year={2020}
281 | }
282 | 
283 | @article{narang2021transformer,
284 |   title={Do transformer modifications transfer across implementations and applications?},
285 |   author={Narang, Sharan and Chung, Hyung Won and Tay, Yi and Fedus, William and Fevry, Thibault and Matena, Michael and Malkan, Karishma and Fiedel, Noah and Shazeer, Noam and Lan, Zhenzhong and others},
286 |   journal={arXiv preprint arXiv:2102.11972},
287 |   year={2021}
288 | }
289 | 
290 | @article{warstadt2019linguistic,
291 |   title={Linguistic analysis of pretrained sentence encoders with acceptability judgments},
292 |   author={Warstadt, Alex and Bowman, Samuel R},
293 |   journal={arXiv preprint arXiv:1901.03438},
294 |   year={2019}
295 | }
296 | 
297 | @article{gulordava2018colorless,
298 |   title={Colorless green recurrent networks dream hierarchically},
299 |   author={Gulordava, Kristina and Bojanowski, Piotr and Grave, Edouard and Linzen, Tal and Baroni, Marco},
300 |   journal={arXiv preprint arXiv:1803.11138},
301 |   year={2018}
302 | }
303 | 
304 | @article{clark2019does,
305 |   title={What does bert look at? an analysis of bert's attention},
306 |   author={Clark, Kevin and Khandelwal, Urvashi and Levy, Omer and Manning, Christopher D},
307 |   journal={arXiv preprint arXiv:1906.04341},
308 |   year={2019}
309 | }
310 | 
311 | @article{tenney2019bert,
312 |   title={BERT rediscovers the classical NLP pipeline},
313 |   author={Tenney, Ian and Das, Dipanjan and Pavlick, Ellie},
314 |   journal={arXiv preprint arXiv:1905.05950},
315 |   year={2019}
316 | }
317 | 
318 | @inproceedings{rajpurkar2016squad,
319 |   title={SQuAD: 100,000+ Questions for Machine Comprehension of Text},
320 |   author={Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin and Liang, Percy},
321 |   booktitle={Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing},
322 |   pages={2383--2392},
323 |   year={2016}
324 | }
325 | 
326 | @article{wettig2022should,
327 |   title={Should You Mask 15\% in Masked Language Modeling?},
328 |   author={Wettig, Alexander and Gao, Tianyu and Zhong, Zexuan and Chen, Danqi},
329 |   journal={arXiv preprint arXiv:2202.08005},
330 |   year={2022}
331 | }
332 | 
333 | @article{warstadt2019neural,
334 |   title={Neural network acceptability judgments},
335 |   author={Warstadt, Alex and Singh, Amanpreet and Bowman, Samuel R},
336 |   journal={Transactions of the Association for Computational Linguistics},
337 |   volume={7},
338 |   pages={625--641},
339 |   year={2019},
340 |   publisher={MIT Press}
341 | }
342 | 
343 | @article{hendrycks2016gaussian,
344 |   title={Gaussian error linear units (gelus)},
345 |   author={Hendrycks, Dan and Gimpel, Kevin},
346 |   journal={arXiv preprint arXiv:1606.08415},
347 |   year={2016}
348 | }
349 | 
350 | @article{ma2022mega,
351 |   title={Mega: moving average equipped gated attention},
352 |   author={Ma, Xuezhe and Zhou, Chunting and Kong, Xiang and He, Junxian and Gui, Liangke and Neubig, Graham and May, Jonathan and Zettlemoyer, Luke},
353 |   journal={arXiv preprint arXiv:2209.10655},
354 |   year={2022}
355 | }
356 | 
357 | @article{dao2022hungry,
358 |   title={Hungry Hungry Hippos: Towards Language Modeling with State Space Models},
359 |   author={Dao, Tri and Fu, Daniel Y and Saab, Khaled K and Thomas, Armin W and Rudra, Atri and R{\'e}, Christopher},
360 |   journal={arXiv preprint arXiv:2212.14052},
361 |   year={2022}
362 | }
363 | 
364 | @inproceedings{joshi2017triviaqa,
365 |   title={TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
366 |   author={Joshi, Mandar and Choi, Eunsol and Weld, Daniel S and Zettlemoyer, Luke},
367 |   booktitle={Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
368 |   pages={1601--1611},
369 |   year={2017}
370 | }
371 | 
372 | @inproceedings{yang2015wikiqa,
373 |   title={Wikiqa: A challenge dataset for open-domain question answering},
374 |   author={Yang, Yi and Yih, Wen-tau and Meek, Christopher},
375 |   booktitle={Proceedings of the 2015 conference on empirical methods in natural language processing},
376 |   pages={2013--2018},
377 |   year={2015}
378 | }
379 | 
380 | 


--------------------------------------------------------------------------------
/beamercolorthemeauriga.sty:
--------------------------------------------------------------------------------
 1 | % Auriga theme
 2 | % https://github.com/anishathalye/auriga
 3 | 
 4 | % ====================
 5 | % Definitions
 6 | % ====================
 7 | 
 8 | \definecolor{lightgray}{RGB}{245, 246, 250}
 9 | \definecolor{darkgray}{RGB}{79,79,79}
10 | 
11 | % ====================
12 | % Theme
13 | % ====================
14 | 
15 | % Basic colors
16 | \setbeamercolor{palette primary}{fg=black,bg=white}
17 | \setbeamercolor{palette secondary}{fg=black,bg=white}
18 | \setbeamercolor{palette tertiary}{bg=black,fg=white}
19 | \setbeamercolor{palette quaternary}{fg=black,bg=white}
20 | \setbeamercolor{structure}{fg=darkgray}
21 | 
22 | % Itemize
23 | \setbeamercolor{item}{fg=black}
24 | 
25 | % Page numbering
26 | \setbeamercolor{page number in head/foot}{fg=structure.fg}
27 | 
28 | % Frame titles
29 | \setbeamercolor{frametitle}{fg=black}
30 | 


--------------------------------------------------------------------------------
/beamerthemeauriga.sty:
--------------------------------------------------------------------------------
  1 | % Auriga theme
  2 | % https://github.com/anishathalye/auriga
  3 | 
  4 | % ====================
  5 | % Dependencies
  6 | % ====================
  7 | 
  8 | \RequirePackage{exscale}
  9 | \RequirePackage{ragged2e}
 10 | \RequirePackage{changepage}
 11 | \RequirePackage{fontspec}
 12 | \RequirePackage{xpatch}
 13 | 
 14 | % ====================
 15 | % Fonts
 16 | % ====================
 17 | 
 18 | \newfontfamily\Raleway[Ligatures=TeX]{Raleway}
 19 | \newfontfamily\Lato[Ligatures=TeX]{Lato}
 20 | 
 21 | \usefonttheme{professionalfonts}
 22 | 
 23 | \setsansfont{Lato}[
 24 |   UprightFont=*-Regular,
 25 |   ItalicFont=*-Italic,
 26 |   BoldFont=*-Bold,
 27 |   BoldItalicFont=*-BoldItalic
 28 | ]
 29 | \setmonofont{Hack}
 30 | 
 31 | \setbeamerfont{title page}{family=\Raleway}
 32 | \setbeamerfont{title page title}{size=\LARGE,series=\bfseries}
 33 | \setbeamerfont{title page author}{size=\footnotesize}
 34 | \setbeamerfont{title page institute}{size=\scriptsize}
 35 | \setbeamerfont{frametitle}{family=\Raleway,size=\large,series=\bfseries}
 36 | \setbeamerfont{caption}{size=\footnotesize}
 37 | 
 38 | 
 39 | % ====================
 40 | % Macros
 41 | % ====================
 42 | 
 43 | \newcommand{\samelineand}{\qquad}
 44 | 
 45 | % ====================
 46 | % Elements
 47 | % ====================
 48 | 
 49 | % Itemize
 50 | 
 51 | \setbeamertemplate{itemize item}[circle]
 52 | \setbeamertemplate{itemize subitem}[circle]
 53 | \setbeamertemplate{itemize subsubitem}[circle]
 54 | \xpatchcmd{\itemize}
 55 |   {\def\makelabel}
 56 |   {\ifnum\@itemdepth=1\relax
 57 |      \setlength\itemsep{3ex}% separation for first level
 58 |    \else
 59 |      \ifnum\@itemdepth=2\relax
 60 |        \setlength\itemsep{0.5ex}% separation for second level
 61 |      \else
 62 |        \ifnum\@itemdepth=3\relax
 63 |          \setlength\itemsep{0.5ex}% separation for third level
 64 |    \fi\fi\fi\def\makelabel
 65 |   }
 66 |  {}
 67 |  {}
 68 | 
 69 | % Equation
 70 | \setlength\belowdisplayshortskip{2ex}
 71 | 
 72 | % Caption
 73 | \setlength{\abovecaptionskip}{2ex}
 74 | \setlength{\belowcaptionskip}{1ex}
 75 | \setbeamertemplate{caption}
 76 | {
 77 |     {\usebeamerfont{caption}\insertcaption}
 78 | }
 79 | 
 80 | % Navigation
 81 | \beamertemplatenavigationsymbolsempty
 82 | 
 83 | % ====================
 84 | % Components
 85 | % ====================
 86 | 
 87 | % Title page
 88 | \setbeamertemplate{title page}
 89 | {
 90 |     \begin{centering}
 91 |         \vskip5ex plus 1filll
 92 |         {\usebeamerfont{title page title}\usebeamercolor[fg]{title page}\inserttitle\\[1.5ex]}
 93 |         {\usebeamerfont{title page author}\usebeamercolor[fg]{title page}\insertauthor\\[2ex]}
 94 |         {\usebeamerfont{title page institute}\usebeamercolor[fg]{title page}\insertinstitute\\[1ex]}
 95 |         \vskip0pt plus 1filll
 96 |     \end{centering}
 97 | }
 98 | 
 99 | % Footer
100 | \setbeamertemplate{footline}{
101 |     \hfill%
102 |     \usebeamercolor[fg]{page number in head/foot}%
103 |     \usebeamerfont{page number in head/foot}%
104 |     \hspace{2em}%
105 |     %\insertframenumber\kern1em\vskip2ex%
106 | }
107 | 
108 | % Frame title
109 | \setbeamertemplate{frametitle}{
110 |     \nointerlineskip
111 |     \vskip2ex
112 |     {\usebeamerfont{frametitle}\usebeamercolor[fg]{frametitle}\insertframetitle}
113 | }
114 | 
115 | \renewcommand\footnoterule{}
116 | 
117 | \setbeamertemplate{footnote}{%
118 |   \parindent 0.5em\noindent%
119 |    \raggedleft
120 |   \usebeamercolor{footnote}\hbox to 5.8em{}\scriptsize \insertfootnotetext\par%
121 | }


--------------------------------------------------------------------------------
/old.tex:
--------------------------------------------------------------------------------
  1 | % Auriga theme
  2 | % find the most up-to-date version here: https://github.com/anishathalye/auriga
  3 | 
  4 | \documentclass[14pt,aspectratio=169]{beamer}
  5 | \usepackage{pgfpages}
  6 | \usepackage{fancyvrb}
  7 | \usepackage{tikz}
  8 | \usepackage{pgfplots}
  9 | \usepackage{booktabs}
 10 | \includeonlyframes{current}
 11 | 
 12 | \usetheme{auriga}
 13 | \usecolortheme{auriga}
 14 | \setbeamercolor{math text}{fg=blue}
 15 | 
 16 | \newcommand\blfootnote[1]{%
 17 | \begingroup
 18 | \renewcommand\thefootnote{}\footnote{#1}%
 19 | \addtocounter{footnote}{-1}%
 20 | \endgroup
 21 | }
 22 | 
 23 | %\setbeamertemplate{footline}[]
 24 | %\renewcommand\footnotemark{}
 25 | 
 26 | 
 27 | % define some colors for a consistent theme across slides
 28 | \definecolor{red}{RGB}{181, 23, 0}
 29 | \definecolor{blue}{RGB}{0, 118, 186}
 30 | \definecolor{gray}{RGB}{146, 146, 146}
 31 | 
 32 | \title{Do we need \textcolor{blue}{Attention}?}
 33 | 
 34 | \author{Alexander "Sasha" Rush}
 35 | 
 36 | \institute[shortinst]{}
 37 | 
 38 | \begin{document}
 39 | 
 40 | {
 41 |   % rather than use the frame options [noframenumbering,plain], we make the
 42 |   % color match, so that the indicated page numbers match PDF page numbers
 43 |   \setbeamercolor{page number in head/foot}{fg=background canvas.bg}
 44 |   \begin{frame}
 45 |     \titlepage
 46 |   \end{frame}
 47 | }
 48 | 
 49 | \begin{frame}[label=current]{}
 50 |     \cite{gu2022parameterization}
 51 |     \cite{}
 52 |     \cite{dao2022hungry}
 53 |     \cite{ma2022mega}
 54 | \end{frame}
 55 | 
 56 | 
 57 | \section{Context}
 58 | % \begin{frame}{Outline}
 59 | %     \tableofcontents
 60 | % \end{frame}
 61 | 
 62 | 
 63 | \begin{frame}[label=current]{Caveats}
 64 |     \begin{itemize}
 65 |         \item LLMs are remarkable, we should use them for most things
 66 |         \item This talk is \structure{not} about LLMs 
 67 |     \end{itemize}
 68 | \end{frame}
 69 | 
 70 | 
 71 | \begin{frame}
 72 |     \includegraphics[trim={10cm 0 10cm 0}, clip, height=\textheight]{Figs/ModelSize2.png}
 73 | \end{frame}
 74 | 
 75 | \begin{frame}
 76 |     \includegraphics[trim={10cm 0 10cm 0}, clip, height=\textheight]{Figs/ModelSize3.png}
 77 | \end{frame}
 78 | 
 79 | \begin{frame}{Context}
 80 |     \begin{itemize}
 81 |         \item BERT used to require non-trivial compute 
 82 |         \item Belief: Open architecture questions in NLP
 83 |         \item Today's Talk: How important is \textit{attention}?
 84 |     \end{itemize}
 85 | \end{frame}
 86 | 
 87 | 
 88 | \begin{frame}{\textcolor{red}{ELMo} }
 89 | 
 90 |     \begin{columns}
 91 |     \begin{column}{0.3\linewidth}
 92 |     \centerline{Bidirectional RNN}
 93 |     \end{column}
 94 |     \begin{column}{0.7\linewidth}
 95 |     
 96 |     \begin{figure}
 97 |     \includegraphics[width=0.8\textwidth]{Figs/elmo.png}
 98 |     \end{figure}
 99 |     \end{column}
100 |     \end{columns}
101 |     \blfootnote{\cite{DBLP:conf/naacl/PetersNIGCLZ18}}
102 | 
103 | \end{frame}
104 | 
105 | \begin{frame}{\textcolor{red}{ELMo} For Pretraining}
106 |     \begin{table}
107 |     \begin{tabular}{lc}
108 |         \toprule
109 |         Model & GLUE\\
110 |         \midrule 
111 |          ELMo& 67.7  \\
112 |          ELMo+Attn&  71.0\\ 
113 |          \visible<2>{BERT-Base & 79 - 83} \\
114 |         \bottomrule
115 |     \end{tabular}
116 |     \end{table}
117 | \blfootnote{\cite{DBLP:conf/naacl/PetersNIGCLZ18, devlin2018bert}}
118 | \end{frame}
119 | 
120 | \begin{frame}{Architecture?}
121 |     \begin{itemize}
122 |         \item 
123 |     Several confounding differences, e.g. frozen model.
124 |        \item Followup: \textit{To Tune or Not to Tune? Adapting Pretrained Representations to Diverse Tasks} \cite{peters2019tune}
125 |     \pause
126 | 
127 |      \item  Conclusion: Transformers significantly beat BiLSTMs
128 |     \end{itemize}
129 | \end{frame}
130 | 
131 | \begin{frame}{Other Models}
132 | 
133 |     Maybe there are other models
134 | 
135 |     \vspace{0.5cm}
136 | 
137 |     \begin{itemize}
138 |         \item Convolutions?
139 |         \item Mixers?
140 |     \end{itemize}
141 |     
142 | %     \textit{Are Pre-trained Convolutions Better than Pre-trained Transformers?}
143 | % \\
144 | % \\
145 | %     Answer: No. 
146 | 
147 | \end{frame}
148 | 
149 | \begin{frame}{Pretraining with CNNs}
150 |     \textit{Are Pre-trained Convolutions Better than Pre-trained Transformers?} \cite{tay2020efficient}
151 | 
152 |     \vspace{0.5cm}
153 |     
154 |     \visible<2>{\structure{Answer: No.}
155 | 
156 |     \begin{table}
157 |     \begin{tabular}{lc}
158 |         \toprule
159 |         Model & SST-2\\
160 |         \midrule 
161 |          ELMo &  91.8 \\ 
162 |          Best CNN & 92.2  \\
163 |          BERT-Base & 93.5 \\ 
164 |         \bottomrule
165 |     \end{tabular}
166 |     \end{table}
167 |     
168 |     }
169 |   
170 | \end{frame}
171 | 
172 | 
173 | % \begin{frame}{Results: CNNs}
174 | %     \begin{table}
175 | %     \begin{tabular}{lc}
176 | %         \toprule
177 | %         Model & SST-2\\
178 | %         \midrule 
179 | %          Best CNN & 92.2  \\
180 | %          ELMo &  91.8 \\ 
181 | %          BERT-Base & 93.5 \\ 
182 | %         \bottomrule
183 | %     \end{tabular}
184 | %     \end{table}
185 | % \end{frame}
186 | 
187 | \begin{frame}{Pretraining with FNet}
188 |     \textit{FNet: Mixing Tokens with Fourier Transforms} \cite{lee2021fnet}
189 | 
190 |     \vspace{0.5cm}
191 |     
192 |     Replaces attention with 2D FFT mixing-layer.
193 | 
194 |     \visible<2>{
195 |     \begin{table}
196 |     \begin{tabular}{lc}
197 |         \toprule
198 |         Model & GLUE (dev)\\
199 |         \midrule 
200 |          Best FNet & 76.3  \\
201 |          BERT-Base & 83.3 \\ 
202 |         \bottomrule
203 |     \end{tabular}
204 |     \end{table}
205 |     }
206 | \end{frame}
207 | 
208 | 
209 | 
210 | \begin{frame}{Transformers are Great...}    
211 |     \begin{itemize}
212 |         \item Highly optimized training 
213 |         \item Long-range ability
214 |         \item Expensive $O(n^2)$, but we have the money...
215 |     \end{itemize}
216 |     \vspace{0.5cm}
217 | 
218 |     \visible<2>{(But aren't you curious...)}
219 | \end{frame}
220 | 
221 | \section{State Space Models}
222 | \begin{frame}{Outline}
223 |     \tableofcontents[currentsection]
224 | \end{frame}
225 | 
226 | 
227 | \begin{frame}{State Space Models (SSM)}
228 |     \begin{itemize}
229 |         
230 |         \item Think hybrid RNN / CNN 
231 |         
232 |         \item SOTA on speech generation and long-range tasks 
233 | 
234 |         \item Tutorial at \textit{The Annotated S4}
235 |     \end{itemize}
236 | 
237 |     \blfootnote{\cite{gu2020hippo,gu2021combining,gu2021efficiently}}
238 | \end{frame}
239 | 
240 | 
241 | \begin{frame}{State Space Model - Continuous Time}
242 |     Let $u(t) \in \mathbb{R}$ be a continuous input and $y(t) \in \mathbb{R}$ be output. 
243 | 
244 | \pause
245 | \vspace{0.5cm}
246 | 
247 | SSM is a differential equation.
248 | \begin{align*}
249 |     \boldsymbol{x}'(t) &= \boldsymbol{A}\boldsymbol{x}(t) + \boldsymbol{B}u(t) \\  
250 |     y(t) &= \boldsymbol{C}\boldsymbol{x}(t) + \boldsymbol{D}u(t).
251 | \end{align*}
252 | 
253 | \pause 
254 | Where $\boldsymbol{x}(t) \in \mathbf{R}^N$ is a hidden state and model \structure{parameters},
255 | 
256 | $$\boldsymbol{A} \in \mathbb{R}^{N\times N}, \boldsymbol{B}\in \mathbb{R}^{N \times 1}, \boldsymbol{C} \in \mathbb{R}^{1 \times N}, \boldsymbol{D} \in \mathbb{R}^{1\times 1}$$
257 | 
258 | \end{frame}
259 | \begin{frame}{Discrete Time Sequence}
260 | 
261 | Goal: Map scalar sequence $u_{1}, \ldots, u_L$ to $y_1, \ldots, y_L$,
262 | 
263 | \begin{figure}
264 |     \centering
265 |     \includegraphics[width=0.5\textwidth]{Figs/SSMStart.pdf}
266 |     \label{fig:my_label}
267 | \end{figure}
268 | \end{frame}
269 | 
270 | \begin{frame}{Discrete Time SSM}
271 | 
272 | SSM on discretize time data,
273 | 
274 | \begin{align*}
275 | \boldsymbol{x}_{k} &= \boldsymbol{\overline{A}} \boldsymbol{x}_{k-1} + \boldsymbol{\overline{B}} u_k \\ 
276 | y_k &= \boldsymbol{\overline{C}} \boldsymbol{x}_{k \phantom{- 1}} + \boldsymbol{\overline{D}}  u_k. 
277 | \end{align*}
278 | 
279 | Using discretization with (learned) sampling rate parameter $\Delta$, 
280 | 
281 | $$\boldsymbol{\overline{A}}, \boldsymbol{\overline{B}}, \boldsymbol{\overline{C}}  = \text{discretize}(\boldsymbol{A}, \boldsymbol{B}, \boldsymbol{C}, \Delta )$$
282 | 
283 | \end{frame}
284 | 
285 | \begin{frame}{Recurrent Form}
286 | 
287 | Output sequence $y_1, \ldots, y_L$ can be computed as a linear RNN,
288 | 
289 | \begin{align*}
290 | \boldsymbol{x}_{k} &= \boldsymbol{\overline{A}} \boldsymbol{x}_{k-1} + \boldsymbol{\overline{B}} u_k \\ 
291 | y_k &= \boldsymbol{\overline{C}} \boldsymbol{x}_{k \phantom{- 1}} + \boldsymbol{\overline{D}}  u_k. 
292 | \end{align*}
293 | 
294 | Note $\boldsymbol{x}_k \in \mathbb{R}^N$ is the bigger hidden state for $u_k \in \mathbb{R}$, and $\boldsymbol{x}_0 = \mathbf{0}$.
295 | 
296 | \end{frame}
297 | 
298 | \begin{frame}{Convolutional Form}
299 | 
300 | Alternative: 1D convolution with kernel $\boldsymbol{\overline{K}}$ (width $L$),
301 | 
302 | \begin{align*}
303 | \overline{K} &= (\boldsymbol{\overline{C}}\boldsymbol{\overline{B}}, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}\boldsymbol{\overline{B}}, \dots, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}^{L-1}\boldsymbol{\overline{B}}) \\
304 | y &= \text{conv1d}(\overline{K}_L \ldots \overline{K}_1, u_1 \ldots u_L)
305 | \end{align*}
306 | 
307 | Intuition: 
308 | \pause
309 | $$y_1 = \boldsymbol{\overline{C}} \boldsymbol{\overline{B}} u_1$$ 
310 | \pause
311 | $$y_2 = \boldsymbol{\overline{C}} \boldsymbol{\overline{A}} \boldsymbol{\overline{B}} u_1 + \boldsymbol{\overline{C}} \boldsymbol{\overline{B}} u_2 = \boldsymbol{\overline{C}} (\boldsymbol{\overline{A}} \boldsymbol{\overline{B}} u_1 + \boldsymbol{\overline{B}} u_2) = \boldsymbol{\overline{C}} (\boldsymbol{x}_1 + \boldsymbol{\overline{B}} u_2) $$
312 | \end{frame}
313 | 
314 | \begin{frame}{Convolutional Form}
315 | Step 1: Discretize (Training Only). Step 2: Apply 1D Conv
316 | \begin{figure}
317 |     \centering
318 |     \includegraphics[width=0.6\textwidth]{Figs/SSMSide.pdf}
319 |     \label{fig:my_label}
320 | \end{figure}
321 | \end{frame}
322 | 
323 | \begin{frame}{Implementation - Computing Kernel}
324 |     
325 |     $$\boldsymbol{\overline{K}} = (\boldsymbol{\overline{C}}\boldsymbol{\overline{B}}, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}\boldsymbol{\overline{B}}, \dots, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}^{L-1}\boldsymbol{\overline{B}}) $$
326 |     
327 | \begin{itemize}    
328 |     \item Simple approximations work well (See S4D, DSS)   
329 | \end{itemize}
330 | \blfootnote{\cite{gu2021efficiently,gupta2022diagonal,gu2022parameterization}}
331 | \end{frame}
332 | 
333 | 
334 | \begin{frame}{Implementation - Fourier Transform}
335 | \begin{align*}
336 | &y = \boldsymbol{\overline{K}} \ast u
337 | \end{align*}
338 | \begin{itemize}
339 |     \item At long $L$, convolution computed with FFT.
340 |     \item More efficient than self-attention or standard RNN. 
341 | \end{itemize}
342 | \end{frame}
343 | 
344 | 
345 | \begin{frame}{Important Training Initialization}
346 | \begin{itemize}
347 | \item Parameter $\boldsymbol{A}$ is initialized with  HiPPO Matrix \cite{gu2020hippo}
348 | 
349 | % \begin{scriptsize}
350 | % \begin{align*}  
351 | % \boldsymbol{A}_{nk}= -
352 | % \begin{cases} 
353 | % (2n+1)^{1/2}(2k+1)^{1/2} & \text{if } n > k \\ n+1 &\text{if } n=k \text{\ else\ } 0
354 | % \end{cases} 
355 | % \end{align*}
356 | % \end{scriptsize}
357 | 
358 | 
359 |     \item Kernel formed by Legendre coefficients 
360 | \end{itemize}
361 | \begin{figure}
362 |     \centering
363 |     \includegraphics[width=0.7\textwidth]{Figs/hippo.png}
364 | \end{figure}
365 | \end{frame}
366 | 
367 | 
368 | 
369 | \begin{frame}{Summary: SSM}
370 |     \begin{itemize}
371 |         \item Mapping from sequence-to-sequence
372 |         \item Acts like an RNN, Computed like a CNN
373 |         \item Fast to train and utilize
374 |     \end{itemize}
375 | \end{frame}
376 | 
377 | \section{Model Architectures}
378 | \begin{frame}{Outline}
379 |     \tableofcontents[currentsection]
380 | \end{frame}
381 | 
382 | \begin{frame}{Objective: Replicate BERT with SSM}
383 | \begin{itemize}
384 |     \item Everything else identical (loss, number of parameters, data) 
385 | \end{itemize}
386 | \end{frame}
387 | 
388 | % \begin{frame}{Architectures for Pretraining}
389 | %     \begin{itemize}
390 | %         \item Idea 1: Just replace self-attention
391 | %         \item Minimal change to Transformer arch 
392 | %     \end{itemize}
393 | % \end{frame}
394 | 
395 | 
396 | \begin{frame}{\structure{Naive Idea} Self-attention $\Rightarrow$ SSM}
397 | \begin{figure}
398 |     \centering
399 |     \includegraphics[height=0.8\textheight,trim={0 0 18cm 0},clip]{Figs/model_architecture_comparison2.pdf}
400 |     \caption{}
401 |     \label{}
402 | \end{figure}
403 | \end{frame}
404 | 
405 | \begin{frame}{Can this work?}
406 | \begin{itemize}
407 |     \item SSM is significantly less expressive than self-attention.     
408 |     \item Static routing through the model like a CNN.
409 |     \item Can it learn to do \structure{matching} across sentences?
410 | \end{itemize}
411 | \pause 
412 | \vspace{0.5cm}
413 | 
414 | 
415 | 
416 | 
417 | \end{frame}
418 | 
419 | 
420 | \begin{frame}{Test: Matching Across Gaps}
421 | \centerline{Task: QNLI \cite{wang2018glue}}
422 | \vspace{0.5cm}
423 | 
424 |    
425 |     \centerline{\textcolor{red}{What percentage of farmland grows wheat?}}
426 |     
427 |     \centerline{$\sim \sim \sim $}
428 | 
429 |     \centerline{\textcolor{olivegreen}{More than 50\% of this area is sown for wheat and 33\% for barley.}}
430 |     
431 |     \pause
432 |     
433 |     \begin{table}[t]
434 | \center
435 |     \begin{tabular}{ccc}
436 |     \toprule
437 |     \centering
438 |      Arch  & \textcolor{red}{H} P &   \textcolor{red}{H} $\sim$ P \\    
439 |     \midrule
440 |              \textsc{stack} / \textsc{ssm} & 77.4 &  69.7\\
441 |           % \textsc{gated} / \textsc{ssm} & 77.4 &  77.7\\
442 |     \bottomrule
443 |     \end{tabular}
444 |     \caption{}
445 |     \label{tab:synthetic}
446 | \end{table}
447 | \end{frame}
448 | 
449 | 
450 | 
451 | % \begin{frame}{Does this work}
452 |     
453 | % \end{frame}
454 | 
455 | 
456 | 
457 | \begin{frame}{\structure{Proposed Fix}: Multiplicative Gating}
458 | 
459 | Add dynamism to stacked model with multiplicative gating.
460 | 
461 | $$\sigma(\mathbf{W} \mathbf{u}) \otimes (\mathbf{V} \mathbf{u})$$
462 | 
463 | Positive results with CNN, Transformer, and SSM models.
464 | 
465 | 
466 | \blfootnote{\cite{dauphin2017language, shazeer2020glu, narang2021transformer}}
467 | 
468 | \end{frame}
469 | 
470 | \begin{frame}{Proposed Architecture: BiGS}
471 | \begin{figure}
472 |     \centering
473 |     \includegraphics[height=0.7\textheight,trim={16cm 0 0 0},clip]{Figs/model_architecture_comparison2.pdf}
474 |     \caption{}
475 |     \label{fig:my_label}
476 | \end{figure}
477 | \end{frame}
478 | 
479 | \begin{frame}{Gating Adaptation}
480 |    \centerline{\textcolor{red}{What percentage of farmland grows wheat?}}
481 |     
482 |     \centerline{$\sim \sim \sim $}
483 | 
484 |     \centerline{\textcolor{olivegreen}{More than 50\% of this area is sown for wheat and 33\% for barley.}}
485 | 
486 | 
487 |     \begin{table}[t]
488 | \center
489 |     \begin{tabular}{lcc}
490 |     \toprule
491 |     \centering
492 |      Arch  & \textcolor{red}{H} P &   \textcolor{red}{H} $\sim$ P \\      
493 |     \midrule
494 |                \textsc{stack} / \textsc{ssm} & 77.4 &  69.7\\
495 |          \textsc{gated} / \textsc{ssm} & 77.4 &  77.7\\
496 |     \bottomrule
497 |     \end{tabular}
498 |     \caption{ }
499 |     \label{tab:synthetic}
500 | \end{table}
501 | \pause 
502 | 
503 | 
504 | \end{frame}
505 | 
506 | \begin{frame}{Full Experiment: QNLI}
507 | 
508 | Preview: Experimental results, pretraining for QNLI.
509 | 
510 | \begin{figure}
511 |     \centering
512 |     \includegraphics[height=0.7\textheight]{Figs/graph.png}
513 |     \label{fig:my_label}
514 | \end{figure}
515 | \end{frame}
516 | 
517 | \begin{frame}{Related Result: Induction Heads (H3)}
518 |     Synthetic \structure{induction head} experiment from \cite{dao2022hungry}
519 | 
520 |     \vspace{0.5cm}
521 |     
522 |     \centerline{a b c d e $\Rightarrow$ f g h i . . . x y z $\Rightarrow$  \ \ \ \  \textcolor{red}{f} }
523 | 
524 |     \begin{table}[t]
525 | \center
526 |     \begin{tabular}{lcc}
527 |     \toprule
528 |     \centering
529 |      Arch  & Induction \\    
530 |     \midrule
531 |                 \textsc{ssm} & 35.6 \\
532 |          \textsc{gating} + \textsc{ssm} & 100\\
533 |          \textsc{attention} & 100\\
534 |     \bottomrule
535 |     \end{tabular}
536 |     \caption{ }
537 |     \label{tab:synthetic}
538 | \end{table}
539 | \end{frame}
540 | 
541 | 
542 | \begin{frame}{Induction Heads}
543 | 
544 | \begin{columns}
545 |     \begin{column}{0.5\textwidth}
546 |     \begin{figure}
547 |         \centering
548 |         \includegraphics[height=0.8\textheight]{Figs/induct.png}
549 |         
550 |         \label{fig:my_label}
551 |     \end{figure}        
552 |     \end{column}    
553 |     \begin{column}{0.5\textwidth}
554 |     \begin{figure}
555 |         \centering
556 | 
557 |         \includegraphics[height=0.3\textheight]{Figs/RASP.png}
558 |         \label{fig:my_label}
559 |     \end{figure}        
560 |     \end{column}    
561 | \end{columns}
562 | 
563 | \end{frame}
564 | 
565 | 
566 | 
567 | % \begin{frame}{Gating}
568 | 
569 | % \end{frame}
570 | 
571 | % \begin{frame}{Simpler multiplicative Interactions}
572 | % \begin{figure}
573 | %     \centering
574 | %     % \includegraphics{Figs/model_architecture_comparison2.pdf}
575 | %     \caption{Caption}
576 | %     \label{fig:my_label}
577 | % \end{figure}
578 | % \end{frame}
579 | 
580 | \section{Experiments}
581 | 
582 | \begin{frame}{Outline}
583 |     \tableofcontents[currentsection]
584 | \end{frame}
585 | 
586 | 
587 | \begin{frame}{\structure{Experiment 1:} BERT}
588 | \begin{itemize}
589 |     \item Models trained using ``24 Hour'' BERT \cite{izsak2021train}
590 |     \begin{itemize}
591 |         \item All BERT-Large Size
592 |         \item Training length (Short 11B, Medium 22B, Full >100B)
593 |         \item 128 Length Sequences
594 |     \end{itemize}
595 |     
596 |     \item Codebase in JAX (from Annotated S4 {\small \cite{rush2022s4}}) using S4D
597 |     \item Training data and masking is identical
598 | \end{itemize}
599 | \end{frame}
600 | 
601 | % \begin{frame}{Short Training $\sim$11B Tokens}
602 | %     \begin{table}
603 | %     \begin{tabular}{lc}
604 | %         \toprule
605 | %         Model & GLUE (Dev)\\
606 | %         \midrule 
607 | %          BERT &  84.1\\ 
608 | %          Stacked-SSM & 77.2 \\ 
609 | %          BiGS & 84.0 \\
610 | %         \bottomrule
611 | %     \end{tabular}
612 | %     \end{table}
613 | % \end{frame}
614 | 
615 | \begin{frame}{Short Training $\sim$11B Tokens}
616 |     \begin{table}
617 |     \begin{tabular}{lc}
618 |         \toprule
619 |         Model & GLUE (Dev)\\
620 |         \midrule 
621 |          ELMo & 68.7 \\
622 |          BERT &  84.1\\ 
623 |          Stacked-SSM & 77.2 \\ 
624 |          BiGS & 84.0 \\
625 |         \bottomrule
626 |     \end{tabular}
627 |     \end{table}
628 | \end{frame}
629 | 
630 | \begin{frame}{Is it just Gating?}
631 |     \begin{table}
632 |     \begin{tabular}{lc}
633 |         \toprule
634 |         Model & GLUE \\
635 |         \midrule 
636 |          BERT &  84.1\\ 
637 |          Gated-BERT &  82.6 \\ 
638 |         \bottomrule
639 |     \end{tabular}
640 |     \end{table}
641 | \end{frame}
642 | 
643 | 
644 | \begin{frame}{BERT Large > 100B Tokens}
645 |     \begin{table}
646 |     \begin{tabular}{lc}
647 |         \toprule
648 |         Model & GLUE (Test)\\
649 |         \midrule 
650 |          BERT-Large^* &  83.0\\ 
651 |          BiGS & 83.0 \\
652 |         \bottomrule
653 |     \end{tabular}
654 |     \end{table}
655 |     \centerline{$^*$Best reported BERT-Large Results.}
656 | \end{frame}
657 | 
658 | \begin{frame}{Analysis: Masked PPL Transfer}
659 |     \begin{figure}
660 |     \centering
661 |     \includegraphics[width=0.6\textwidth]{Figs/MNLI.png}
662 | \end{figure}
663 | \end{frame}
664 | 
665 | \begin{frame}{Analysis: Kernel Visualization}
666 | 
667 | 
668 | \begin{figure}
669 |     \centering
670 |     \includegraphics[width=\textwidth]{Figs/kernel1.png}
671 | \end{figure}
672 | 
673 | \begin{itemize}
674 |     \item Each BiGS layer only has 2 kernels (forward / backward). 
675 |     \item Shows \structure{all routing} in layer 2! (vs $O(HT^2)$ attention coef.)
676 | \end{itemize}
677 | \end{frame}
678 | 
679 | \begin{frame}{Analysis: All Kernels}
680 | \begin{figure}
681 |     \centering
682 |     \includegraphics[height=0.6\textheight]{Figs/kernel2.png}
683 | \end{figure}
684 | \end{frame}
685 | 
686 | \begin{frame}{Analysis: Change in Kernels during Finetuning }
687 | 
688 | \centerline{Task: MNLI}
689 | \begin{figure}
690 |     \centering
691 | \includegraphics[width=0.8\textwidth]{Figs/comparison_results.png}
692 |     \end{figure}
693 | \end{frame}
694 | 
695 | \begin{frame}{Analysis: Syntax}
696 | \begin{itemize}
697 |     \item Observation: SSM model seems to do better on syntax-centric tasks 
698 |     \item Hypothesis: Locality of features encourages a stack-like inductive bias. 
699 | \end{itemize}
700 | \end{frame}
701 | 
702 | \begin{frame}{\structure{Observation 1}: COLA}
703 |     \begin{table}
704 |     \begin{tabular}{lc}
705 |         \toprule
706 |         Model & COLA \\
707 |         \midrule 
708 |          BERT &  60.5\\ 
709 |          BiGS &  64.7 \\ 
710 |         \bottomrule
711 |     \end{tabular}
712 |     \end{table}
713 |     Statistically significant across runs.
714 | \end{frame}
715 | 
716 | 
717 | \begin{frame}{\structure{Observation 2}: Agreement Attractors}
718 |     Task from \cite{linzen2016assessing,goldberg2019assessing}.
719 | \vspace{0.5cm}
720 | 
721 |     \begin{quote}        
722 |     Yet the \textbf{ratio} of \underline{men} who survive to the \underline{women} and \underline{children} who survive [is] not clear in this story
723 |     \end{quote}
724 | 
725 |     \begin{figure}
726 |         \centering
727 |         \includegraphics[height=0.5\textheight]{Figs/attractors.png}
728 |         \label{fig:my_label}
729 |     \end{figure}
730 |     
731 | \end{frame}
732 | 
733 | \begin{frame}{\structure{Observation 3}: Diagnostics }
734 | From \cite{marvin2018targeted,goldberg2019assessing}:
735 | \begin{table}[t]
736 | \centering
737 | \scriptsize
738 | \begin{tabular}{lrrr}
739 | \toprule
740 | & BiGS & BERT & LSTM  \\
741 | \midrule
742 | \textsl{SUBJECT-VERB:}    &       &      &        \\
743 | Simple                              & 100.0 & 100.0& 94.0    \\
744 | Sentential complement          & 85.1  & 85.6 & 99.0   \\
745 | Short VP coordination               & 91.0  & 86.5 & 90.0    \\
746 | Long VP coordination                & 97.5  & 97.5 & 61.0    \\ 
747 | Across prep phrase       & 88.6  & 84.8 & 57.0  \\ 
748 | Across subj relative clause    & 88.4  & 84.9 & 56.0   \\
749 | Across obj relative clause    & 89.9  & 85.1 & 50.0  \\
750 | Across obj relative (-that) & 86.9  & 81.1 & 52.0  \\ 
751 | In  obj relative clause        & 97.2  & 99.1 & 84.0  \\
752 | In obj relative (-that)     & 88.7  & 81.6 & 71.0  \\
753 | \midrule
754 | \textsl{REFL ANAPHORA:}        &       &      &             \\
755 | Simple                              & 97.1  & 98.9 & 83.0    \\
756 | In a sentential complement          & 79.9  & 86.2 & 86.0   \\
757 | Across a relative clause            & 79.1  & 75.9 & 55.0  \\
758 | \bottomrule
759 | \end{tabular}
760 | \end{table}
761 | \end{frame}
762 | 
763 | 
764 | \begin{frame}{\structure{Experiment 2:} Longformer}
765 | \begin{itemize}
766 |     \item Can we lengthen SSM $L\rightarrow L'$ without approximation?
767 |     
768 |     \item Continued training based on Longformer protocol. 
769 |     
770 |     \item Two experimental scales
771 |     % \begin{itemize}
772 |     %     \item 128->512 SQuAD \cite{rajpurkar2016squad}
773 |     %     \item 128->4096 SCROLLS \cite{shaham2022scrolls}
774 |     % \end{itemize}
775 | \end{itemize}
776 | \end{frame}
777 | 
778 | % \begin{frame}{SQuAD}
779 | % \begin{table}[tb]
780 | %     \centering
781 | %     \begin{tabular}{ll|c}
782 | %     \toprule
783 | %            & & SQuAD 1.1 \\
784 | %     \midrule
785 | %          BERT & (512) & 90.9\\
786 | %          \midrule
787 | %          BERT &(128 $\rightarrow$ 512) & 87.3 \\
788 | %          BiGS &  (128 $\rightarrow$ 512) & 89.5 \\
789 | %     \bottomrule
790 | %     \end{tabular}
791 | %     \caption{ }
792 | %     \label{tab:squad}
793 | % \end{table}    
794 | % \end{frame}
795 | 
796 | 
797 | 
798 | \begin{frame}{SCROLLS}
799 |     \begin{table}[tb]
800 |     \centering
801 |     \begin{tabular}{lr|cc}
802 |     \toprule
803 |         & Length & QALT & CNLI \\
804 |     \midrule
805 |          LED  & 1024    &  26.6/27.2  & 73.4\\
806 |            & 4096    &  26.6/27.3  & 71.5\\
807 |            & 16384   &  25.8/25.4   & 71.5\\
808 |          \midrule
809 |          BART & 256  &  26.0/25.8 & 69.8\\
810 |           & 512  &  26.8/27.4 & 71.6\\
811 |           & 1024 &  26.0/25.9 & 77.4\\
812 |          \midrule
813 |          BiGS & 128  & 32.3/30.0 & 68.7 \\
814 |          % BiGS & 1024 &  & \\
815 |           & 4096 & 32.8/31.7 & 71.4 \\
816 |     \bottomrule
817 |     \end{tabular}
818 |     \caption{}
819 |     \label{tab:scroll}
820 | \end{table}
821 | \end{frame}
822 | 
823 | \begin{frame}{FLOPs}
824 | \begin{figure}
825 |     \centering
826 |     \includegraphics[height=0.5\textheight]{Figs/graph2.png}
827 | 
828 |     \label{fig:my_label}
829 | \end{figure}
830 | \end{frame}
831 | 
832 | \begin{frame}{Related Results: H3 - SSM For Language Modeling}
833 | \begin{itemize}
834 |     \item Alternative gating method for language modeling
835 |     \item Use 2 attention layers + SSM and reach Transformer PPL. 
836 |     \item Efficient implementation targeting on GPUs. 
837 | \end{itemize}
838 | 
839 |     \blfootnote{\cite{dao2022hungry}}
840 | \end{frame}
841 | 
842 | 
843 | % \section{Next Steps}
844 | % \begin{frame}{Outline}
845 | %     \tableofcontents[currentsection]
846 | % \end{frame}
847 | 
848 | 
849 | \begin{frame}{Next Steps}
850 | \begin{itemize}
851 |     \item Attention may not be required? Simpler routing + gating. 
852 |     \item More analysis on feed-forward contribution. 
853 |     \item Transfer from pretraining unclear.
854 | \end{itemize}
855 | \end{frame}
856 | 
857 | % \begin{frame}
858 | %     \includegraphics[height=\textheight]{Figs/ModelSize0.jpg}
859 | % \end{frame}
860 | 
861 | 
862 | % \input{slides/bullets}
863 | % \input{slides/split}
864 | % \input{slides/figure}
865 | % \input{slides/centered}
866 | % \input{slides/monospace}
867 | % \input{slides/brackets}
868 | % \input{slides/link}
869 | \begin{frame}[allowframebreaks, label=current]
870 |         \frametitle{References}
871 |         \footnotesize
872 |         \bibliographystyle{apalike}
873 |         \bibliography{anthology.bib}
874 |         \bibliography{ssm.bib}
875 | 
876 | \end{frame}
877 | \end{document}
878 | 


--------------------------------------------------------------------------------
/p-notes.tex:
--------------------------------------------------------------------------------
1 | \newif\ifnotes\notestrue\input{presentation.tex}
2 | 


--------------------------------------------------------------------------------
/p.tex:
--------------------------------------------------------------------------------
1 | % Auriga theme
2 | % find the most up-to-date version here: https://github.com/anishathalye/auriga
3 | \newif\ifnotes\notesfalse\input{presentation.tex}
4 | 


--------------------------------------------------------------------------------
/presentation-netflix.tex:
--------------------------------------------------------------------------------
  1 |  % Auriga theme
  2 | % find the most up-to-date version here: https://github.com/anishathalye/auriga
  3 | 
  4 | \documentclass[14pt,aspectratio=169]{beamer}
  5 | \usepackage{pgfpages}
  6 | \usepackage{fancyvrb}
  7 | \usepackage{tikz}
  8 | \usepackage{pgfplots}
  9 | \usepackage{booktabs}
 10 | 
 11 | \usetheme{auriga}
 12 | \usecolortheme{auriga}
 13 | \setbeamercolor{math text}{fg=blue}
 14 | 
 15 | \newcommand\blfootnote[1]{%
 16 | \begingroup
 17 | \renewcommand\thefootnote{}\footnote{#1}%
 18 | \addtocounter{footnote}{-1}%
 19 | \endgroup
 20 | }
 21 | 
 22 | %\setbeamertemplate{footline}[]
 23 | %\renewcommand\footnotemark{}
 24 | 
 25 | 
 26 | % define some colors for a consistent theme across slides
 27 | \definecolor{red}{RGB}{181, 23, 0}
 28 | \definecolor{blue}{RGB}{0, 118, 186}
 29 | \definecolor{gray}{RGB}{146, 146, 146}
 30 | 
 31 | \title{Pretraining Without Attention}
 32 | 
 33 | \author{Junxiong Wang  \and Jing Nathan Yan  \and Albert Gu  \and \underline{Sasha Rush} \inst{*}}
 34 | 
 35 | \institute[shortinst]{\inst{*} Preprint}
 36 | 
 37 | \begin{document}
 38 | 
 39 | {
 40 |   % rather than use the frame options [noframenumbering,plain], we make the
 41 |   % color match, so that the indicated page numbers match PDF page numbers
 42 |   \setbeamercolor{page number in head/foot}{fg=background canvas.bg}
 43 |   \begin{frame}
 44 |     \titlepage
 45 |   \end{frame}
 46 | }
 47 | 
 48 | % \begin{frame}{Introduction - Sasha Rush}
 49 | %     \begin{itemize}
 50 | %         \item \structure{Associate Professor} - Cornell Tech
 51 | %         \item \structure{Researcher} - Hugging Face
 52 | %         \item \structure{Open Source Machine Learning} - @srush 
 53 | %     \end{itemize}
 54 | % \end{frame}
 55 | 
 56 | 
 57 | % \begin{frame}{Transformer}
 58 | %     \begin{figure}
 59 | %         \centering
 60 | %     \includegraphics[height=0.6\textheight]
 61 | % {Figs/transformer.png}
 62 | %     \end{figure}
 63 | % \end{frame}
 64 | 
 65 | % \begin{frame}{Transformer Self-Attention}
 66 | %     \begin{figure}
 67 | %         \centering
 68 | %     \includegraphics[height=0.8\textheight]
 69 | % {Figs/attention.png}
 70 | %     \end{figure}
 71 | % \end{frame}
 72 | 
 73 | \section{Context}
 74 | % \begin{frame}{Outline}
 75 | %     \tableofcontents
 76 | % \end{frame}
 77 | \begin{frame}
 78 |     \includegraphics[trim={10cm 0 10cm 0}, clip, height=\textheight]{Figs/ModelSize2.png}
 79 | \end{frame}
 80 | 
 81 | 
 82 | \begin{frame}
 83 |     \includegraphics[width=\textwidth]{Figs/Banana.png}
 84 | \end{frame}
 85 | 
 86 | % \begin{frame}
 87 | %     \includegraphics[width=\textwidth]{Figs/llama.png}
 88 | % \end{frame}
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | \begin{frame}{Caveats}
 95 |     \begin{itemize}
 96 |         \item LLMs are remarkable, we should use them for most things
 97 |         \item This talk is \structure{not} about LLMs 
 98 |     \end{itemize}
 99 | \end{frame}
100 | 
101 | 
102 | 
103 | 
104 | \begin{frame}
105 |     \includegraphics[trim={10cm 0 10cm 0}, clip, height=\textheight]{Figs/ModelSize3.png}
106 | \end{frame}
107 | 
108 | \begin{frame}{Context}
109 |     \begin{itemize}
110 |         \item BERT used to require non-trivial compute 
111 |         \item Belief: Open architecture questions in NLP
112 |         \item Today's Talk: How important is \textit{attention}?
113 |     \end{itemize}
114 | \end{frame}
115 | 
116 | 
117 | \begin{frame}{\textcolor{red}{ELMo} }
118 | 
119 |     \begin{columns}
120 |     \begin{column}{0.3\linewidth}
121 |     \centerline{Bidirectional RNN}
122 |     \end{column}
123 |     \begin{column}{0.7\linewidth}
124 |     
125 |     \begin{figure}
126 |     \includegraphics[width=0.8\textwidth]{Figs/elmo.png}
127 |     \end{figure}
128 |     \end{column}
129 |     \end{columns}
130 |     \blfootnote{\cite{DBLP:conf/naacl/PetersNIGCLZ18}}
131 | 
132 | \end{frame}
133 | 
134 | \begin{frame}{\textcolor{red}{ELMo} For Pretraining}
135 |     \begin{table}
136 |     \begin{tabular}{lc}
137 |         \toprule
138 |         Model & GLUE\\
139 |         \midrule 
140 |          ELMo& 67.7  \\
141 |          ELMo+Attn&  71.0\\ 
142 |          \visible<2>{BERT-Base & 79 - 83} \\
143 |         \bottomrule
144 |     \end{tabular}
145 |     \end{table}
146 | \blfootnote{\cite{DBLP:conf/naacl/PetersNIGCLZ18, devlin2018bert}}
147 | \end{frame}
148 | 
149 | \begin{frame}{Architecture?}
150 |     \begin{itemize}
151 |         \item 
152 |     Several confounding differences, e.g. frozen model.
153 |        \item Followup: \textit{To Tune or Not to Tune? Adapting Pretrained Representations to Diverse Tasks} \cite{peters2019tune}
154 |     \pause
155 | 
156 |      \item  Conclusion: Transformers significantly beat BiLSTMs
157 |     \end{itemize}
158 | \end{frame}
159 | 
160 | \begin{frame}{Other Models}
161 | 
162 |     Maybe there are other models
163 | 
164 |     \vspace{0.5cm}
165 | 
166 |     \begin{itemize}
167 |         \item Convolutions?
168 |         \item Mixers?
169 |     \end{itemize}
170 |     
171 | %     \textit{Are Pre-trained Convolutions Better than Pre-trained Transformers?}
172 | % \\
173 | % \\
174 | %     Answer: No. 
175 | 
176 | \end{frame}
177 | 
178 | \begin{frame}{Pretraining with CNNs}
179 |     \textit{Are Pre-trained Convolutions Better than Pre-trained Transformers?} \cite{tay2020efficient}
180 | 
181 |     \vspace{0.5cm}
182 |     
183 |     \visible<2>{\structure{Answer: No.}
184 | 
185 |     \begin{table}
186 |     \begin{tabular}{lc}
187 |         \toprule
188 |         Model & SST-2\\
189 |         \midrule 
190 |          ELMo &  91.8 \\ 
191 |          Best CNN & 92.2  \\
192 |          BERT-Base & 93.5 \\ 
193 |         \bottomrule
194 |     \end{tabular}
195 |     \end{table}
196 |     
197 |     }
198 |   
199 | \end{frame}
200 | 
201 | 
202 | % \begin{frame}{Results: CNNs}
203 | %     \begin{table}
204 | %     \begin{tabular}{lc}
205 | %         \toprule
206 | %         Model & SST-2\\
207 | %         \midrule 
208 | %          Best CNN & 92.2  \\
209 | %          ELMo &  91.8 \\ 
210 | %          BERT-Base & 93.5 \\ 
211 | %         \bottomrule
212 | %     \end{tabular}
213 | %     \end{table}
214 | % \end{frame}
215 | 
216 | \begin{frame}{Pretraining with FNet}
217 |     \textit{FNet: Mixing Tokens with Fourier Transforms} \cite{lee2021fnet}
218 | 
219 |     \vspace{0.5cm}
220 |     
221 |     Replaces attention with 2D FFT mixing-layer.
222 | 
223 |     \visible<2>{
224 |     \begin{table}
225 |     \begin{tabular}{lc}
226 |         \toprule
227 |         Model & GLUE (dev)\\
228 |         \midrule 
229 |          Best FNet & 76.3  \\
230 |          BERT-Base & 83.3 \\ 
231 |         \bottomrule
232 |     \end{tabular}
233 |     \end{table}
234 |     }
235 | \end{frame}
236 | 
237 | 
238 | 
239 | \begin{frame}{Transformers are Great...}    
240 |     \begin{itemize}
241 |         \item Highly optimized training 
242 |         \item Long-range ability
243 |         \item Expensive $O(n^2)$, but we have the money...
244 |     \end{itemize}
245 |     \vspace{0.5cm}
246 | 
247 |     \visible<2>{(But aren't you curious...)}
248 | \end{frame}
249 | 
250 | \section{State Space Models}
251 | \begin{frame}{Outline}
252 |     \tableofcontents[currentsection]
253 | \end{frame}
254 | 
255 | 
256 | \begin{frame}{State Space Models (SSM)}
257 |     \begin{itemize}
258 |         
259 |         \item Think hybrid RNN / CNN 
260 |         
261 |         \item SOTA on speech generation and long-range tasks 
262 | 
263 |         \item Tutorial at \textit{The Annotated S4}
264 |     \end{itemize}
265 | 
266 |     \blfootnote{\cite{gu2020hippo,gu2021combining,gu2021efficiently}}
267 | \end{frame}
268 | 
269 | 
270 | \begin{frame}{State Space Model - Continuous Time}
271 |     Let $u(t) \in \mathbb{R}$ be a continuous input and $y(t) \in \mathbb{R}$ be output. 
272 | 
273 | \pause
274 | \vspace{0.5cm}
275 | 
276 | SSM is a differential equation.
277 | \begin{align*}
278 |     \boldsymbol{x}'(t) &= \boldsymbol{A}\boldsymbol{x}(t) + \boldsymbol{B}u(t) \\  
279 |     y(t) &= \boldsymbol{C}\boldsymbol{x}(t) + \boldsymbol{D}u(t).
280 | \end{align*}
281 | 
282 | \pause 
283 | Where $\boldsymbol{x}(t) \in \mathbf{R}^N$ is a hidden state and model \structure{parameters},
284 | 
285 | $$\boldsymbol{A} \in \mathbb{R}^{N\times N}, \boldsymbol{B}\in \mathbb{R}^{N \times 1}, \boldsymbol{C} \in \mathbb{R}^{1 \times N}, \boldsymbol{D} \in \mathbb{R}^{1\times 1}$$
286 | 
287 | \end{frame}
288 | \begin{frame}{Discrete Time Sequence}
289 | 
290 | Goal: Map scalar sequence $u_{1}, \ldots, u_L$ to $y_1, \ldots, y_L$,
291 | 
292 | \begin{figure}
293 |     \centering
294 |     \includegraphics[width=0.5\textwidth]{Figs/SSMStart.pdf}
295 |     \label{fig:my_label}
296 | \end{figure}
297 | \end{frame}
298 | 
299 | \begin{frame}{Discrete Time SSM}
300 | 
301 | SSM on discretize time data,
302 | 
303 | \begin{align*}
304 | \boldsymbol{x}_{k} &= \boldsymbol{\overline{A}} \boldsymbol{x}_{k-1} + \boldsymbol{\overline{B}} u_k \\ 
305 | y_k &= \boldsymbol{\overline{C}} \boldsymbol{x}_{k \phantom{- 1}} + \boldsymbol{\overline{D}}  u_k. 
306 | \end{align*}
307 | 
308 | Using discretization with (learned) sampling rate parameter $\Delta$, 
309 | 
310 | $$\boldsymbol{\overline{A}}, \boldsymbol{\overline{B}}, \boldsymbol{\overline{C}}  = \text{discretize}(\boldsymbol{A}, \boldsymbol{B}, \boldsymbol{C}, \Delta )$$
311 | 
312 | \end{frame}
313 | 
314 | \begin{frame}{Recurrent Form}
315 | 
316 | Output sequence $y_1, \ldots, y_L$ can be computed as a linear RNN,
317 | 
318 | \begin{align*}
319 | \boldsymbol{x}_{k} &= \boldsymbol{\overline{A}} \boldsymbol{x}_{k-1} + \boldsymbol{\overline{B}} u_k \\ 
320 | y_k &= \boldsymbol{\overline{C}} \boldsymbol{x}_{k \phantom{- 1}} + \boldsymbol{\overline{D}}  u_k. 
321 | \end{align*}
322 | 
323 | Note $\boldsymbol{x}_k \in \mathbb{R}^N$ is the bigger hidden state for $u_k \in \mathbb{R}$, and $\boldsymbol{x}_0 = \mathbf{0}$.
324 | 
325 | \end{frame}
326 | 
327 | \begin{frame}{Convolutional Form}
328 | 
329 | Alternative: 1D convolution with kernel $\boldsymbol{\overline{K}}$ (width $L$),
330 | 
331 | \begin{align*}
332 | \overline{K} &= (\boldsymbol{\overline{C}}\boldsymbol{\overline{B}}, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}\boldsymbol{\overline{B}}, \dots, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}^{L-1}\boldsymbol{\overline{B}}) \\
333 | y &= \text{conv1d}(\overline{K}_L \ldots \overline{K}_1, u_1 \ldots u_L)
334 | \end{align*}
335 | 
336 | Intuition: 
337 | \pause
338 | $$y_1 = \boldsymbol{\overline{C}} \boldsymbol{\overline{B}} u_1$$ 
339 | \pause
340 | $$y_2 = \boldsymbol{\overline{C}} \boldsymbol{\overline{A}} \boldsymbol{\overline{B}} u_1 + \boldsymbol{\overline{C}} \boldsymbol{\overline{B}} u_2 = \boldsymbol{\overline{C}} (\boldsymbol{\overline{A}} \boldsymbol{\overline{B}} u_1 + \boldsymbol{\overline{B}} u_2) = \boldsymbol{\overline{C}} (\boldsymbol{x}_1 + \boldsymbol{\overline{B}} u_2) $$
341 | \end{frame}
342 | 
343 | \begin{frame}{Convolutional Form}
344 | Step 1: Discretize (Training Only). Step 2: Apply 1D Conv
345 | \begin{figure}
346 |     \centering
347 |     \includegraphics[width=0.6\textwidth]{Figs/SSMSide.pdf}
348 |     \label{fig:my_label}
349 | \end{figure}
350 | \end{frame}
351 | 
352 | \begin{frame}{Implementation - Computing Kernel}
353 |     
354 |     $$\boldsymbol{\overline{K}} = (\boldsymbol{\overline{C}}\boldsymbol{\overline{B}}, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}\boldsymbol{\overline{B}}, \dots, \boldsymbol{\overline{C}}\boldsymbol{\overline{A}}^{L-1}\boldsymbol{\overline{B}}) $$
355 |     
356 | \begin{itemize}    
357 |     \item Simple approximations work well (See S4D, DSS)   
358 | \end{itemize}
359 | \blfootnote{\cite{gu2021efficiently,gupta2022diagonal,gu2022parameterization}}
360 | \end{frame}
361 | 
362 | 
363 | \begin{frame}{Implementation - Fourier Transform}
364 | \begin{align*}
365 | &y = \boldsymbol{\overline{K}} \ast u
366 | \end{align*}
367 | \begin{itemize}
368 |     \item At long $L$, convolution computed with FFT.
369 |     \item More efficient than self-attention or standard RNN. 
370 | \end{itemize}
371 | \end{frame}
372 | 
373 | 
374 | \begin{frame}{Important Training Initialization}
375 | \begin{itemize}
376 | \item Parameter $\boldsymbol{A}$ is initialized with  HiPPO Matrix \cite{gu2020hippo}
377 | 
378 | % \begin{scriptsize}
379 | % \begin{align*}  
380 | % \boldsymbol{A}_{nk}= -
381 | % \begin{cases} 
382 | % (2n+1)^{1/2}(2k+1)^{1/2} & \text{if } n > k \\ n+1 &\text{if } n=k \text{\ else\ } 0
383 | % \end{cases} 
384 | % \end{align*}
385 | % \end{scriptsize}
386 | 
387 | 
388 |     \item Kernel formed by Legendre coefficients 
389 | \end{itemize}
390 | \begin{figure}
391 |     \centering
392 |     \includegraphics[width=0.7\textwidth]{Figs/hippo.png}
393 | \end{figure}
394 | \end{frame}
395 | 
396 | 
397 | 
398 | \begin{frame}{Summary: SSM}
399 |     \begin{itemize}
400 |         \item Mapping from sequence-to-sequence
401 |         \item Acts like an RNN, Computed like a CNN
402 |         \item Fast to train and utilize
403 |     \end{itemize}
404 | \end{frame}
405 | 
406 | \section{Model Architectures}
407 | \begin{frame}{Outline}
408 |     \tableofcontents[currentsection]
409 | \end{frame}
410 | 
411 | \begin{frame}{Objective: Replicate BERT with SSM}
412 | \begin{itemize}
413 |     \item Everything else identical (loss, number of parameters, data) 
414 | \end{itemize}
415 | \end{frame}
416 | 
417 | % \begin{frame}{Architectures for Pretraining}
418 | %     \begin{itemize}
419 | %         \item Idea 1: Just replace self-attention
420 | %         \item Minimal change to Transformer arch 
421 | %     \end{itemize}
422 | % \end{frame}
423 | 
424 | 
425 | \begin{frame}{\structure{Naive Idea}: Self-attention $\Rightarrow$ SSM}
426 | \begin{figure}
427 |     \centering
428 |     \includegraphics[height=0.8\textheight,trim={0 0 18cm 0},clip]{Figs/model_architecture_comparison2.pdf}
429 |     \caption{}
430 |     \label{}
431 | \end{figure}
432 | \end{frame}
433 | 
434 | \begin{frame}{Can this work?}
435 | \begin{itemize}
436 |     \item SSM is significantly less expressive than self-attention.     
437 |     \item Static routing through the model like a CNN.
438 |     \item Can it learn to do \structure{matching} across sentences?
439 | \end{itemize}
440 | \pause 
441 | \vspace{0.5cm}
442 | 
443 | 
444 | 
445 | 
446 | \end{frame}
447 | 
448 | 
449 | \begin{frame}{Test: Matching Across Gaps}
450 | \centerline{Task: QNLI \cite{wang2018glue}}
451 | \vspace{0.5cm}
452 | 
453 |    
454 |     \centerline{\textcolor{red}{What percentage of farmland grows wheat?}}
455 |     
456 |     \centerline{$\sim \sim \sim $}
457 | 
458 |     \centerline{\textcolor{olivegreen}{More than 50\% of this area is sown for wheat and 33\% for barley.}}
459 |     
460 |     \pause
461 |     
462 |     \begin{table}[t]
463 | \center
464 |     \begin{tabular}{ccc}
465 |     \toprule
466 |     \centering
467 |      Arch  & \textcolor{red}{H} P &   \textcolor{red}{H} $\sim$ P \\    
468 |     \midrule
469 |              \textsc{stack} / \textsc{ssm} & 77.4 &  69.7\\
470 |           % \textsc{gated} / \textsc{ssm} & 77.4 &  77.7\\
471 |     \bottomrule
472 |     \end{tabular}
473 |     \caption{}
474 |     \label{tab:synthetic}
475 | \end{table}
476 | \end{frame}
477 | 
478 | 
479 | 
480 | % \begin{frame}{Does this work}
481 |     
482 | % \end{frame}
483 | 
484 | 
485 | 
486 | \begin{frame}{\structure{Proposed Fix}: Multiplicative Gating}
487 | 
488 | Add dynamism to stacked model with multiplicative gating.
489 | 
490 | $$\sigma(\mathbf{W} \mathbf{u}) \otimes (\mathbf{V} \mathbf{u})$$
491 | 
492 | Positive results with CNN, Transformer, and SSM models.
493 | 
494 | 
495 | \blfootnote{\cite{dauphin2017language, shazeer2020glu, narang2021transformer}}
496 | 
497 | \end{frame}
498 | 
499 | \begin{frame}{Proposed Architecture: BiGS}
500 | \begin{figure}
501 |     \centering
502 |     \includegraphics[height=0.7\textheight,trim={16cm 0 0 0},clip]{Figs/model_architecture_comparison2.pdf}
503 |     \caption{}
504 |     \label{fig:my_label}
505 | \end{figure}
506 | \end{frame}
507 | 
508 | \begin{frame}{Gating Adaptation}
509 |    \centerline{\textcolor{red}{What percentage of farmland grows wheat?}}
510 |     
511 |     \centerline{$\sim \sim \sim $}
512 | 
513 |     \centerline{\textcolor{olivegreen}{More than 50\% of this area is sown for wheat and 33\% for barley.}}
514 | 
515 | 
516 |     \begin{table}[t]
517 | \center
518 |     \begin{tabular}{lcc}
519 |     \toprule
520 |     \centering
521 |      Arch  & \textcolor{red}{H} P &   \textcolor{red}{H} $\sim$ P \\      
522 |     \midrule
523 |                \textsc{stack} / \textsc{ssm} & 77.4 &  69.7\\
524 |          \textsc{gated} / \textsc{ssm} & 77.4 &  77.7\\
525 |     \bottomrule
526 |     \end{tabular}
527 |     \caption{ }
528 |     \label{tab:synthetic}
529 | \end{table}
530 | \pause 
531 | 
532 | 
533 | \end{frame}
534 | 
535 | \begin{frame}{Full Experiment: QNLI}
536 | 
537 | Preview: Experimental results, pretraining for QNLI.
538 | 
539 | \begin{figure}
540 |     \centering
541 |     \includegraphics[height=0.7\textheight]{Figs/graph.png}
542 |     \label{fig:my_label}
543 | \end{figure}
544 | \end{frame}
545 | 
546 | \begin{frame}{Related Result: Induction Heads (H3)}
547 |     Synthetic \structure{induction head} experiment from \cite{dao2022hungry}
548 | 
549 |     \vspace{0.5cm}
550 |     
551 |     \centerline{a b c d e $\Rightarrow$ f g h i . . . x y z $\Rightarrow$  \ \ \ \  \textcolor{red}{f} }
552 | 
553 |     \begin{table}[t]
554 | \center
555 |     \begin{tabular}{lcc}
556 |     \toprule
557 |     \centering
558 |      Arch  & Induction \\    
559 |     \midrule
560 |                 \textsc{ssm} & 35.6 \\
561 |          \textsc{gating} + \textsc{ssm} & 100\\
562 |          \textsc{attention} & 100\\
563 |     \bottomrule
564 |     \end{tabular}
565 |     \caption{ }
566 |     \label{tab:synthetic}
567 | \end{table}
568 | \end{frame}
569 | 
570 | 
571 | \begin{frame}{Induction Heads}
572 | 
573 | \begin{columns}
574 |     \begin{column}{0.5\textwidth}
575 |     \begin{figure}
576 |         \centering
577 |         \includegraphics[height=0.8\textheight]{Figs/induct.png}
578 |         
579 |         \label{fig:my_label}
580 |     \end{figure}        
581 |     \end{column}    
582 |     \begin{column}{0.5\textwidth}
583 |     \begin{figure}
584 |         \centering
585 | 
586 |         \includegraphics[height=0.3\textheight]{Figs/RASP.png}
587 |         \label{fig:my_label}
588 |     \end{figure}        
589 |     \end{column}    
590 | \end{columns}
591 | 
592 | \end{frame}
593 | 
594 | 
595 | 
596 | % \begin{frame}{Gating}
597 | 
598 | % \end{frame}
599 | 
600 | % \begin{frame}{Simpler multiplicative Interactions}
601 | % \begin{figure}
602 | %     \centering
603 | %     % \includegraphics{Figs/model_architecture_comparison2.pdf}
604 | %     \caption{Caption}
605 | %     \label{fig:my_label}
606 | % \end{figure}
607 | % \end{frame}
608 | 
609 | \section{Experiments}
610 | 
611 | \begin{frame}{Outline}
612 |     \tableofcontents[currentsection]
613 | \end{frame}
614 | 
615 | 
616 | \begin{frame}{\structure{Experiment 1:} BERT}
617 | \begin{itemize}
618 |     \item Models trained using ``24 Hour'' BERT \cite{izsak2021train}
619 |     \begin{itemize}
620 |         \item All BERT-Large Size
621 |         \item Training length (Short 11B, Medium 22B, Full >100B)
622 |         \item 128 Length Sequences
623 |     \end{itemize}
624 |     
625 |     \item Codebase in JAX (from Annotated S4 {\small \cite{rush2022s4}}) using S4D
626 |     \item Training data and masking is identical
627 | \end{itemize}
628 | \end{frame}
629 | 
630 | % \begin{frame}{Short Training $\sim$11B Tokens}
631 | %     \begin{table}
632 | %     \begin{tabular}{lc}
633 | %         \toprule
634 | %         Model & GLUE (Dev)\\
635 | %         \midrule 
636 | %          BERT &  84.1\\ 
637 | %          Stacked-SSM & 77.2 \\ 
638 | %          BiGS & 84.0 \\
639 | %         \bottomrule
640 | %     \end{tabular}
641 | %     \end{table}
642 | % \end{frame}
643 | 
644 | \begin{frame}{Short Training $\sim$11B Tokens}
645 |     \begin{table}
646 |     \begin{tabular}{lc}
647 |         \toprule
648 |         Model & GLUE (Dev)\\
649 |         \midrule 
650 |          ELMo & 68.7 \\
651 |          BERT &  84.1\\ 
652 |          Stacked-SSM & 77.2 \\ 
653 |          BiGS & 84.0 \\
654 |         \bottomrule
655 |     \end{tabular}
656 |     \end{table}
657 | \end{frame}
658 | 
659 | \begin{frame}{Is it just Gating?}
660 |     \begin{table}
661 |     \begin{tabular}{lc}
662 |         \toprule
663 |         Model & GLUE \\
664 |         \midrule 
665 |          BERT &  84.1\\ 
666 |          Gated-BERT &  82.6 \\ 
667 |         \bottomrule
668 |     \end{tabular}
669 |     \end{table}
670 | \end{frame}
671 | 
672 | 
673 | \begin{frame}{BERT Large > 100B Tokens}
674 |     \begin{table}
675 |     \begin{tabular}{lc}
676 |         \toprule
677 |         Model & GLUE (Test)\\
678 |         \midrule 
679 |          BERT-Large^* &  83.0\\ 
680 |          BiGS & 83.0 \\
681 |         \bottomrule
682 |     \end{tabular}
683 |     \end{table}
684 |     \centerline{$^*$Best reported BERT-Large Results.}
685 | \end{frame}
686 | 
687 | \begin{frame}{Analysis: Masked PPL Transfer}
688 |     \begin{figure}
689 |     \centering
690 |     \includegraphics[width=0.6\textwidth]{Figs/MNLI.png}
691 | \end{figure}
692 | \end{frame}
693 | 
694 | \begin{frame}{Analysis: Kernel Visualization}
695 | 
696 | 
697 | \begin{figure}
698 |     \centering
699 |     \includegraphics[width=\textwidth]{Figs/kernel1.png}
700 | \end{figure}
701 | 
702 | \begin{itemize}
703 |     \item Each BiGS layer only has 2 kernels (forward / backward). 
704 |     \item Shows \structure{all routing} in layer 2! (vs $O(HT^2)$ attention coef.)
705 | \end{itemize}
706 | 
707 | 
708 | \end{frame}
709 | 
710 | \begin{frame}{Analysis: All Kernels}
711 | \begin{figure}
712 |     \centering
713 |     \includegraphics[height=0.6\textheight]{Figs/kernel2.png}
714 | \end{figure}
715 | \end{frame}
716 | 
717 | \begin{frame}{Analysis: Change in Kernels during Finetuning }
718 | 
719 | \centerline{Task: MNLI}
720 | \begin{figure}
721 |     \centering
722 | \includegraphics[width=0.8\textwidth]{Figs/comparison_results.png}
723 |     \end{figure}
724 | \end{frame}
725 | 
726 | \begin{frame}{Analysis: Syntax}
727 | \begin{itemize}
728 |     \item Observation: SSM model seems to do better on syntax-centric tasks 
729 |     \item Hypothesis: Locality of features encourages a stack-like inductive bias. 
730 | \end{itemize}
731 | \end{frame}
732 | 
733 | \begin{frame}{\structure{Observation 1}: COLA}
734 |     \begin{table}
735 |     \begin{tabular}{lc}
736 |         \toprule
737 |         Model & COLA \\
738 |         \midrule 
739 |          BERT &  60.5\\ 
740 |          BiGS &  64.7 \\ 
741 |         \bottomrule
742 |     \end{tabular}
743 |     \end{table}
744 |     Statistically significant across runs.
745 | \end{frame}
746 | 
747 | 
748 | \begin{frame}{\structure{Observation 2}: Agreement Attractors}
749 |     Task from \cite{linzen2016assessing,goldberg2019assessing}.
750 | \vspace{0.5cm}
751 | 
752 |     \begin{quote}        
753 |     Yet the \textbf{ratio} of \underline{men} who survive to the \underline{women} and \underline{children} who survive [is] not clear in this story
754 |     \end{quote}
755 | 
756 |     \begin{figure}
757 |         \centering
758 |         \includegraphics[height=0.5\textheight]{Figs/attractors.png}
759 |         \label{fig:my_label}
760 |     \end{figure}
761 |     
762 | \end{frame}
763 | 
764 | \begin{frame}{\structure{Observation 3}: Diagnostics }
765 | From \cite{marvin2018targeted,goldberg2019assessing}:
766 | \begin{table}[t]
767 | \centering
768 | \scriptsize
769 | \begin{tabular}{lrrr}
770 | \toprule
771 | & BiGS & BERT & LSTM  \\
772 | \midrule
773 | \textsl{SUBJECT-VERB:}    &       &      &        \\
774 | Simple                              & 100.0 & 100.0& 94.0    \\
775 | Sentential complement          & 85.1  & 85.6 & 99.0   \\
776 | Short VP coordination               & 91.0  & 86.5 & 90.0    \\
777 | Long VP coordination                & 97.5  & 97.5 & 61.0    \\ 
778 | Across prep phrase       & 88.6  & 84.8 & 57.0  \\ 
779 | Across subj relative clause    & 88.4  & 84.9 & 56.0   \\
780 | Across obj relative clause    & 89.9  & 85.1 & 50.0  \\
781 | Across obj relative (-that) & 86.9  & 81.1 & 52.0  \\ 
782 | In  obj relative clause        & 97.2  & 99.1 & 84.0  \\
783 | In obj relative (-that)     & 88.7  & 81.6 & 71.0  \\
784 | \midrule
785 | \textsl{REFL ANAPHORA:}        &       &      &             \\
786 | Simple                              & 97.1  & 98.9 & 83.0    \\
787 | In a sentential complement          & 79.9  & 86.2 & 86.0   \\
788 | Across a relative clause            & 79.1  & 75.9 & 55.0  \\
789 | \bottomrule
790 | \end{tabular}
791 | \end{table}
792 | \end{frame}
793 | 
794 | 
795 | \begin{frame}{\structure{Experiment 2:} Longformer}
796 | \begin{itemize}
797 |     \item Can we lengthen SSM $L\rightarrow L'$ without approximation?
798 |     
799 |     \item Continued training based on Longformer protocol. 
800 |     
801 |     \item Two experimental scales
802 |     % \begin{itemize}
803 |     %     \item 128->512 SQuAD \cite{rajpurkar2016squad}
804 |     %     \item 128->4096 SCROLLS \cite{shaham2022scrolls}
805 |     % \end{itemize}
806 | \end{itemize}
807 | \end{frame}
808 | 
809 | % \begin{frame}{SQuAD}
810 | % \begin{table}[tb]
811 | %     \centering
812 | %     \begin{tabular}{ll|c}
813 | %     \toprule
814 | %            & & SQuAD 1.1 \\
815 | %     \midrule
816 | %          BERT & (512) & 90.9\\
817 | %          \midrule
818 | %          BERT &(128 $\rightarrow$ 512) & 87.3 \\
819 | %          BiGS &  (128 $\rightarrow$ 512) & 89.5 \\
820 | %     \bottomrule
821 | %     \end{tabular}
822 | %     \caption{ }
823 | %     \label{tab:squad}
824 | % \end{table}    
825 | % \end{frame}
826 | 
827 | 
828 | 
829 | \begin{frame}{SCROLLS}
830 |     \begin{table}[tb]
831 |     \centering
832 |     \begin{tabular}{lr|cc}
833 |     \toprule
834 |         & Length & QALT & CNLI \\
835 |     \midrule
836 |          LED  & 1024    &  26.6/27.2  & 73.4\\
837 |            & 4096    &  26.6/27.3  & 71.5\\
838 |            & 16384   &  25.8/25.4   & 71.5\\
839 |          \midrule
840 |          BART & 256  &  26.0/25.8 & 69.8\\
841 |           & 512  &  26.8/27.4 & 71.6\\
842 |           & 1024 &  26.0/25.9 & 77.4\\
843 |          \midrule
844 |          BiGS & 128  & 32.3/30.0 & 68.7 \\
845 |          % BiGS & 1024 &  & \\
846 |           & 4096 & 32.8/31.7 & 71.4 \\
847 |     \bottomrule
848 |     \end{tabular}
849 |     \caption{}
850 |     \label{tab:scroll}
851 | \end{table}
852 | \end{frame}
853 | 
854 | \begin{frame}{FLOPs}
855 | \begin{figure}
856 |     \centering
857 |     \includegraphics[height=0.5\textheight]{Figs/graph2.png}
858 | 
859 |     \label{fig:my_label}
860 | \end{figure}
861 | \end{frame}
862 | 
863 | \begin{frame}{Related Results: H3 - SSM For Language Modeling}
864 | \begin{itemize}
865 |     \item Alternative gating method for language modeling
866 |     \item Use 2 attention layers + SSM and reach Transformer PPL. 
867 |     \item Efficient implementation targeting on GPUs. 
868 | \end{itemize}
869 | 
870 |     \blfootnote{\cite{dao2022hungry}}
871 | \end{frame}
872 | 
873 | 
874 | % \section{Next Steps}
875 | % \begin{frame}{Outline}
876 | %     \tableofcontents[currentsection]
877 | % \end{frame}
878 | 
879 | 
880 | \begin{frame}{Next Steps}
881 | \begin{itemize}
882 |     \item Attention may not be required? Simpler routing + gating. 
883 |     \item More analysis on feed-forward contribution. 
884 |     \item Transfer from pretraining unclear.
885 | \end{itemize}
886 | \end{frame}
887 | 
888 | % \begin{frame}
889 | %     \includegraphics[height=\textheight]{Figs/ModelSize0.jpg}
890 | % \end{frame}
891 | 
892 | 
893 | % \input{slides/bullets}
894 | % \input{slides/split}
895 | % \input{slides/figure}
896 | % \input{slides/centered}
897 | % \input{slides/monospace}
898 | % \input{slides/brackets}
899 | % \input{slides/link}
900 | \begin{frame}[allowframebreaks]
901 |         \frametitle{References}
902 |         \footnotesize
903 |         \bibliographystyle{apalike}
904 |         \bibliography{anthology.bib}
905 | \end{frame}
906 | \end{document}
907 | 


--------------------------------------------------------------------------------
/presentation.tex:
--------------------------------------------------------------------------------
  1 | % Auriga theme
  2 | % find the most up-to-date version here: https://github.com/anishathalye/auriga
  3 | 
  4 | \documentclass[14pt,aspectratio=169]{beamer}
  5 | \usepackage{pgfpages}
  6 | \usepackage{fancyvrb}
  7 | \usepackage{tikz}
  8 | \usepackage{tikz-qtree}
  9 | 
 10 | \usepackage{pgfplots}
 11 | 
 12 | \usepackage{booktabs}
 13 | \usepackage[normalem]{ulem}
 14 | 
 15 | 
 16 | \usetheme{auriga}
 17 | \usecolortheme{auriga}
 18 | %\setbeamercolor{math text}{fg=blue}
 19 | 
 20 | \newcommand\blfootnote[1]{%
 21 | \begingroup
 22 | \renewcommand\thefootnote{}\footnote{#1}%
 23 | \addtocounter{footnote}{-1}%
 24 | \endgroup
 25 | }
 26 | 
 27 | %\setbeamertemplate{footline}[]
 28 | %\renewcommand\footnotemark{}
 29 | 
 30 | \setbeamertemplate{footline}[frame number]
 31 | 
 32 | % define some colors for a consistent theme across slides
 33 | \definecolor{red}{RGB}{181, 23, 0}
 34 | \definecolor{blue}{RGB}{0, 118, 186}
 35 | \definecolor{gray}{RGB}{146, 146, 146}
 36 | \definecolor{orange}{RGB}{255, 165, 0}
 37 | \definecolor{green}{RGB}{0, 128, 0}
 38 | % Create a slide for each section
 39 | \AtBeginSection[]{
 40 |   \begin{frame}
 41 |     \vfill
 42 |     \centering
 43 |     \begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title}
 44 |       \usebeamerfont{title}\insertsectionhead\par%
 45 |     \end{beamercolorbox}
 46 |     \vfill
 47 |   \end{frame}
 48 | }
 49 | 
 50 | \title{Do we need \textcolor{blue}{Attention}?}
 51 | 
 52 | \author{Presented by Sasha Rush}
 53 | 
 54 | % \institute[shortinst]{}
 55 | 
 56 | \begin{document}
 57 | 
 58 | {
 59 |   % rather than use the frame options [noframenumbering,plain], we make the
 60 |   % color match, so that the indicated page numbers match PDF page numbers
 61 |   \setbeamercolor{page number in head/foot}{fg=background canvas.bg}
 62 |   \begin{frame}
 63 |     \titlepage
 64 |   \end{frame}
 65 | }
 66 | 
 67 | \begin{frame}[label=c]{}
 68 |     \textit{
 69 |     This talk is a survey of work done by:
 70 | }
 71 |     
 72 |     \begin{center}
 73 |     Albert Gu, Ankit Gupta, Tri Dao, Dan Fu, Shuangfei Zhai, Antono Orvieto, Michael Poli, Chris Re, Yuhon Li, Tianle Cai, Harsh Mehta, Jimmy Smith, Scott Linderman, Xuezhe Ma, Chunting Zhou, Xiang Kong, Bo Peng, Eric Alcaide, Anthony Quentin, Andrew Warrington, Yi Zhang, Stefano Massaroli, \\and many others
 74 |     \end{center}
 75 | 
 76 | \end{frame}
 77 | 
 78 | \section{Preface: Transformers and Attention}
 79 | 
 80 | \input{02-transformers}
 81 | 
 82 | \section{The Challenge}
 83 | \input{01-intro}
 84 | 
 85 | \section{An RNN Revival}
 86 |  \input{03-RNN}
 87 | 
 88 | \section{Are we GPT yet?}
 89 |  \input{03.5-Results}
 90 | 
 91 | % \section{Computation and Parameterization}
 92 | % \input{05-Extensions}
 93 | 
 94 | 
 95 | \section{Scaling Linear RNNs}
 96 | \input{06-final}
 97 | 
 98 | 
 99 | % \section{Practicalities}
100 | 
101 | 
102 | 
103 | 
104 | % \input{slides/bullets}
105 | % \input{slides/split}
106 | % \input{slides/figure}
107 | % \input{slides/centered}
108 | % \input{slides/monospace}
109 | % \input{slides/brackets}
110 | % \input{slides/link}
111 | \begin{frame}[allowframebreaks]
112 |         \frametitle{References}
113 |         \footnotesize
114 |         \bibliographystyle{apalike}
115 |         \bibliography{ssm.bib,anthology.bib}
116 | \end{frame}
117 | \end{document}
118 | 


--------------------------------------------------------------------------------
/slides/brackets.tex:
--------------------------------------------------------------------------------
 1 | \begin{frame}{A slide with some bracketed text}
 2 | 
 3 | 	\begin{itemize}
 4 | 		\item Some statement {\color{gray} [Some citation]}
 5 | 		\item Another statement {\color{gray} [Another citation]}
 6 | 		\item A final statement {\color{gray} [The last citation]}
 7 | 	\end{itemize}
 8 | 
 9 | 	\vspace{3ex}
10 | 	\begin{center}
11 | 		\scriptsize (a small note)
12 | 	\end{center}
13 | 
14 | \end{frame}
15 | 
16 | 


--------------------------------------------------------------------------------
/slides/bullets.tex:
--------------------------------------------------------------------------------
 1 | \begin{frame}{A slide title}
 2 | 
 3 |   \begin{itemize}
 4 |     \item A bulleted item
 5 |     \item Another item
 6 |       \begin{itemize}
 7 |         \item With sub-bullets
 8 |         \item And another, with some \textbf{bold} text
 9 |       \end{itemize}
10 |     \item And another, at the top level, with \textit{italic} text
11 |   \end{itemize}
12 | 
13 |   \note{
14 |     Here's a note for this slide.
15 |   }
16 | 
17 | \end{frame}
18 | 


--------------------------------------------------------------------------------
/slides/centered.tex:
--------------------------------------------------------------------------------
 1 | \begin{frame}{A slide with centered text}
 2 | 
 3 |   \begin{center}
 4 |     Some statement that is centered.
 5 |   \end{center}
 6 | 
 7 |   \vspace{2ex}
 8 |   \begin{center}
 9 |     \scriptsize (a small note)
10 |   \end{center}
11 | 
12 | \end{frame}
13 | 


--------------------------------------------------------------------------------
/slides/figure.tex:
--------------------------------------------------------------------------------
 1 | \begin{frame}{Full-slide figure}
 2 | 
 3 |   \begin{figure}
 4 |     \centering
 5 |     \begin{tikzpicture}[scale=0.5]
 6 |       \begin{axis}[
 7 |           scale only axis,
 8 |           no markers,
 9 |           domain=0:2*pi,
10 |           samples=100,
11 |           axis lines=center,
12 |           axis line style={-},
13 |           ticks=none]
14 |         \addplot[red] {sin(deg(x))};
15 |         \addplot[blue] {cos(deg(x))};
16 |       \end{axis}
17 |     \end{tikzpicture}
18 |   \end{figure}
19 |     \blfootnote{[Here is a citation]}
20 | 
21 | 
22 | \end{frame}
23 | 


--------------------------------------------------------------------------------
/slides/link.tex:
--------------------------------------------------------------------------------
 1 | \begin{frame}{A slide with some text and a link}
 2 | 
 3 |   \begin{itemize}
 4 |     \item This slide has some text along with a link
 5 |       \begin{itemize}
 6 |         \item \textbf{Some bold text}: followed by an explanation
 7 |         \item \textbf{More bold text}: followed by more text
 8 |       \end{itemize}
 9 |     \item Another bullet, with sub-bullets
10 |       \begin{itemize}
11 |         \item A sub-bullet
12 |         \item Another sub-bullet, with more text
13 |       \end{itemize}
14 |   \end{itemize}
15 | 
16 |   \vspace{2ex}
17 |   \begin{center}
18 |     \color{blue} \href{https://github.com/anishathalye/auriga}{github.com/anishathalye/auriga}
19 |   \end{center}
20 | 
21 | \end{frame}
22 | 


--------------------------------------------------------------------------------
/slides/monospace.tex:
--------------------------------------------------------------------------------
 1 | \begin{frame}[fragile]{A slide with some code}
 2 | 
 3 | 	\begin{columns}
 4 | 		\begin{column}{0.5\linewidth}
 5 | 			\footnotesize
 6 | 			\begin{Verbatim}[commandchars=\\\{\}]
 7 | /* some code */
 8 | def foo(x):
 9 |   return x**0.5 + 2*x
10 | 
11 | \color{blue}/* some can be highlighted */
12 | \color{blue}foo(3)
13 |       \end{Verbatim}
14 |     \end{column}
15 |     \begin{column}{0.5\linewidth}
16 |       {\color{red} Some explanatory text, in red, with some \texttt{monospace} text.}
17 |       There might be some math, too:
18 | 
19 |       $$\sqrt{x} + 2x$$
20 |     \end{column}
21 |   \end{columns}
22 | 
23 | \end{frame}
24 | 


--------------------------------------------------------------------------------
/slides/split.tex:
--------------------------------------------------------------------------------
 1 | \begin{frame}{A 50-50 split slide}
 2 | 
 3 |   \begin{columns}
 4 |     \begin{column}{0.5\linewidth}
 5 |       \begin{itemize}
 6 |         \item This side has a bullet
 7 |         \item And another bullet, with text that wraps if it's long
 8 |       \end{itemize}
 9 |     \end{column}
10 |     \begin{column}{0.5\linewidth}
11 |       \begin{figure}
12 |         \centering
13 |         \begin{tikzpicture}[scale=2]
14 |           \draw[step=0.25cm,color=gray] (-1,-1) grid (1,1);
15 |           \draw[color=red] (1,0) -- (0.2,0.2) -- (0,1) -- (-0.2,0.2) -- (-1,0)
16 |           -- (-0.2,-0.2) -- (0,-1) -- (0.2,-0.2) -- cycle;
17 |         \end{tikzpicture}
18 |         \caption{A figure caption}
19 |       \end{figure}
20 |     \end{column}
21 |   \end{columns}
22 | 
23 |   \note{
24 |     This slide has notes too.
25 |   }
26 | 
27 | \end{frame}
28 | 


--------------------------------------------------------------------------------
/ssm.bib:
--------------------------------------------------------------------------------
  1 | @ARTICLE{Orvieto2023-an,
  2 |   title         = "Resurrecting Recurrent Neural Networks for Long Sequences",
  3 |   author        = "Orvieto, Antonio and Smith, Samuel L and Gu, Albert and
  4 |                    Fernando, Anushan and Gulcehre, Caglar and Pascanu, Razvan
  5 |                    and De, Soham",
  6 |   abstract      = "Recurrent Neural Networks (RNNs) offer fast inference on
  7 |                    long sequences but are hard to optimize and slow to train.
  8 |                    Deep state-space models (SSMs) have recently been shown to
  9 |                    perform remarkably well on long sequence modeling tasks, and
 10 |                    have the added benefits of fast parallelizable training and
 11 |                    RNN-like fast inference. However, while SSMs are
 12 |                    superficially similar to RNNs, there are important
 13 |                    differences that make it unclear where their performance
 14 |                    boost over RNNs comes from. In this paper, we show that
 15 |                    careful design of deep RNNs using standard signal
 16 |                    propagation arguments can recover the impressive performance
 17 |                    of deep SSMs on long-range reasoning tasks, while also
 18 |                    matching their training speed. To achieve this, we analyze
 19 |                    and ablate a series of changes to standard RNNs including
 20 |                    linearizing and diagonalizing the recurrence, using better
 21 |                    parameterizations and initializations, and ensuring proper
 22 |                    normalization of the forward pass. Our results provide new
 23 |                    insights on the origins of the impressive performance of
 24 |                    deep SSMs, while also introducing an RNN block called the
 25 |                    Linear Recurrent Unit that matches both their performance on
 26 |                    the Long Range Arena benchmark and their computational
 27 |                    efficiency.",
 28 |   month         =  mar,
 29 |   year          =  2023,
 30 |   keywords      = "SSM",
 31 |   archivePrefix = "arXiv",
 32 |   primaryClass  = "cs.LG",
 33 |   eprint        = "2303.06349"
 34 | }
 35 | 
 36 | @ARTICLE{Zhai2021-gz,
 37 |   title         = "An Attention Free Transformer",
 38 |   author        = "Zhai, Shuangfei and Talbott, Walter and Srivastava, Nitish
 39 |                    and Huang, Chen and Goh, Hanlin and Zhang, Ruixiang and
 40 |                    Susskind, Josh",
 41 |   abstract      = "We introduce Attention Free Transformer (AFT), an efficient
 42 |                    variant of Transformers that eliminates the need for dot
 43 |                    product self attention. In an AFT layer, the key and value
 44 |                    are first combined with a set of learned position biases,
 45 |                    the result of which is multiplied with the query in an
 46 |                    element-wise fashion. This new operation has a memory
 47 |                    complexity linear w.r.t. both the context size and the
 48 |                    dimension of features, making it compatible to both large
 49 |                    input and model sizes. We also introduce AFT-local and
 50 |                    AFT-conv, two model variants that take advantage of the idea
 51 |                    of locality and spatial weight sharing while maintaining
 52 |                    global connectivity. We conduct extensive experiments on two
 53 |                    autoregressive modeling tasks (CIFAR10 and Enwik8) as well
 54 |                    as an image recognition task (ImageNet-1K classification).
 55 |                    We show that AFT demonstrates competitive performance on all
 56 |                    the benchmarks, while providing excellent efficiency at the
 57 |                    same time.",
 58 |   month         =  may,
 59 |   year          =  2021,
 60 |   keywords      = "SSM",
 61 |   archivePrefix = "arXiv",
 62 |   primaryClass  = "cs.LG",
 63 |   eprint        = "2105.14103"
 64 | }
 65 | 
 66 | @ARTICLE{Poli2023-ag,
 67 |   title         = "Hyena Hierarchy: Towards Larger Convolutional Language
 68 |                    Models",
 69 |   author        = "Poli, Michael and Massaroli, Stefano and Nguyen, Eric and
 70 |                    Fu, Daniel Y and Dao, Tri and Baccus, Stephen and Bengio,
 71 |                    Yoshua and Ermon, Stefano and R{\'e}, Christopher",
 72 |   abstract      = "Recent advances in deep learning have relied heavily on the
 73 |                    use of large Transformers due to their ability to learn at
 74 |                    scale. However, the core building block of Transformers, the
 75 |                    attention operator, exhibits quadratic cost in sequence
 76 |                    length, limiting the amount of context accessible. Existing
 77 |                    subquadratic methods based on low-rank and sparse
 78 |                    approximations need to be combined with dense attention
 79 |                    layers to match Transformers, indicating a gap in
 80 |                    capability. In this work, we propose Hyena, a subquadratic
 81 |                    drop-in replacement for attention constructed by
 82 |                    interleaving implicitly parametrized long convolutions and
 83 |                    data-controlled gating. In recall and reasoning tasks on
 84 |                    sequences of thousands to hundreds of thousands of tokens,
 85 |                    Hyena improves accuracy by more than 50 points over
 86 |                    operators relying on state-spaces and other implicit and
 87 |                    explicit methods, matching attention-based models. We set a
 88 |                    new state-of-the-art for dense-attention-free architectures
 89 |                    on language modeling in standard datasets (WikiText103 and
 90 |                    The Pile), reaching Transformer quality with a 20\%
 91 |                    reduction in training compute required at sequence length
 92 |                    2K. Hyena operators are twice as fast as highly optimized
 93 |                    attention at sequence length 8K, and 100x faster at sequence
 94 |                    length 64K.",
 95 |   month         =  feb,
 96 |   year          =  2023,
 97 |   keywords      = "SSM",
 98 |   archivePrefix = "arXiv",
 99 |   primaryClass  = "cs.LG",
100 |   eprint        = "2302.10866"
101 | }
102 | 
103 | @ARTICLE{Li2022-pn,
104 |   title         = "What Makes Convolutional Models Great on Long Sequence
105 |                    Modeling?",
106 |   author        = "Li, Yuhong and Cai, Tianle and Zhang, Yi and Chen, Deming
107 |                    and Dey, Debadeepta",
108 |   abstract      = "Convolutional models have been widely used in multiple
109 |                    domains. However, most existing models only use local
110 |                    convolution, making the model unable to handle long-range
111 |                    dependency efficiently. Attention overcomes this problem by
112 |                    aggregating global information but also makes the
113 |                    computational complexity quadratic to the sequence length.
114 |                    Recently, Gu et al. [2021] proposed a model called S4
115 |                    inspired by the state space model. S4 can be efficiently
116 |                    implemented as a global convolutional model whose kernel
117 |                    size equals the input sequence length. S4 can model much
118 |                    longer sequences than Transformers and achieve significant
119 |                    gains over SoTA on several long-range tasks. Despite its
120 |                    empirical success, S4 is involved. It requires sophisticated
121 |                    parameterization and initialization schemes. As a result, S4
122 |                    is less intuitive and hard to use. Here we aim to demystify
123 |                    S4 and extract basic principles that contribute to the
124 |                    success of S4 as a global convolutional model. We focus on
125 |                    the structure of the convolution kernel and identify two
126 |                    critical but intuitive principles enjoyed by S4 that are
127 |                    sufficient to make up an effective global convolutional
128 |                    model: 1) The parameterization of the convolutional kernel
129 |                    needs to be efficient in the sense that the number of
130 |                    parameters should scale sub-linearly with sequence length.
131 |                    2) The kernel needs to satisfy a decaying structure that the
132 |                    weights for convolving with closer neighbors are larger than
133 |                    the more distant ones. Based on the two principles, we
134 |                    propose a simple yet effective convolutional model called
135 |                    Structured Global Convolution (SGConv). SGConv exhibits
136 |                    strong empirical performance over several tasks: 1) With
137 |                    faster speed, SGConv surpasses S4 on Long Range Arena and
138 |                    Speech Command datasets. 2) When plugging SGConv into
139 |                    standard language and vision models, it shows the potential
140 |                    to improve both efficiency and performance.",
141 |   month         =  oct,
142 |   year          =  2022,
143 |   keywords      = "SSM",
144 |   archivePrefix = "arXiv",
145 |   primaryClass  = "cs.LG",
146 |   eprint        = "2210.09298"
147 | }
148 | 
149 | @ARTICLE{Fu2022-bw,
150 |   title         = "Hungry Hungry Hippos: Towards Language Modeling with State
151 |                    Space Models",
152 |   author        = "Fu, Daniel Y and Dao, Tri and Saab, Khaled K and Thomas,
153 |                    Armin W and Rudra, Atri and R{\'e}, Christopher",
154 |   abstract      = "State space models (SSMs) have demonstrated state-of-the-art
155 |                    sequence modeling performance in some modalities, but
156 |                    underperform attention in language modeling. Moreover,
157 |                    despite scaling nearly linearly in sequence length instead
158 |                    of quadratically, SSMs are still slower than Transformers
159 |                    due to poor hardware utilization. In this paper, we make
160 |                    progress on understanding the expressivity gap between SSMs
161 |                    and attention in language modeling, and on reducing the
162 |                    hardware barrier between SSMs and attention. First, we use
163 |                    synthetic language modeling tasks to understand the gap
164 |                    between SSMs and attention. We find that existing SSMs
165 |                    struggle with two capabilities: recalling earlier tokens in
166 |                    the sequence and comparing tokens across the sequence. To
167 |                    understand the impact on language modeling, we propose a new
168 |                    SSM layer, H3, that is explicitly designed for these
169 |                    abilities. H3 matches attention on the synthetic languages
170 |                    and comes within 0.4 PPL of Transformers on OpenWebText.
171 |                    Furthermore, a hybrid 125M-parameter H3-attention model that
172 |                    retains two attention layers surprisingly outperforms
173 |                    Transformers on OpenWebText by 1.0 PPL. Next, to improve the
174 |                    efficiency of training SSMs on modern hardware, we propose
175 |                    FlashConv. FlashConv uses a fused block FFT algorithm to
176 |                    improve efficiency on sequences up to 8K, and introduces a
177 |                    novel state passing algorithm that exploits the recurrent
178 |                    properties of SSMs to scale to longer sequences. FlashConv
179 |                    yields 2$\times$ speedup on the long-range arena benchmark
180 |                    and allows hybrid language models to generate text
181 |                    2.4$\times$ faster than Transformers. Using FlashConv, we
182 |                    scale hybrid H3-attention language models up to 2.7B
183 |                    parameters on the Pile and find promising initial results,
184 |                    achieving lower perplexity than Transformers and
185 |                    outperforming Transformers in zero- and few-shot learning on
186 |                    a majority of tasks in the SuperGLUE benchmark.",
187 |   month         =  dec,
188 |   year          =  2022,
189 |   keywords      = "SSM",
190 |   archivePrefix = "arXiv",
191 |   primaryClass  = "cs.LG",
192 |   eprint        = "2212.14052"
193 | }
194 | 
195 | @ARTICLE{Mehta2022-pz,
196 |   title         = "Long Range Language Modeling via Gated State Spaces",
197 |   author        = "Mehta, Harsh and Gupta, Ankit and Cutkosky, Ashok and
198 |                    Neyshabur, Behnam",
199 |   abstract      = "State space models have shown to be effective at modeling
200 |                    long range dependencies, specially on sequence
201 |                    classification tasks. In this work we focus on
202 |                    autoregressive sequence modeling over English books, Github
203 |                    source code and ArXiv mathematics articles. Based on recent
204 |                    developments around the effectiveness of gated activation
205 |                    functions, we propose a new layer named Gated State Space
206 |                    (GSS) and show that it trains significantly faster than the
207 |                    diagonal version of S4 (i.e. DSS) on TPUs, is fairly
208 |                    competitive with several well-tuned Transformer-based
209 |                    baselines and exhibits zero-shot generalization to longer
210 |                    inputs while being straightforward to implement. Finally, we
211 |                    show that leveraging self-attention to model local
212 |                    dependencies improves the performance of GSS even further.",
213 |   month         =  jun,
214 |   year          =  2022,
215 |   keywords      = "SSM",
216 |   archivePrefix = "arXiv",
217 |   primaryClass  = "cs.LG",
218 |   eprint        = "2206.13947"
219 | }
220 | 
221 | @ARTICLE{Smith2022-at,
222 |   title         = "Simplified State Space Layers for Sequence Modeling",
223 |   author        = "Smith, Jimmy T H and Warrington, Andrew and Linderman, Scott
224 |                    W",
225 |   abstract      = "Models using structured state space sequence (S4) layers
226 |                    have achieved state-of-the-art performance on long-range
227 |                    sequence modeling tasks. An S4 layer combines linear state
228 |                    space models (SSMs), the HiPPO framework, and deep learning
229 |                    to achieve high performance. We build on the design of the
230 |                    S4 layer and introduce a new state space layer, the S5
231 |                    layer. Whereas an S4 layer uses many independent
232 |                    single-input, single-output SSMs, the S5 layer uses one
233 |                    multi-input, multi-output SSM. We establish a connection
234 |                    between S5 and S4, and use this to develop the
235 |                    initialization and parameterization used by the S5 model.
236 |                    The result is a state space layer that can leverage
237 |                    efficient and widely implemented parallel scans, allowing S5
238 |                    to match the computational efficiency of S4, while also
239 |                    achieving state-of-the-art performance on several long-range
240 |                    sequence modeling tasks. S5 averages 87.4\% on the long
241 |                    range arena benchmark, and 98.5\% on the most difficult
242 |                    Path-X task.",
243 |   month         =  aug,
244 |   year          =  2022,
245 |   keywords      = "SSM",
246 |   archivePrefix = "arXiv",
247 |   primaryClass  = "cs.LG",
248 |   eprint        = "2208.04933"
249 | }
250 | 
251 | @ARTICLE{Ma2022-xw,
252 |   title         = "Mega: Moving Average Equipped Gated Attention",
253 |   author        = "Ma, Xuezhe and Zhou, Chunting and Kong, Xiang and He,
254 |                    Junxian and Gui, Liangke and Neubig, Graham and May,
255 |                    Jonathan and Zettlemoyer, Luke",
256 |   abstract      = "The design choices in the Transformer attention mechanism,
257 |                    including weak inductive bias and quadratic computational
258 |                    complexity, have limited its application for modeling long
259 |                    sequences. In this paper, we introduce Mega, a simple,
260 |                    theoretically grounded, single-head gated attention
261 |                    mechanism equipped with (exponential) moving average to
262 |                    incorporate inductive bias of position-aware local
263 |                    dependencies into the position-agnostic attention mechanism.
264 |                    We further propose a variant of Mega that offers linear time
265 |                    and space complexity yet yields only minimal quality loss,
266 |                    by efficiently splitting the whole sequence into multiple
267 |                    chunks with fixed length. Extensive experiments on a wide
268 |                    range of sequence modeling benchmarks, including the Long
269 |                    Range Arena, neural machine translation, auto-regressive
270 |                    language modeling, and image and speech classification, show
271 |                    that Mega achieves significant improvements over other
272 |                    sequence models, including variants of Transformers and
273 |                    recent state space models.",
274 |   month         =  sep,
275 |   year          =  2022,
276 |   keywords      = "SSM",
277 |   archivePrefix = "arXiv",
278 |   primaryClass  = "cs.LG",
279 |   eprint        = "2209.10655"
280 | }
281 | 
282 | @ARTICLE{Peng2023-yp,
283 |   title         = "{RWKV}: Reinventing {RNNs} for the Transformer Era",
284 |   author        = "Peng, Bo and Alcaide, Eric and Anthony, Quentin and Albalak,
285 |                    Alon and Arcadinho, Samuel and Cao, Huanqi and Cheng, Xin
286 |                    and Chung, Michael and Grella, Matteo and Gv, Kranthi Kiran
287 |                    and He, Xuzheng and Hou, Haowen and Kazienko, Przemyslaw and
288 |                    Kocon, Jan and Kong, Jiaming and Koptyra, Bartlomiej and
289 |                    Lau, Hayden and Mantri, Krishna Sri Ipsit and Mom, Ferdinand
290 |                    and Saito, Atsushi and Tang, Xiangru and Wang, Bolun and
291 |                    Wind, Johan S and Wozniak, Stansilaw and Zhang, Ruichong and
292 |                    Zhang, Zhenyuan and Zhao, Qihang and Zhou, Peng and Zhu,
293 |                    Jian and Zhu, Rui-Jie",
294 |   abstract      = "Transformers have revolutionized almost all natural language
295 |                    processing (NLP) tasks but suffer from memory and
296 |                    computational complexity that scales quadratically with
297 |                    sequence length. In contrast, recurrent neural networks
298 |                    (RNNs) exhibit linear scaling in memory and computational
299 |                    requirements but struggle to match the same performance as
300 |                    Transformers due to limitations in parallelization and
301 |                    scalability. We propose a novel model architecture,
302 |                    Receptance Weighted Key Value (RWKV), that combines the
303 |                    efficient parallelizable training of Transformers with the
304 |                    efficient inference of RNNs. Our approach leverages a linear
305 |                    attention mechanism and allows us to formulate the model as
306 |                    either a Transformer or an RNN, which parallelizes
307 |                    computations during training and maintains constant
308 |                    computational and memory complexity during inference,
309 |                    leading to the first non-transformer architecture to be
310 |                    scaled to tens of billions of parameters. Our experiments
311 |                    reveal that RWKV performs on par with similarly sized
312 |                    Transformers, suggesting that future work can leverage this
313 |                    architecture to create more efficient models. This work
314 |                    presents a significant step towards reconciling the
315 |                    trade-offs between computational efficiency and model
316 |                    performance in sequence processing tasks.",
317 |   month         =  may,
318 |   year          =  2023,
319 |   keywords      = "SSM",
320 |   archivePrefix = "arXiv",
321 |   primaryClass  = "cs.CL",
322 |   eprint        = "2305.13048"
323 | }
324 | 
325 | @UNPUBLISHED{Martin2018-bq,
326 |   title    = "Parallelizing Linear Recurrent Neural Nets Over Sequence Length",
327 |   author   = "Martin, Eric and Cundy, Chris",
328 |   abstract = "Recurrent neural networks (RNNs) are widely used to model
329 |               sequential data but their non-linear dependencies between
330 |               sequence elements prevent parallelizing training over sequence
331 |               length. We show the training of RNNs with only linear sequential
332 |               dependencies can be parallelized over the sequence length using
333 |               the parallel scan algorithm, leading to rapid training on long
334 |               sequences even with small minibatch size. We develop a parallel
335 |               linear recurrence CUDA kernel and show that it can be applied to
336 |               immediately speed up training and inference of several state of
337 |               the art RNN architectures by up to 9x. We abstract recent work on
338 |               linear RNNs into a new framework of linear surrogate RNNs and
339 |               develop a linear surrogate model for the long short-term memory
340 |               unit, the GILR-LSTM, that utilizes parallel linear recurrence. We
341 |               extend sequence learning to new extremely long sequence regimes
342 |               that were previously out of reach by successfully training a
343 |               GILR-LSTM on a synthetic sequence classification task with a one
344 |               million timestep dependency.",
345 |   month    =  feb,
346 |   year     =  2018,
347 |   keywords = "SSM"
348 | }
349 | 
350 | @ARTICLE{Wang2022-un,
351 |   title         = "Pretraining Without Attention",
352 |   author        = "Wang, Junxiong and Yan, Jing Nathan and Gu, Albert and Rush,
353 |                    Alexander M",
354 |   abstract      = "Transformers have been essential to pretraining success in
355 |                    NLP. While other architectures have been used, downstream
356 |                    accuracy is either significantly worse, or requires
357 |                    attention layers to match standard benchmarks such as GLUE.
358 |                    This work explores pretraining without attention by using
359 |                    recent advances in sequence routing based on state-space
360 |                    models (SSMs). Our proposed model, Bidirectional Gated SSM
361 |                    (BiGS), combines SSM layers with a multiplicative gating
362 |                    architecture that has been effective in simplified sequence
363 |                    modeling architectures. The model learns static layers that
364 |                    do not consider pair-wise interactions. Even so, BiGS is
365 |                    able to match BERT pretraining accuracy on GLUE and can be
366 |                    extended to long-form pretraining of 4096 tokens without
367 |                    approximation. Analysis shows that while the models have
368 |                    similar average accuracy, the approach has different
369 |                    inductive biases than BERT in terms of interactions and
370 |                    syntactic representations. All models from this work are
371 |                    available at https://github.com/jxiw/BiGS.",
372 |   month         =  dec,
373 |   year          =  2022,
374 |   keywords      = "SSM",
375 |   archivePrefix = "arXiv",
376 |   primaryClass  = "cs.CL",
377 |   eprint        = "2212.10544"
378 | }
379 | 
380 | @ARTICLE{Gupta2022-vp,
381 |   title         = "Diagonal State Spaces are as Effective as Structured State
382 |                    Spaces",
383 |   author        = "Gupta, Ankit and Gu, Albert and Berant, Jonathan",
384 |   abstract      = "Modeling long range dependencies in sequential data is a
385 |                    fundamental step towards attaining human-level performance
386 |                    in many modalities such as text, vision, audio and video.
387 |                    While attention-based models are a popular and effective
388 |                    choice in modeling short-range interactions, their
389 |                    performance on tasks requiring long range reasoning has been
390 |                    largely inadequate. In an exciting result, Gu et al. (ICLR
391 |                    2022) proposed the $\textit\{Structured State Space\}$ (S4)
392 |                    architecture delivering large gains over state-of-the-art
393 |                    models on several long-range tasks across various
394 |                    modalities. The core proposition of S4 is the
395 |                    parameterization of state matrices via a diagonal plus low
396 |                    rank structure, allowing efficient computation. In this
397 |                    work, we show that one can match the performance of S4 even
398 |                    without the low rank correction and thus assuming the state
399 |                    matrices to be diagonal. Our $\textit\{Diagonal State
400 |                    Space\}$ (DSS) model matches the performance of S4 on Long
401 |                    Range Arena tasks, speech classification on Speech Commands
402 |                    dataset, while being conceptually simpler and
403 |                    straightforward to implement.",
404 |   month         =  mar,
405 |   year          =  2022,
406 |   keywords      = "SSM",
407 |   archivePrefix = "arXiv",
408 |   primaryClass  = "cs.LG",
409 |   eprint        = "2203.14343"
410 | }
411 | 
412 | @MISC{Blelloch1990-yo,
413 |   title        = "Prefix sums and their applications",
414 |   author       = "Blelloch, Guy E and Reif, John H",
415 |   abstract     = "Experienced algorithm designers rely heavily on a set of
416 |                   building blocks and on the tools needed to put the blocks
417 |                   together into an algorithm. The understanding of these basic
418 |                   blocks and tools is therefore critical to the understanding
419 |                   of algorithms. Many of the blocks and tools needed for
420 |                   parallel algorithms extend from sequential algorithms, such
421 |                   as dynamic-programming and divide-and-conquer, but others are
422 |                   new. This paper introduces one of the simplest and most
423 |                   useful building blocks for parallel algorithms: the
424 |                   all-prefixsums operation. The paper defines the operation,
425 |                   shows how to implement it on a PRAM and illustrates many
426 |                   applications of the operation. In addition to being a useful
427 |                   building block, the all-prefix-sums operation is a good
428 |                   example of a computation that seems inherently sequential,
429 |                   but for which there is an efficient parallel algorithm.",
430 |   publisher    = "shelf2.library.cmu.edu",
431 |   year         =  1990,
432 |   howpublished = "\url{http://shelf2.library.cmu.edu/Tech/23445461.pdf}",
433 |   note         = "Accessed: 2023-5-30",
434 |   keywords     = "SSM"
435 | }
436 | 
437 | @ARTICLE{Gu2022-jz,
438 |   title         = "On the Parameterization and Initialization of Diagonal State
439 |                    Space Models",
440 |   author        = "Gu, Albert and Gupta, Ankit and Goel, Karan and R{\'e},
441 |                    Christopher",
442 |   abstract      = "State space models (SSM) have recently been shown to be very
443 |                    effective as a deep learning layer as a promising
444 |                    alternative to sequence models such as RNNs, CNNs, or
445 |                    Transformers. The first version to show this potential was
446 |                    the S4 model, which is particularly effective on tasks
447 |                    involving long-range dependencies by using a prescribed
448 |                    state matrix called the HiPPO matrix. While this has an
449 |                    interpretable mathematical mechanism for modeling long
450 |                    dependencies, it introduces a custom representation and
451 |                    algorithm that can be difficult to implement. On the other
452 |                    hand, a recent variant of S4 called DSS showed that
453 |                    restricting the state matrix to be fully diagonal can still
454 |                    preserve the performance of the original model when using a
455 |                    specific initialization based on approximating S4's matrix.
456 |                    This work seeks to systematically understand how to
457 |                    parameterize and initialize such diagonal state space
458 |                    models. While it follows from classical results that almost
459 |                    all SSMs have an equivalent diagonal form, we show that the
460 |                    initialization is critical for performance. We explain why
461 |                    DSS works mathematically, by showing that the diagonal
462 |                    restriction of S4's matrix surprisingly recovers the same
463 |                    kernel in the limit of infinite state dimension. We also
464 |                    systematically describe various design choices in
465 |                    parameterizing and computing diagonal SSMs, and perform a
466 |                    controlled empirical study ablating the effects of these
467 |                    choices. Our final model S4D is a simple diagonal version of
468 |                    S4 whose kernel computation requires just 2 lines of code
469 |                    and performs comparably to S4 in almost all settings, with
470 |                    state-of-the-art results for image, audio, and medical
471 |                    time-series domains, and averaging 85\% on the Long Range
472 |                    Arena benchmark.",
473 |   month         =  jun,
474 |   year          =  2022,
475 |   keywords      = "SSM",
476 |   archivePrefix = "arXiv",
477 |   primaryClass  = "cs.LG",
478 |   eprint        = "2206.11893"
479 | }
480 | 
481 | @ARTICLE{Goel2022-lv,
482 |   title         = "It's Raw! Audio Generation with {State-Space} Models",
483 |   author        = "Goel, Karan and Gu, Albert and Donahue, Chris and R{\'e},
484 |                    Christopher",
485 |   abstract      = "Developing architectures suitable for modeling raw audio is
486 |                    a challenging problem due to the high sampling rates of
487 |                    audio waveforms. Standard sequence modeling approaches like
488 |                    RNNs and CNNs have previously been tailored to fit the
489 |                    demands of audio, but the resultant architectures make
490 |                    undesirable computational tradeoffs and struggle to model
491 |                    waveforms effectively. We propose SaShiMi, a new multi-scale
492 |                    architecture for waveform modeling built around the recently
493 |                    introduced S4 model for long sequence modeling. We identify
494 |                    that S4 can be unstable during autoregressive generation,
495 |                    and provide a simple improvement to its parameterization by
496 |                    drawing connections to Hurwitz matrices. SaShiMi yields
497 |                    state-of-the-art performance for unconditional waveform
498 |                    generation in the autoregressive setting. Additionally,
499 |                    SaShiMi improves non-autoregressive generation performance
500 |                    when used as the backbone architecture for a diffusion
501 |                    model. Compared to prior architectures in the autoregressive
502 |                    generation setting, SaShiMi generates piano and speech
503 |                    waveforms which humans find more musical and coherent
504 |                    respectively, e.g. 2x better mean opinion scores than
505 |                    WaveNet on an unconditional speech generation task. On a
506 |                    music generation task, SaShiMi outperforms WaveNet on
507 |                    density estimation and speed at both training and inference
508 |                    even when using 3x fewer parameters. Code can be found at
509 |                    https://github.com/HazyResearch/state-spaces and samples at
510 |                    https://hazyresearch.stanford.edu/sashimi-examples.",
511 |   month         =  feb,
512 |   year          =  2022,
513 |   keywords      = "SSM",
514 |   archivePrefix = "arXiv",
515 |   primaryClass  = "cs.SD",
516 |   eprint        = "2202.09729"
517 | }
518 | 
519 | @ARTICLE{Lu2023-ov,
520 |   title         = "Structured State Space Models for {In-Context} Reinforcement
521 |                    Learning",
522 |   author        = "Lu, Chris and Schroecker, Yannick and Gu, Albert and
523 |                    Parisotto, Emilio and Foerster, Jakob and Singh, Satinder
524 |                    and Behbahani, Feryal",
525 |   abstract      = "Structured state space sequence (S4) models have recently
526 |                    achieved state-of-the-art performance on long-range sequence
527 |                    modeling tasks. These models also have fast inference speeds
528 |                    and parallelisable training, making them potentially useful
529 |                    in many reinforcement learning settings. We propose a
530 |                    modification to a variant of S4 that enables us to
531 |                    initialise and reset the hidden state in parallel, allowing
532 |                    us to tackle reinforcement learning tasks. We show that our
533 |                    modified architecture runs asymptotically faster than
534 |                    Transformers and performs better than LSTM models on a
535 |                    simple memory-based task. Then, by leveraging the model's
536 |                    ability to handle long-range sequences, we achieve strong
537 |                    performance on a challenging meta-learning task in which the
538 |                    agent is given a randomly-sampled continuous control
539 |                    environment, combined with a randomly-sampled linear
540 |                    projection of the environment's observations and actions.
541 |                    Furthermore, we show the resulting model can adapt to
542 |                    out-of-distribution held-out tasks. Overall, the results
543 |                    presented in this paper suggest that the S4 models are a
544 |                    strong contender for the default architecture used for
545 |                    in-context reinforcement learning",
546 |   month         =  mar,
547 |   year          =  2023,
548 |   keywords      = "SSM",
549 |   archivePrefix = "arXiv",
550 |   primaryClass  = "cs.LG",
551 |   eprint        = "2303.03982"
552 | }
553 | 
554 | @ARTICLE{Nguyen2022-qi,
555 |   title         = "{S4ND}: Modeling Images and Videos as Multidimensional
556 |                    Signals Using State Spaces",
557 |   author        = "Nguyen, Eric and Goel, Karan and Gu, Albert and Downs,
558 |                    Gordon W and Shah, Preey and Dao, Tri and Baccus, Stephen A
559 |                    and R{\'e}, Christopher",
560 |   abstract      = "Visual data such as images and videos are typically modeled
561 |                    as discretizations of inherently continuous,
562 |                    multidimensional signals. Existing continuous-signal models
563 |                    attempt to exploit this fact by modeling the underlying
564 |                    signals of visual (e.g., image) data directly. However,
565 |                    these models have not yet been able to achieve competitive
566 |                    performance on practical vision tasks such as large-scale
567 |                    image and video classification. Building on a recent line of
568 |                    work on deep state space models (SSMs), we propose S4ND, a
569 |                    new multidimensional SSM layer that extends the
570 |                    continuous-signal modeling ability of SSMs to
571 |                    multidimensional data including images and videos. We show
572 |                    that S4ND can model large-scale visual data in $1$D, $2$D,
573 |                    and $3$D as continuous multidimensional signals and
574 |                    demonstrates strong performance by simply swapping Conv2D
575 |                    and self-attention layers with S4ND layers in existing
576 |                    state-of-the-art models. On ImageNet-1k, S4ND exceeds the
577 |                    performance of a Vision Transformer baseline by $1.5\%$ when
578 |                    training with a $1$D sequence of patches, and matches
579 |                    ConvNeXt when modeling images in $2$D. For videos, S4ND
580 |                    improves on an inflated $3$D ConvNeXt in activity
581 |                    classification on HMDB-51 by $4\%$. S4ND implicitly learns
582 |                    global, continuous convolutional kernels that are resolution
583 |                    invariant by construction, providing an inductive bias that
584 |                    enables generalization across multiple resolutions. By
585 |                    developing a simple bandlimiting modification to S4 to
586 |                    overcome aliasing, S4ND achieves strong zero-shot (unseen at
587 |                    training time) resolution performance, outperforming a
588 |                    baseline Conv2D by $40\%$ on CIFAR-10 when trained on $8
589 |                    \times 8$ and tested on $32 \times 32$ images. When trained
590 |                    with progressive resizing, S4ND comes within $\sim 1\%$ of a
591 |                    high-resolution model while training $22\%$ faster.",
592 |   month         =  oct,
593 |   year          =  2022,
594 |   keywords      = "SSM",
595 |   archivePrefix = "arXiv",
596 |   primaryClass  = "cs.CV",
597 |   eprint        = "2210.06583"
598 | }
599 | 


--------------------------------------------------------------------------------
/temp.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[tikz,border=2mm]{standalone}
 2 | \usepackage{tikz}
 3 | \usetikzlibrary{positioning}
 4 | \begin{document}
 5 |     \begin{tikzpicture}[scale=0.8]
 6 |         \tikzset{layer/.style={draw,minimum width=1.5cm,minimum height=1.5cm}}
 7 |         \tikzset{dot/.style={circle,fill,inner sep=1.5pt}}
 8 |         \tikzset{vec/.style={draw,thick,-latex}}
 9 |         
10 |         \node[dot,label=left:$x_1$] (x1) at (0,0) {};
11 |         \node[dot,label=left:$x_2$,below=0.5cm of x1] (x2) {};
12 |         \node[dot,label=left:$x_3$,below=0.5cm of x2] (x3) {};
13 |         
14 |         \node[layer,right=1.5cm of x2,align=center] (self-att) {Self-\\Attention};
15 |         
16 |         \node[dot,label=right:$y_1$,right=1.5cm of self-att] (y1) {};
17 |         \node[dot,label=right:$y_2$,below=0.5cm of y1] (y2) {};
18 |         \node[dot,label=right:$y_3$,below=0.5cm of y2] (y3) {};
19 |         
20 |         \foreach \i in {1,...,3} {
21 |             \draw[vec] (x\i) -- (self-att.west |- x\i);
22 |             \draw[vec] (self-att.east |- y\i) -- (y\i);
23 |         }
24 |         
25 |         \draw[vec] (self-att) -- (self-att);
26 |     \end{tikzpicture}
27 | \end{document}
28 | 


--------------------------------------------------------------------------------