├── .gitignore
├── LICENSE
├── README.md
├── latex
├── dl4nlp-bibliography.bib
├── lecture01
│ ├── .gitignore
│ ├── compile-pdf.sh
│ ├── dl4nlp2023-lecture01.tex
│ └── img
│ │ ├── arct.png
│ │ ├── cc-by-sa-icon.pdf
│ │ ├── dial1.png
│ │ ├── hfdata.png
│ │ ├── logo-trusthlt.pdf
│ │ ├── mt2.png
│ │ ├── mtex.jpg
│ │ ├── nlg1.png
│ │ └── nlg2.png
├── lecture02
│ ├── .gitignore
│ ├── compile-pdf.sh
│ ├── dl4nlp2023-lecture02.tex
│ └── img
│ │ ├── backprop-my.pdf
│ │ ├── backprop-my.svg
│ │ ├── backprop01.pdf
│ │ ├── backprop02.pdf
│ │ ├── backprop03.pdf
│ │ ├── backprop04.pdf
│ │ ├── backprop05.pdf
│ │ ├── backprop06.pdf
│ │ ├── backprop07.pdf
│ │ ├── backprop08.pdf
│ │ ├── backprop09.pdf
│ │ ├── backprop10.pdf
│ │ ├── cc-by-sa-icon.pdf
│ │ ├── desmos-graph1.pdf
│ │ ├── desmos-graph1.svg
│ │ ├── gradient1.pdf
│ │ ├── logo-trusthlt.pdf
│ │ ├── parent-child.pdf
│ │ ├── parent-child.svg
│ │ └── rosenbrock.pdf
├── lecture03
│ ├── .gitignore
│ ├── compile-pdf.sh
│ ├── dl4nlp2023-lecture03.tex
│ └── img
│ │ ├── cc-by-sa-icon.pdf
│ │ └── logo-trusthlt.pdf
├── lecture04
│ ├── .gitignore
│ ├── compile-pdf.sh
│ ├── dl4nlp2023-lecture04.tex
│ └── img
│ │ ├── cc-by-sa-icon.pdf
│ │ ├── logo-trusthlt.pdf
│ │ └── temperatures.png
├── lecture05
│ ├── .gitignore
│ ├── compile-pdf.sh
│ ├── dl4nlp2023-lecture05.tex
│ └── img
│ │ ├── cc-by-sa-icon.pdf
│ │ ├── linear1.png
│ │ ├── linear2.png
│ │ ├── linear3.png
│ │ ├── linear4.png
│ │ ├── logo-trusthlt.pdf
│ │ ├── xor1.pdf
│ │ └── xor1.svg
├── lecture06
│ ├── .gitignore
│ ├── compile-pdf.sh
│ ├── dl4nlp2023-lecture06.tex
│ └── img
│ │ ├── cc-by-sa-icon.pdf
│ │ ├── logo-trusthlt.pdf
│ │ └── rewe.png
├── lecture07
│ ├── .gitignore
│ ├── compile-pdf.sh
│ ├── dl4nlp2023-lecture07.tex
│ └── img
│ │ ├── cc-by-sa-icon.pdf
│ │ └── logo-trusthlt.pdf
├── lecture08
│ ├── .gitignore
│ ├── compile-pdf.sh
│ ├── dl4nlp2023-lecture08.tex
│ └── img
│ │ ├── cc-by-sa-icon.pdf
│ │ ├── logo-trusthlt.pdf
│ │ ├── seq2seq.pdf
│ │ ├── seq2seq_attention_motivation.pdf
│ │ ├── seq2seq_attention_t1.pdf
│ │ ├── seq2seq_attn_encdec.pdf
│ │ ├── seq2seq_selfattn.pdf
│ │ ├── sequence_classification.pdf
│ │ ├── sequence_labeling.pdf
│ │ ├── sequence_length.png
│ │ ├── sequence_to_sequence.pdf
│ │ ├── sequence_to_sequence_anno.pdf
│ │ ├── sequence_to_sequence_boxed.pdf
│ │ ├── translation_heatmap.png
│ │ ├── ukp_logo.png
│ │ └── variable_input_output.pdf
├── lecture09
│ ├── .gitignore
│ ├── compile-pdf.sh
│ ├── dl4nlp2023-lecture09.tex
│ └── img
│ │ ├── aiayn.png
│ │ ├── anno_transformer.png
│ │ ├── anno_transformer_attn_block.png
│ │ ├── anno_trf_hlattn.png
│ │ ├── anno_trf_hllinear.png
│ │ ├── anno_trf_multihead.png
│ │ ├── cc-by-sa-icon.pdf
│ │ ├── positional_embs.png
│ │ ├── seq2seq_attention_t1.pdf
│ │ ├── seq2seq_attention_t2.pdf
│ │ ├── seq2seq_attention_t3.pdf
│ │ ├── seq2seq_attn_encdec.pdf
│ │ └── ukp_logo.png
├── lecture10
│ ├── .gitignore
│ ├── compile-pdf.sh
│ ├── dl4nlp2023-lecture10.tex
│ └── img
│ │ ├── BERT-language-modeling-masked-lm.png
│ │ ├── aiayn.png
│ │ ├── anno_transformer.png
│ │ ├── anno_transformer_attn_block.png
│ │ ├── anno_trf_hlattn.png
│ │ ├── anno_trf_hllinear.png
│ │ ├── anno_trf_multihead.png
│ │ ├── bart-pretraining-tasks.png
│ │ ├── bert-decoder-head-hl.png
│ │ ├── bert-google.png
│ │ ├── bert-next-sentence-prediction.png
│ │ ├── bert-pair-classification.png
│ │ ├── bert-paper.png
│ │ ├── bert-results.png
│ │ ├── bert-seq-labeling.png
│ │ ├── bert-single-sentence-clf.png
│ │ ├── bert-spanex-qa.png
│ │ ├── bert-viz.png
│ │ ├── bert_dual_seq.png
│ │ ├── bert_modeling.pdf
│ │ ├── bert_nsp_anno.png
│ │ ├── cc-by-sa-icon.pdf
│ │ ├── gifs
│ │ ├── .ipynb_checkpoints
│ │ │ └── Untitled-checkpoint.ipynb
│ │ ├── transformer_decoding_1.gif
│ │ └── transformer_decoding_2.gif
│ │ ├── positional_embs.png
│ │ ├── pretrained-lm-variants.png
│ │ ├── seq2seq_attention_t1.pdf
│ │ ├── seq2seq_attention_t2.pdf
│ │ ├── seq2seq_attention_t3.pdf
│ │ ├── seq2seq_attn_encdec.pdf
│ │ ├── t5-objectives.png
│ │ ├── the_transformer_mt.png
│ │ ├── transformer_blocks.png
│ │ ├── transformer_encoder_decoder_stack.png
│ │ ├── transformer_encoder_decoder_stack_full.png
│ │ ├── transformer_encoders_decoders.png
│ │ ├── transformer_residual_layer_norm.png
│ │ ├── ukp_logo.png
│ │ └── word2vec_cbow.pdf
└── lecture11
│ ├── .gitignore
│ ├── compile-pdf.sh
│ ├── dl4nlp2023-lecture11.tex
│ └── img
│ ├── attention-masks-anno.png
│ ├── attention-patterns.png
│ ├── attention-types.png
│ ├── autoregressive_trf_decoder.pdf
│ ├── bidirectional_trf_encoder.pdf
│ ├── cc-by-sa-icon.pdf
│ ├── continuous-prompts.png
│ ├── few-shot-translation-gpt3.png
│ ├── fine-tuning-mlms.png
│ ├── gpt2-demonstrations.png
│ ├── gpt2-paper.png
│ ├── gpt2-prompt-anno.png
│ ├── gpt2-prompting-qa.png
│ ├── gpt2-title-anno.png
│ ├── gpt2-zeroshot-qa.png
│ ├── gpt3-translation-results.png
│ ├── in-context-learning.png
│ ├── language-modeling-types.png
│ ├── lm-scaling.png
│ ├── one-shot-translation-gpt3.png
│ ├── prompting_mlms.png
│ ├── t5-anno-prompts.png
│ ├── t5-objectives.png
│ ├── transformer_enc_dec.pdf
│ ├── ukp_logo.png
│ └── zero-shot-translation-gpt3.png
├── pdf
├── DL4NLP Lecture 12_ Contemporary LLMs.pptx
├── dl4nlp2023-lecture01.pdf
├── dl4nlp2023-lecture02.pdf
├── dl4nlp2023-lecture03.pdf
├── dl4nlp2023-lecture04.pdf
├── dl4nlp2023-lecture05.pdf
├── dl4nlp2023-lecture06.pdf
├── dl4nlp2023-lecture07.pdf
├── dl4nlp2023-lecture08.pdf
├── dl4nlp2023-lecture09.pdf
├── dl4nlp2023-lecture10.pdf
├── dl4nlp2023-lecture11.pdf
├── dl4nlp2023-lecture12-recap.pdf
├── dl4nlp2023-lecture13.pdf
└── dl4nlp2023-lecture13.pptx
└── subtitles
├── DL4NLP23 06_ Text classification 3_ Learning word embeddings.srt
├── DL4NLP23-01-medium.srt
├── DL4NLP23-02-large.srt
├── DL4NLP23-03.srt
├── DL4NLP23-04.srt
├── DL4NLP23-05.srt
└── DL4NLP23-07.srt
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Core latex/pdflatex auxiliary files:
2 | *.aux
3 | *.lof
4 | *.log
5 | *.lot
6 | *.fls
7 | *.out
8 | *.toc
9 | *.fmt
10 | *.fot
11 | *.cb
12 | *.cb2
13 | .*.lb
14 |
15 | ## Intermediate documents:
16 | *.dvi
17 | *.xdv
18 | *-converted-to.*
19 | # these rules might exclude image files for figures etc.
20 | # *.ps
21 | # *.eps
22 | # *.pdf
23 |
24 | ## Generated if empty string is given at "Please type another file name for output:"
25 | .pdf
26 |
27 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
28 | *.bbl
29 | *.bcf
30 | *.blg
31 | *-blx.aux
32 | *-blx.bib
33 | *.run.xml
34 |
35 | ## Build tool auxiliary files:
36 | *.fdb_latexmk
37 | *.synctex
38 | *.synctex(busy)
39 | *.synctex.gz
40 | *.synctex.gz(busy)
41 | *.pdfsync
42 |
43 | ## Build tool directories for auxiliary files
44 | # latexrun
45 | latex.out/
46 |
47 | ## Auxiliary and intermediate files from other packages:
48 | # algorithms
49 | *.alg
50 | *.loa
51 |
52 | # achemso
53 | acs-*.bib
54 |
55 | # amsthm
56 | *.thm
57 |
58 | # beamer
59 | *.nav
60 | *.pre
61 | *.snm
62 | *.vrb
63 |
64 | # changes
65 | *.soc
66 |
67 | # comment
68 | *.cut
69 |
70 | # cprotect
71 | *.cpt
72 |
73 | # elsarticle (documentclass of Elsevier journals)
74 | *.spl
75 |
76 | # endnotes
77 | *.ent
78 |
79 | # fixme
80 | *.lox
81 |
82 | # feynmf/feynmp
83 | *.mf
84 | *.mp
85 | *.t[1-9]
86 | *.t[1-9][0-9]
87 | *.tfm
88 |
89 | #(r)(e)ledmac/(r)(e)ledpar
90 | *.end
91 | *.?end
92 | *.[1-9]
93 | *.[1-9][0-9]
94 | *.[1-9][0-9][0-9]
95 | *.[1-9]R
96 | *.[1-9][0-9]R
97 | *.[1-9][0-9][0-9]R
98 | *.eledsec[1-9]
99 | *.eledsec[1-9]R
100 | *.eledsec[1-9][0-9]
101 | *.eledsec[1-9][0-9]R
102 | *.eledsec[1-9][0-9][0-9]
103 | *.eledsec[1-9][0-9][0-9]R
104 |
105 | # glossaries
106 | *.acn
107 | *.acr
108 | *.glg
109 | *.glo
110 | *.gls
111 | *.glsdefs
112 | *.lzo
113 | *.lzs
114 |
115 | # uncomment this for glossaries-extra (will ignore makeindex's style files!)
116 | # *.ist
117 |
118 | # gnuplottex
119 | *-gnuplottex-*
120 |
121 | # gregoriotex
122 | *.gaux
123 | *.gtex
124 |
125 | # htlatex
126 | *.4ct
127 | *.4tc
128 | *.idv
129 | *.lg
130 | *.trc
131 | *.xref
132 |
133 | # hyperref
134 | *.brf
135 |
136 | # knitr
137 | *-concordance.tex
138 | # TODO Comment the next line if you want to keep your tikz graphics files
139 | *.tikz
140 | *-tikzDictionary
141 |
142 | # listings
143 | *.lol
144 |
145 | # luatexja-ruby
146 | *.ltjruby
147 |
148 | # makeidx
149 | *.idx
150 | *.ilg
151 | *.ind
152 |
153 | # minitoc
154 | *.maf
155 | *.mlf
156 | *.mlt
157 | *.mtc[0-9]*
158 | *.slf[0-9]*
159 | *.slt[0-9]*
160 | *.stc[0-9]*
161 |
162 | # minted
163 | _minted*
164 | *.pyg
165 |
166 | # morewrites
167 | *.mw
168 |
169 | # nomencl
170 | *.nlg
171 | *.nlo
172 | *.nls
173 |
174 | # pax
175 | *.pax
176 |
177 | # pdfpcnotes
178 | *.pdfpc
179 |
180 | # sagetex
181 | *.sagetex.sage
182 | *.sagetex.py
183 | *.sagetex.scmd
184 |
185 | # scrwfile
186 | *.wrt
187 |
188 | # sympy
189 | *.sout
190 | *.sympy
191 | sympy-plots-for-*.tex/
192 |
193 | # pdfcomment
194 | *.upa
195 | *.upb
196 |
197 | # pythontex
198 | *.pytxcode
199 | pythontex-files-*/
200 |
201 | # tcolorbox
202 | *.listing
203 |
204 | # thmtools
205 | *.loe
206 |
207 | # TikZ & PGF
208 | *.dpth
209 | *.md5
210 | *.auxlock
211 |
212 | # todonotes
213 | *.tdo
214 |
215 | # vhistory
216 | *.hst
217 | *.ver
218 |
219 | # easy-todo
220 | *.lod
221 |
222 | # xcolor
223 | *.xcp
224 |
225 | # xmpincl
226 | *.xmpi
227 |
228 | # xindy
229 | *.xdy
230 |
231 | # xypic precompiled matrices and outlines
232 | *.xyc
233 | *.xyd
234 |
235 | # endfloat
236 | *.ttt
237 | *.fff
238 |
239 | # Latexian
240 | TSWLatexianTemp*
241 |
242 | ## Editors:
243 | # WinEdt
244 | *.bak
245 | *.sav
246 |
247 | # Texpad
248 | .texpadtmp
249 |
250 | # LyX
251 | *.lyx~
252 |
253 | # Kile
254 | *.backup
255 |
256 | # gummi
257 | .*.swp
258 |
259 | # KBibTeX
260 | *~[0-9]*
261 |
262 | # TeXnicCenter
263 | *.tps
264 |
265 | # auto folder when using emacs and auctex
266 | ./auto/*
267 | *.el
268 |
269 | # expex forward references with \gathertags
270 | *-tags.tex
271 |
272 | # standalone packages
273 | *.sta
274 |
275 | # Makeindex log files
276 | *.lpz
277 |
278 | .idea/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Attribution-ShareAlike 4.0 International
2 |
3 | =======================================================================
4 |
5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
6 | does not provide legal services or legal advice. Distribution of
7 | Creative Commons public licenses does not create a lawyer-client or
8 | other relationship. Creative Commons makes its licenses and related
9 | information available on an "as-is" basis. Creative Commons gives no
10 | warranties regarding its licenses, any material licensed under their
11 | terms and conditions, or any related information. Creative Commons
12 | disclaims all liability for damages resulting from their use to the
13 | fullest extent possible.
14 |
15 | Using Creative Commons Public Licenses
16 |
17 | Creative Commons public licenses provide a standard set of terms and
18 | conditions that creators and other rights holders may use to share
19 | original works of authorship and other material subject to copyright
20 | and certain other rights specified in the public license below. The
21 | following considerations are for informational purposes only, are not
22 | exhaustive, and do not form part of our licenses.
23 |
24 | Considerations for licensors: Our public licenses are
25 | intended for use by those authorized to give the public
26 | permission to use material in ways otherwise restricted by
27 | copyright and certain other rights. Our licenses are
28 | irrevocable. Licensors should read and understand the terms
29 | and conditions of the license they choose before applying it.
30 | Licensors should also secure all rights necessary before
31 | applying our licenses so that the public can reuse the
32 | material as expected. Licensors should clearly mark any
33 | material not subject to the license. This includes other CC-
34 | licensed material, or material used under an exception or
35 | limitation to copyright. More considerations for licensors:
36 | wiki.creativecommons.org/Considerations_for_licensors
37 |
38 | Considerations for the public: By using one of our public
39 | licenses, a licensor grants the public permission to use the
40 | licensed material under specified terms and conditions. If
41 | the licensor's permission is not necessary for any reason--for
42 | example, because of any applicable exception or limitation to
43 | copyright--then that use is not regulated by the license. Our
44 | licenses grant only permissions under copyright and certain
45 | other rights that a licensor has authority to grant. Use of
46 | the licensed material may still be restricted for other
47 | reasons, including because others have copyright or other
48 | rights in the material. A licensor may make special requests,
49 | such as asking that all changes be marked or described.
50 | Although not required by our licenses, you are encouraged to
51 | respect those requests where reasonable. More considerations
52 | for the public:
53 | wiki.creativecommons.org/Considerations_for_licensees
54 |
55 | =======================================================================
56 |
57 | Creative Commons Attribution-ShareAlike 4.0 International Public
58 | License
59 |
60 | By exercising the Licensed Rights (defined below), You accept and agree
61 | to be bound by the terms and conditions of this Creative Commons
62 | Attribution-ShareAlike 4.0 International Public License ("Public
63 | License"). To the extent this Public License may be interpreted as a
64 | contract, You are granted the Licensed Rights in consideration of Your
65 | acceptance of these terms and conditions, and the Licensor grants You
66 | such rights in consideration of benefits the Licensor receives from
67 | making the Licensed Material available under these terms and
68 | conditions.
69 |
70 |
71 | Section 1 -- Definitions.
72 |
73 | a. Adapted Material means material subject to Copyright and Similar
74 | Rights that is derived from or based upon the Licensed Material
75 | and in which the Licensed Material is translated, altered,
76 | arranged, transformed, or otherwise modified in a manner requiring
77 | permission under the Copyright and Similar Rights held by the
78 | Licensor. For purposes of this Public License, where the Licensed
79 | Material is a musical work, performance, or sound recording,
80 | Adapted Material is always produced where the Licensed Material is
81 | synched in timed relation with a moving image.
82 |
83 | b. Adapter's License means the license You apply to Your Copyright
84 | and Similar Rights in Your contributions to Adapted Material in
85 | accordance with the terms and conditions of this Public License.
86 |
87 | c. BY-SA Compatible License means a license listed at
88 | creativecommons.org/compatiblelicenses, approved by Creative
89 | Commons as essentially the equivalent of this Public License.
90 |
91 | d. Copyright and Similar Rights means copyright and/or similar rights
92 | closely related to copyright including, without limitation,
93 | performance, broadcast, sound recording, and Sui Generis Database
94 | Rights, without regard to how the rights are labeled or
95 | categorized. For purposes of this Public License, the rights
96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar
97 | Rights.
98 |
99 | e. Effective Technological Measures means those measures that, in the
100 | absence of proper authority, may not be circumvented under laws
101 | fulfilling obligations under Article 11 of the WIPO Copyright
102 | Treaty adopted on December 20, 1996, and/or similar international
103 | agreements.
104 |
105 | f. Exceptions and Limitations means fair use, fair dealing, and/or
106 | any other exception or limitation to Copyright and Similar Rights
107 | that applies to Your use of the Licensed Material.
108 |
109 | g. License Elements means the license attributes listed in the name
110 | of a Creative Commons Public License. The License Elements of this
111 | Public License are Attribution and ShareAlike.
112 |
113 | h. Licensed Material means the artistic or literary work, database,
114 | or other material to which the Licensor applied this Public
115 | License.
116 |
117 | i. Licensed Rights means the rights granted to You subject to the
118 | terms and conditions of this Public License, which are limited to
119 | all Copyright and Similar Rights that apply to Your use of the
120 | Licensed Material and that the Licensor has authority to license.
121 |
122 | j. Licensor means the individual(s) or entity(ies) granting rights
123 | under this Public License.
124 |
125 | k. Share means to provide material to the public by any means or
126 | process that requires permission under the Licensed Rights, such
127 | as reproduction, public display, public performance, distribution,
128 | dissemination, communication, or importation, and to make material
129 | available to the public including in ways that members of the
130 | public may access the material from a place and at a time
131 | individually chosen by them.
132 |
133 | l. Sui Generis Database Rights means rights other than copyright
134 | resulting from Directive 96/9/EC of the European Parliament and of
135 | the Council of 11 March 1996 on the legal protection of databases,
136 | as amended and/or succeeded, as well as other essentially
137 | equivalent rights anywhere in the world.
138 |
139 | m. You means the individual or entity exercising the Licensed Rights
140 | under this Public License. Your has a corresponding meaning.
141 |
142 |
143 | Section 2 -- Scope.
144 |
145 | a. License grant.
146 |
147 | 1. Subject to the terms and conditions of this Public License,
148 | the Licensor hereby grants You a worldwide, royalty-free,
149 | non-sublicensable, non-exclusive, irrevocable license to
150 | exercise the Licensed Rights in the Licensed Material to:
151 |
152 | a. reproduce and Share the Licensed Material, in whole or
153 | in part; and
154 |
155 | b. produce, reproduce, and Share Adapted Material.
156 |
157 | 2. Exceptions and Limitations. For the avoidance of doubt, where
158 | Exceptions and Limitations apply to Your use, this Public
159 | License does not apply, and You do not need to comply with
160 | its terms and conditions.
161 |
162 | 3. Term. The term of this Public License is specified in Section
163 | 6(a).
164 |
165 | 4. Media and formats; technical modifications allowed. The
166 | Licensor authorizes You to exercise the Licensed Rights in
167 | all media and formats whether now known or hereafter created,
168 | and to make technical modifications necessary to do so. The
169 | Licensor waives and/or agrees not to assert any right or
170 | authority to forbid You from making technical modifications
171 | necessary to exercise the Licensed Rights, including
172 | technical modifications necessary to circumvent Effective
173 | Technological Measures. For purposes of this Public License,
174 | simply making modifications authorized by this Section 2(a)
175 | (4) never produces Adapted Material.
176 |
177 | 5. Downstream recipients.
178 |
179 | a. Offer from the Licensor -- Licensed Material. Every
180 | recipient of the Licensed Material automatically
181 | receives an offer from the Licensor to exercise the
182 | Licensed Rights under the terms and conditions of this
183 | Public License.
184 |
185 | b. Additional offer from the Licensor -- Adapted Material.
186 | Every recipient of Adapted Material from You
187 | automatically receives an offer from the Licensor to
188 | exercise the Licensed Rights in the Adapted Material
189 | under the conditions of the Adapter's License You apply.
190 |
191 | c. No downstream restrictions. You may not offer or impose
192 | any additional or different terms or conditions on, or
193 | apply any Effective Technological Measures to, the
194 | Licensed Material if doing so restricts exercise of the
195 | Licensed Rights by any recipient of the Licensed
196 | Material.
197 |
198 | 6. No endorsement. Nothing in this Public License constitutes or
199 | may be construed as permission to assert or imply that You
200 | are, or that Your use of the Licensed Material is, connected
201 | with, or sponsored, endorsed, or granted official status by,
202 | the Licensor or others designated to receive attribution as
203 | provided in Section 3(a)(1)(A)(i).
204 |
205 | b. Other rights.
206 |
207 | 1. Moral rights, such as the right of integrity, are not
208 | licensed under this Public License, nor are publicity,
209 | privacy, and/or other similar personality rights; however, to
210 | the extent possible, the Licensor waives and/or agrees not to
211 | assert any such rights held by the Licensor to the limited
212 | extent necessary to allow You to exercise the Licensed
213 | Rights, but not otherwise.
214 |
215 | 2. Patent and trademark rights are not licensed under this
216 | Public License.
217 |
218 | 3. To the extent possible, the Licensor waives any right to
219 | collect royalties from You for the exercise of the Licensed
220 | Rights, whether directly or through a collecting society
221 | under any voluntary or waivable statutory or compulsory
222 | licensing scheme. In all other cases the Licensor expressly
223 | reserves any right to collect such royalties.
224 |
225 |
226 | Section 3 -- License Conditions.
227 |
228 | Your exercise of the Licensed Rights is expressly made subject to the
229 | following conditions.
230 |
231 | a. Attribution.
232 |
233 | 1. If You Share the Licensed Material (including in modified
234 | form), You must:
235 |
236 | a. retain the following if it is supplied by the Licensor
237 | with the Licensed Material:
238 |
239 | i. identification of the creator(s) of the Licensed
240 | Material and any others designated to receive
241 | attribution, in any reasonable manner requested by
242 | the Licensor (including by pseudonym if
243 | designated);
244 |
245 | ii. a copyright notice;
246 |
247 | iii. a notice that refers to this Public License;
248 |
249 | iv. a notice that refers to the disclaimer of
250 | warranties;
251 |
252 | v. a URI or hyperlink to the Licensed Material to the
253 | extent reasonably practicable;
254 |
255 | b. indicate if You modified the Licensed Material and
256 | retain an indication of any previous modifications; and
257 |
258 | c. indicate the Licensed Material is licensed under this
259 | Public License, and include the text of, or the URI or
260 | hyperlink to, this Public License.
261 |
262 | 2. You may satisfy the conditions in Section 3(a)(1) in any
263 | reasonable manner based on the medium, means, and context in
264 | which You Share the Licensed Material. For example, it may be
265 | reasonable to satisfy the conditions by providing a URI or
266 | hyperlink to a resource that includes the required
267 | information.
268 |
269 | 3. If requested by the Licensor, You must remove any of the
270 | information required by Section 3(a)(1)(A) to the extent
271 | reasonably practicable.
272 |
273 | b. ShareAlike.
274 |
275 | In addition to the conditions in Section 3(a), if You Share
276 | Adapted Material You produce, the following conditions also apply.
277 |
278 | 1. The Adapter's License You apply must be a Creative Commons
279 | license with the same License Elements, this version or
280 | later, or a BY-SA Compatible License.
281 |
282 | 2. You must include the text of, or the URI or hyperlink to, the
283 | Adapter's License You apply. You may satisfy this condition
284 | in any reasonable manner based on the medium, means, and
285 | context in which You Share Adapted Material.
286 |
287 | 3. You may not offer or impose any additional or different terms
288 | or conditions on, or apply any Effective Technological
289 | Measures to, Adapted Material that restrict exercise of the
290 | rights granted under the Adapter's License You apply.
291 |
292 |
293 | Section 4 -- Sui Generis Database Rights.
294 |
295 | Where the Licensed Rights include Sui Generis Database Rights that
296 | apply to Your use of the Licensed Material:
297 |
298 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right
299 | to extract, reuse, reproduce, and Share all or a substantial
300 | portion of the contents of the database;
301 |
302 | b. if You include all or a substantial portion of the database
303 | contents in a database in which You have Sui Generis Database
304 | Rights, then the database in which You have Sui Generis Database
305 | Rights (but not its individual contents) is Adapted Material,
306 |
307 | including for purposes of Section 3(b); and
308 | c. You must comply with the conditions in Section 3(a) if You Share
309 | all or a substantial portion of the contents of the database.
310 |
311 | For the avoidance of doubt, this Section 4 supplements and does not
312 | replace Your obligations under this Public License where the Licensed
313 | Rights include other Copyright and Similar Rights.
314 |
315 |
316 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
317 |
318 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
319 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
320 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
321 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
322 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
323 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
324 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
325 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
326 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
327 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
328 |
329 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
330 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
331 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
332 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
333 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
334 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
335 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
336 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
337 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
338 |
339 | c. The disclaimer of warranties and limitation of liability provided
340 | above shall be interpreted in a manner that, to the extent
341 | possible, most closely approximates an absolute disclaimer and
342 | waiver of all liability.
343 |
344 |
345 | Section 6 -- Term and Termination.
346 |
347 | a. This Public License applies for the term of the Copyright and
348 | Similar Rights licensed here. However, if You fail to comply with
349 | this Public License, then Your rights under this Public License
350 | terminate automatically.
351 |
352 | b. Where Your right to use the Licensed Material has terminated under
353 | Section 6(a), it reinstates:
354 |
355 | 1. automatically as of the date the violation is cured, provided
356 | it is cured within 30 days of Your discovery of the
357 | violation; or
358 |
359 | 2. upon express reinstatement by the Licensor.
360 |
361 | For the avoidance of doubt, this Section 6(b) does not affect any
362 | right the Licensor may have to seek remedies for Your violations
363 | of this Public License.
364 |
365 | c. For the avoidance of doubt, the Licensor may also offer the
366 | Licensed Material under separate terms or conditions or stop
367 | distributing the Licensed Material at any time; however, doing so
368 | will not terminate this Public License.
369 |
370 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
371 | License.
372 |
373 |
374 | Section 7 -- Other Terms and Conditions.
375 |
376 | a. The Licensor shall not be bound by any additional or different
377 | terms or conditions communicated by You unless expressly agreed.
378 |
379 | b. Any arrangements, understandings, or agreements regarding the
380 | Licensed Material not stated herein are separate from and
381 | independent of the terms and conditions of this Public License.
382 |
383 |
384 | Section 8 -- Interpretation.
385 |
386 | a. For the avoidance of doubt, this Public License does not, and
387 | shall not be interpreted to, reduce, limit, restrict, or impose
388 | conditions on any use of the Licensed Material that could lawfully
389 | be made without permission under this Public License.
390 |
391 | b. To the extent possible, if any provision of this Public License is
392 | deemed unenforceable, it shall be automatically reformed to the
393 | minimum extent necessary to make it enforceable. If the provision
394 | cannot be reformed, it shall be severed from this Public License
395 | without affecting the enforceability of the remaining terms and
396 | conditions.
397 |
398 | c. No term or condition of this Public License will be waived and no
399 | failure to comply consented to unless expressly agreed to by the
400 | Licensor.
401 |
402 | d. Nothing in this Public License constitutes or may be interpreted
403 | as a limitation upon, or waiver of, any privileges and immunities
404 | that apply to the Licensor or You, including from the legal
405 | processes of any jurisdiction or authority.
406 |
407 |
408 | =======================================================================
409 |
410 | Creative Commons is not a party to its public
411 | licenses. Notwithstanding, Creative Commons may elect to apply one of
412 | its public licenses to material it publishes and in those instances
413 | will be considered the “Licensor.” The text of the Creative Commons
414 | public licenses is dedicated to the public domain under the CC0 Public
415 | Domain Dedication. Except for the limited purpose of indicating that
416 | material is shared under a Creative Commons public license or as
417 | otherwise permitted by the Creative Commons policies published at
418 | creativecommons.org/policies, Creative Commons does not authorize the
419 | use of the trademark "Creative Commons" or any other trademark or logo
420 | of Creative Commons without its prior written consent including,
421 | without limitation, in connection with any unauthorized modifications
422 | to any of its public licenses or any other arrangements,
423 | understandings, or agreements concerning use of licensed material. For
424 | the avoidance of doubt, this paragraph does not form part of the
425 | public licenses.
426 |
427 | Creative Commons may be contacted at creativecommons.org.
428 |
429 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Deep Learning for Natural Language Processing - Lectures 2023
2 |
3 | This repository contains slides for the course "20-00-0947: Deep Learning for Natural Language Processing" (Technical University of Darmstadt, Summer term 2023).
4 |
5 | This course is jointly lectured by [Ivan Habernal](https://www.trusthlt.org) and [Martin Tutek](https://www.informatik.tu-darmstadt.de/ukp/ukp_home/staff_ukp/ukp_home_content_staff_1_details_124480.en.jsp).
6 |
7 | The slides are available as PDF as well as LaTeX source code (we've used Beamer because typesetting mathematics in PowerPoint or similar tools is painful). See the instructions below if you want to compile the slides yourselves.
8 |
9 | 
10 |
11 | The content is licensed under [Creative Commons CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/) which means that you can re-use, adapt, modify, or publish it further, provided you keep the license and give proper credits.
12 |
13 | **Note:** The following content is continuously updated as the summer term progresses. If you're interested in the full previous 2022 content, checkout the latest [2022 Git commit](https://github.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/tree/a59910534ac600a6e8c22fbcde6ae8223a87cda9).
14 |
15 | ## YouTube Playlist
16 |
17 | Subscribe the YouTube playlist to get updates on new lectures: https://youtube.com/playlist?list=PL6WLGVNe6ZcA4gUr5MaAKdrGxYzYAETK3
18 |
19 | ## Lecture 1: NLP tasks and evaluation
20 |
21 | April 11, 2023
22 |
23 | * [Slides as PDF](/pdf/dl4nlp2023-lecture01.pdf), [YouTube recording](https://www.youtube.com/watch?v=-cku_A34-qE)
24 |
25 | ## Lecture 2: Mathematical foundations of deep learning
26 |
27 | April 18, 2023
28 |
29 | * [Slides as PDF](/pdf/dl4nlp2023-lecture02.pdf), [YouTube recording](https://www.youtube.com/watch?v=XbFNcvWdCTw)
30 |
31 |
32 | ## Lecture 3: Text classification 1: Log-linear models
33 |
34 | April 25, 2023
35 |
36 | * [Slides as PDF](/pdf/dl4nlp2023-lecture03.pdf), [YouTube recording](https://www.youtube.com/watch?v=t7YZ7OgtD5o)
37 |
38 | ## Lecture 4: Text classification 2: Deep neural networks
39 |
40 | May 2, 2023
41 |
42 | * [Slides as PDF](/pdf/dl4nlp2023-lecture04.pdf), [YouTube recording](https://www.youtube.com/watch?v=Fk1Y4ycO3aY)
43 |
44 | ## Lecture 5: Text generation 1: Language models and word embeddings
45 |
46 | May 9, 2023
47 |
48 | * [Slides as PDF](/pdf/dl4nlp2023-lecture05.pdf), [YouTube recording](https://www.youtube.com/watch?v=hqcFkKymRdw)
49 |
50 | ## Lecture 6: Text classification 3: Learning word embeddings
51 |
52 | May 16, 2023
53 |
54 | * [Slides as PDF](/pdf/dl4nlp2023-lecture06.pdf), [YouTube recording](https://www.youtube.com/watch?v=fClxXB8-m8I)
55 |
56 | ## Lecture 7: Text classification 4: Recurrent neural networks
57 |
58 | May 30, 2023
59 |
60 | * [Slides as PDF](/pdf/dl4nlp2023-lecture07.pdf), [YouTube recording](https://www.youtube.com/watch?v=sgjKJRoYx4s)
61 |
62 | ## Lecture 8: Text generation 2: Autoregressive encoder-decoder with RNNs and attention
63 |
64 | June 6, 2023
65 |
66 | * [Slides as PDF](/pdf/dl4nlp2023-lecture08.pdf), [YouTube recording](https://www.youtube.com/watch?v=tOmYTC3XaEo)
67 |
68 | ## Lecture 9: Text generation 3: Transformers
69 |
70 | June 13, 2023
71 |
72 | * [Slides as PDF](/pdf/dl4nlp2023-lecture09.pdf), [YouTube recording](https://youtu.be/yg5QrKOe0V4)
73 |
74 | ## Lecture 10: Text classification 4: self-attention and BERT
75 |
76 | June 20, 2023
77 |
78 | * [Slides as PDF](/pdf/dl4nlp2023-lecture10.pdf), [YouTube recording](https://youtu.be/NOD9irGv9Xg)
79 |
80 | ## Lecture 11: Text generation 4: Decoder-only Models and GPT
81 |
82 | June 27, 2023
83 |
84 | * [Slides as PDF](/pdf/dl4nlp2023-lecture11.pdf), [YouTube recording](https://youtu.be/t3J534JyE-E)
85 |
86 | ## Lecture 12: Contemporary LLMs: Prompting and in-context learning
87 |
88 | July 4, 2023
89 |
90 | * [Recap slides as PDF](/pdf/dl4nlp2023-lecture12-recap.pdf), [PPTX lecture slides](/pdf/DL4NLP%20Lecture%2012_%20Contemporary%20LLMs.pptx)
91 |
92 | ## Lecture 13: Guest lecture by Dr. Thomas Arnold: Ethics of generative AI
93 |
94 | July 11, 2023
95 |
96 | * [Slides as PDF](/pdf/dl4nlp2023-lecture13.pdf), [PPTX lecture slides](/pdf/dl4nlp2023-lecture13.pptx), [YouTube recording](https://www.youtube.com/watch?v=lO2-W5l2y40)
97 |
98 |
99 | ## Subtitles/Close caption
100 |
101 | Thanks to Jan Kühnemund for generating the close caption for YouTube with Open Whisper. We track the subtitles here under `subtitles`, so if you spot an error there (there are many, such as "tanh" -> "10h"), just open a bug or PR.
102 |
103 | ## FAQ
104 |
105 | * What are some essential pre-requisites?
106 | * Math: Derivatives and partial derivatives. We cover them in Lecture 2. If you need more, I would recommend these sources:
107 | * *Jeremy Kun: A Programmer's Introduction to Mathematics.* Absolutely amazing book. Pay-what-you-want for the PDF book. https://pimbook.org/
108 | * *Deisenroth, A. Aldo Faisal, and Cheng Soon Ong: Mathematics for Machine Learning*. Excellent resource, freely available. Might be a bit dense. https://mml-book.github.io/
109 | * Can I have the slide deck without "unfolding" the content over multiple pages?
110 | * You can compile the slides with the `handout` parameter, see below the section [Compiling handouts](#compiling-handouts).
111 | * Where do I find the code for plotting the functions?
112 | * Most of the plots are generated in Python/Jupyter (in Colab). The links are included as comments in the respective LaTeX sources for the slides.
113 |
114 | ## Compiling slides to PDF
115 |
116 | If you run a linux distribution (e.g., Ubuntu 20.04 and newer), all packages are provided as part of `texlive`. Install the following packages
117 |
118 | ```plain
119 | $ sudo apt-get install texlive-latex-recommended texlive-pictures texlive-latex-extra \
120 | texlive-fonts-extra texlive-bibtex-extra texlive-humanities texlive-science \
121 | texlive-luatex biber wget -y
122 | ```
123 |
124 | Install Fira Sans fonts required by the beamer template locally
125 |
126 | ```plain
127 | $ wget https://github.com/mozilla/Fira/archive/refs/tags/4.106.zip -O 4.106.zip \
128 | && unzip -o 4.106.zip && mkdir -p ~/.fonts/FiraSans && cp Fira-4.106/otf/Fira* \
129 | ~/.fonts/FiraSans/ && rm -rf Fira-4.106 && rm 4.106.zip && fc-cache -f -v && mktexlsr
130 | ```
131 |
132 | Compile each lecture's slides using ``lualatex``
133 |
134 | ```plain
135 | $ lualatex dl4nlp2023-lecture*.tex && biber dl4nlp2023-lecture*.bcf && \
136 | lualatex dl4nlp2023-lecture*.tex && lualatex dl4nlp2023-lecture*.tex
137 | ```
138 |
139 | ### Compiling slides using Docker
140 |
141 | If you don't run a linux system or don't want to mess up your latex packages, I've tested compiling the slides in a Docker.
142 |
143 | Install Docker ( https://docs.docker.com/engine/install/ )
144 |
145 | Create a folder to which you clone this repository (for example, `$ mkdir -p /tmp/slides`)
146 |
147 | Run Docker with Ubuntu 20.04 interactively; mount your slides directory under `/mnt` in this Docker container
148 |
149 | ```plain
150 | $ docker run -it --rm --mount type=bind,source=/tmp/slides,target=/mnt \
151 | ubuntu:20.04 /bin/bash
152 | ```
153 |
154 | Once the container is running, update, install packages and fonts as above
155 |
156 | ```plain
157 | # apt-get update && apt-get dist-upgrade -y && apt-get install texlive-latex-recommended \
158 | texlive-pictures texlive-latex-extra texlive-fonts-extra texlive-bibtex-extra \
159 | texlive-humanities texlive-science texlive-luatex biber wget -y
160 | ```
161 |
162 | Fonts
163 |
164 | ```plain
165 | # wget https://github.com/mozilla/Fira/archive/refs/tags/4.106.zip -O 4.106.zip \
166 | && unzip -o 4.106.zip && mkdir -p ~/.fonts/FiraSans && cp Fira-4.106/otf/Fira* \
167 | ~/.fonts/FiraSans/ && rm -rf Fira-4.106 && rm 4.106.zip && fc-cache -f -v && mktexlsr
168 | ```
169 |
170 | And compile
171 |
172 | ```plain
173 | # cd /mnt/dl4nlp/latex/lecture01
174 | # lualatex dl4nlp2023-lecture*.tex && biber dl4nlp2023-lecture*.bcf && \
175 | lualatex dl4nlp2023-lecture*.tex && lualatex dl4nlp2023-lecture*.tex
176 | ```
177 |
178 | which generates the PDF in your local folder (e.g, `/tmp/slides`).
179 |
180 | ### Compiling handouts
181 |
182 | We're uploading the PDFs as presented in the lecture. You can compile the slides in a concise way using the `handout` settings. Just comment/uncomment the respective line at the beginning of the tex file of the lecture slides.
183 |
--------------------------------------------------------------------------------
/latex/dl4nlp-bibliography.bib:
--------------------------------------------------------------------------------
1 | @inproceedings{Wang.et.al.2019.NeurIPS,
2 | address = {Vancouver, Canada},
3 | author = {Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.},
4 | booktitle = {Proceedings of the 33rd International Conference on Neural Information Processing Systems},
5 | pages = {3266--3280},
6 | publisher = {Curran Associates, Inc.},
7 | title = {{SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems}},
8 | year = {2019}
9 | }
10 |
11 |
12 | @inproceedings{Levesque.et.al.2012,
13 | address = {Rome, Italy},
14 | author = {Levesque, Hector J. and Davis, Ernest and Morgenstern, Leora},
15 | booktitle = {Proceedings of the Thirteenth International Conference on Principles of Knowledge Representation and Reasoning},
16 | pages = {552--561},
17 | publisher = {Association for the Advancement of Artificial Intelligence},
18 | title = {{The Winograd Schema Challenge}},
19 | year = {2012}
20 | }
21 |
22 | @article{Dagan.et.al.2009.NLE,
23 | author = {Dagan, Ido and Dolan, BIll and Magnini, Bernardo and Roth, Dan},
24 | doi = {10.1017/S1351324909990209},
25 | journal = {Natural Language Engineering},
26 | number = {4},
27 | pages = {1--27},
28 | title = {{Recognizing textual entailment: Rational, evaluation and approaches}},
29 | volume = {15},
30 | year = {2009}
31 | }
32 |
33 | @inproceedings{Maas.et.al.2011,
34 | address = {Portland, Oregon},
35 | author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},
36 | booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
37 | pages = {142--150},
38 | publisher = {Association for Computational Linguistics},
39 | title = {{Learning Word Vectors for Sentiment Analysis}},
40 | url = {https://aclanthology.org/P11-1015},
41 | year = {2011}
42 | }
43 |
44 |
45 | @inproceedings{Bowman.et.al.2015,
46 | address = {Lisbon, Portugal},
47 | author = {Bowman, Samuel R. and Angeli, Gabor and Potts, Christopher and Manning, Christopher D.},
48 | booktitle = {Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing},
49 | doi = {10.18653/v1/D15-1075},
50 | pages = {632--642},
51 | publisher = {Association for Computational Linguistics},
52 | title = {{A large annotated corpus for learning natural language inference}},
53 | url = {http://aclweb.org/anthology/D15-1075},
54 | year = {2015}
55 | }
56 |
57 | @article{Habernal.et.al.2023.AILaw,
58 | title = {{Mining Legal Arguments in Court Decisions}},
59 | author = {\textbf{Habernal}, \textbf{Ivan} and Faber, Daniel and Recchia, Nicola and Bretthauer, Sebastian and Gurevych, Iryna and Spiecker gennant Döhmann, Indra and Burchard, Christoph},
60 | year = 2023,
61 | journal = {Artificial Intelligence \& Law},
62 | pages = {(to appear)}
63 | }
64 |
65 | @article{Artstein.Poesio.2008.CoLi,
66 | author = {Artstein, Ron and Poesio, Massimo},
67 | doi = {10.1162/coli.07-034-R2},
68 | journal = {Computational Linguistics},
69 | number = {4},
70 | pages = {555--596},
71 | title = {{Inter-Coder Agreement for Computational Linguistics}},
72 | volume = {34},
73 | year = {2008}
74 | }
75 |
76 |
77 | @inproceedings{TjongKimSang.DeMeulder.2003,
78 | author = {{Tjong Kim Sang}, Erik F. and {De Meulder}, Fien},
79 | booktitle = {Proceedings of the Seventh Conference on Natural Language Learning at HLT-NAACL 2003},
80 | pages = {142--147},
81 | publisher = {https://aclanthology.org/W03-0419},
82 | title = {{Introduction to the CoNLL-2003 Shared Task: Language-Independent Named Entity Recognition}},
83 | year = {2003}
84 | }
85 |
86 |
87 | @inproceedings{Clark.et.al.2019.NAACL,
88 | address = {Minneapolis, Minnesota},
89 | author = {Clark, Christopher and Lee, Kenton and Chang, Ming-wei and Kwiatkowski, Tom and Collins, Michael and Toutanova, Kristina},
90 | booktitle = {Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
91 | doi = {10.18653/v1/N19-1300},
92 | pages = {2924--2936},
93 | publisher = {Association for Computational Linguistics},
94 | title = {{BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions}},
95 | year = {2019}
96 | }
97 |
98 |
99 | @inproceedings{Khashabi.et.al.2018.NAACL,
100 | address = {New Orleans, LA},
101 | author = {Khashabi, Daniel and Chaturvedi, Snigdha and Roth, Michael and Upadhyay, Shyam and Roth, Dan},
102 | booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)},
103 | doi = {10.18653/v1/N18-1023},
104 | pages = {252--262},
105 | publisher = {Association for Computational Linguistics},
106 | title = {{Looking Beyond the Surface: A Challenge Set for Reading Comprehension over Multiple Sentences}},
107 | year = {2018}
108 | }
109 |
110 |
111 | @inproceedings{Bojar.et.al.2018.WMT,
112 | address = {Brussels, Belgium},
113 | author = {Bojar, Ondřej and Federmann, Christian and Fishel, Mark and Graham, Yvette and Haddow, Barry and Koehn, Philipp and Monz, Christof},
114 | booktitle = {Proceedings of the Third Conference on Machine Translation: Shared Task Papers},
115 | doi = {10.18653/v1/W18-6401},
116 | pages = {272--303},
117 | publisher = {Association for Computational Linguistics},
118 | title = {{Findings of the 2018 Conference on Machine Translation (WMT18)}},
119 | volume = {2},
120 | year = {2018}
121 | }
122 |
123 |
124 | @book{Koehn.2020,
125 | author = {Philipp Koehn},
126 | title = {Neural Machine Translation},
127 | publisher = {Cambridge University Press},
128 | year = {2020},
129 | note = {(not freely available)}
130 | }
131 |
132 | @inproceedings{Hermann.et.al.2015.NeurIPS,
133 | author = {Hermann, Karl Moritz and Kocisky, Tomas and Grefenstette, Edward and Espeholt, Lasse and Kay, Will and Suleyman, Mustafa and Blunsom, Phil},
134 | booktitle = {Proceedings of NeurIPS},
135 | pages = {1--9},
136 | publisher = {Curran Associates, Inc.},
137 | title = {{Teaching Machines to Read and Comprehend}},
138 | year = {2015}
139 | }
140 |
141 |
142 | @article{Raffel.et.al.2020.JMLR,
143 | author = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J.},
144 | journal = {Journal of Machine Learning Research},
145 | keywords = {attention-,multi-task learning,natural language processing,transfer learning},
146 | number = {140},
147 | pages = {1--67},
148 | title = {{Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer}},
149 | volume = {21},
150 | year = {2020}
151 | }
152 |
153 | @book{Japkowicz.Shah.2011,
154 | title = {{Evaluating Learning Algorithms: A Classification Perspective}},
155 | author = {Nathalie Japkowicz and Mohak Shah},
156 | year = {2011},
157 | publisher = {Cambridge University Press},
158 | note = {(not freely available)},
159 | }
160 |
161 | @inproceedings{Papineni.et.al.2002.ACL,
162 | address = {Philadelphia, PA},
163 | author = {Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing},
164 | booktitle = {Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics},
165 | doi = {10.3115/1073083.1073135},
166 | pages = {311--318},
167 | publisher = {Association for Computational Linguistics},
168 | title = {{BLEU: a Method for Automatic Evaluation of Machine Translation}},
169 | year = {2002}
170 | }
171 |
172 | @inproceedings{Lin.2004,
173 | title = "{ROUGE}: A Package for Automatic Evaluation of Summaries",
174 | author = "Lin, Chin-Yew",
175 | booktitle = "Text Summarization Branches Out",
176 | year = "2004",
177 | address = "Barcelona, Spain",
178 | publisher = "Association for Computational Linguistics",
179 | url = "https://aclanthology.org/W04-1013",
180 | pages = "74--81",
181 | }
182 |
183 | @inproceedings{Plank.2022.EMNLP,
184 | address = {Abu Dhabi, United Arab Emirates},
185 | author = {Plank, Barbara},
186 | booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing},
187 | pages = {10671--10682},
188 | publisher = {Association for Computational Linguistics},
189 | title = {{The “Problem” of Human Label Variation: On Ground Truth in Data, Modeling and Evaluation}},
190 | url = {https://aclanthology.org/2022.emnlp-main.731},
191 | year = {2022}
192 | }
193 |
194 |
195 | @inproceedings{Geva.et.al.2019.EMNLP,
196 | address = {Hong Kong, China},
197 | author = {Geva, Mor and Goldberg, Yoav and Berant, Jonathan},
198 | booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
199 | doi = {10.18653/v1/D19-1107},
200 | pages = {1161--1166},
201 | publisher = {Association for Computational Linguistics},
202 | title = {{Are We Modeling the Task or the Annotator? An Investigation of Annotator Bias in Natural Language Understanding Datasets}},
203 | year = {2019}
204 | }
205 |
206 |
207 | @article{Sai.et.al.2023.CSUR,
208 | author = {Sai, Ananya B. and Mohankumar, Akash Kumar and Khapra, Mitesh M.},
209 | doi = {10.1145/3485766},
210 | journal = {ACM Computing Surveys},
211 | number = {2},
212 | pages = {1--39},
213 | title = {{A Survey of Evaluation Metrics Used for NLG Systems}},
214 | volume = {55},
215 | year = {2023}
216 | }
217 |
218 |
219 | @inproceedings{Habernal.et.al.2018.NAACL.ARCT,
220 | author = {\textbf{Habernal}, \textbf{Ivan} and Wachsmuth, Henning and Gurevych, Iryna and Stein, Benno},
221 | booktitle = {Proceedings of NAACL},
222 | pages = {1930--1940},
223 | title = {{The Argument Reasoning Comprehension Task: Identification and Reconstruction of Implicit Warrants}},
224 | url = {http://aclweb.org/anthology/N18-1175},
225 | address = {New Orleans, LA},
226 | year = {2018}
227 | }
228 |
229 |
230 | @inproceedings{Niven.Kao.2019.ACL,
231 | address = {Florence, Italy},
232 | author = {Niven, Timothy and Kao, Hung-Yu},
233 | booktitle = {Proceedings of ACL},
234 | pages = {4658--4664},
235 | title = {{Probing Neural Network Comprehension of Natural Language Arguments}},
236 | url = {https://www.aclweb.org/anthology/P19-1459},
237 | year = {2019}
238 | }
239 |
240 |
241 |
242 | @article{Forman.Scholz.2009.SIGKDD,
243 | annote = {fundamental article for reporting f-measure},
244 | author = {Forman, George and Scholz, Martin},
245 | file = {:home/habi/Dokumenty/Mendeley Desktop/Forman, Scholz - 2010 - Apples-to-Apples in Cross-Validation Studies Pitfalls in Classifier Performance Measurement.pdf:pdf},
246 | journal = {ACM SIGKDD Explorations Newsletter},
247 | mendeley-groups = {evaluation},
248 | number = {1},
249 | pages = {49--57},
250 | title = {{Apples-to-Apples in Cross-Validation Studies: Pitfalls in Classifier Performance Measurement}},
251 | volume = {12},
252 | year = {2010}
253 | }
254 |
255 |
256 | @article{Sokolova.Lapalme.2009,
257 | author = {Sokolova, Marina and Lapalme, Guy},
258 | doi = {10.1016/j.ipm.2009.03.002},
259 | journal = {Information Processing and Management},
260 | number = {4},
261 | pages = {427--437},
262 | publisher = {Elsevier Ltd},
263 | title = {{A systematic analysis of performance measures for classification tasks}},
264 | volume = {45},
265 | year = {2009}
266 | }
267 |
268 |
269 | @inproceedings{caglayan-etal-2020-curious,
270 | title = "Curious Case of Language Generation Evaluation Metrics: A Cautionary Tale",
271 | author = "Caglayan, Ozan and
272 | Madhyastha, Pranava and
273 | Specia, Lucia",
274 | booktitle = "Proceedings of COLING",
275 | year = "2020",
276 | doi = "10.18653/v1/2020.coling-main.210",
277 | pages = "2322--2328",
278 | }
279 |
280 |
281 | @inproceedings{Rajpurkar.et.al.2018.ACL,
282 | address = {Melbourne, Australia},
283 | author = {Rajpurkar, Pranav and Jia, Robin and Liang, Percy},
284 | booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
285 | doi = {10.18653/v1/P18-2124},
286 | pages = {784--789},
287 | publisher = {Association for Computational Linguistics},
288 | title = {{Know What You Don't Know: Unanswerable Questions for SQuAD}},
289 | year = {2018}
290 | }
291 |
292 |
293 | @inproceedings{Zhang.et.al.2018.ACL,
294 | address = {Melbourne, Australia},
295 | author = {Zhang, Saizheng and Dinan, Emily and Urbanek, Jack and Szlam, Arthur and Kiela, Douwe and Weston, Jason},
296 | booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
297 | doi = {10.18653/v1/P18-1205},
298 | pages = {2204--2213},
299 | publisher = {Association for Computational Linguistics},
300 | title = {{Personalizing Dialogue Agents: I have a dog, do you have pets too?}},
301 | year = {2018}
302 | }
303 |
304 |
305 |
306 | @book{Deisenroth.et.al.2021.book,
307 | title = {Mathematics for Machine Learning},
308 | author = {Deisenroth, Marc Peter and Faisal, Aldo and Ong, Cheng Soon},
309 | year = {2021},
310 | publisher = {Cambridge University Press},
311 | url = {mml-book.com},
312 | }
313 |
314 | @book{Koller.Friedman.2009.book,
315 | title = {Probabilistic Graphical Models: Principles and Techniques},
316 | author = {Koller, Daphne and Friedman, Nir},
317 | publisher = {MIT Press},
318 | year = {2009},
319 | }
320 |
321 | @book{Goodfellow.et.al.2016.book,
322 | title={Deep Learning},
323 | author={Ian Goodfellow and Yoshua Bengio and Aaron Courville},
324 | publisher={MIT Press},
325 | url={www.deeplearningbook.org},
326 | year={2016}
327 | }
328 |
329 | @inproceedings{Iacobacci.et.al.2015.ACL,
330 | address = {Beijing, China},
331 | author = {Iacobacci, Ignacio and Pilehvar, Mohammad Taher and Navigli, Roberto},
332 | booktitle = {Proceedings of ACL},
333 | doi = {10.3115/v1/P15-1010},
334 | pages = {95--105},
335 | publisher = {Association for Computational Linguistics},
336 | title = {{SensEmbed: Learning Sense Embeddings for Word and Relational Similarity}},
337 | year = {2015}
338 | }
339 |
340 |
341 | @inproceedings{Upadhyay.et.al.2016.ACL,
342 | address = {Berlin, Germany},
343 | author = {Upadhyay, Shyam and Faruqui, Manaal and Dyer, Chris and Roth, Dan},
344 | booktitle = {Proceedings of ACL},
345 | doi = {10.18653/v1/P16-1157},
346 | pages = {1661--1670},
347 | title = {{Cross-lingual Models of Word Embeddings: An Empirical Comparison}},
348 | year = {2016}
349 | }
350 |
351 | @inproceedings{Glavas.et.al.2019.ACL,
352 | address = {Florence, Italy},
353 | author = {Glava{\v{s}}, Goran and Litschko, Robert and Ruder, Sebastian and Vuli{\'{c}}, Ivan},
354 | booktitle = {Proceedings of ACL},
355 | doi = {10.18653/v1/P19-1070},
356 | pages = {710--721},
357 | title = {{How to (Properly) Evaluate Cross-Lingual Word Embeddings: On Strong Baselines, Comparative Analyses, and Some Misconceptions}},
358 | year = {2019}
359 | }
360 |
361 |
362 | @inproceedings{Vulic.Moens.2015.ACL,
363 | address = {Beijing, China},
364 | author = {Vuli{\'{c}}, Ivan and Moens, Marie-Francine},
365 | booktitle = {Proceedings of ACL (Volume 2: Short Papers)},
366 | doi = {10.3115/v1/P15-2118},
367 | pages = {719--725},
368 | title = {{Bilingual Word Embeddings from Non-Parallel Document-Aligned Data Applied to Bilingual Lexicon Induction}},
369 | year = {2015}
370 | }
371 |
372 |
373 | @inproceedings{Artetxe.et.al.2017.ACL,
374 | address = {Vancouver, Canada},
375 | author = {Artetxe, Mikel and Labaka, Gorka and Agirre, Eneko},
376 | booktitle = {Proceedings of ACL},
377 | doi = {10.18653/v1/P17-1042},
378 | pages = {451--462},
379 | title = {{Learning bilingual word embeddings with (almost) no bilingual data}},
380 | year = {2017}
381 | }
382 |
383 | @inproceedings{Ling.et.al.2015.NAACL,
384 | address = {Denver, Colorado},
385 | author = {Ling, Wang and Dyer, Chris and Black, Alan W and Trancoso, Isabel},
386 | booktitle = {Proceedings of NAACL},
387 | doi = {10.3115/v1/N15-1142},
388 | pages = {1299--1304},
389 | title = {{Two/Too Simple Adaptations of Word2Vec for Syntax Problems}},
390 | year = {2015}
391 | }
392 |
393 |
394 | @inproceedings{Levy.Goldberg.2014.ACL,
395 | address = {Baltimore, MD, USA},
396 | author = {Levy, Omer and Goldberg, Yoav},
397 | booktitle = {Proceedings of ACL},
398 | doi = {10.3115/v1/P14-2050},
399 | pages = {302--308},
400 | title = {{Dependency-Based Word Embeddings}},
401 | year = {2014}
402 | }
403 |
404 | @article{Bojanowski.et.al.2017.TACL,
405 | author = {Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
406 | doi = {10.1162/tacl_a_00051},
407 | journal = {Transactions of the ACL},
408 | pages = {135--146},
409 | title = {{Enriching Word Vectors with Subword Information}},
410 | volume = {5},
411 | year = {2017}
412 | }
413 |
414 |
415 | @inproceedings{Madasu.AnveshRao.2019.EMNLP,
416 | address = {Hong Kong, China},
417 | author = {Madasu, Avinash and {Anvesh Rao}, Vijjini},
418 | booktitle = {Proceedings of EMNLP-IJCNLP},
419 | doi = {10.18653/v1/D19-1567},
420 | pages = {5657--5666},
421 | publisher = {Association for Computational Linguistics},
422 | title = {{Sequential Learning of Convolutional Features for Effective Text Classification}},
423 | year = {2019}
424 | }
425 |
426 |
427 | @inproceedings{Kim.2014.EMNLP,
428 | address = {Doha, Qatar},
429 | author = {Kim, Yoon},
430 | booktitle = {Proceedings of EMNLP},
431 | doi = {10.3115/v1/D14-1181},
432 | pages = {1746--1751},
433 | publisher = {Association for Computational Linguistics},
434 | title = {{Convolutional Neural Networks for Sentence Classification}},
435 | year = {2014}
436 | }
437 |
438 | @inproceedings{Devlin.et.al.2019.NAACL,
439 | address = {Minneapolis, Minnesota},
440 | author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
441 | booktitle = {Proceedings of NAACL},
442 | doi = {10.18653/v1/N19-1423},
443 | pages = {4171--4186},
444 | publisher = {Association for Computational Linguistics},
445 | title = {{BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}},
446 | year = {2019}
447 | }
448 |
449 | @inproceedings{Gururangan.et.al.2018.NAACL.short,
450 | address = {New Orleans, LA},
451 | author = {Gururangan, Suchin and Swayamdipta, Swabha and Levy, Omer and Schwartz, Roy and Bowman, Samuel and Smith, Noah A.},
452 | booktitle = {Proceedings of NAACL},
453 | doi = {10.18653/v1/N18-2017},
454 | pages = {107--112},
455 | publisher = {Association for Computational Linguistics},
456 | title = {{Annotation Artifacts in Natural Language Inference Data}},
457 | year = {2018}
458 | }
459 |
460 | @article{Goldberg.2016,
461 | author = {Goldberg, Yoav},
462 | doi = {10.1613/jair.4992},
463 | journal = {Journal of Artificial Intelligence Research},
464 | pages = {345--420},
465 | title = {{A Primer on Neural Network Models for Natural Language Processing}},
466 | volume = {57},
467 | year = {2016}
468 | }
469 |
470 |
471 | @inproceedings{Gehring.et.al.2017a.ICML,
472 | address = {Sydney, Australia},
473 | author = {Gehring, Jonas and Auli, Michael and Grangier, David and Yarats, Denis and Dauphin, Yann N.},
474 | booktitle = {Proceedings of the 34th International Conference on Machine Learning},
475 | editor = {Precup, Doina and Teh, Yee Whye},
476 | pages = {1243--1252},
477 | publisher = {PMLR},
478 | title = {{Convolutional Sequence to Sequence Learning}},
479 | year = {2017}
480 | }
481 |
482 |
483 | @inproceedings{Krishnan.Manning.2006,
484 | address = {Sydney, Australia},
485 | author = {Krishnan, Vijay and Manning, Christopher D.},
486 | booktitle = {Proceedings of ACL},
487 | doi = {10.3115/1220175.1220316},
488 | pages = {1121--1128},
489 | publisher = {Association for Computational Linguistics},
490 | title = {{An Effective Two-Stage Model for Exploiting Non-Local Dependencies in Named Entity Recognition}},
491 | year = {2006}
492 | }
493 |
494 |
495 | @inproceedings{artemova-etal-2021-teaching,
496 | title = "Teaching a Massive Open Online Course on Natural Language Processing",
497 | author = "Artemova, Ekaterina and
498 | Apishev, Murat and
499 | Kirianov, Denis and
500 | Sarkisyan, Veronica and
501 | Aksenov, Sergey and
502 | Serikov, Oleg",
503 | booktitle = "Proceedings of the Fifth Workshop on Teaching NLP",
504 | year = "2021",
505 | address = "Online",
506 | publisher = "Association for Computational Linguistics",
507 | url = "https://www.aclweb.org/anthology/2021.teachingnlp-1.2",
508 | pages = "13--27",
509 | }
510 |
511 |
512 | @inproceedings{Vaswani.et.al.2017,
513 | address = {Long Beach, CA, USA},
514 | author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia},
515 | booktitle = {Advances in Neural Information Processing Systems 30},
516 | pages = {5998--6008},
517 | publisher = {Curran Associates, Inc.},
518 | title = {{Attention Is All You Need}},
519 | year = {2017}
520 | }
521 |
522 | @article{Koehn.2017,
523 | author = {Koehn, Philipp},
524 | title = {Neural Machine Translation},
525 | journal = {arXiv preprint},
526 | date = {2017},
527 | url = {http://arxiv.org/abs/1709.07809}
528 | }
529 |
530 |
531 | @inproceedings{Schuster.Nakajima.2012,
532 | address = {Kyoto, Japan},
533 | author = {Schuster, Mike and Nakajima, Kaisuke},
534 | booktitle = {2012 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
535 | doi = {10.1109/ICASSP.2012.6289079},
536 | pages = {5149--5152},
537 | publisher = {IEEE},
538 | title = {{Japanese and Korean voice search}},
539 | year = {2012}
540 | }
541 |
542 | @article{Wu.et.al.2016.GoogleMT,
543 | author = {Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le, Quoc V. and Norouzi, Mohammad and Macherey, Wolfgang and Krikun, Maxim and Cao, Yuan and Gao, Qin and Macherey, Klaus and Klingner, Jeff and Shah, Apurva and Johnson, Melvin and Liu, Xiaobing and Kaiser, {\L}ukasz and Gouws, Stephan and Kato, Yoshikiyo and Kudo, Taku and Kazawa, Hideto and Stevens, Keith and Kurian, George and Patil, Nishant and Wang, Wei and Young, Cliff and Smith, Jason and Riesa, Jason and Rudnick, Alex and Vinyals, Oriol and Corrado, Greg and Hughes, Macduff and Dean, Jeffrey},
544 | pages = {1--23},
545 | title = {{Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation}},
546 | url = {http://arxiv.org/abs/1609.08144},
547 | year = {2016},
548 | journal = {arXive},
549 | }
550 |
551 |
552 | @inproceedings{Sennrich.et.al.2016.ACL,
553 | address = {Berlin, Germany},
554 | author = {Sennrich, Rico and Haddow, Barry and Birch, Alexandra},
555 | booktitle = {Proceedings of ACL},
556 | doi = {10.18653/v1/P16-1162},
557 | pages = {1715--1725},
558 | publisher = {Association for Computational Linguistics},
559 | title = {{Neural Machine Translation of Rare Words with Subword Units}},
560 | year = {2016}
561 | }
562 |
563 | @article{Caruana.1997,
564 | author = {Caruana, Rich},
565 | doi = {10.1023/A:1007379606734},
566 | journal = {Machine Learning},
567 | number = {1},
568 | pages = {41--75},
569 | title = {{Multi-task Learning}},
570 | volume = {28},
571 | year = {1997}
572 | }
573 |
574 | @inproceedings{Sogaard.Goldberg.2016,
575 | address = {Berlin, Germany},
576 | author = {S{\o}gaard, Anders and Goldberg, Yoav},
577 | booktitle = {Proceedings of ACL},
578 | doi = {10.18653/v1/P16-2038},
579 | pages = {231--235},
580 | publisher = {Association for Computational Linguistics},
581 | title = {{Deep multi-task learning with low level tasks supervised at lower layers}},
582 | year = {2016}
583 | }
584 |
585 | @inproceedings{Conneau.et.al.2017.EMNLP,
586 | address = {Copenhagen, Denmark},
587 | author = {Conneau, Alexis and Kiela, Douwe and Schwenk, Holger and Barrault, Lo{\"{i}}c and Bordes, Antoine},
588 | booktitle = {Proceedings of EMNLP},
589 | pages = {670--680},
590 | title = {{Supervised Learning of Universal Sentence Representations from Natural Language Inference Data}},
591 | year = {2017}
592 | }
593 |
594 | @article{Rogers.et.al.2020.BERT,
595 | author = {Rogers, Anna and Kovaleva, Olga and Rumshisky, Anna},
596 | doi = {10.1162/tacl_a_00349},
597 | journal = {Transactions of the Association for Computational Linguistics},
598 | pages = {842--866},
599 | title = {{A Primer in BERTology: What We Know About How BERT Works}},
600 | volume = {8},
601 | year = {2020}
602 | }
603 |
604 |
605 | @inproceedings{Kingma.Ba.2015,
606 | address = {San Diego, CA, USA},
607 | author = {Kingma, Diederik P. and Ba, Jimmy Lei},
608 | booktitle = {3rd International Conference on Learning Representations, ICLR 2015},
609 | editor = {Bengio, Yoshua and LeCun, Yann},
610 | pages = {1--15},
611 | title = {{Adam: A Method for Stochastic Optimization}},
612 | year = {2015},
613 | url = {https://arxiv.org/abs/1412.6980},
614 | }
615 |
616 | @article{Bengio.et.al.2003.JMLR,
617 | author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal and Jauvin, Christian},
618 | journal = {Journal of Machine Learning Research},
619 | pages = {1137--1155},
620 | title = {{A Neural Probabilistic Language Model}},
621 | volume = {3},
622 | year = {2003},
623 | url = {https://research.jmlr.org/papers/v3/bengio03a.html},
624 | }
625 |
626 |
627 | @book{Kun.2020,
628 | author = {Jeremy Kun},
629 | edition = {2},
630 | title = {A Programmer’s Introduction to Mathematics},
631 | url = {https://pimbook.org},
632 | year = {2020},
633 | }
634 |
635 | @book{Goldberg.2017,
636 | author = {Goldberg, Yoav},
637 | title = {Neural Network Methods for Natural Language Processing},
638 | year = {2017},
639 | publisher = {Morgan \& Claypool},
640 |
641 | }
642 |
643 |
644 | @inproceedings{Kudo.Richardson.2018.EMNLP,
645 | title = "{S}entence{P}iece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing",
646 | author = "Kudo, Taku and
647 | Richardson, John",
648 | booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
649 | year = "2018",
650 | address = "Brussels, Belgium",
651 | publisher = "Association for Computational Linguistics",
652 | doi = "10.18653/v1/D18-2012",
653 | pages = "66--71",
654 | }
655 |
656 | @article{kudo2018subword,
657 | title={Subword regularization: Improving neural network translation models with multiple subword candidates},
658 | author={Kudo, Taku},
659 | journal={arXiv preprint arXiv:1804.10959},
660 | year={2018}
661 | }
662 |
663 | @article{bahdanau2014neural,
664 | title={Neural machine translation by jointly learning to align and translate},
665 | author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
666 | journal={arXiv preprint arXiv:1409.0473},
667 | year={2014}
668 | }
--------------------------------------------------------------------------------
/latex/lecture01/.gitignore:
--------------------------------------------------------------------------------
1 | dl4nlp2023-lecture01.pdf
2 | flags.tex
3 | pdf
4 |
--------------------------------------------------------------------------------
/latex/lecture01/compile-pdf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Inspired by: https://tex.stackexchange.com/a/1501
4 |
5 | # current lecture file name
6 | lecture_filename="dl4nlp2023-lecture01"
7 |
8 | # create the output folder (might already exist)
9 | mkdir -p pdf
10 |
11 | # Compile the lecture version with pauses
12 |
13 | # set empty flags
14 | echo "" > flags.tex
15 |
16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
17 | biber pdf/${lecture_filename}-lecture.bcf
18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
20 |
21 | # Compile the handout (no slide unfolding)
22 |
23 | # set the flag
24 | echo "\handouttrue" > flags.tex
25 |
26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
27 | biber pdf/${lecture_filename}-handout.bcf
28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
30 |
31 | # Cleaning up temporary latex files
32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache
33 |
--------------------------------------------------------------------------------
/latex/lecture01/img/arct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/arct.png
--------------------------------------------------------------------------------
/latex/lecture01/img/cc-by-sa-icon.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/cc-by-sa-icon.pdf
--------------------------------------------------------------------------------
/latex/lecture01/img/dial1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/dial1.png
--------------------------------------------------------------------------------
/latex/lecture01/img/hfdata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/hfdata.png
--------------------------------------------------------------------------------
/latex/lecture01/img/logo-trusthlt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/logo-trusthlt.pdf
--------------------------------------------------------------------------------
/latex/lecture01/img/mt2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/mt2.png
--------------------------------------------------------------------------------
/latex/lecture01/img/mtex.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/mtex.jpg
--------------------------------------------------------------------------------
/latex/lecture01/img/nlg1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/nlg1.png
--------------------------------------------------------------------------------
/latex/lecture01/img/nlg2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/nlg2.png
--------------------------------------------------------------------------------
/latex/lecture02/.gitignore:
--------------------------------------------------------------------------------
1 | dl4nlp2023-lecture02.pdf
2 | flags.tex
3 | pdf
4 |
--------------------------------------------------------------------------------
/latex/lecture02/compile-pdf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Inspired by: https://tex.stackexchange.com/a/1501
4 |
5 | # current lecture file name
6 | lecture_filename="dl4nlp2023-lecture02"
7 |
8 | # create the output folder (might already exist)
9 | mkdir -p pdf
10 |
11 | # Compile the lecture version with pauses
12 |
13 | # set empty flags
14 | echo "" > flags.tex
15 |
16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
17 | biber pdf/${lecture_filename}-lecture.bcf
18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
20 |
21 | # Compile the handout (no slide unfolding)
22 |
23 | # set the flag
24 | echo "\handouttrue" > flags.tex
25 |
26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
27 | biber pdf/${lecture_filename}-handout.bcf
28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
30 |
31 | # Cleaning up temporary latex files
32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache
33 |
--------------------------------------------------------------------------------
/latex/lecture02/img/backprop-my.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop-my.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/backprop01.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop01.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/backprop02.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop02.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/backprop03.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop03.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/backprop04.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop04.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/backprop05.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop05.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/backprop06.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop06.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/backprop07.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop07.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/backprop08.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop08.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/backprop09.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop09.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/backprop10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop10.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/cc-by-sa-icon.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/cc-by-sa-icon.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/desmos-graph1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/desmos-graph1.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/desmos-graph1.svg:
--------------------------------------------------------------------------------
1 |
2 |
349 |
--------------------------------------------------------------------------------
/latex/lecture02/img/gradient1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/gradient1.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/logo-trusthlt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/logo-trusthlt.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/parent-child.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/parent-child.pdf
--------------------------------------------------------------------------------
/latex/lecture02/img/rosenbrock.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/rosenbrock.pdf
--------------------------------------------------------------------------------
/latex/lecture03/.gitignore:
--------------------------------------------------------------------------------
1 | dl4nlp2023-lecture03.pdf
2 | flags.tex
3 | pdf
4 |
--------------------------------------------------------------------------------
/latex/lecture03/compile-pdf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Inspired by: https://tex.stackexchange.com/a/1501
4 |
5 | # current lecture file name
6 | lecture_filename="dl4nlp2023-lecture03"
7 |
8 | # create the output folder (might already exist)
9 | mkdir -p pdf
10 |
11 | # Compile the lecture version with pauses
12 |
13 | # set empty flags
14 | echo "" > flags.tex
15 |
16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
17 | biber pdf/${lecture_filename}-lecture.bcf
18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
20 |
21 | # Compile the handout (no slide unfolding)
22 |
23 | # set the flag
24 | echo "\handouttrue" > flags.tex
25 |
26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
27 | biber pdf/${lecture_filename}-handout.bcf
28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
30 |
31 | # Cleaning up temporary latex files
32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache
33 |
--------------------------------------------------------------------------------
/latex/lecture03/img/cc-by-sa-icon.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture03/img/cc-by-sa-icon.pdf
--------------------------------------------------------------------------------
/latex/lecture03/img/logo-trusthlt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture03/img/logo-trusthlt.pdf
--------------------------------------------------------------------------------
/latex/lecture04/.gitignore:
--------------------------------------------------------------------------------
1 | dl4nlp2023-lecture04.pdf
2 | flags.tex
3 | pdf
4 |
--------------------------------------------------------------------------------
/latex/lecture04/compile-pdf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Inspired by: https://tex.stackexchange.com/a/1501
4 |
5 | # current lecture file name
6 | lecture_filename="dl4nlp2023-lecture04"
7 |
8 | # create the output folder (might already exist)
9 | mkdir -p pdf
10 |
11 | # Compile the lecture version with pauses
12 |
13 | # set empty flags
14 | echo "" > flags.tex
15 |
16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
17 | biber pdf/${lecture_filename}-lecture.bcf
18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
20 |
21 | # Compile the handout (no slide unfolding)
22 |
23 | # set the flag
24 | echo "\handouttrue" > flags.tex
25 |
26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
27 | biber pdf/${lecture_filename}-handout.bcf
28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
30 |
31 | # Cleaning up temporary latex files
32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache
33 |
--------------------------------------------------------------------------------
/latex/lecture04/dl4nlp2023-lecture04.tex:
--------------------------------------------------------------------------------
1 | % !TeX program = lualatex
2 | % !BIB program = biber
3 | % Lualatex is important to render Fira fonts; with pdflatex it's just the regular one
4 | % ratio 16:9 -- https://tex.stackexchange.com/questions/14336/
5 |
6 | % compile two versions, inspired by https://tex.stackexchange.com/a/1501
7 | % use the script "compile-pdf.sh"
8 | \newif\ifhandout
9 | % if flags.tex does not exist, create an empty file to be able to compile in TeXstudio
10 | \input{flags}
11 |
12 | \ifhandout
13 | \documentclass[12pt,aspectratio=169,handout]{beamer}
14 | \else
15 | \documentclass[12pt,aspectratio=169]{beamer}
16 | \fi
17 |
18 | % adjust for 16:9
19 | % https://tex.stackexchange.com/questions/354022/modifying-the-margins-of-all-slides-in-beamer
20 | \setbeamersize{text margin left=0.3cm,text margin right=4.5cm}
21 |
22 | %\usepackage{xcolor}
23 |
24 | %%% better TOC
25 | \usetheme[subsectionpage=progressbar]{metropolis}
26 |
27 | % name in footer
28 | \setbeamertemplate{frame numbering}{\insertframenumber ~ | Dr.\ Ivan Habernal}
29 |
30 | % blocks with background globally
31 | \metroset{block=fill}
32 |
33 | % adjust the background to be completely white
34 | \setbeamercolor{background canvas}{bg=white}
35 |
36 | % typeset mathematics on serif
37 | \usefonttheme[onlymath]{serif}
38 |
39 | % better bibliography using biber as backend
40 | \usepackage[natbib=true,backend=biber,style=authoryear-icomp,maxbibnames=30,maxcitenames=9,uniquelist=false,giveninits=true,doi=false,url=false,dashed=false,isbn=false]{biblatex}
41 | % shared bibliography
42 | \addbibresource{../dl4nlp-bibliography.bib}
43 | % disable "ibid" for repeated citations
44 | \boolfalse{citetracker}
45 |
46 | \definecolor{76abdf}{RGB}{118, 171, 223}
47 |
48 | \setbeamercolor{frametitle}{bg=76abdf, fg=white}
49 |
50 | \usepackage{xspace}
51 |
52 |
53 | % for derivatives, https://tex.stackexchange.com/a/412442
54 | \usepackage{physics}
55 |
56 | \usepackage{tikz}
57 | \usetikzlibrary{matrix, positioning}
58 | \usetikzlibrary{angles,quotes} % for angles
59 | \usetikzlibrary{backgrounds} % background
60 | \usetikzlibrary{decorations.pathreplacing} % curly braces
61 | \usetikzlibrary{calligraphy}
62 | \usetikzlibrary{calc} % for neural nets
63 |
64 | % for plotting functions
65 | \usepackage{pgfplots}
66 | \usepgfplotslibrary{dateplot}
67 |
68 | % sub-figures
69 | \usepackage{caption}
70 | \usepackage{subcaption}
71 |
72 | % book tabs
73 | \usepackage{booktabs}
74 |
75 |
76 | % show TOC at every section start
77 | \AtBeginSection{
78 | \frame{
79 | \vspace{2em}
80 | \sectionpage
81 | \hspace*{2.2em}\begin{minipage}{10cm}
82 | \tableofcontents[currentsection]
83 | \end{minipage}
84 | }
85 | }
86 |
87 | % argmin, argmax
88 | \usepackage{amsmath}
89 | \DeclareMathOperator*{\argmax}{arg\!\max}
90 | \DeclareMathOperator*{\argmin}{arg\!\min}
91 | % softmax
92 | \DeclareMathOperator*{\softmax}{soft\!\max}
93 |
94 | % bold math
95 | \usepackage{bm}
96 |
97 | % for \mathclap
98 | \usepackage{mathtools}
99 |
100 | % algorithms
101 | \usepackage[noend]{algpseudocode}
102 |
103 |
104 | % for neurons and layers in tikz
105 | \tikzset{
106 | neuron/.style={draw, circle, inner sep=0pt, minimum width=0.75cm, fill=blue!20},
107 | param/.style={draw, circle, inner sep=0pt, minimum width=0.75cm, fill=green!20},
108 | constant/.style={draw, circle, inner sep=0pt, minimum width=0.75cm, fill=black!15},
109 | }
110 |
111 |
112 | \title{Deep Learning for Natural Language Processing}
113 | \subtitle{Lecture 4 --- Text classification 2: Deep neural networks}
114 | \date{May 2, 2023}
115 | \author{Dr.\ Ivan Habernal}
116 | \institute{Trustworthy Human Language Technologies \hfill \includegraphics[height=.8cm]{img/logo-trusthlt.pdf} \\
117 | Department of Computer Science\\
118 | Technical University of Darmstadt \hfill \texttt{www.trusthlt.org} }
119 | %\titlegraphic{\hfill }
120 |
121 | \begin{document}
122 |
123 | \maketitle
124 |
125 |
126 | \section{Where we finished last time}
127 |
128 | \begin{frame}{Our binary text classification function}
129 |
130 | Linear function through sigmoid --- log-linear model
131 | $$
132 | \hat{y} = \sigma(f(\bm{x})) = \frac{1}{1 + \exp(- (\bm{x} \cdot \bm{w} + b))}
133 | $$
134 |
135 | \begin{figure}
136 | \begin{tikzpicture}
137 | %\node (a1) [draw, circle, inner sep=0pt, minimum width=0.75cm, fill=green!20] {$a_1$};
138 | \node (x) [constant] {$\bm{x}$};
139 | \node (w) [param, below of=x] {$\bm{w}$};
140 | \node (b) [param, below of=w] {$b$};
141 |
142 | \node (f) [neuron, right of=w, xshift=1.5cm] {$\bm{x} \cdot \bm{w} + b$};
143 | \node (s) [neuron, right of=f, xshift=1.5cm] {$\sigma$};
144 |
145 | \begin{scope}[thick, black, ->, >=latex]
146 | \draw (x) -- (f);
147 | \draw (w) -- (f);
148 | \draw (b) -- (f);
149 | \draw (f) -- (s);
150 | \end{scope}
151 | \end{tikzpicture}
152 | \caption{Computational graph; green circles are trainable parameters, gray are inputs}
153 | \end{figure}
154 |
155 | \end{frame}
156 |
157 | \begin{frame}{Decision rule of log-linear model}
158 |
159 | Log-linear model
160 | $
161 | \hat{y} = \sigma(f(\bm{x})) = \frac{1}{1 + \exp(- (\bm{x} \cdot \bm{w} + b))}
162 | $
163 |
164 | \begin{itemize}
165 | \item Prediction = 1 if $\hat{y} > 0.5$
166 | \item Prediction = 0 if $\hat{y} < 0.5$
167 | \end{itemize}
168 |
169 | \bigskip
170 |
171 | Natural interpretation: Conditional probability of prediction = 1 given the input $\bm{x}$
172 | $$
173 | \begin{aligned}
174 | \sigma(f(\bm{x})) &= \Pr(\text{prediction} = 1 | \bm{x}) \\
175 | 1 - \sigma(f(\bm{x})) &= \Pr(\text{prediction} = 0 | \bm{x})
176 | \end{aligned}
177 | $$
178 |
179 | \end{frame}
180 |
181 | \section{Finding the best model's parameters}
182 |
183 | \begin{frame}{The loss function}
184 |
185 | Loss function: Quantifies the loss suffered when predicting $\hat{y}$ while the true label is $y$ for a single example. In binary classification: \pause
186 | $$
187 | L(\hat{y}, y): \mathbb{R}^2 \to \mathbb{R}
188 | $$
189 |
190 | \pause
191 | Given a labeled training set
192 | $(\bm{x}_{1:n}, \bm{y}_{1:n})$,
193 | a per-instance loss function $L$ and a
194 | parameterized function $f(\bm{x}; \Theta)$ we define the corpus-wide loss with respect to the parameters $\Theta$ as the average loss over all training examples \pause
195 | $$
196 | \mathcal{L}(\Theta) = \frac{1}{n} \sum_{i =1}^{n} L (f(\bm{x}_i; \Theta), y_i)
197 | $$
198 | \end{frame}
199 |
200 | \begin{frame}{Training as optimization}
201 | $$
202 | \mathcal{L}(\Theta) = \frac{1}{n} \sum_{i =1}^{n} L (f(\bm{x}_i; \Theta), y_i)
203 | $$
204 |
205 | The training examples are fixed, and the values of the parameters determine the loss
206 |
207 | \pause
208 | The goal of the training algorithm is to set the values of the parameters $\Theta$‚ such that
209 | the value of $\mathcal{L}$ is minimized \pause
210 | $$
211 | \hat{\Theta} = \argmin_{\Theta} \mathcal{L}(\Theta) = \argmin_{\Theta} \frac{1}{n} \sum_{i =1}^{n} L (f(\bm{x}_i; \Theta), y_i)
212 | $$
213 |
214 |
215 | \end{frame}
216 |
217 | \begin{frame}{Binary cross-entropy loss (logistic loss)}
218 | $$
219 | L_{\text{logistic}} = - y \log \hat{y} - (1 - y) \log (1 - \hat{y})
220 | $$
221 |
222 | \pause
223 | \begin{block}{Partial derivative wrt.\ input $\hat{y}$}
224 | $$
225 | \dv{L_{\text{Logistic}}}{\hat{y}} =
226 | - \left(
227 | \frac{y}{\hat{y}} - \frac{1 - y}{1 - \hat{y}}
228 | \right)
229 | =
230 | - \frac{y - \hat{y}}{ \hat{y} (1 - \hat{y})}
231 | $$
232 | \end{block}
233 |
234 | \end{frame}
235 |
236 | \begin{frame}{Full computational graph}
237 | \begin{figure}
238 | \begin{tikzpicture}
239 | %\node (a1) [draw, circle, inner sep=0pt, minimum width=0.75cm, fill=green!20] {$a_1$};
240 | \node (x) [constant] {$\bm{x}$};
241 | \node (w) [param, below of=x] {$\bm{w}$};
242 | \node (b) [param, below of=w] {$b$};
243 |
244 | \node (f) [neuron, right of=w, xshift=1.5cm] {$\bm{x} \cdot \bm{w} + b$};
245 | \node (s) [neuron, right of=f, xshift=1.5cm] {$\sigma$};
246 |
247 | \node (l) [neuron, right of=s, xshift=1cm] {$L$};
248 | \node (y) [constant, below of=s] {$y$};
249 |
250 | \begin{scope}[thick, black, ->, >=latex]
251 | \draw (x) -- (f);
252 | \draw (w) -- (f);
253 | \draw (b) -- (f);
254 | \draw (f) -- (s);
255 | \draw (s) -- (l);
256 | \draw (y) -- (l);
257 | \end{scope}
258 | \end{tikzpicture}
259 | \caption{Computational graph; green circles are trainable parameters, gray are constant inputs}
260 | \end{figure}
261 |
262 | How can we minimize this function?
263 |
264 | \pause
265 | \begin{itemize}
266 | \item Recall Lecture 2: (a) Gradient descent and (b) backpropagation
267 | \end{itemize}
268 |
269 | \end{frame}
270 |
271 | \begin{frame}{(Online) Stochastic Gradient Descent}
272 |
273 | \begin{algorithmic}[1]
274 | \Function{SGD}{$f(\bm{x}; \Theta)$, $(\bm{x}_1, \ldots, \bm{x}_n)$, $(\bm{y}_1, \ldots, \bm{y}_n)$, $L$}
275 | \While{stopping criteria not met}
276 | \State Sample a training example $\bm{x}_i, \bm{y}_i$
277 | \State Compute the loss $L(f(\bm{x}_i; \Theta), \bm{y}_i)$
278 | \State $\hat{\bm{g}} \gets$ gradient of $L(f(\bm{x}_i; \Theta), \bm{y}_i)$ wrt.\ $\Theta$
279 | \State $\Theta \gets \Theta - \eta_t \hat{\bm{g}}$
280 | \EndWhile
281 | \State \Return $\Theta$
282 | \EndFunction
283 | \end{algorithmic}
284 |
285 | \pause
286 | Loss in line 4 is based on a \textbf{single training example} $\to$ a rough estimate of the corpus loss $\mathcal{L}$ we aim to minimize
287 |
288 | \pause
289 | The noise in the loss computation may result in inaccurate gradients
290 |
291 | \end{frame}
292 |
293 |
294 |
295 | \begin{frame}{Minibatch Stochastic Gradient Descent}
296 |
297 | \begin{algorithmic}[1]
298 | \Function{mbSGD}{$f(\bm{x}; \Theta)$, $(\bm{x}_1, \ldots, \bm{x}_n)$, $(\bm{y}_1, \ldots, \bm{y}_n)$, $L$}
299 | \While{stopping criteria not met}
300 | \State Sample $m$ examples $\{ (\bm{x}_1, \bm{y}_1), \ldots (\bm{x}_m, \bm{y}_m) \}$
301 | \State $\hat{\bm{g}} \gets 0$
302 | \For{$i = 1$ to $m$}
303 | \State Compute the loss $L(f(\bm{x}_i; \Theta), \bm{y}_i)$
304 | \State $\hat{\bm{g}} \gets \hat{\bm{g}}\ + $ gradient of $\frac{1}{m} L(f(\bm{x}_i; \Theta), \bm{y}_i)$ wrt.\ $\Theta$
305 | \EndFor
306 | \State $\Theta \gets \Theta - \eta_t \hat{\bm{g}}$
307 | \EndWhile
308 | \State \Return $\Theta$
309 | \EndFunction
310 | \end{algorithmic}
311 |
312 |
313 | \end{frame}
314 |
315 | \begin{frame}{Properties of Minibatch Stochastic Gradient Descent}
316 |
317 | The minibatch size can vary in size from $m = 1$ to $m = n$
318 |
319 | Higher values provide better estimates of the corpus-wide gradients, while smaller values allow more updates and in turn faster convergence
320 |
321 | Lines 6+7: May be easily parallelized
322 |
323 | \end{frame}
324 |
325 |
326 | \section{Log-linear multi-class classification}
327 |
328 | \begin{frame}{From binary to multi-class labels}
329 |
330 | So far we mapped our gold label $y \in \{0, 1\}$
331 |
332 | What if we classify into distinct categorical classes?
333 |
334 | \begin{itemize}
335 | \item Categorical: There is no `ordering'
336 | \item Example: Classify the language of a document into 6 languages (En, Fr, De, It, Es, Other)
337 | \end{itemize}
338 |
339 | \pause
340 | \begin{block}{One-hot encoding of labels}
341 | $$
342 | \begin{aligned}
343 | \text{En} &= \begin{pmatrix}1 & 0 & 0 & 0 & 0 & 0\end{pmatrix} \qquad
344 | \text{Fr} = \begin{pmatrix}0 & 1 & 0 & 0 & 0 & 0\end{pmatrix} \\
345 | \text{De} &= \begin{pmatrix}0 & 0 & 1 & 0 & 0 & 0\end{pmatrix} \qquad \ldots \\
346 | \bm{y} &\in \mathbb{R}^{d_{out}} \quad \text{where } d_{out} \text{ is the number of classes}
347 | \end{aligned}
348 | $$
349 | \end{block}
350 |
351 | \end{frame}
352 |
353 | \begin{frame}{Possible solution: Six weight vectors and biases}
354 |
355 | Consider for each language $\ell \in \{\text{En}, \text{Fr}, \text{De}, \text{It}, \text{Es}, \text{Other}\}$
356 | \begin{itemize}
357 | \item Weight vector $\bm{w}^{\ell}$ (e.g., $\bm{w}^{\text{Fr}})$
358 | \item Bias $b^{\ell}$ (e.g., $b^{\text{Fr}})$
359 | \end{itemize}
360 | \pause We can predict the language resulting in the highest score
361 | $$
362 | \hat{y} = f(\bm{x}) = \argmax_{
363 | \ell \in \{\text{En}, \text{Fr}, \text{De}, \text{It}, \text{Es}, \text{Other}\}
364 | }
365 | \bm{x} \cdot \bm{w}^{\ell} + b^{\ell}
366 | $$
367 |
368 | \pause
369 | But we can re-arrange the $\bm{w} \in \mathbb{R}^{d_{in}}$ vectors into columns of a matrix $\bm{W} \in \mathbb{R}^{d_{in} \times 6}$ and $\bm{b} \in \mathbb{R}^6$, to get
370 | $$f(\bm{x}) = \bm{x} \bm{W} + \bm{b}$$
371 |
372 | \end{frame}
373 |
374 |
375 | \begin{frame}{Projecting input vector to output vector $f(\bm{x}) : \mathbb{R}^{d_{in}} \to \mathbb{R}^{d_{out}}$}
376 |
377 | \pause
378 | \begin{block}{Recall from lecture 3: High-dimensional linear functions}
379 | Function $f(\bm{x}) : \mathbb{R}^{d_{in}} \to \mathbb{R}^{d_{out}}$
380 | $$f(\bm{x}) = \bm{x} \bm{W} + \bm{b}$$
381 | where
382 | $\bm{x} \in \mathbb{R}^{d_{in}} \qquad
383 | \bm{W} \in \mathbb{R}^{d_{in} \times d_{out}} \qquad
384 | \bm{b} \in \mathbb{R}^{d_{out}}$
385 | \end{block}
386 |
387 | The simplest neural network --- a perceptron (simply a linear model)
388 |
389 | \begin{itemize}
390 | \item How to find the prediction $\hat{y}$?
391 | \end{itemize}
392 |
393 | \end{frame}
394 |
395 | \begin{frame}{Prediction of multi-class classifier}
396 | Project the input $\bm{x}$ to an output $\bm{y}$
397 | $$\bm{\hat{y}} = f(\bm{x}) = \bm{x} \bm{W} + \bm{b}$$
398 | and pick the element of $\bm{\hat{y}}$ with the highest value
399 | $$
400 | \text{prediction} = \hat{y} = \argmax_{i} \bm{\hat{y}}_{[i]}
401 | $$
402 |
403 | \begin{block}{Sanity check}
404 | What is $\hat{y}$?
405 |
406 | \pause
407 | Index of $1$ in the one-hot
408 |
409 | For example, if $\hat{y} = 3$, then the document is in German
410 | $\text{De} = \begin{pmatrix}0 & 0 & 1 & 0 & 0 & 0\end{pmatrix}$
411 | \end{block}
412 |
413 | \end{frame}
414 |
415 | \subsection{Representations}
416 |
417 |
418 | \begin{frame}{Two representations of the input document}
419 | $$\bm{\hat{y}} = \bm{x} \bm{W} + \bm{b}$$
420 |
421 | Vector $\bm{x}$ is a document representation
422 | \begin{itemize}
423 | \item Bag of words, for example ($d_{in} = |V|$ dimensions, sparse)
424 | \end{itemize}
425 |
426 | Vector $\bm{\hat{y}}$ is \textbf{also} a document representation
427 | \begin{itemize}
428 | \item More compact (only 6 dimensions)
429 | \item More specialized for the language prediction task
430 | \end{itemize}
431 |
432 | \end{frame}
433 |
434 | \begin{frame}{Matrix $\bm{W}$ as learned representation --- columns}
435 | $\bm{\hat{y}} = \bm{x} \bm{W} + \bm{b} \quad \to$ two views of $\bm{W}$, as rows or as columns
436 |
437 | \begin{tabular}{r|cccccc}
438 | & En & Fr & De & It & Es & Ot \\ \midrule
439 | a & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ \\
440 | at & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ \\
441 | ... & & & & & & \\
442 | zoo & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ \\
443 | \end{tabular}
444 |
445 | \pause
446 | Each of the 6 columns (corresponding to a language) is a $d_{in}$-dimensional vector representation of this language in terms of its characteristic word unigram patterns (e.g., we can then cluster the 6 language vectors according to their similarity)
447 |
448 |
449 | \end{frame}
450 |
451 | \begin{frame}{Matrix $\bm{W}$ as learned representation --- rows}
452 | $\bm{\hat{y}} = \bm{x} \bm{W} + \bm{b}$
453 |
454 | \begin{tabular}{r|cccccc}
455 | & En & Fr & De & It & Es & Ot \\ \midrule
456 | a & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ \\
457 | at & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ \\
458 | ... & & & & & & \\
459 | zoo & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ \\
460 | \end{tabular}
461 |
462 | Each of the $d_{in}$ rows corresponds to a particular unigram, and provides a 6-dimensional vector
463 | representation of that unigram in terms of the languages it prompts
464 |
465 | \end{frame}
466 |
467 | \begin{frame}{From bag-of-words to continuous bag-of-words}
468 | \begin{block}{Recall from lecture 3 --- Averaged bag of words}
469 | $$\bm{x} = \frac{1}{|D|} \sum_{i =1}^{|D|} \bm{x}^{D_{[i]}}$$
470 | $D_{[i]}$ --- word in doc $D$ at position $i$, $\bm{x}^{D_{[i]}}$ --- one-hot vector
471 | \end{block}
472 | $$
473 | \begin{aligned}
474 | \bm{\hat{y}} &= \bm{x} \bm{W} = \pause
475 | \left (\frac{1}{|D|} \sum_{i =1}^{|D|} \bm{x}^{D_{[i]}} \right) \bm{W}
476 | \pause = \frac{1}{|D|} \sum_{i =1}^{|D|} \left ( \bm{x}^{D_{[i]}} \bm{W} \right) \\
477 | &= \pause \frac{1}{|D|} \sum_{i =1}^{|D|} \bm{W}^{D_{[i]}}
478 | \end{aligned}
479 | $$
480 | (we ignore the bias $\bm{b}$ here)
481 |
482 | \end{frame}
483 |
484 | \begin{frame}{From bag-of-words to continuous bag-of-words (CBOW)}
485 | \begin{block}{Two equivalent views; $\bm{W}^{D_{[i]}}$ is the $D_{[i]}$-th row of matrix $\bm{W}$}
486 | $$
487 | \bm{\hat{y}} = \frac{1}{|D|} \sum_{i =1}^{|D|} \bm{W}^{D_{[i]}}
488 | \qquad
489 | \bm{\hat{y}} = \left (\frac{1}{|D|} \sum_{i =1}^{|D|} \bm{x}^{D_{[i]}} \right) \bm{W}
490 | $$
491 | \end{block}
492 |
493 | \pause
494 | The continuous-bag-of-words (CBOW) representation
495 | \begin{itemize}
496 | \item \pause Either by summing word-representation vectors
497 | \item \pause Or by multiplying a bag-of-words vector by a matrix in which each row corresponds to a dense word representation (also called \textbf{embedding matrix})
498 | \end{itemize}
499 |
500 | \end{frame}
501 |
502 | \begin{frame}{Learned representations --- central to deep learning}
503 | Representations are central to deep learning
504 |
505 | One could argue that the main power of deep-learning is the ability to learn good representations
506 | \end{frame}
507 |
508 |
509 | \subsection{From multi-dimensional linear transformation to probabilities}
510 |
511 | \begin{frame}{Turning output vector into probabilities of classes}
512 |
513 | \begin{block}{Recap: Categorical probability distribution}
514 | Categorical random variable $X$ is defined over $K$ categories, typically mapped to natural numbers $1, 2, \ldots, K$, for example En = 1, De = 2, $\ldots$
515 |
516 | \pause
517 | Each category parametrized with probability $\Pr(X = k) = p_k$
518 |
519 | \pause
520 | Must be valid probability distribution: $\sum_{i =1}^{K} \Pr(X = i) = 1$
521 | \end{block}
522 |
523 | \pause
524 | How to turn an \textbf{unbounded} vector in $\mathbb{R}^K$ into a categorical probability distribution?
525 |
526 | \end{frame}
527 |
528 | \begin{frame}{The softmax function $\softmax (\bm{x}): \mathbb{R}^K \to \mathbb{R}^K$}
529 |
530 | \begin{block}{Softmax}
531 | Applied element-wise, for each element $\bm{x}_{[i]}$ we have
532 | $$
533 | \softmax (\bm{x}_{[i]}) = \frac{\exp(\bm{x}_{[i]})}{
534 | \sum_{k=1}^{K} \exp(\bm{x}_{[k]})
535 | }
536 | $$
537 | \end{block}
538 |
539 | \pause
540 | \begin{itemize}
541 | \item Nominator: Non-linear bijection from $\mathbb{R}$ to $(0; \infty)$
542 | \item Denominator: Normalizing constant to ensure $\sum_{j = 1}^{K} \softmax (\bm{x}_{[j]}) = 1$
543 | \end{itemize}
544 |
545 | \pause
546 | We also need to know how to compute the partial derivative of $\softmax (\bm{x}_{[i]})$ wrt.\ each argument $\bm{x}_{[k]}$: $\pdv{\softmax (\bm{x}_{[i]})}{\bm{x}_{[k]}}$
547 |
548 | \end{frame}
549 |
550 |
551 | \begin{frame}{Softmax can be smoothed with a `temperature' $T$}
552 | \vspace{-1em}
553 | $$
554 | \softmax (\bm{x}_{[i]}; T) = \frac{
555 | \exp(\frac{\bm{x}_{[i]}}{T})
556 | }{
557 | \sum_{k=1}^{K} \exp(
558 | \frac{\bm{x}_{[k]}}{T})
559 | }
560 | $$
561 |
562 | \pause
563 | \begin{block}{Example: Softmax of $\bm{x} = (3, 0, 1)$ at different $T$}
564 | \includegraphics[width=0.95\linewidth]{img/temperatures.png}
565 |
566 | High temperature $\to$ uniform distribution
567 |
568 | Low temperature $\to$ `spiky' distribution, all mass on the largest element
569 |
570 | \end{block}
571 |
572 | \begin{tikzpicture}[overlay, remember picture]
573 | \node at (current page.north east)[anchor = north east, text width=4cm, yshift=-1.3cm] {\scriptsize Figure: \fullcite[p.~103]{Murphy.2012} \par};
574 | \end{tikzpicture}
575 |
576 |
577 | \end{frame}
578 |
579 |
580 | \section{Loss function for softmax}
581 |
582 | \begin{frame}{Categorical cross-entropy loss (aka.\ negative log likelihood)}
583 |
584 | Vector representing the gold-standard categorical distribution over the classes/labels $1, \ldots, K$:
585 | $$
586 | \bm{y} = (\bm{y_{[1]}}, \bm{y}_{[2]}, \ldots, \bm{y}_{[K]})
587 | $$
588 | Output from softmax:
589 | $$
590 | \bm{\hat{y}} = (\bm{\hat{y}_{[1]}}, \bm{\hat{y}}_{[2]}, \ldots, \bm{\hat{y}}_{[K]})
591 | $$
592 | which is in fact $\bm{\hat{y}_{[i]}} = \Pr(y = i| \bm{x})$
593 |
594 |
595 | \begin{block}{Cross entropy loss}
596 | $$
597 | L_{\text{cross-entropy}} (\bm{\hat{y}, \bm{y}}) =
598 | - \sum_{k = 1}^{K} \bm{y}_{[k]} \log \left( \bm{\hat{y}}_{[k]} \right)
599 | $$
600 | \end{block}
601 | \end{frame}
602 |
603 | \begin{frame}{Background: K-L divergence (also known as \emph{relative entropy})}
604 |
605 | Let $Y$ and $\hat{Y}$ be categorical random variables over same categories, with probability distributions $P(Y)$ and $Q(\hat{Y})$
606 | \begin{align*}
607 | \mathbb{D}(P(Y) || Q(\hat{Y})) &= \mathbb{E}_{P(Y)} \left[ \log \frac{P(Y)}{Q(\hat{Y})} \right] \\
608 | &= \mathbb{E}_{P(Y)} \left[ \log P(Y) - \log Q(\hat{Y}) \right] \\
609 | &= \mathbb{E}_{P(Y)} \left[ \log P(Y)\right] - \mathbb{E}_{P(Y)} \left[ \log Q(\hat{Y}) \right] \\
610 | &= - \mathbb{E}_{P(Y)} \left[ \log \frac{1}{P(Y)}\right] - \mathbb{E}_{P(Y)} \left[ \log Q(\hat{Y}) \right] \\
611 | &= - \mathbb{H}_{P} (Y) - \mathbb{E}_{P(Y)} \left[ \log Q(\hat{Y}) \right] \\
612 | \end{align*}
613 |
614 | \end{frame}
615 |
616 |
617 |
618 | \section{Stacking transformations and non-linearity}
619 |
620 | \begin{frame}{Stacking linear layers on top of each other --- still linear!}
621 | \vspace{-1em}
622 | $$
623 | \bm{x} \in \mathbb{R}^{d_{in}} \qquad
624 | \bm{W^1} \in \mathbb{R}^{d_{in} \times d_1} \qquad
625 | \bm{b^1} \in \mathbb{R}^{d_1} \qquad
626 | \bm{W^2} \in \mathbb{R}^{d_1 \times d_{out}} \qquad
627 | \bm{b^2} \in \mathbb{R}^{d_{out}} \qquad
628 | $$
629 | $$
630 | f(\bm{x}) = \left(
631 | \bm{x} \bm{W^1} + \bm{b^1}
632 | \right)
633 | \bm{W^2} + \bm{b^2}
634 | $$
635 |
636 | \begin{figure}
637 | \begin{tikzpicture}
638 | %\node (a1) [draw, circle, inner sep=0pt, minimum width=0.75cm, fill=green!20] {$a_1$};
639 | \node (x) [constant] {$\bm{x}$};
640 | \node (w) [param, below of=x] {$\bm{W^1}$};
641 | \node (b) [param, below of=w] {$\bm{b^1}$};
642 |
643 | \node (f1) [neuron, right of=w, xshift=1.5cm] {$\bm{x} \bm{W^1} + \bm{b^1}$};
644 | \node (f2) [neuron, right of=f1, xshift=1.5cm] {$\bm{h^1} \bm{W^2} + \bm{b^2}$};
645 |
646 | \node (w2) [param, below of=f2, xshift=-1.5cm, yshift=0cm] {$\bm{W^2}$};
647 | \node (b2) [param, below of=f2, xshift=-0.5cm, yshift=-0.5cm] {$\bm{b^2}$};
648 |
649 | \node (l) [neuron, right of=f2, xshift=1cm] {$L$};
650 | \node (y) [constant, below of=f2, xshift=1.5cm] {$\bm{y}$};
651 |
652 | \begin{scope}[thick, black, ->, >=latex]
653 | \draw (x) -- (f1);
654 | \draw (w) -- (f1);
655 | \draw (b) -- (f1);
656 | \draw (f1) -- (f2);
657 | \draw (f2) -- (l);
658 | \draw (w2) -- (f2);
659 | \draw (b2) -- (f2);
660 | \draw (y) -- (l);
661 | \end{scope}
662 | \end{tikzpicture}
663 | \caption{Computational graph; green circles are trainable parameters, gray are constant inputs}
664 | \end{figure}
665 |
666 | \end{frame}
667 |
668 |
669 | \begin{frame}{Adding non-linear function $g: \mathbb{R}^{d_1} \to \mathbb{R}^{d_1}$}
670 | \vspace{-1em}
671 | $$
672 | f(\bm{x}) = g \left(
673 | \bm{x} \bm{W^1} + \bm{b^1}
674 | \right)
675 | \bm{W^2} + \bm{b^2}
676 | $$
677 |
678 | \begin{figure}
679 | \begin{tikzpicture}
680 | %\node (a1) [draw, circle, inner sep=0pt, minimum width=0.75cm, fill=green!20] {$a_1$};
681 | \node (x) [constant] {$\bm{x}$};
682 | \node (w) [param, below of=x] {$\bm{W^1}$};
683 | \node (b) [param, below of=w] {$\bm{b^1}$};
684 |
685 | \node (f1) [neuron, right of=w, xshift=1.5cm] {$\bm{x} \bm{W^1} + \bm{b^1}$};
686 |
687 | \node (g) [neuron, right of=f1, xshift=1.5cm] {$g$};
688 | \node (f2) [neuron, right of=g, xshift=1.5cm] {$\bm{h^1} \bm{W^2} + \bm{b^2}$};
689 |
690 | \node (w2) [param, below of=f2, xshift=-1.5cm, yshift=0cm] {$\bm{W^2}$};
691 | \node (b2) [param, below of=f2, xshift=-0.5cm, yshift=-0.5cm] {$\bm{b^2}$};
692 |
693 | \node (l) [neuron, right of=f2, xshift=1cm] {$L$};
694 | \node (y) [constant, below of=f2, xshift=1.5cm] {$\bm{y}$};
695 |
696 | \begin{scope}[thick, black, ->, >=latex]
697 | \draw (x) -- (f1);
698 | \draw (w) -- (f1);
699 | \draw (b) -- (f1);
700 | \draw (f1) -- (g);
701 | \draw (g) -- (f2);
702 | \draw (f2) -- (l);
703 | \draw (w2) -- (f2);
704 | \draw (b2) -- (f2);
705 | \draw (y) -- (l);
706 | \end{scope}
707 | \end{tikzpicture}
708 | \caption{Computational graph; green circles are trainable parameters, gray are constant inputs}
709 | \end{figure}
710 |
711 | \end{frame}
712 |
713 |
714 | \begin{frame}{Non-linear function $g$: Rectified linear unit (ReLU) activation}
715 |
716 |
717 | \begin{columns}
718 |
719 | \begin{column}{0.6\linewidth}
720 |
721 | $$
722 | \mathrm{ReLU}(z) =
723 | \begin{cases}
724 | 0 & \quad \text{if } z < 0\\
725 | z & \quad \text{if } z \geq 0
726 | \end{cases}
727 | $$
728 |
729 | or \hspace{0.4em} $\mathrm{ReLU}(z) = \max(0, z)$
730 |
731 |
732 |
733 |
734 | \end{column}
735 |
736 | \begin{column}{0.4\linewidth}
737 | \begin{figure}
738 | \begin{tikzpicture}
739 |
740 | \begin{axis}[
741 | xmin = -5, xmax = 5,
742 | ymin = -5, ymax = 5,
743 | xtick distance = 5,
744 | ytick distance = 5,
745 | grid = both,
746 | minor tick num = 5,
747 | major grid style = {lightgray},
748 | minor grid style = {lightgray!25},
749 | width = \textwidth,
750 | height = \textwidth,
751 | legend pos = north west
752 | ]
753 |
754 | \addplot[
755 | domain = -5:0,
756 | samples = 10,
757 | smooth,
758 | thick,
759 | blue,
760 | ] {0};
761 |
762 | \addplot[
763 | domain = 0:5,
764 | samples = 10,
765 | smooth,
766 | thick,
767 | blue,
768 | ] {x};
769 |
770 |
771 | \end{axis}
772 |
773 | \end{tikzpicture}
774 | \caption{ReLU function}
775 | \end{figure}
776 | \end{column}
777 | \end{columns}
778 |
779 |
780 | \end{frame}
781 |
782 |
783 |
784 | \section*{Recap}
785 |
786 | \begin{frame}{Take aways}
787 |
788 | \begin{itemize}
789 | \item Binary classification as a linear function of words and a sigmoid
790 | \item Binary cross-entropy (logistic) loss
791 | \item Training as minimizing the loss using minibatch SGD and backpropagation
792 | \item Stacking layers and non-linear functions: MLP
793 | \item ReLU as a go-to activation function in NLP
794 | \end{itemize}
795 |
796 | \end{frame}
797 |
798 |
799 |
800 | \begin{frame}{License and credits}
801 |
802 | \begin{columns}
803 | \begin{column}{0.7\textwidth}
804 | Licensed under Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)
805 | \end{column}
806 | \begin{column}{0.2\textwidth}
807 | \includegraphics[width=0.9\linewidth]{img/cc-by-sa-icon.pdf}
808 | \end{column}
809 | \end{columns}
810 |
811 | \bigskip
812 |
813 | Credits
814 |
815 | \begin{scriptsize}
816 |
817 | Ivan Habernal
818 |
819 | Content from ACL Anthology papers licensed under CC-BY \url{https://www.aclweb.org/anthology}
820 |
821 | \end{scriptsize}
822 |
823 | \end{frame}
824 |
825 |
826 |
827 | \end{document}
828 |
829 |
--------------------------------------------------------------------------------
/latex/lecture04/img/cc-by-sa-icon.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture04/img/cc-by-sa-icon.pdf
--------------------------------------------------------------------------------
/latex/lecture04/img/logo-trusthlt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture04/img/logo-trusthlt.pdf
--------------------------------------------------------------------------------
/latex/lecture04/img/temperatures.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture04/img/temperatures.png
--------------------------------------------------------------------------------
/latex/lecture05/.gitignore:
--------------------------------------------------------------------------------
1 | dl4nlp2023-lecture05.pdf
2 | flags.tex
3 | pdf
4 |
--------------------------------------------------------------------------------
/latex/lecture05/compile-pdf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Inspired by: https://tex.stackexchange.com/a/1501
4 |
5 | # current lecture file name
6 | lecture_filename="dl4nlp2023-lecture05"
7 |
8 | # create the output folder (might already exist)
9 | mkdir -p pdf
10 |
11 | # Compile the lecture version with pauses
12 |
13 | # set empty flags
14 | echo "" > flags.tex
15 |
16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
17 | biber pdf/${lecture_filename}-lecture.bcf
18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
20 |
21 | # Compile the handout (no slide unfolding)
22 |
23 | # set the flag
24 | echo "\handouttrue" > flags.tex
25 |
26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
27 | biber pdf/${lecture_filename}-handout.bcf
28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
30 |
31 | # Cleaning up temporary latex files
32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache
33 |
--------------------------------------------------------------------------------
/latex/lecture05/img/cc-by-sa-icon.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture05/img/cc-by-sa-icon.pdf
--------------------------------------------------------------------------------
/latex/lecture05/img/linear1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture05/img/linear1.png
--------------------------------------------------------------------------------
/latex/lecture05/img/linear2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture05/img/linear2.png
--------------------------------------------------------------------------------
/latex/lecture05/img/linear3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture05/img/linear3.png
--------------------------------------------------------------------------------
/latex/lecture05/img/linear4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture05/img/linear4.png
--------------------------------------------------------------------------------
/latex/lecture05/img/logo-trusthlt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture05/img/logo-trusthlt.pdf
--------------------------------------------------------------------------------
/latex/lecture05/img/xor1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture05/img/xor1.pdf
--------------------------------------------------------------------------------
/latex/lecture05/img/xor1.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
246 |
--------------------------------------------------------------------------------
/latex/lecture06/.gitignore:
--------------------------------------------------------------------------------
1 | dl4nlp2023-lecture06.pdf
2 | flags.tex
3 | pdf
4 |
--------------------------------------------------------------------------------
/latex/lecture06/compile-pdf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Inspired by: https://tex.stackexchange.com/a/1501
4 |
5 | # current lecture file name
6 | lecture_filename="dl4nlp2023-lecture06"
7 |
8 | # create the output folder (might already exist)
9 | mkdir -p pdf
10 |
11 | # Compile the lecture version with pauses
12 |
13 | # set empty flags
14 | echo "" > flags.tex
15 |
16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
17 | biber pdf/${lecture_filename}-lecture.bcf
18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
20 |
21 | # Compile the handout (no slide unfolding)
22 |
23 | # set the flag
24 | echo "\handouttrue" > flags.tex
25 |
26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
27 | biber pdf/${lecture_filename}-handout.bcf
28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
30 |
31 | # Cleaning up temporary latex files
32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache
33 |
--------------------------------------------------------------------------------
/latex/lecture06/img/cc-by-sa-icon.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture06/img/cc-by-sa-icon.pdf
--------------------------------------------------------------------------------
/latex/lecture06/img/logo-trusthlt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture06/img/logo-trusthlt.pdf
--------------------------------------------------------------------------------
/latex/lecture06/img/rewe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture06/img/rewe.png
--------------------------------------------------------------------------------
/latex/lecture07/.gitignore:
--------------------------------------------------------------------------------
1 | dl4nlp2023-lecture07.pdf
2 | flags.tex
3 | pdf
4 |
--------------------------------------------------------------------------------
/latex/lecture07/compile-pdf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Inspired by: https://tex.stackexchange.com/a/1501
4 |
5 | # current lecture file name
6 | lecture_filename="dl4nlp2023-lecture07"
7 |
8 | # create the output folder (might already exist)
9 | mkdir -p pdf
10 |
11 | # Compile the lecture version with pauses
12 |
13 | # set empty flags
14 | echo "" > flags.tex
15 |
16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
17 | biber pdf/${lecture_filename}-lecture.bcf
18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
20 |
21 | # Compile the handout (no slide unfolding)
22 |
23 | # set the flag
24 | echo "\handouttrue" > flags.tex
25 |
26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
27 | biber pdf/${lecture_filename}-handout.bcf
28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
30 |
31 | # Cleaning up temporary latex files
32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache
33 |
--------------------------------------------------------------------------------
/latex/lecture07/img/cc-by-sa-icon.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture07/img/cc-by-sa-icon.pdf
--------------------------------------------------------------------------------
/latex/lecture07/img/logo-trusthlt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture07/img/logo-trusthlt.pdf
--------------------------------------------------------------------------------
/latex/lecture08/.gitignore:
--------------------------------------------------------------------------------
1 | dl4nlp2023-lecture08.pdf
2 | flags.tex
3 | pdf
4 |
--------------------------------------------------------------------------------
/latex/lecture08/compile-pdf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Inspired by: https://tex.stackexchange.com/a/1501
4 |
5 | # current lecture file name
6 | lecture_filename="dl4nlp2023-lecture08"
7 |
8 | # create the output folder (might already exist)
9 | mkdir -p pdf
10 |
11 | # Compile the lecture version with pauses
12 |
13 | # set empty flags
14 | echo "" > flags.tex
15 |
16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
17 | biber pdf/${lecture_filename}-lecture.bcf
18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
20 |
21 | # Compile the handout (no slide unfolding)
22 |
23 | # set the flag
24 | echo "\handouttrue" > flags.tex
25 |
26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
27 | biber pdf/${lecture_filename}-handout.bcf
28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
30 |
31 | # Cleaning up temporary latex files
32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache
33 |
--------------------------------------------------------------------------------
/latex/lecture08/img/cc-by-sa-icon.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/cc-by-sa-icon.pdf
--------------------------------------------------------------------------------
/latex/lecture08/img/logo-trusthlt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/logo-trusthlt.pdf
--------------------------------------------------------------------------------
/latex/lecture08/img/seq2seq.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/seq2seq.pdf
--------------------------------------------------------------------------------
/latex/lecture08/img/seq2seq_attention_motivation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/seq2seq_attention_motivation.pdf
--------------------------------------------------------------------------------
/latex/lecture08/img/seq2seq_attention_t1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/seq2seq_attention_t1.pdf
--------------------------------------------------------------------------------
/latex/lecture08/img/seq2seq_attn_encdec.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/seq2seq_attn_encdec.pdf
--------------------------------------------------------------------------------
/latex/lecture08/img/seq2seq_selfattn.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/seq2seq_selfattn.pdf
--------------------------------------------------------------------------------
/latex/lecture08/img/sequence_classification.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/sequence_classification.pdf
--------------------------------------------------------------------------------
/latex/lecture08/img/sequence_labeling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/sequence_labeling.pdf
--------------------------------------------------------------------------------
/latex/lecture08/img/sequence_length.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/sequence_length.png
--------------------------------------------------------------------------------
/latex/lecture08/img/sequence_to_sequence.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/sequence_to_sequence.pdf
--------------------------------------------------------------------------------
/latex/lecture08/img/sequence_to_sequence_anno.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/sequence_to_sequence_anno.pdf
--------------------------------------------------------------------------------
/latex/lecture08/img/sequence_to_sequence_boxed.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/sequence_to_sequence_boxed.pdf
--------------------------------------------------------------------------------
/latex/lecture08/img/translation_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/translation_heatmap.png
--------------------------------------------------------------------------------
/latex/lecture08/img/ukp_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/ukp_logo.png
--------------------------------------------------------------------------------
/latex/lecture08/img/variable_input_output.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/variable_input_output.pdf
--------------------------------------------------------------------------------
/latex/lecture09/.gitignore:
--------------------------------------------------------------------------------
1 | dl4nlp2023-lecture08.pdf
2 | flags.tex
3 | pdf
4 |
--------------------------------------------------------------------------------
/latex/lecture09/compile-pdf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Inspired by: https://tex.stackexchange.com/a/1501
4 |
5 | # current lecture file name
6 | lecture_filename="dl4nlp2023-lecture09"
7 |
8 | # create the output folder (might already exist)
9 | mkdir -p pdf
10 |
11 | # Compile the lecture version with pauses
12 |
13 | # set empty flags
14 | echo "" > flags.tex
15 |
16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
17 | biber pdf/${lecture_filename}-lecture.bcf
18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
20 |
21 | # Compile the handout (no slide unfolding)
22 |
23 | # set the flag
24 | echo "\handouttrue" > flags.tex
25 |
26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
27 | biber pdf/${lecture_filename}-handout.bcf
28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
30 |
31 | # Cleaning up temporary latex files
32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache
33 |
--------------------------------------------------------------------------------
/latex/lecture09/dl4nlp2023-lecture09.tex:
--------------------------------------------------------------------------------
1 | % !TeX program = lualatex
2 | % !BIB program = biber
3 | % Lualatex is important to render Fira fonts; with pdflatex it's just the regular one
4 | % ratio 16:9 -- https://tex.stackexchange.com/questions/14336/
5 |
6 | % compile two versions, inspired by https://tex.stackexchange.com/a/1501
7 | % use the script "compile-pdf.sh"
8 | \newif\ifhandout
9 | % if flags.tex does not exist, create an empty file to be able to compile in TeXstudio
10 | \input{flags}
11 |
12 | \ifhandout
13 | \documentclass[12pt,aspectratio=169,handout]{beamer}
14 | \else
15 | \documentclass[12pt,aspectratio=169]{beamer}
16 | \fi
17 |
18 | % adjust for 16:9
19 | % https://tex.stackexchange.com/questions/354022/modifying-the-margins-of-all-slides-in-beamer
20 | \setbeamersize{text margin left=0.3cm,text margin right=1.0cm}
21 |
22 | %\usepackage{xcolor}
23 |
24 | %%% better TOC
25 | \usetheme[subsectionpage=progressbar]{metropolis}
26 |
27 | % name in footer
28 | \setbeamertemplate{frame numbering}{\insertframenumber ~ | Dr.\ Martin Tutek}
29 |
30 | % blocks with background globally
31 | \metroset{block=fill}
32 |
33 | % adjust the background to be completely white
34 | \setbeamercolor{background canvas}{bg=white}
35 |
36 | % typeset mathematics on serif
37 | \usefonttheme[onlymath]{serif}
38 |
39 | % better bibliography using biber as backend
40 | \usepackage[natbib=true,backend=biber,style=authoryear-icomp,maxbibnames=30,maxcitenames=2,uniquelist=false,giveninits=true,doi=false,url=false,dashed=false,isbn=false]{biblatex}
41 | % shared bibliography
42 | \addbibresource{../dl4nlp-bibliography.bib}
43 | % disable "ibid" for repeated citations
44 | \boolfalse{citetracker}
45 |
46 | \definecolor{76abdf}{RGB}{118, 171, 223}
47 |
48 | \setbeamercolor{frametitle}{bg=76abdf, fg=white}
49 |
50 | \newcounter{saveenumi}
51 | \newcommand{\seti}{\setcounter{saveenumi}{\value{enumi}}}
52 | \newcommand{\conti}{\setcounter{enumi}{\value{saveenumi}}}
53 |
54 | \resetcounteronoverlays{saveenumi}
55 |
56 | \usepackage{xspace}
57 | % Emojis
58 | \usepackage{emoji}
59 | % Figs
60 | \usepackage{graphicx}
61 | \graphicspath{ {./img/} }
62 |
63 |
64 | % for derivatives, https://tex.stackexchange.com/a/412442
65 | \usepackage{physics}
66 |
67 | \usepackage{tikz}
68 | \usetikzlibrary{matrix, positioning}
69 | \usetikzlibrary{angles,quotes} % for angles
70 | \usetikzlibrary{backgrounds} % background
71 | \usetikzlibrary{decorations.pathreplacing} % curly braces
72 | \usetikzlibrary{calligraphy}
73 | \usetikzlibrary{calc} % for neural nets
74 |
75 | % for plotting functions
76 | \usepackage{pgfplots}
77 | \usepgfplotslibrary{dateplot}
78 |
79 | % sub-figures
80 | \usepackage{caption}
81 | \usepackage{subcaption}
82 |
83 | % Checkmark, xmark
84 | \usepackage{pifont}% http://ctan.org/pkg/pifont
85 |
86 | % book tabs
87 | \usepackage{booktabs}
88 |
89 | % caption*
90 | \usepackage{caption}
91 |
92 |
93 | % show TOC at every section start
94 | \AtBeginSection{
95 | \frame{
96 | \vspace{2em}
97 | \sectionpage
98 | \hspace*{2.2em}\begin{minipage}{10cm}
99 | \tableofcontents[currentsection]
100 | \end{minipage}
101 | }
102 | }
103 |
104 | % argmin, argmax
105 | \usepackage{amssymb}% http://ctan.org/pkg/amssymb
106 | \usepackage{amsmath}
107 |
108 | \DeclareMathOperator*{\argmax}{arg\!\max}
109 | \DeclareMathOperator*{\argmin}{arg\!\min}
110 | % softmax
111 | \DeclareMathOperator*{\softmax}{soft\!\max}
112 | % RNN
113 | \DeclareMathOperator*{\rnn}{RNN}
114 | % RNN star
115 | \DeclareMathOperator*{\rnnstar}{RNN^{*}}
116 | % bi-RNN
117 | \DeclareMathOperator*{\birnn}{biRNN}
118 |
119 | % bold math
120 | \usepackage{bm}
121 |
122 | % for \mathclap
123 | \usepackage{mathtools}
124 |
125 | % algorithms
126 | \usepackage[noend]{algpseudocode}
127 |
128 |
129 | % for neurons and layers in tikz
130 | \tikzset{
131 | neuron/.style={draw, rectangle, inner sep=2pt, minimum width=0.75cm, fill=blue!20},
132 | param/.style={draw, rectangle, inner sep=2pt, minimum width=0.75cm, fill=green!20},
133 | constant/.style={draw, rectangle, inner sep=2pt, minimum width=0.75cm, fill=black!15},
134 | state/.style={rectangle, inner sep=2pt, minimum width=0.75cm, fill=black!5},
135 | }
136 |
137 | % for strike-through text
138 | \usepackage[normalem]{ulem}
139 |
140 |
141 | \title{Deep Learning for Natural Language Processing}
142 | \subtitle{Lecture 9 -- Text generation 3: Transformers}
143 | \date{June 13, 2023}
144 | \author{Dr.\ Martin Tutek}
145 | \institute{Ubiquitous Knowledge Processing \hfill \includegraphics[height=1.cm]{img/ukp_logo.png} \\
146 | Department of Computer Science\\
147 | Technical University of Darmstadt \hfill \href{https://www.informatik.tu-darmstadt.de/ukp/ukp_home/index.en.jsp}{\underline{UKP Web}}}
148 | %\titlegraphic{\hfill }
149 |
150 | \begin{document}
151 |
152 | \maketitle
153 |
154 | \begin{frame}{Recap}
155 | In the previous lecture we:
156 | \begin{itemize}
157 | \item Introduced the encoder-decoder architecture \& why we need it
158 | \item Defined the three broad classes of NLP problems
159 | \item Shown that RNNs have problems when modeling long dependencies
160 | \item Introduced the attention mechanism, its abstraction and design choices
161 | \end{itemize}
162 | \end{frame}
163 |
164 |
165 | \begin{frame}{Recap: Encoder-decoder with attention}
166 | \begin{center}
167 | \begin{figure}[h]
168 | \includegraphics[height=7cm]{seq2seq_attn_encdec.pdf}
169 | \end{figure}
170 | \end{center}
171 | \end{frame}
172 |
173 |
174 | \begin{frame}{Motivation}
175 |
176 | MLP -- fixed input sequence length
177 |
178 | RNN -- works well with \textbf{shorter} sequences
179 |
180 | RNN + attention -- works well with both \textbf{shorter and longer} sequences
181 |
182 | \pause
183 |
184 | \begin{itemize}
185 | \item Why not use \textbf{only} attention?
186 | \end{itemize}
187 |
188 | \pause
189 |
190 | \begin{center}
191 | \begin{figure}[h]
192 | \includegraphics[height=2cm]{aiayn}
193 | \end{figure}
194 | \end{center}
195 |
196 | \end{frame}
197 |
198 | \begin{frame}{Prerequisites for attention-only networks}
199 | What do we \textbf{gain} from recurrent networks?
200 | \pause
201 |
202 | \begin{itemize}
203 | \item \textbf{Memory cells}: contain summaries of sequence read \textit{so far}
204 | \pause
205 | \begin{itemize}
206 | \item \textbf{However}, they have \textbf{limited} capacity -- we complement them with attention
207 | \end{itemize}
208 | \pause
209 | \item \textbf{Position} of a word in sequence
210 | \pause
211 | \begin{itemize}
212 | \item For each hidden state $s_{i}$, the current word embedding $x_i$ is added to the previous state $s_{i-1}$ -- the network can distinguish \textbf{word order}
213 | \pause
214 | \item \textbf{However}, it takes $n$ recurrence operations to process a sequence
215 | \end{itemize}
216 |
217 | \end{itemize}
218 | \pause
219 |
220 | Do recurrent networks have any other \textbf{drawbacks}?
221 |
222 | \pause
223 |
224 | \begin{itemize}
225 | \item They \textbf{scale poorly} -- LSTMs are problematic to scale deeper than 4-8 layers
226 | \item \textbf{Closed vocabulary} -- so far, we assumed one word = one vector (no BPE)
227 | \end{itemize}
228 | \pause
229 |
230 | How to make attention-only networks work?
231 |
232 | \end{frame}
233 |
234 | \section{The Transformer}
235 |
236 | \begin{frame}{The Transformer (\cite{Vaswani.et.al.2017})}
237 | \begin{columns}[T] % align columns
238 |
239 | \begin{column}{.48\textwidth}
240 |
241 | \begin{figure}[h]
242 | \includegraphics[height=7cm]{anno_transformer}
243 | \end{figure}
244 | \end{column}
245 |
246 | \begin{column}{.48\textwidth}
247 | What are the unknown elements?
248 | \pause
249 | \begin{itemize}
250 | \item \textbf{Multi-head} attention
251 | \item Add \& Norm
252 | \item \textbf{Positional} embeddings
253 | \pause
254 | \item \textbf{Open vocabulary} through BPE
255 | \end{itemize}
256 | \end{column}
257 |
258 | \end{columns}
259 |
260 | \end{frame}
261 |
262 | \subsection{Contextualized representations}
263 |
264 | \begin{frame}{Contextualized representations}
265 |
266 | Recall: \textbf{limitations} of word embeddings
267 | \begin{block}{Polysemy, context independent representation}
268 | Some words have obvious multiple senses
269 |
270 | A \emph{bank} may refer to a financial institution or to the side of a river, a \emph{star} may an abstract shape, a celebrity, an astronomical entity
271 | \end{block}
272 |
273 | \pause
274 |
275 | How do recurrent networks handle contextualization?
276 |
277 | \pause
278 | $$
279 | s_i = f_{\text{rnn}} (s_{i-1}, x_i)
280 | $$
281 |
282 | \begin{itemize}
283 | \item Each state acts as a representation of the sequence \textbf{so far}
284 | \pause
285 | \end{itemize}
286 |
287 | \end{frame}
288 |
289 | \begin{frame}{Contextualized representations}
290 | $$
291 | s_i = f_{\text{rnn}} (s_{i-1}, x_i)
292 | $$
293 |
294 | \begin{itemize}
295 | \item Each state acts as a representation of the sequence \textbf{so far}
296 | \pause
297 | \begin{itemize}
298 | \item Recall: \textbf{bidirectional} RNNs (left- and right-hand context)
299 | \item A state contains \textbf{cues} about the meaning of the current word \textbf{in context}
300 | \pause
301 | \end{itemize}
302 | \vspace{1em}
303 | \item \textbf{However}, the state has to act as both
304 | \begin{enumerate}
305 | \item A summary of the entire sequence
306 | \item The meaning of the current word in context
307 | \end{enumerate}
308 | \end{itemize}
309 |
310 | \end{frame}
311 |
312 | \begin{frame}{Contextualized representations}
313 |
314 | \begin{columns}[T] % align columns
315 |
316 | \begin{column}{.48\textwidth}
317 |
318 | \begin{figure}[h]
319 | \includegraphics[height=7cm]{seq2seq_attention_t1.pdf}
320 | \end{figure}
321 | \end{column}
322 |
323 | \begin{column}{.48\textwidth}
324 | Step $1$ of encoder-decoder attention:
325 | \pause
326 | \begin{itemize}
327 | \item We obtain relevant information \textbf{for current state} from input sequence
328 | \pause
329 | \item This result of the attention operator should also contain \textbf{contextual cues}
330 | \end{itemize}
331 | \end{column}
332 |
333 | \end{columns}
334 |
335 | \end{frame}
336 |
337 |
338 | \begin{frame}{Contextualized representations}
339 |
340 | \begin{columns}[T] % align columns
341 |
342 | \begin{column}{.48\textwidth}
343 |
344 | \begin{figure}[h]
345 | \includegraphics[height=7cm]{seq2seq_attention_t2.pdf}
346 | \end{figure}
347 | \end{column}
348 | \pause
349 | \begin{column}{.48\textwidth}
350 | \begin{figure}[h]
351 | \includegraphics[height=7cm]{seq2seq_attention_t3.pdf}
352 | \end{figure}
353 | \end{column}
354 |
355 | \end{columns}
356 |
357 | \end{frame}
358 |
359 | \begin{frame}{Contextualized representations}
360 | Why not \textbf{cut out the middleman} (RNN)?
361 | \pause
362 | \begin{itemize}
363 | \item We use the RNN state as the \textbf{query} for attention
364 | \pause
365 | \item We could instead use the input \textbf{word representation}
366 | \end{itemize}
367 | \pause
368 |
369 | Recall: scaled dot-product attention
370 |
371 | \noindent\begin{minipage}{0.4\textwidth}
372 | $$
373 | a = \sum_i^n \alpha_i v_i
374 | $$
375 | \end{minipage}%
376 | \begin{minipage}{0.2\textwidth}
377 | \end{minipage}
378 | \begin{minipage}{0.4\textwidth}
379 | $$
380 | \hat{\alpha}_i = \frac{q^T \cdot k_i}{\sqrt{d_{\text{k} } } }
381 | $$
382 | \end{minipage}
383 | \pause
384 |
385 | Recall: what are the query, keys \& values (in encoder-decoder attention)?
386 |
387 | \noindent\begin{minipage}{0.29\textwidth}
388 | \vspace{1em}
389 | $$
390 | q = f_q(s^{\text{dec}}_t)
391 | $$
392 | \end{minipage}%
393 | \begin{minipage}{0.29\textwidth}
394 | $$
395 | K = f_k(\{s^{\text{enc}}_i\}_{i=1}^n)
396 | $$
397 | \end{minipage}
398 | \begin{minipage}{0.29\textwidth}
399 | $$
400 | V = f_v(\{s^{\text{enc}}_i\}_{i=1}^n)
401 | $$
402 | \end{minipage}
403 |
404 | \pause
405 | Where $f_q, f_k, f_v$ are arbitrary functions (neural network layers).
406 |
407 | \end{frame}
408 |
409 | \subsection{The Transformer attention block}
410 |
411 | \begin{frame}{The Transformer attention block}
412 |
413 | \begin{columns}[T]
414 | \begin{column}{.48\textwidth}
415 |
416 | \begin{figure}[h]
417 | \includegraphics[height=7cm]{anno_transformer}
418 | \end{figure}
419 | \end{column}
420 | \begin{column}{.48\textwidth}
421 | \textbf{Encoder} part of the Transformer block
422 |
423 | \begin{itemize}
424 | \item Inputs: $\{\bm{x}^l_i\}_{i=1}^n; \quad \bm{x}_i \in \mathbb{R}^{d_m}$
425 | \item $x^0_i \to \text{word embeddings}$
426 | \pause
427 | \end{itemize}
428 |
429 | Goal: \textbf{contextualize} word embeds.
430 |
431 | \begin{enumerate}
432 | \pause
433 | \item Transform \textbf{each} embedding to its query, key and value reprs.
434 | \pause
435 | \item Apply \textbf{pairwise} attention between all inputs
436 | \pause
437 | \item Use the outputs as word embeddings for \textbf{next layer}
438 | \end{enumerate}
439 | \end{column}
440 | \end{columns}
441 | \end{frame}
442 |
443 | \begin{frame}{The Transformer attention block}
444 |
445 | \begin{enumerate}
446 | \item Each layer $l$ has its own query, key and value linear transformation
447 | $$
448 | \bm{W}^l_q, \bm{W}^l_k, \bm{W}^l_v \in \mathbb{R}^{d_m \times d_m}
449 | $$
450 | \pause
451 | \item Transform the inputs of the current layer $\{\bm{x}^l_i\}$ into the keys, queries and values
452 | $$
453 | \bm{Q} = \bm{W}_q (\{\bm{x}^l_i\}) \quad \bm{K} = \bm{W}_k (\{\bm{x}^l_i\}) \quad \bm{V} = \bm{W}_v (\{\bm{x}^l_i\})
454 | $$
455 | \pause
456 | \item Apply scaled dot-product attention
457 | $$
458 | \text{Attention} (\bm{Q},\bm{K},\bm{V}) = \text{softmax} \left( \frac{\bm{Q}\bm{K}^T}{\sqrt{d_m}}\right) \bm{V}
459 | $$
460 |
461 | \end{enumerate}
462 |
463 |
464 | \end{frame}
465 |
466 |
467 | \begin{frame}{The Transformer attention block: scaled dot-product}
468 | \begin{columns}[T] % align columns
469 |
470 | \begin{column}{.48\textwidth}
471 |
472 | \begin{figure}[h]
473 | \includegraphics[height=5cm]{anno_transformer_attn_block}
474 | \caption*{Figure from \cite{Vaswani.et.al.2017}}
475 | \end{figure}
476 | \end{column}
477 |
478 | \begin{column}{.48\textwidth}
479 | $$
480 | \text{Attention} (\bm{Q},\bm{K},\bm{V}) = \text{softmax} \left( \frac{\bm{Q}\bm{K}^T}{\sqrt{d_m}} \right) \bm{V}
481 | $$
482 | \pause
483 | \begin{itemize}
484 | \item Matmul between $\bm{Q}$ and $\bm{K} \to$ \textbf{energy}
485 | \pause
486 | \item Masking (why?)
487 | \begin{itemize}
488 | \item We might not want to attend to \textbf{all} tokens
489 | \end{itemize}
490 | \pause
491 | \item Output $=$ weighted sum
492 | \end{itemize}
493 | \end{column}
494 |
495 | \end{columns}
496 |
497 | \end{frame}
498 |
499 |
500 | \begin{frame}{The Transformer attention block: multi-head attention}
501 |
502 | \begin{columns}[T]
503 | \begin{column}{.48\textwidth}
504 |
505 | \begin{figure}[h]
506 | \includegraphics[height=7cm]{anno_transformer}
507 | \end{figure}
508 | \end{column}
509 | \begin{column}{.48\textwidth}
510 | However: we are using \textbf{multi-head} attention!
511 | \vspace{1em}
512 | \pause
513 |
514 | Idea: there could be \textbf{multiple aspects} in which two tokens can be similar
515 | \pause
516 | \begin{itemize}
517 | \item Intuition: \textit{each} hidden dimension $\approx$ one linguistic feature
518 | \item $\to$ perform \textbf{multiple} energy computations
519 | \end{itemize}
520 | \end{column}
521 | \end{columns}
522 | \end{frame}
523 |
524 |
525 | \begin{frame}{The Transformer attention block: multi-head attention}
526 |
527 | \textbf{Recall:} Transform the inputs of the current layer $\{\bm{x}^l_i\}$ into the keys, queries and values
528 | $$
529 | \bm{Q} = \bm{W}_q (\{\bm{x}^l_i\}) \quad \bm{K} = \bm{W}_k (\{\bm{x}^l_i\}) \quad \bm{V} = \bm{W}_v (\{\bm{x}^l_i\})
530 | $$
531 |
532 | \pause
533 | Each matrix $\bm{Q}, \bm{K}, \bm{V} \in \mathbb{R}^{n \times d_m}$, where $d_m$ is the \textit{model dimension}.
534 |
535 | \pause
536 | \textbf{Split} each query/key/value into $h$ \textbf{heads} (aspects) by \textit{reshaping}.
537 |
538 | $$
539 | \bm{Q}, \bm{K}, \bm{V} \in \mathbb{R}^{n \times d_m} \to \bm{Q}, \bm{K}, \bm{V} \in \mathbb{R}^{n \times h \times d_m/h}
540 | $$
541 | \pause
542 | \begin{itemize}
543 | \item \textbf{Note}: $d_m$ \textbf{has} to be divisible by $h$
544 | \end{itemize}
545 | \pause
546 | Remaining process continues as usual.
547 |
548 | \end{frame}
549 |
550 | \begin{frame}{The Transformer attention block: multi-head attention}
551 |
552 | \textbf{Recall}: 3. Apply scaled dot-product attention
553 | $$
554 | \text{Attention} (\bm{Q},\bm{K},\bm{V}) = \text{softmax} \left( \frac{\bm{Q}\bm{K}^T}{\sqrt{d_m}}\right) \bm{V}
555 | $$
556 |
557 | \pause
558 |
559 | Apply attention $h$ times \textbf{in parallel}, then \textbf{concatenate} the results.
560 |
561 | \pause
562 |
563 |
564 | $$
565 | \text{Attention}_j (\bm{Q}_j, \bm{K}_j, \bm{V}_j) = \text{softmax} \left( \frac{\bm{Q}_j \bm{K}_j^T}{ \sqrt{d_m / h} } \right) \bm{V}_j
566 | $$
567 |
568 | \pause
569 |
570 | Where $ \{ \bm{Q}, \bm{K}, \bm{V} \}^h_{j=1} $ are different \textit{heads}.
571 |
572 | \end{frame}
573 |
574 | \begin{frame}{The Transformer attention block: multi-head attention}
575 |
576 | \begin{columns}[T]
577 | \begin{column}{.48\textwidth}
578 |
579 | \begin{figure}[h]
580 | \includegraphics[height=7cm]{anno_trf_multihead}
581 | \end{figure}
582 | \end{column}
583 | \begin{column}{.48\textwidth}
584 | Although this entire process happens behind the scenes, we will still refer to (multi-head) attention as
585 |
586 | $$
587 | \text{Attention} (\bm{Q},\bm{K},\bm{V}) = \text{softmax} \left( \frac{\bm{Q}\bm{K}^T}{\sqrt{d_m}}\right) \bm{V}
588 | $$
589 |
590 | for brevity.
591 | \end{column}
592 | \end{columns}
593 |
594 | \end{frame}
595 |
596 | \begin{frame}{The Transformer attention block: residual connection}
597 |
598 | \begin{columns}[T]
599 | \begin{column}{.48\textwidth}
600 |
601 | \begin{figure}[h]
602 | \includegraphics[height=7cm]{anno_trf_hlattn.png}
603 | \end{figure}
604 | \end{column}
605 | \begin{column}{.48\textwidth}
606 | We use \textit{residual connections} with the input of the layer
607 | \begin{enumerate}
608 | \item $\hat{x}^l$ is the output of attention
609 | $$
610 | \hat{x}^l = \text{Attention} (\bm{Q}^l,\bm{K}^l,\bm{V}^l)
611 | $$
612 | \item We apply the residual connection and normalize
613 | $$
614 | x^{l*} = \text{LayerNorm} ( x^l + \hat{x}^l )
615 | $$
616 | \seti
617 | \end{enumerate}
618 | \end{column}
619 | \end{columns}
620 |
621 | \end{frame}
622 |
623 | \begin{frame}{The Transformer attention block: position-wise linear layer}
624 |
625 | \begin{columns}[T]
626 | \begin{column}{.48\textwidth}
627 |
628 | \begin{figure}[h]
629 | \includegraphics[height=7cm]{anno_trf_hllinear.png}
630 | \end{figure}
631 | \end{column}
632 | \begin{column}{.48\textwidth}
633 | \begin{enumerate}
634 | \conti
635 | \item We apply an extra \textbf{linear transformation} to each individual representation
636 |
637 | $$
638 | x^{l+1} = \text{LayerNorm} (x^{l*} + f^l_{hh} (x^{l*}))
639 | $$
640 |
641 | Where $f_{hh}$ is an arbitrary transformation (single hidden layer NN)
642 |
643 | \item We use $x^{l+1}$ as the input to the \textbf{next} layer $l+1$
644 | \end{enumerate}
645 | \end{column}
646 | \end{columns}
647 |
648 | \end{frame}
649 |
650 | \subsection{Byte-pair encodings}
651 |
652 | \begin{frame}{Byte-pair encodings}
653 | \textbf{Recall}: sub-word embeddings
654 |
655 | \begin{block}{Sub-word embeddings}
656 | Each character $n-$gram has its own embedding.
657 |
658 | Resolves the issues of \textbf{rare words}, \textbf{typos} and doesn't ignore the \textbf{morphology} of each word.
659 |
660 | However -- it scales poorly (there are \textbf{many} character $n-$grams)
661 | \end{block}
662 |
663 | \pause
664 |
665 | \textbf{Byte pair encodings} -- characters ($1$-grams / \textit{bytes}) can represent \textbf{any} word.
666 |
667 |
668 | \end{frame}
669 |
670 | \begin{frame}{Byte-pair encodings}
671 |
672 | \begin{block}{Byte-pair encodings}
673 | Start at \textbf{character} level.
674 |
675 | Merge the two \textbf{most frequently co-occurring} characters into a \textbf{new character}.
676 |
677 | Continue until you reach desired vocabulary size.
678 | \textbf{Each word} will always be represented.
679 |
680 | \end{block}
681 |
682 | \pause
683 |
684 | \textbf{Variants}: WordPiece, SentencePiece, subword-nmt (\href{https://github.com/google/sentencepiece}{\underline{GitHub}})
685 |
686 | \pause
687 |
688 | The differences are in the \textbf{merging criterion}:
689 | \pause
690 | \begin{itemize}
691 | \item \cite{Sennrich.et.al.2016.ACL} use \textbf{frequency} of co-occurrence;
692 | \item \cite{kudo2018subword} trains a \textbf{unigram language model}.
693 | \end{itemize}
694 |
695 | \end{frame}
696 |
697 | \subsection{Positional embeddings}
698 |
699 | \begin{frame}{Positional embeddings}
700 | The Transformer processes all tokens \textbf{in parallel} -- there is \textbf{no information} about word order which in RNNs originated from recurrence.
701 |
702 | \pause
703 |
704 | \textbf{Idea}: use functions which depend on \textbf{position of token in sequence}. The closer the tokens, the higher the similarity of the functions.
705 |
706 | \pause
707 |
708 | \begin{itemize}
709 | \item Sine and cosine waves
710 | $$
711 | PE_{(pos, 2i)} = \underbrace{\text{sin} ( \text{pos} / 10000^{2i / d_m})}_{\text{Even dimensions}}
712 | $$
713 | \pause
714 | $$
715 | PE_{(pos, 2i+1)} = \underbrace{\text{cos} ( \text{pos} / 10000^{2i / d_m})}_{\text{Even dimensions}}
716 | $$
717 | \pause
718 | \item We \textbf{sum} the positional embedding vector to the token embedding
719 | \end{itemize}
720 |
721 | \end{frame}
722 |
723 | \begin{frame}{Positional embeddings}
724 | \begin{center}
725 | \begin{figure}[h]
726 | \includegraphics[height=5cm]{positional_embs}
727 | \end{figure}
728 | \end{center}
729 | \end{frame}
730 |
731 |
732 | \begin{frame}{Positional embeddings}
733 | Alternative: \textbf{trained} positional embeddings
734 |
735 | \pause
736 | \begin{itemize}
737 | \item Similar to word embeddings (byte pair embeddings)
738 | \item We randomly initialize a \textbf{position embedding matrix} and train it along with our model
739 | \pause
740 | \begin{itemize}
741 | \item \underline{Issues}?
742 | \pause
743 | \item How \textbf{large} is this position embedding matrix?
744 | \item What if test data contains sequences \textbf{longer} than training data?
745 | \end{itemize}
746 | \end{itemize}
747 |
748 | \end{frame}
749 |
750 |
751 | \section*{Recap}
752 |
753 | % **Content**
754 | %
755 | %* Vanilla RNNs (and maybe vanishing/exploding gradient?)
756 | %* LSTM cells
757 | %* Bi-Directional LSTMs
758 | %* Domain adaptation and multi-task learning (?)
759 | %
760 | %**Notes**
761 | %
762 | %efficiency, bidirectionality, multi-layer RNNs, how to apply to different tasks, how to ensure no data leakage
763 | %connection to LMs (using RNNs)?
764 | %
765 | %* Domain adaptation and multi-task learning (?) - should be somewhere early too
766 |
767 | \begin{frame}{Takeaways}
768 |
769 | \begin{itemize}
770 | \item Transformer networks are \textbf{fully attentional networks}
771 | \begin{itemize}
772 | \item More efficient than RNNs (process tokens in parallel)
773 | \item Scale better than RNNs (deeper networks)
774 | \end{itemize}
775 | \item Multi-head attention
776 | \begin{itemize}
777 | \item Split each token representation into $h$ parts, perform $h$ attention operations in parallel
778 | \item Increased expressivity
779 | \end{itemize}
780 |
781 | \item They require \textbf{positional embeddings}
782 | \begin{itemize}
783 | \item Parallel processing $=$ no information about word position
784 | \end{itemize}
785 | \item Byte pair encoding allows for \textbf{open vocabulary}
786 | \end{itemize}
787 |
788 | \end{frame}
789 |
790 |
791 |
792 | \begin{frame}{License and credits}
793 |
794 | \begin{columns}
795 | \begin{column}{0.7\textwidth}
796 | Licensed under Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)
797 | \end{column}
798 | \begin{column}{0.2\textwidth}
799 | \includegraphics[width=0.9\linewidth]{img/cc-by-sa-icon.pdf}
800 | \end{column}
801 | \end{columns}
802 |
803 | \bigskip
804 |
805 | Credits
806 |
807 | \begin{scriptsize}
808 |
809 | Martin Tutek
810 |
811 | Content from ACL Anthology papers licensed under CC-BY \url{https://www.aclweb.org/anthology}
812 |
813 |
814 | \end{scriptsize}
815 |
816 | \end{frame}
817 |
818 |
819 |
820 | \end{document}
821 |
822 |
--------------------------------------------------------------------------------
/latex/lecture09/img/aiayn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/aiayn.png
--------------------------------------------------------------------------------
/latex/lecture09/img/anno_transformer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/anno_transformer.png
--------------------------------------------------------------------------------
/latex/lecture09/img/anno_transformer_attn_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/anno_transformer_attn_block.png
--------------------------------------------------------------------------------
/latex/lecture09/img/anno_trf_hlattn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/anno_trf_hlattn.png
--------------------------------------------------------------------------------
/latex/lecture09/img/anno_trf_hllinear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/anno_trf_hllinear.png
--------------------------------------------------------------------------------
/latex/lecture09/img/anno_trf_multihead.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/anno_trf_multihead.png
--------------------------------------------------------------------------------
/latex/lecture09/img/cc-by-sa-icon.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/cc-by-sa-icon.pdf
--------------------------------------------------------------------------------
/latex/lecture09/img/positional_embs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/positional_embs.png
--------------------------------------------------------------------------------
/latex/lecture09/img/seq2seq_attention_t1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/seq2seq_attention_t1.pdf
--------------------------------------------------------------------------------
/latex/lecture09/img/seq2seq_attention_t2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/seq2seq_attention_t2.pdf
--------------------------------------------------------------------------------
/latex/lecture09/img/seq2seq_attention_t3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/seq2seq_attention_t3.pdf
--------------------------------------------------------------------------------
/latex/lecture09/img/seq2seq_attn_encdec.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/seq2seq_attn_encdec.pdf
--------------------------------------------------------------------------------
/latex/lecture09/img/ukp_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/ukp_logo.png
--------------------------------------------------------------------------------
/latex/lecture10/.gitignore:
--------------------------------------------------------------------------------
1 | dl4nlp2023-lecture08.pdf
2 | flags.tex
3 | pdf
4 |
--------------------------------------------------------------------------------
/latex/lecture10/compile-pdf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Inspired by: https://tex.stackexchange.com/a/1501
4 |
5 | # current lecture file name
6 | lecture_filename="dl4nlp2023-lecture10"
7 |
8 | # create the output folder (might already exist)
9 | mkdir -p pdf
10 |
11 | # Compile the lecture version with pauses
12 |
13 | # set empty flags
14 | echo "" > flags.tex
15 |
16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
17 | biber pdf/${lecture_filename}-lecture.bcf
18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
20 |
21 | # Compile the handout (no slide unfolding)
22 |
23 | # set the flag
24 | echo "\handouttrue" > flags.tex
25 |
26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
27 | biber pdf/${lecture_filename}-handout.bcf
28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
30 |
31 | # Cleaning up temporary latex files
32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache
33 |
--------------------------------------------------------------------------------
/latex/lecture10/img/BERT-language-modeling-masked-lm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/BERT-language-modeling-masked-lm.png
--------------------------------------------------------------------------------
/latex/lecture10/img/aiayn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/aiayn.png
--------------------------------------------------------------------------------
/latex/lecture10/img/anno_transformer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/anno_transformer.png
--------------------------------------------------------------------------------
/latex/lecture10/img/anno_transformer_attn_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/anno_transformer_attn_block.png
--------------------------------------------------------------------------------
/latex/lecture10/img/anno_trf_hlattn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/anno_trf_hlattn.png
--------------------------------------------------------------------------------
/latex/lecture10/img/anno_trf_hllinear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/anno_trf_hllinear.png
--------------------------------------------------------------------------------
/latex/lecture10/img/anno_trf_multihead.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/anno_trf_multihead.png
--------------------------------------------------------------------------------
/latex/lecture10/img/bart-pretraining-tasks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bart-pretraining-tasks.png
--------------------------------------------------------------------------------
/latex/lecture10/img/bert-decoder-head-hl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-decoder-head-hl.png
--------------------------------------------------------------------------------
/latex/lecture10/img/bert-google.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-google.png
--------------------------------------------------------------------------------
/latex/lecture10/img/bert-next-sentence-prediction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-next-sentence-prediction.png
--------------------------------------------------------------------------------
/latex/lecture10/img/bert-pair-classification.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-pair-classification.png
--------------------------------------------------------------------------------
/latex/lecture10/img/bert-paper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-paper.png
--------------------------------------------------------------------------------
/latex/lecture10/img/bert-results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-results.png
--------------------------------------------------------------------------------
/latex/lecture10/img/bert-seq-labeling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-seq-labeling.png
--------------------------------------------------------------------------------
/latex/lecture10/img/bert-single-sentence-clf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-single-sentence-clf.png
--------------------------------------------------------------------------------
/latex/lecture10/img/bert-spanex-qa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-spanex-qa.png
--------------------------------------------------------------------------------
/latex/lecture10/img/bert-viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-viz.png
--------------------------------------------------------------------------------
/latex/lecture10/img/bert_dual_seq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert_dual_seq.png
--------------------------------------------------------------------------------
/latex/lecture10/img/bert_modeling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert_modeling.pdf
--------------------------------------------------------------------------------
/latex/lecture10/img/bert_nsp_anno.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert_nsp_anno.png
--------------------------------------------------------------------------------
/latex/lecture10/img/cc-by-sa-icon.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/cc-by-sa-icon.pdf
--------------------------------------------------------------------------------
/latex/lecture10/img/gifs/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "bc9c5062",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "ename": "SyntaxError",
11 | "evalue": "unmatched ')' (2784638048.py, line 8)",
12 | "output_type": "error",
13 | "traceback": [
14 | "\u001b[0;36m Cell \u001b[0;32mIn [1], line 8\u001b[0;36m\u001b[0m\n\u001b[0;31m os.rename(os.path.join(path, file), os.path.join(path, file[:9]+\".png\")))\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m unmatched ')'\n"
15 | ]
16 | }
17 | ],
18 | "source": [
19 | "import os\n",
20 | "path = 'transformer-decoding-frames'\n",
21 | "files = os.listdir(path)\n",
22 | "\n",
23 | "\n",
24 | "for index, file in enumerate(files):\n",
25 | " \n",
26 | " os.rename(os.path.join(path, file), os.path.join(path, file[:9]+\".png\"))"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "id": "5279e0ac",
33 | "metadata": {},
34 | "outputs": [],
35 | "source": []
36 | }
37 | ],
38 | "metadata": {
39 | "kernelspec": {
40 | "display_name": "Python 3 (ipykernel)",
41 | "language": "python",
42 | "name": "python3"
43 | },
44 | "language_info": {
45 | "codemirror_mode": {
46 | "name": "ipython",
47 | "version": 3
48 | },
49 | "file_extension": ".py",
50 | "mimetype": "text/x-python",
51 | "name": "python",
52 | "nbconvert_exporter": "python",
53 | "pygments_lexer": "ipython3",
54 | "version": "3.9.12"
55 | }
56 | },
57 | "nbformat": 4,
58 | "nbformat_minor": 5
59 | }
60 |
--------------------------------------------------------------------------------
/latex/lecture10/img/gifs/transformer_decoding_1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/gifs/transformer_decoding_1.gif
--------------------------------------------------------------------------------
/latex/lecture10/img/gifs/transformer_decoding_2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/gifs/transformer_decoding_2.gif
--------------------------------------------------------------------------------
/latex/lecture10/img/positional_embs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/positional_embs.png
--------------------------------------------------------------------------------
/latex/lecture10/img/pretrained-lm-variants.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/pretrained-lm-variants.png
--------------------------------------------------------------------------------
/latex/lecture10/img/seq2seq_attention_t1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/seq2seq_attention_t1.pdf
--------------------------------------------------------------------------------
/latex/lecture10/img/seq2seq_attention_t2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/seq2seq_attention_t2.pdf
--------------------------------------------------------------------------------
/latex/lecture10/img/seq2seq_attention_t3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/seq2seq_attention_t3.pdf
--------------------------------------------------------------------------------
/latex/lecture10/img/seq2seq_attn_encdec.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/seq2seq_attn_encdec.pdf
--------------------------------------------------------------------------------
/latex/lecture10/img/t5-objectives.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/t5-objectives.png
--------------------------------------------------------------------------------
/latex/lecture10/img/the_transformer_mt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/the_transformer_mt.png
--------------------------------------------------------------------------------
/latex/lecture10/img/transformer_blocks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/transformer_blocks.png
--------------------------------------------------------------------------------
/latex/lecture10/img/transformer_encoder_decoder_stack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/transformer_encoder_decoder_stack.png
--------------------------------------------------------------------------------
/latex/lecture10/img/transformer_encoder_decoder_stack_full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/transformer_encoder_decoder_stack_full.png
--------------------------------------------------------------------------------
/latex/lecture10/img/transformer_encoders_decoders.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/transformer_encoders_decoders.png
--------------------------------------------------------------------------------
/latex/lecture10/img/transformer_residual_layer_norm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/transformer_residual_layer_norm.png
--------------------------------------------------------------------------------
/latex/lecture10/img/ukp_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/ukp_logo.png
--------------------------------------------------------------------------------
/latex/lecture10/img/word2vec_cbow.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/word2vec_cbow.pdf
--------------------------------------------------------------------------------
/latex/lecture11/.gitignore:
--------------------------------------------------------------------------------
1 | dl4nlp2023-lecture08.pdf
2 | flags.tex
3 | pdf
4 |
--------------------------------------------------------------------------------
/latex/lecture11/compile-pdf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Inspired by: https://tex.stackexchange.com/a/1501
4 |
5 | # current lecture file name
6 | lecture_filename="dl4nlp2023-lecture11"
7 |
8 | # create the output folder (might already exist)
9 | mkdir -p pdf
10 |
11 | # Compile the lecture version with pauses
12 |
13 | # set empty flags
14 | echo "" > flags.tex
15 |
16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
17 | biber pdf/${lecture_filename}-lecture.bcf
18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex
20 |
21 | # Compile the handout (no slide unfolding)
22 |
23 | # set the flag
24 | echo "\handouttrue" > flags.tex
25 |
26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
27 | biber pdf/${lecture_filename}-handout.bcf
28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex
30 |
31 | # Cleaning up temporary latex files
32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache
33 |
--------------------------------------------------------------------------------
/latex/lecture11/dl4nlp2023-lecture11.tex:
--------------------------------------------------------------------------------
1 | % !TeX program = lualatex
2 | % !BIB program = biber
3 | % Lualatex is important to render Fira fonts; with pdflatex it's just the regular one
4 | % ratio 16:9 -- https://tex.stackexchange.com/questions/14336/
5 |
6 | % compile two versions, inspired by https://tex.stackexchange.com/a/1501
7 | % use the script "compile-pdf.sh"
8 | \newif\ifhandout
9 | % if flags.tex does not exist, create an empty file to be able to compile in TeXstudio
10 | \input{flags}
11 |
12 | \ifhandout
13 | \documentclass[12pt,aspectratio=169,handout]{beamer}
14 | \else
15 | \documentclass[12pt,aspectratio=169]{beamer}
16 | \fi
17 |
18 | % adjust for 16:9
19 | % https://tex.stackexchange.com/questions/354022/modifying-the-margins-of-all-slides-in-beamer
20 | \setbeamersize{text margin left=0.3cm,text margin right=1.0cm}
21 |
22 | %\usepackage{xcolor}
23 |
24 | %%% better TOC
25 | \usetheme[subsectionpage=progressbar]{metropolis}
26 |
27 | % name in footer
28 | \setbeamertemplate{frame numbering}{\insertframenumber ~ | Dr.\ Martin Tutek}
29 |
30 | % blocks with background globally
31 | \metroset{block=fill}
32 |
33 | % adjust the background to be completely white
34 | \setbeamercolor{background canvas}{bg=white}
35 |
36 | % typeset mathematics on serif
37 | \usefonttheme[onlymath]{serif}
38 |
39 | % better bibliography using biber as backend
40 | \usepackage[natbib=true,backend=biber,style=authoryear-icomp,maxbibnames=30,maxcitenames=2,uniquelist=false,giveninits=true,doi=false,url=false,dashed=false,isbn=false]{biblatex}
41 | % shared bibliography
42 | \addbibresource{../dl4nlp-bibliography.bib}
43 | % disable "ibid" for repeated citations
44 | \boolfalse{citetracker}
45 |
46 | \definecolor{76abdf}{RGB}{118, 171, 223}
47 |
48 | \setbeamercolor{frametitle}{bg=76abdf, fg=white}
49 |
50 | \newcounter{saveenumi}
51 | \newcommand{\seti}{\setcounter{saveenumi}{\value{enumi}}}
52 | \newcommand{\conti}{\setcounter{enumi}{\value{saveenumi}}}
53 |
54 | \resetcounteronoverlays{saveenumi}
55 | % \usepackage{movie15}
56 | \usepackage{animate}
57 |
58 | \usepackage{xspace}
59 | % Emojis
60 | \usepackage{emoji}
61 | % Figs
62 | \usepackage{graphicx}
63 | \graphicspath{ {./img/} }
64 |
65 |
66 | % for derivatives, https://tex.stackexchange.com/a/412442
67 | \usepackage{physics}
68 |
69 | \usepackage{tikz}
70 | \usetikzlibrary{matrix, positioning}
71 | \usetikzlibrary{angles,quotes} % for angles
72 | \usetikzlibrary{backgrounds} % background
73 | \usetikzlibrary{decorations.pathreplacing} % curly braces
74 | \usetikzlibrary{calligraphy}
75 | \usetikzlibrary{calc} % for neural nets
76 |
77 | % for plotting functions
78 | \usepackage{pgfplots}
79 | \usepgfplotslibrary{dateplot}
80 |
81 | % sub-figures
82 | \usepackage{caption}
83 | \usepackage{subcaption}
84 |
85 | % Checkmark, xmark
86 | \usepackage{pifont}% http://ctan.org/pkg/pifont
87 |
88 | % book tabs
89 | \usepackage{booktabs}
90 |
91 | % caption*
92 | \usepackage{caption}
93 |
94 |
95 | % show TOC at every section start
96 | \AtBeginSection{
97 | \frame{
98 | \vspace{2em}
99 | \sectionpage
100 | \hspace*{2.2em}\begin{minipage}{10cm}
101 | \tableofcontents[currentsection]
102 | \end{minipage}
103 | }
104 | }
105 |
106 | % argmin, argmax
107 | \usepackage{amssymb}% http://ctan.org/pkg/amssymb
108 | \usepackage{amsmath}
109 |
110 | \DeclareMathOperator*{\argmax}{arg\!\max}
111 | \DeclareMathOperator*{\argmin}{arg\!\min}
112 | % softmax
113 | \DeclareMathOperator*{\softmax}{soft\!\max}
114 | % RNN
115 | \DeclareMathOperator*{\rnn}{RNN}
116 | % RNN star
117 | \DeclareMathOperator*{\rnnstar}{RNN^{*}}
118 | % bi-RNN
119 | \DeclareMathOperator*{\birnn}{biRNN}
120 |
121 | % bold math
122 | \usepackage{bm}
123 |
124 | % for \mathclap
125 | \usepackage{mathtools}
126 |
127 | % algorithms
128 | \usepackage[noend]{algpseudocode}
129 |
130 |
131 | % for neurons and layers in tikz
132 | \tikzset{
133 | neuron/.style={draw, rectangle, inner sep=2pt, minimum width=0.75cm, fill=blue!20},
134 | param/.style={draw, rectangle, inner sep=2pt, minimum width=0.75cm, fill=green!20},
135 | constant/.style={draw, rectangle, inner sep=2pt, minimum width=0.75cm, fill=black!15},
136 | state/.style={rectangle, inner sep=2pt, minimum width=0.75cm, fill=black!5},
137 | }
138 |
139 | % for strike-through text
140 | \usepackage[normalem]{ulem}
141 |
142 |
143 | \title{Deep Learning for Natural Language Processing}
144 | \subtitle{Lecture 11 -- Text generation 4: Decoder-only Models and GPT}
145 | \date{June 27, 2023}
146 | \author{Dr.\ Martin Tutek}
147 | \institute{Ubiquitous Knowledge Processing \hfill \includegraphics[height=1.cm]{img/ukp_logo.png} \\
148 | Department of Computer Science\\
149 | Technical University of Darmstadt \hfill \href{https://www.informatik.tu-darmstadt.de/ukp/ukp_home/index.en.jsp}{\underline{UKP Web}}}
150 | %\titlegraphic{\hfill }
151 |
152 | \begin{document}
153 |
154 | \maketitle
155 |
156 | \begin{frame}{Recap}
157 | In the previous lecture we:
158 | \begin{itemize}
159 | \item Introduced the \textbf{BERT model}
160 | \item Introduced the two pretraining tasks for BERT: \textbf{MLM} and \textbf{NSP}
161 | \item Explained the connection between MLM and CBOW-style training
162 | \item Explained the purpose of NSP -- learning a \textbf{sentence embedding}
163 | \item Analyzed how to \textbf{apply BERT} to various \textbf{downstream tasks} such as classification and QA
164 | \item Gave an overview of various other pretraining tasks for LLMs
165 | \end{itemize}
166 | \end{frame}
167 |
168 | \begin{frame}{Motivation}
169 | Recall: using the \textbf{same model} for \textbf{multiple tasks} without task-specific decoder heads
170 | \begin{figure}[h]
171 | \includegraphics[height=4.5cm]{t5-objectives}
172 | \caption*{Image from \href{https://jmlr.org/papers/volume21/20-074/20-074.pdf}{\underline{T5 paper}}}
173 | \end{figure}
174 | \end{frame}
175 |
176 | \begin{frame}{Motivation}
177 | Recall: using the \textbf{same model} for \textbf{multiple tasks} without task-specific decoder heads
178 | \begin{figure}[h]
179 | \includegraphics[height=4.5cm]{t5-anno-prompts}
180 | \caption*{Image from \href{https://jmlr.org/papers/volume21/20-074/20-074.pdf}{\underline{T5 paper}}}
181 | \end{figure}
182 | \end{frame}
183 |
184 | \section{Types of Transformer Architectures}
185 |
186 | \begin{frame}{Encoder-Decoder Transformer}
187 |
188 | \begin{figure}[h]
189 | \includegraphics[height=4.5cm]{transformer_enc_dec.pdf}
190 | \end{figure}
191 |
192 | \end{frame}
193 |
194 |
195 | \begin{frame}{Bidirectional Encoder-only Transformer}
196 | \begin{columns}[T] % align columns
197 | \begin{column}{.48\textwidth}
198 | \begin{figure}[h]
199 | \includegraphics[height=4.5cm]{bidirectional_trf_encoder.pdf}
200 | \end{figure}
201 | \end{column}
202 |
203 | \begin{column}{.48\textwidth}
204 | \begin{itemize}
205 | \item Efficient encoding \emoji{check-mark}
206 | \item Versatile base for downstream tasks \emoji{check-mark}
207 | \pause
208 | \item Can't \textbf{really} generate text \emoji{cross-mark}
209 | \end{itemize}
210 | \end{column}
211 |
212 | \end{columns}
213 | \end{frame}
214 |
215 |
216 | \begin{frame}{Autoregressive Decoder-only Transformer}
217 | \begin{columns}[T] % align columns
218 | \begin{column}{.48\textwidth}
219 | \begin{figure}[h]
220 | \includegraphics[height=4.5cm]{autoregressive_trf_decoder.pdf}
221 | \end{figure}
222 | \end{column}
223 |
224 | \begin{column}{.48\textwidth}
225 | An \textbf{autoregressive} (causal) language model uses \textbf{past} values of a time series to predict future values.
226 | \pause
227 | \begin{itemize}
228 | \item Didn't we decide not to use these because they were inefficient?
229 | \pause \begin{center} \textbf{(RNNs)} \end{center}
230 | \pause
231 | \item Yes, but...
232 | \begin{enumerate}
233 | \item Hardware has improved
234 | \item Autoregressive models are \textit{really} good at generating text
235 | \end{enumerate}
236 | \end{itemize}
237 | \end{column}
238 | \end{columns}
239 |
240 | \end{frame}
241 |
242 | \begin{frame}{Differences between attention masks}
243 | \begin{figure}[h]
244 | \includegraphics[height=4.5cm]{attention-types}
245 | \end{figure}
246 | Read: y axis $\to$ tokens attending, x axis $\to$ tokens attended to.
247 |
248 | Black cell $\to$ token visible, white cell $\to$ token \textbf{masked}
249 | \end{frame}
250 |
251 | \begin{frame}{Differences between attention masks}
252 | \begin{figure}[h]
253 | \includegraphics[height=4.5cm]{attention-masks-anno}
254 | \end{figure}
255 | Read: y axis $\to$ tokens attending, x axis $\to$ tokens attended to.
256 |
257 | Black cell $\to$ token visible, white cell $\to$ token \textbf{masked}
258 | \end{frame}
259 |
260 | \begin{frame}{Attention masks}
261 |
262 | Recall: the attention mechanism
263 |
264 | \noindent\begin{minipage}{0.4\textwidth}
265 | \begin{equation*}
266 | a = \sum_i^n \alpha_i v_i
267 | \end{equation*}
268 | \end{minipage}%
269 | \begin{minipage}{0.2\textwidth}
270 | \end{minipage}
271 | \begin{minipage}{0.4\textwidth}
272 | \begin{equation*}
273 | \hat{\alpha}_i = \frac{q^T \cdot k_i}{\sqrt{d_{\text{model} } } }
274 | \end{equation*}
275 | \end{minipage}\vskip1em
276 |
277 | \pause
278 | How do we do \textbf{masking}?
279 |
280 | In the \textbf{causal} scenario (each token can only attend to \textbf{past} tokens);
281 | \pause
282 |
283 | For a $q = W_q(s_j)$ query computed based on the hidden state $s_j$ at position $j$
284 | \pause
285 |
286 | $$
287 | \alpha_i =
288 | \begin{cases}
289 | \alpha_i,& \text{if } j\geq i\\
290 | 0, & \text{otherwise}
291 | \end{cases}
292 | $$
293 | \pause
294 | \textbf{NB: actually}, we set $\hat{\alpha}_i$ to $-\inf$ (before softmax)
295 |
296 | \end{frame}
297 |
298 |
299 | \begin{frame}{Differences between attention masks}
300 | \begin{figure}[h]
301 | \includegraphics[height=5.5cm]{attention-patterns}
302 | \end{figure}
303 | \end{frame}
304 |
305 | \section{Autoregressive decoder-only Models}
306 |
307 | \begin{frame}{Variants of language modeling}
308 | \begin{figure}[h]
309 | \includegraphics[height=3.5cm]{language-modeling-types}
310 | \end{figure}
311 |
312 | \begin{itemize}
313 | \item (Full) language modeling $\to$ given previous tokens, predict next token, for \textbf{every token} in sequence
314 | \pause
315 | \item Prefix language modeling $\to$ (1) feed a prefix (where mask \textbf{does not have to be causal}), (2) full LM starting after prefix
316 | \pause
317 | \item Masked language modeling $\to$ \textbf{reconstruct masked} tokens/spans
318 | \end{itemize}
319 |
320 | \end{frame}
321 |
322 | \begin{frame}{Autoregressive decoder-only models}
323 | \begin{figure}[h]
324 | \includegraphics[height=3cm]{gpt2-paper}
325 | \end{figure}
326 |
327 | Introduction of \textbf{GPT-2}, an autoregressive Transformer decoder-only model trained on full language modeling.
328 |
329 | \pause
330 |
331 | \textbf{GPT-3} is \textit{"just"} a \textbf{larger} version of GPT-2
332 |
333 | \end{frame}
334 |
335 | \begin{frame}{Autoregressive decoder-only models}
336 | \begin{figure}[h]
337 | \includegraphics[height=3cm]{gpt2-title-anno}
338 | \end{figure}
339 |
340 | Introduction of \textbf{GPT-2}, an autoregressive Transformer decoder-only model trained on full language modeling.
341 |
342 | \hspace{1em} What does "unsupervised multitask learners" mean \emoji{thinking}?
343 | \end{frame}
344 |
345 | \subsection{Zero-shot, one-shot and few-shot learning}
346 |
347 | \begin{frame}{Zero-shot, one-shot and few-shot learning}
348 |
349 | \textbf{Recall:} T5 was able to perform \textbf{multiple tasks} at the same time
350 |
351 | ... but it was trained on them \& on keywords which indicate the task.
352 | \vspace{1em}
353 | \pause
354 |
355 | For a model that has \textbf{not been trained on the downstream task}:
356 | \begin{itemize}
357 | \item \textbf{Few-shot} learning: tune pretrained model on a \textbf{small number} of target task instances, \textbf{then perform task (!)}
358 | \pause
359 | \item \textbf{One-shot} learning: tune pretrained model on \textbf{one instance (!)} \textit{per class}, then perform task
360 | \pause
361 | \item \textbf{Zero-shot} learning: \textbf{don't tune pretrained model}\textbf{(!!!)}, then perform task
362 | \end{itemize}
363 |
364 | \end{frame}
365 |
366 | \begin{frame}{Zero-shot learning}
367 | Zero shot learning $\approx$ unsupervised learning
368 | \pause
369 |
370 | \vspace{1em}
371 | Why $\approx$?
372 | \pause
373 |
374 | \textbf{Assumption:} when trained on a \textbf{massive} corpus of text, the language model is likely to \textbf{see some tasks naturally} occur (e.g. question answering).
375 |
376 | \pause
377 | \begin{itemize}
378 | \item We want to \textbf{transform} our task into a \textbf{generative one} by providing a \textbf{prompt} to the model which will make the label of the input instance the \textbf{most likely generated sequence}.
379 | \end{itemize}
380 |
381 | \end{frame}
382 |
383 | \begin{frame}%{Zero-shot learning}
384 | \begin{columns}[T] % align columns
385 | \begin{column}{.48\textwidth}
386 | \begin{figure}[h]
387 | \includegraphics[height=7.3cm]{gpt2-demonstrations}
388 | \caption*{Image from \href{https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf}{\underline{GPT2 paper}}}
389 | \end{figure}
390 | \end{column}
391 |
392 | \begin{column}{.48\textwidth}
393 | \vspace{1.5em}
394 | The internet \textbf{does} contain samples of various NLP tasks
395 | \pause
396 | \begin{itemize}
397 | \item ... and a large language model (LLM) \textbf{can} remember them;
398 | \pause
399 | \item ... and when \textbf{prompted} to perform a task, without seeing the prompt before, \textbf{recall it};
400 | \pause
401 | \item ... and \textbf{perform them accurately}.
402 | \end{itemize}
403 | \end{column}
404 | \end{columns}
405 |
406 | \end{frame}
407 |
408 | \begin{frame}{GPT-2: Zero-shot question answering}
409 | \begin{figure}[h]
410 | \includegraphics[height=3.5cm]{gpt2-zeroshot-qa}
411 | \caption*{Image from \href{https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf}{\underline{GPT2 paper}}}
412 | \end{figure}
413 | \end{frame}
414 |
415 |
416 | \begin{frame}{GPT-2: Prompted one-shot question answering}
417 | \begin{figure}[h]
418 | \includegraphics[height=6.5cm]{gpt2-prompting-qa}
419 | \caption*{Image from \href{https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf}{\underline{GPT2 paper}}}
420 | \end{figure}
421 | \end{frame}
422 |
423 | \begin{frame}
424 | \begin{figure}[h]
425 | \includegraphics[height=8.5cm]{gpt2-prompting-qa}
426 | % \caption*{Image from \href{https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf}{\underline{GPT2 paper}}}
427 | \end{figure}
428 | \end{frame}
429 |
430 | \begin{frame}
431 | \begin{figure}[h]
432 | \includegraphics[height=8.5cm]{gpt2-prompt-anno}
433 | % \caption*{Image from \href{https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf}{\underline{GPT2 paper}}}
434 | \end{figure}
435 | \end{frame}
436 |
437 | \subsection{Prompting}
438 |
439 | \begin{frame}{Prompting}
440 | \textbf{A prompt} is a piece of text inserted in the input examples, so that the original task \textbf{can be formulated as} a (masked) \textbf{language modeling} problem.
441 |
442 | \pause
443 |
444 | \begin{figure}[h]
445 | \includegraphics[height=5cm]{zero-shot-translation-gpt3}
446 | \caption*{Image from \href{https://arxiv.org/pdf/2005.14165.pdf}{\underline{GPT3 paper}}}
447 | \end{figure}
448 |
449 | \end{frame}
450 |
451 |
452 | \begin{frame}{Prompting}
453 | \textbf{A prompt} is a piece of text inserted in the input examples, so that the original task \textbf{can be formulated as} a (masked) \textbf{language modeling} problem.
454 |
455 | \begin{figure}[h]
456 | \includegraphics[height=5cm]{one-shot-translation-gpt3}
457 | \caption*{Image from \href{https://arxiv.org/pdf/2005.14165.pdf}{\underline{GPT3 paper}}}
458 | \end{figure}
459 |
460 | \end{frame}
461 |
462 |
463 | \begin{frame}{Prompting}
464 | \textbf{A prompt} is a piece of text inserted in the input examples, so that the original task \textbf{can be formulated as} a (masked) \textbf{language modeling} problem.
465 |
466 | \begin{figure}[h]
467 | \includegraphics[height=5cm]{few-shot-translation-gpt3}
468 | \caption*{Image from \href{https://arxiv.org/pdf/2005.14165.pdf}{\underline{GPT3 paper}}}
469 | \end{figure}
470 |
471 | \end{frame}
472 |
473 | \begin{frame}{Prompting works well}
474 | \begin{figure}[h]
475 | \includegraphics[height=4.5cm]{gpt3-translation-results}
476 | \caption*{Image from \href{https://arxiv.org/pdf/2005.14165.pdf}{\underline{GPT3 paper}}}
477 | \end{figure}
478 | GPT3 \textbf{without fine-tuning} performs better than \textbf{unsupervised} alternatives, and sometimes even \textbf{better} than supervised state-of-the-art!
479 | \end{frame}
480 |
481 | \begin{frame}{In-context learning}
482 | \textbf{In-context learning} is the paradigm in which a LLM learns to solve a new task at inference time \textbf{without any change to its weights}, based only on examples in the \textbf{prompt}.
483 | \pause
484 |
485 | \hspace{1em}$\approx$ umbrella term for zero-, one- and few-shot learning with task descriptions also contained in prompt.
486 | \vspace{1em}
487 |
488 | \pause
489 | \textit{"During \textbf{unsupervised pre-training}, a language model develops a broad set of skills and pattern recognition abilities. It then uses these abilities at inference time to rapidly adapt to or recognize the desired task. We use the term “in-context learning” to describe the inner loop of this process, which occurs \textbf{within the forward-pass} upon each sequence."} -- from GPT3 paper
490 |
491 | \end{frame}
492 |
493 | \begin{frame}%{In-context learning}
494 | \begin{figure}[h]
495 | \includegraphics[height=7.5cm]{in-context-learning}
496 | \end{figure}
497 |
498 | \end{frame}
499 |
500 | \section{Prompt-tuning MLMs}
501 |
502 | \begin{frame}{Prompt-tuning MLMs}
503 | Can we only use prompting with autoregressive models?
504 | \pause
505 | \vspace{1em}
506 | \begin{itemize}
507 | \item No -- we can also use it with bidirectional decoder-only models!
508 | \begin{itemize}
509 | \pause
510 | \item ... but it is \textbf{more difficult} because they have not been trained to generate texts
511 | \pause
512 | \item ... because the downstream task is \textbf{less natural} (further from the pretraining task) to the model
513 | \end{itemize}
514 |
515 | \end{itemize}
516 | \pause
517 | How to overcome this gap between the \textbf{pretraining task} and the \textbf{prompting-transformed downstream task}?
518 |
519 | \end{frame}
520 |
521 | \begin{frame}{Prompt-tuning MLMs}
522 | So far, we have \textbf{fine-tuned} masked language models
523 | \pause
524 | \begin{figure}[h]
525 | \includegraphics[height=4cm]{fine-tuning-mlms}
526 | \caption*{Figure from \href{https://thegradient.pub/prompting/}{\underline{The Gradient}}}
527 | \end{figure}
528 | \pause
529 | Can we frame our downstream task \textbf{as MLM}?
530 |
531 | \end{frame}
532 |
533 | \begin{frame}{Prompt-tuning MLMs}
534 | \begin{figure}[h]
535 | \includegraphics[height=6cm]{prompting_mlms}
536 | \end{figure}
537 | \end{frame}
538 |
539 |
540 | \begin{frame}{Prompt-tuning MLMs}
541 | We transform the target task (e.g. sentiment analysis) to \textbf{masked language modeling}.
542 | \begin{enumerate}
543 | \item Choose the prompt and word/token used for each label
544 | \pause
545 | \begin{itemize}
546 | \item Choice of label token \textbf{important}
547 | \item Template design also \textbf{important}
548 | \end{itemize}
549 | \pause
550 | \item Demonstrate task through a few samples
551 | \pause
552 | \begin{itemize}
553 | \item Usually through \textbf{fine-tuning}
554 | \end{itemize}
555 | \pause
556 | \item \textbf{No new parameters needed} to perform task!
557 | \end{enumerate}
558 | \end{frame}
559 |
560 |
561 | \begin{frame}{Discrete and continuous prompts}
562 | So far, we have shown \textbf{discrete prompts}: actual text that we prepend/append to existing data which triggers the LLM to perform our task.
563 | \pause
564 | \vspace{1em}
565 |
566 | Can we learn \textbf{continuous prompts}?
567 | \pause
568 | (dense vectors which we prepend, e.g. as a token)
569 | \pause
570 | \begin{figure}[h]
571 | \includegraphics[height=3.5cm]{continuous-prompts}
572 | \caption*{Figure from \href{https://aclanthology.org/2022.naacl-main.266.pdf}{\underline{Prompt Waywardness}}}
573 | \end{figure}
574 | \end{frame}
575 |
576 |
577 | \section{A step back}
578 |
579 | \begin{frame}{Incredible Performance of Large Language Models}
580 | So... what caused LLMs to be \textbf{so good} all of a sudden?
581 | \pause
582 | \begin{itemize}
583 | \item More available data (more data $\to$ better models)
584 | \pause
585 | \item Training tricks (from experience)
586 | \pause
587 | \item Hardware advancements (faster training of larger models)
588 | \end{itemize}
589 | \begin{figure}[h]
590 | \includegraphics[height=4cm]{lm-scaling}
591 | \end{figure}
592 | \end{frame}
593 |
594 | \begin{frame}{Takeaways}
595 |
596 | \begin{itemize}
597 | \item Three types of Transformer-based architectures for LLM pretraining:
598 | \begin{itemize}
599 | \item \textbf{Encoder-decoder} (T5)
600 | \item \textbf{Bidirectional encoder-only} (BERT)
601 | \item \textbf{Autoregressive decoder-only} (GPT-2)
602 | \end{itemize}
603 | \item The \textbf{attention masks} of these models differ
604 | \item There are three variants of language modeling for pretraining LLMs
605 | \item GPT-2 (and 3) are autoregressive decoder-only transformers
606 | \item We introduced zero-, one- and few-shot learning
607 | \item We introduced prompting and its variants
608 | \begin{itemize}
609 | \item Autoregressive vs MLM prompting
610 | \item Continuous vs discrete prompts
611 | \item In-context learning
612 | \end{itemize}
613 | \end{itemize}
614 |
615 | \end{frame}
616 |
617 | \begin{frame}{Useful resources}
618 |
619 | \begin{itemize}
620 | \item \href{https://thegradient.pub/prompting/}{\underline{The Gradient: Prompting}} by Tianyu Gao
621 | \item \href{https://thegradient.pub/in-context-learning-in-context/}{\underline{The Gradient: In Context Learning}} by Daniel Bashir
622 | \item \href{http://ai.stanford.edu/blog/understanding-incontext/}{\underline{Understanding in-context learning}} by Sang Michael Xie and Sewon Min
623 | \end{itemize}
624 |
625 | \end{frame}
626 |
627 |
628 |
629 | \begin{frame}{License and credits}
630 |
631 | \begin{columns}
632 | \begin{column}{0.7\textwidth}
633 | Licensed under Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)
634 | \end{column}
635 | \begin{column}{0.2\textwidth}
636 | \includegraphics[width=0.9\linewidth]{img/cc-by-sa-icon.pdf}
637 | \end{column}
638 | \end{columns}
639 |
640 | \bigskip
641 |
642 | Credits
643 |
644 | \begin{scriptsize}
645 |
646 | Martin Tutek
647 |
648 | Content from ACL Anthology papers licensed under CC-BY \url{https://www.aclweb.org/anthology}
649 |
650 |
651 | \end{scriptsize}
652 |
653 | \end{frame}
654 |
655 |
656 |
657 | \end{document}
658 |
659 |
--------------------------------------------------------------------------------
/latex/lecture11/img/attention-masks-anno.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/attention-masks-anno.png
--------------------------------------------------------------------------------
/latex/lecture11/img/attention-patterns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/attention-patterns.png
--------------------------------------------------------------------------------
/latex/lecture11/img/attention-types.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/attention-types.png
--------------------------------------------------------------------------------
/latex/lecture11/img/autoregressive_trf_decoder.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/autoregressive_trf_decoder.pdf
--------------------------------------------------------------------------------
/latex/lecture11/img/bidirectional_trf_encoder.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/bidirectional_trf_encoder.pdf
--------------------------------------------------------------------------------
/latex/lecture11/img/cc-by-sa-icon.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/cc-by-sa-icon.pdf
--------------------------------------------------------------------------------
/latex/lecture11/img/continuous-prompts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/continuous-prompts.png
--------------------------------------------------------------------------------
/latex/lecture11/img/few-shot-translation-gpt3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/few-shot-translation-gpt3.png
--------------------------------------------------------------------------------
/latex/lecture11/img/fine-tuning-mlms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/fine-tuning-mlms.png
--------------------------------------------------------------------------------
/latex/lecture11/img/gpt2-demonstrations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/gpt2-demonstrations.png
--------------------------------------------------------------------------------
/latex/lecture11/img/gpt2-paper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/gpt2-paper.png
--------------------------------------------------------------------------------
/latex/lecture11/img/gpt2-prompt-anno.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/gpt2-prompt-anno.png
--------------------------------------------------------------------------------
/latex/lecture11/img/gpt2-prompting-qa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/gpt2-prompting-qa.png
--------------------------------------------------------------------------------
/latex/lecture11/img/gpt2-title-anno.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/gpt2-title-anno.png
--------------------------------------------------------------------------------
/latex/lecture11/img/gpt2-zeroshot-qa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/gpt2-zeroshot-qa.png
--------------------------------------------------------------------------------
/latex/lecture11/img/gpt3-translation-results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/gpt3-translation-results.png
--------------------------------------------------------------------------------
/latex/lecture11/img/in-context-learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/in-context-learning.png
--------------------------------------------------------------------------------
/latex/lecture11/img/language-modeling-types.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/language-modeling-types.png
--------------------------------------------------------------------------------
/latex/lecture11/img/lm-scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/lm-scaling.png
--------------------------------------------------------------------------------
/latex/lecture11/img/one-shot-translation-gpt3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/one-shot-translation-gpt3.png
--------------------------------------------------------------------------------
/latex/lecture11/img/prompting_mlms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/prompting_mlms.png
--------------------------------------------------------------------------------
/latex/lecture11/img/t5-anno-prompts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/t5-anno-prompts.png
--------------------------------------------------------------------------------
/latex/lecture11/img/t5-objectives.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/t5-objectives.png
--------------------------------------------------------------------------------
/latex/lecture11/img/transformer_enc_dec.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/transformer_enc_dec.pdf
--------------------------------------------------------------------------------
/latex/lecture11/img/ukp_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/ukp_logo.png
--------------------------------------------------------------------------------
/latex/lecture11/img/zero-shot-translation-gpt3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/zero-shot-translation-gpt3.png
--------------------------------------------------------------------------------
/pdf/DL4NLP Lecture 12_ Contemporary LLMs.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/DL4NLP Lecture 12_ Contemporary LLMs.pptx
--------------------------------------------------------------------------------
/pdf/dl4nlp2023-lecture01.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture01.pdf
--------------------------------------------------------------------------------
/pdf/dl4nlp2023-lecture02.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture02.pdf
--------------------------------------------------------------------------------
/pdf/dl4nlp2023-lecture03.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture03.pdf
--------------------------------------------------------------------------------
/pdf/dl4nlp2023-lecture04.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture04.pdf
--------------------------------------------------------------------------------
/pdf/dl4nlp2023-lecture05.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture05.pdf
--------------------------------------------------------------------------------
/pdf/dl4nlp2023-lecture06.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture06.pdf
--------------------------------------------------------------------------------
/pdf/dl4nlp2023-lecture07.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture07.pdf
--------------------------------------------------------------------------------
/pdf/dl4nlp2023-lecture08.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture08.pdf
--------------------------------------------------------------------------------
/pdf/dl4nlp2023-lecture09.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture09.pdf
--------------------------------------------------------------------------------
/pdf/dl4nlp2023-lecture10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture10.pdf
--------------------------------------------------------------------------------
/pdf/dl4nlp2023-lecture11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture11.pdf
--------------------------------------------------------------------------------
/pdf/dl4nlp2023-lecture12-recap.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture12-recap.pdf
--------------------------------------------------------------------------------
/pdf/dl4nlp2023-lecture13.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture13.pdf
--------------------------------------------------------------------------------
/pdf/dl4nlp2023-lecture13.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture13.pptx
--------------------------------------------------------------------------------