├── .gitignore ├── LICENSE ├── README.md ├── latex ├── dl4nlp-bibliography.bib ├── lecture01 │ ├── .gitignore │ ├── compile-pdf.sh │ ├── dl4nlp2023-lecture01.tex │ └── img │ │ ├── arct.png │ │ ├── cc-by-sa-icon.pdf │ │ ├── dial1.png │ │ ├── hfdata.png │ │ ├── logo-trusthlt.pdf │ │ ├── mt2.png │ │ ├── mtex.jpg │ │ ├── nlg1.png │ │ └── nlg2.png ├── lecture02 │ ├── .gitignore │ ├── compile-pdf.sh │ ├── dl4nlp2023-lecture02.tex │ └── img │ │ ├── backprop-my.pdf │ │ ├── backprop-my.svg │ │ ├── backprop01.pdf │ │ ├── backprop02.pdf │ │ ├── backprop03.pdf │ │ ├── backprop04.pdf │ │ ├── backprop05.pdf │ │ ├── backprop06.pdf │ │ ├── backprop07.pdf │ │ ├── backprop08.pdf │ │ ├── backprop09.pdf │ │ ├── backprop10.pdf │ │ ├── cc-by-sa-icon.pdf │ │ ├── desmos-graph1.pdf │ │ ├── desmos-graph1.svg │ │ ├── gradient1.pdf │ │ ├── logo-trusthlt.pdf │ │ ├── parent-child.pdf │ │ ├── parent-child.svg │ │ └── rosenbrock.pdf ├── lecture03 │ ├── .gitignore │ ├── compile-pdf.sh │ ├── dl4nlp2023-lecture03.tex │ └── img │ │ ├── cc-by-sa-icon.pdf │ │ └── logo-trusthlt.pdf ├── lecture04 │ ├── .gitignore │ ├── compile-pdf.sh │ ├── dl4nlp2023-lecture04.tex │ └── img │ │ ├── cc-by-sa-icon.pdf │ │ ├── logo-trusthlt.pdf │ │ └── temperatures.png ├── lecture05 │ ├── .gitignore │ ├── compile-pdf.sh │ ├── dl4nlp2023-lecture05.tex │ └── img │ │ ├── cc-by-sa-icon.pdf │ │ ├── linear1.png │ │ ├── linear2.png │ │ ├── linear3.png │ │ ├── linear4.png │ │ ├── logo-trusthlt.pdf │ │ ├── xor1.pdf │ │ └── xor1.svg ├── lecture06 │ ├── .gitignore │ ├── compile-pdf.sh │ ├── dl4nlp2023-lecture06.tex │ └── img │ │ ├── cc-by-sa-icon.pdf │ │ ├── logo-trusthlt.pdf │ │ └── rewe.png ├── lecture07 │ ├── .gitignore │ ├── compile-pdf.sh │ ├── dl4nlp2023-lecture07.tex │ └── img │ │ ├── cc-by-sa-icon.pdf │ │ └── logo-trusthlt.pdf ├── lecture08 │ ├── .gitignore │ ├── compile-pdf.sh │ ├── dl4nlp2023-lecture08.tex │ └── img │ │ ├── cc-by-sa-icon.pdf │ │ ├── logo-trusthlt.pdf │ │ ├── seq2seq.pdf │ │ ├── seq2seq_attention_motivation.pdf │ │ ├── seq2seq_attention_t1.pdf │ │ ├── seq2seq_attn_encdec.pdf │ │ ├── seq2seq_selfattn.pdf │ │ ├── sequence_classification.pdf │ │ ├── sequence_labeling.pdf │ │ ├── sequence_length.png │ │ ├── sequence_to_sequence.pdf │ │ ├── sequence_to_sequence_anno.pdf │ │ ├── sequence_to_sequence_boxed.pdf │ │ ├── translation_heatmap.png │ │ ├── ukp_logo.png │ │ └── variable_input_output.pdf ├── lecture09 │ ├── .gitignore │ ├── compile-pdf.sh │ ├── dl4nlp2023-lecture09.tex │ └── img │ │ ├── aiayn.png │ │ ├── anno_transformer.png │ │ ├── anno_transformer_attn_block.png │ │ ├── anno_trf_hlattn.png │ │ ├── anno_trf_hllinear.png │ │ ├── anno_trf_multihead.png │ │ ├── cc-by-sa-icon.pdf │ │ ├── positional_embs.png │ │ ├── seq2seq_attention_t1.pdf │ │ ├── seq2seq_attention_t2.pdf │ │ ├── seq2seq_attention_t3.pdf │ │ ├── seq2seq_attn_encdec.pdf │ │ └── ukp_logo.png ├── lecture10 │ ├── .gitignore │ ├── compile-pdf.sh │ ├── dl4nlp2023-lecture10.tex │ └── img │ │ ├── BERT-language-modeling-masked-lm.png │ │ ├── aiayn.png │ │ ├── anno_transformer.png │ │ ├── anno_transformer_attn_block.png │ │ ├── anno_trf_hlattn.png │ │ ├── anno_trf_hllinear.png │ │ ├── anno_trf_multihead.png │ │ ├── bart-pretraining-tasks.png │ │ ├── bert-decoder-head-hl.png │ │ ├── bert-google.png │ │ ├── bert-next-sentence-prediction.png │ │ ├── bert-pair-classification.png │ │ ├── bert-paper.png │ │ ├── bert-results.png │ │ ├── bert-seq-labeling.png │ │ ├── bert-single-sentence-clf.png │ │ ├── bert-spanex-qa.png │ │ ├── bert-viz.png │ │ ├── bert_dual_seq.png │ │ ├── bert_modeling.pdf │ │ ├── bert_nsp_anno.png │ │ ├── cc-by-sa-icon.pdf │ │ ├── gifs │ │ ├── .ipynb_checkpoints │ │ │ └── Untitled-checkpoint.ipynb │ │ ├── transformer_decoding_1.gif │ │ └── transformer_decoding_2.gif │ │ ├── positional_embs.png │ │ ├── pretrained-lm-variants.png │ │ ├── seq2seq_attention_t1.pdf │ │ ├── seq2seq_attention_t2.pdf │ │ ├── seq2seq_attention_t3.pdf │ │ ├── seq2seq_attn_encdec.pdf │ │ ├── t5-objectives.png │ │ ├── the_transformer_mt.png │ │ ├── transformer_blocks.png │ │ ├── transformer_encoder_decoder_stack.png │ │ ├── transformer_encoder_decoder_stack_full.png │ │ ├── transformer_encoders_decoders.png │ │ ├── transformer_residual_layer_norm.png │ │ ├── ukp_logo.png │ │ └── word2vec_cbow.pdf └── lecture11 │ ├── .gitignore │ ├── compile-pdf.sh │ ├── dl4nlp2023-lecture11.tex │ └── img │ ├── attention-masks-anno.png │ ├── attention-patterns.png │ ├── attention-types.png │ ├── autoregressive_trf_decoder.pdf │ ├── bidirectional_trf_encoder.pdf │ ├── cc-by-sa-icon.pdf │ ├── continuous-prompts.png │ ├── few-shot-translation-gpt3.png │ ├── fine-tuning-mlms.png │ ├── gpt2-demonstrations.png │ ├── gpt2-paper.png │ ├── gpt2-prompt-anno.png │ ├── gpt2-prompting-qa.png │ ├── gpt2-title-anno.png │ ├── gpt2-zeroshot-qa.png │ ├── gpt3-translation-results.png │ ├── in-context-learning.png │ ├── language-modeling-types.png │ ├── lm-scaling.png │ ├── one-shot-translation-gpt3.png │ ├── prompting_mlms.png │ ├── t5-anno-prompts.png │ ├── t5-objectives.png │ ├── transformer_enc_dec.pdf │ ├── ukp_logo.png │ └── zero-shot-translation-gpt3.png ├── pdf ├── DL4NLP Lecture 12_ Contemporary LLMs.pptx ├── dl4nlp2023-lecture01.pdf ├── dl4nlp2023-lecture02.pdf ├── dl4nlp2023-lecture03.pdf ├── dl4nlp2023-lecture04.pdf ├── dl4nlp2023-lecture05.pdf ├── dl4nlp2023-lecture06.pdf ├── dl4nlp2023-lecture07.pdf ├── dl4nlp2023-lecture08.pdf ├── dl4nlp2023-lecture09.pdf ├── dl4nlp2023-lecture10.pdf ├── dl4nlp2023-lecture11.pdf ├── dl4nlp2023-lecture12-recap.pdf ├── dl4nlp2023-lecture13.pdf └── dl4nlp2023-lecture13.pptx └── subtitles ├── DL4NLP23 06_ Text classification 3_ Learning word embeddings.srt ├── DL4NLP23-01-medium.srt ├── DL4NLP23-02-large.srt ├── DL4NLP23-03.srt ├── DL4NLP23-04.srt ├── DL4NLP23-05.srt └── DL4NLP23-07.srt /.gitignore: -------------------------------------------------------------------------------- 1 | ## Core latex/pdflatex auxiliary files: 2 | *.aux 3 | *.lof 4 | *.log 5 | *.lot 6 | *.fls 7 | *.out 8 | *.toc 9 | *.fmt 10 | *.fot 11 | *.cb 12 | *.cb2 13 | .*.lb 14 | 15 | ## Intermediate documents: 16 | *.dvi 17 | *.xdv 18 | *-converted-to.* 19 | # these rules might exclude image files for figures etc. 20 | # *.ps 21 | # *.eps 22 | # *.pdf 23 | 24 | ## Generated if empty string is given at "Please type another file name for output:" 25 | .pdf 26 | 27 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 28 | *.bbl 29 | *.bcf 30 | *.blg 31 | *-blx.aux 32 | *-blx.bib 33 | *.run.xml 34 | 35 | ## Build tool auxiliary files: 36 | *.fdb_latexmk 37 | *.synctex 38 | *.synctex(busy) 39 | *.synctex.gz 40 | *.synctex.gz(busy) 41 | *.pdfsync 42 | 43 | ## Build tool directories for auxiliary files 44 | # latexrun 45 | latex.out/ 46 | 47 | ## Auxiliary and intermediate files from other packages: 48 | # algorithms 49 | *.alg 50 | *.loa 51 | 52 | # achemso 53 | acs-*.bib 54 | 55 | # amsthm 56 | *.thm 57 | 58 | # beamer 59 | *.nav 60 | *.pre 61 | *.snm 62 | *.vrb 63 | 64 | # changes 65 | *.soc 66 | 67 | # comment 68 | *.cut 69 | 70 | # cprotect 71 | *.cpt 72 | 73 | # elsarticle (documentclass of Elsevier journals) 74 | *.spl 75 | 76 | # endnotes 77 | *.ent 78 | 79 | # fixme 80 | *.lox 81 | 82 | # feynmf/feynmp 83 | *.mf 84 | *.mp 85 | *.t[1-9] 86 | *.t[1-9][0-9] 87 | *.tfm 88 | 89 | #(r)(e)ledmac/(r)(e)ledpar 90 | *.end 91 | *.?end 92 | *.[1-9] 93 | *.[1-9][0-9] 94 | *.[1-9][0-9][0-9] 95 | *.[1-9]R 96 | *.[1-9][0-9]R 97 | *.[1-9][0-9][0-9]R 98 | *.eledsec[1-9] 99 | *.eledsec[1-9]R 100 | *.eledsec[1-9][0-9] 101 | *.eledsec[1-9][0-9]R 102 | *.eledsec[1-9][0-9][0-9] 103 | *.eledsec[1-9][0-9][0-9]R 104 | 105 | # glossaries 106 | *.acn 107 | *.acr 108 | *.glg 109 | *.glo 110 | *.gls 111 | *.glsdefs 112 | *.lzo 113 | *.lzs 114 | 115 | # uncomment this for glossaries-extra (will ignore makeindex's style files!) 116 | # *.ist 117 | 118 | # gnuplottex 119 | *-gnuplottex-* 120 | 121 | # gregoriotex 122 | *.gaux 123 | *.gtex 124 | 125 | # htlatex 126 | *.4ct 127 | *.4tc 128 | *.idv 129 | *.lg 130 | *.trc 131 | *.xref 132 | 133 | # hyperref 134 | *.brf 135 | 136 | # knitr 137 | *-concordance.tex 138 | # TODO Comment the next line if you want to keep your tikz graphics files 139 | *.tikz 140 | *-tikzDictionary 141 | 142 | # listings 143 | *.lol 144 | 145 | # luatexja-ruby 146 | *.ltjruby 147 | 148 | # makeidx 149 | *.idx 150 | *.ilg 151 | *.ind 152 | 153 | # minitoc 154 | *.maf 155 | *.mlf 156 | *.mlt 157 | *.mtc[0-9]* 158 | *.slf[0-9]* 159 | *.slt[0-9]* 160 | *.stc[0-9]* 161 | 162 | # minted 163 | _minted* 164 | *.pyg 165 | 166 | # morewrites 167 | *.mw 168 | 169 | # nomencl 170 | *.nlg 171 | *.nlo 172 | *.nls 173 | 174 | # pax 175 | *.pax 176 | 177 | # pdfpcnotes 178 | *.pdfpc 179 | 180 | # sagetex 181 | *.sagetex.sage 182 | *.sagetex.py 183 | *.sagetex.scmd 184 | 185 | # scrwfile 186 | *.wrt 187 | 188 | # sympy 189 | *.sout 190 | *.sympy 191 | sympy-plots-for-*.tex/ 192 | 193 | # pdfcomment 194 | *.upa 195 | *.upb 196 | 197 | # pythontex 198 | *.pytxcode 199 | pythontex-files-*/ 200 | 201 | # tcolorbox 202 | *.listing 203 | 204 | # thmtools 205 | *.loe 206 | 207 | # TikZ & PGF 208 | *.dpth 209 | *.md5 210 | *.auxlock 211 | 212 | # todonotes 213 | *.tdo 214 | 215 | # vhistory 216 | *.hst 217 | *.ver 218 | 219 | # easy-todo 220 | *.lod 221 | 222 | # xcolor 223 | *.xcp 224 | 225 | # xmpincl 226 | *.xmpi 227 | 228 | # xindy 229 | *.xdy 230 | 231 | # xypic precompiled matrices and outlines 232 | *.xyc 233 | *.xyd 234 | 235 | # endfloat 236 | *.ttt 237 | *.fff 238 | 239 | # Latexian 240 | TSWLatexianTemp* 241 | 242 | ## Editors: 243 | # WinEdt 244 | *.bak 245 | *.sav 246 | 247 | # Texpad 248 | .texpadtmp 249 | 250 | # LyX 251 | *.lyx~ 252 | 253 | # Kile 254 | *.backup 255 | 256 | # gummi 257 | .*.swp 258 | 259 | # KBibTeX 260 | *~[0-9]* 261 | 262 | # TeXnicCenter 263 | *.tps 264 | 265 | # auto folder when using emacs and auctex 266 | ./auto/* 267 | *.el 268 | 269 | # expex forward references with \gathertags 270 | *-tags.tex 271 | 272 | # standalone packages 273 | *.sta 274 | 275 | # Makeindex log files 276 | *.lpz 277 | 278 | .idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-ShareAlike 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-ShareAlike 4.0 International Public 58 | License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-ShareAlike 4.0 International Public License ("Public 63 | License"). To the extent this Public License may be interpreted as a 64 | contract, You are granted the Licensed Rights in consideration of Your 65 | acceptance of these terms and conditions, and the Licensor grants You 66 | such rights in consideration of benefits the Licensor receives from 67 | making the Licensed Material available under these terms and 68 | conditions. 69 | 70 | 71 | Section 1 -- Definitions. 72 | 73 | a. Adapted Material means material subject to Copyright and Similar 74 | Rights that is derived from or based upon the Licensed Material 75 | and in which the Licensed Material is translated, altered, 76 | arranged, transformed, or otherwise modified in a manner requiring 77 | permission under the Copyright and Similar Rights held by the 78 | Licensor. For purposes of this Public License, where the Licensed 79 | Material is a musical work, performance, or sound recording, 80 | Adapted Material is always produced where the Licensed Material is 81 | synched in timed relation with a moving image. 82 | 83 | b. Adapter's License means the license You apply to Your Copyright 84 | and Similar Rights in Your contributions to Adapted Material in 85 | accordance with the terms and conditions of this Public License. 86 | 87 | c. BY-SA Compatible License means a license listed at 88 | creativecommons.org/compatiblelicenses, approved by Creative 89 | Commons as essentially the equivalent of this Public License. 90 | 91 | d. Copyright and Similar Rights means copyright and/or similar rights 92 | closely related to copyright including, without limitation, 93 | performance, broadcast, sound recording, and Sui Generis Database 94 | Rights, without regard to how the rights are labeled or 95 | categorized. For purposes of this Public License, the rights 96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 97 | Rights. 98 | 99 | e. Effective Technological Measures means those measures that, in the 100 | absence of proper authority, may not be circumvented under laws 101 | fulfilling obligations under Article 11 of the WIPO Copyright 102 | Treaty adopted on December 20, 1996, and/or similar international 103 | agreements. 104 | 105 | f. Exceptions and Limitations means fair use, fair dealing, and/or 106 | any other exception or limitation to Copyright and Similar Rights 107 | that applies to Your use of the Licensed Material. 108 | 109 | g. License Elements means the license attributes listed in the name 110 | of a Creative Commons Public License. The License Elements of this 111 | Public License are Attribution and ShareAlike. 112 | 113 | h. Licensed Material means the artistic or literary work, database, 114 | or other material to which the Licensor applied this Public 115 | License. 116 | 117 | i. Licensed Rights means the rights granted to You subject to the 118 | terms and conditions of this Public License, which are limited to 119 | all Copyright and Similar Rights that apply to Your use of the 120 | Licensed Material and that the Licensor has authority to license. 121 | 122 | j. Licensor means the individual(s) or entity(ies) granting rights 123 | under this Public License. 124 | 125 | k. Share means to provide material to the public by any means or 126 | process that requires permission under the Licensed Rights, such 127 | as reproduction, public display, public performance, distribution, 128 | dissemination, communication, or importation, and to make material 129 | available to the public including in ways that members of the 130 | public may access the material from a place and at a time 131 | individually chosen by them. 132 | 133 | l. Sui Generis Database Rights means rights other than copyright 134 | resulting from Directive 96/9/EC of the European Parliament and of 135 | the Council of 11 March 1996 on the legal protection of databases, 136 | as amended and/or succeeded, as well as other essentially 137 | equivalent rights anywhere in the world. 138 | 139 | m. You means the individual or entity exercising the Licensed Rights 140 | under this Public License. Your has a corresponding meaning. 141 | 142 | 143 | Section 2 -- Scope. 144 | 145 | a. License grant. 146 | 147 | 1. Subject to the terms and conditions of this Public License, 148 | the Licensor hereby grants You a worldwide, royalty-free, 149 | non-sublicensable, non-exclusive, irrevocable license to 150 | exercise the Licensed Rights in the Licensed Material to: 151 | 152 | a. reproduce and Share the Licensed Material, in whole or 153 | in part; and 154 | 155 | b. produce, reproduce, and Share Adapted Material. 156 | 157 | 2. Exceptions and Limitations. For the avoidance of doubt, where 158 | Exceptions and Limitations apply to Your use, this Public 159 | License does not apply, and You do not need to comply with 160 | its terms and conditions. 161 | 162 | 3. Term. The term of this Public License is specified in Section 163 | 6(a). 164 | 165 | 4. Media and formats; technical modifications allowed. The 166 | Licensor authorizes You to exercise the Licensed Rights in 167 | all media and formats whether now known or hereafter created, 168 | and to make technical modifications necessary to do so. The 169 | Licensor waives and/or agrees not to assert any right or 170 | authority to forbid You from making technical modifications 171 | necessary to exercise the Licensed Rights, including 172 | technical modifications necessary to circumvent Effective 173 | Technological Measures. For purposes of this Public License, 174 | simply making modifications authorized by this Section 2(a) 175 | (4) never produces Adapted Material. 176 | 177 | 5. Downstream recipients. 178 | 179 | a. Offer from the Licensor -- Licensed Material. Every 180 | recipient of the Licensed Material automatically 181 | receives an offer from the Licensor to exercise the 182 | Licensed Rights under the terms and conditions of this 183 | Public License. 184 | 185 | b. Additional offer from the Licensor -- Adapted Material. 186 | Every recipient of Adapted Material from You 187 | automatically receives an offer from the Licensor to 188 | exercise the Licensed Rights in the Adapted Material 189 | under the conditions of the Adapter's License You apply. 190 | 191 | c. No downstream restrictions. You may not offer or impose 192 | any additional or different terms or conditions on, or 193 | apply any Effective Technological Measures to, the 194 | Licensed Material if doing so restricts exercise of the 195 | Licensed Rights by any recipient of the Licensed 196 | Material. 197 | 198 | 6. No endorsement. Nothing in this Public License constitutes or 199 | may be construed as permission to assert or imply that You 200 | are, or that Your use of the Licensed Material is, connected 201 | with, or sponsored, endorsed, or granted official status by, 202 | the Licensor or others designated to receive attribution as 203 | provided in Section 3(a)(1)(A)(i). 204 | 205 | b. Other rights. 206 | 207 | 1. Moral rights, such as the right of integrity, are not 208 | licensed under this Public License, nor are publicity, 209 | privacy, and/or other similar personality rights; however, to 210 | the extent possible, the Licensor waives and/or agrees not to 211 | assert any such rights held by the Licensor to the limited 212 | extent necessary to allow You to exercise the Licensed 213 | Rights, but not otherwise. 214 | 215 | 2. Patent and trademark rights are not licensed under this 216 | Public License. 217 | 218 | 3. To the extent possible, the Licensor waives any right to 219 | collect royalties from You for the exercise of the Licensed 220 | Rights, whether directly or through a collecting society 221 | under any voluntary or waivable statutory or compulsory 222 | licensing scheme. In all other cases the Licensor expressly 223 | reserves any right to collect such royalties. 224 | 225 | 226 | Section 3 -- License Conditions. 227 | 228 | Your exercise of the Licensed Rights is expressly made subject to the 229 | following conditions. 230 | 231 | a. Attribution. 232 | 233 | 1. If You Share the Licensed Material (including in modified 234 | form), You must: 235 | 236 | a. retain the following if it is supplied by the Licensor 237 | with the Licensed Material: 238 | 239 | i. identification of the creator(s) of the Licensed 240 | Material and any others designated to receive 241 | attribution, in any reasonable manner requested by 242 | the Licensor (including by pseudonym if 243 | designated); 244 | 245 | ii. a copyright notice; 246 | 247 | iii. a notice that refers to this Public License; 248 | 249 | iv. a notice that refers to the disclaimer of 250 | warranties; 251 | 252 | v. a URI or hyperlink to the Licensed Material to the 253 | extent reasonably practicable; 254 | 255 | b. indicate if You modified the Licensed Material and 256 | retain an indication of any previous modifications; and 257 | 258 | c. indicate the Licensed Material is licensed under this 259 | Public License, and include the text of, or the URI or 260 | hyperlink to, this Public License. 261 | 262 | 2. You may satisfy the conditions in Section 3(a)(1) in any 263 | reasonable manner based on the medium, means, and context in 264 | which You Share the Licensed Material. For example, it may be 265 | reasonable to satisfy the conditions by providing a URI or 266 | hyperlink to a resource that includes the required 267 | information. 268 | 269 | 3. If requested by the Licensor, You must remove any of the 270 | information required by Section 3(a)(1)(A) to the extent 271 | reasonably practicable. 272 | 273 | b. ShareAlike. 274 | 275 | In addition to the conditions in Section 3(a), if You Share 276 | Adapted Material You produce, the following conditions also apply. 277 | 278 | 1. The Adapter's License You apply must be a Creative Commons 279 | license with the same License Elements, this version or 280 | later, or a BY-SA Compatible License. 281 | 282 | 2. You must include the text of, or the URI or hyperlink to, the 283 | Adapter's License You apply. You may satisfy this condition 284 | in any reasonable manner based on the medium, means, and 285 | context in which You Share Adapted Material. 286 | 287 | 3. You may not offer or impose any additional or different terms 288 | or conditions on, or apply any Effective Technological 289 | Measures to, Adapted Material that restrict exercise of the 290 | rights granted under the Adapter's License You apply. 291 | 292 | 293 | Section 4 -- Sui Generis Database Rights. 294 | 295 | Where the Licensed Rights include Sui Generis Database Rights that 296 | apply to Your use of the Licensed Material: 297 | 298 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 299 | to extract, reuse, reproduce, and Share all or a substantial 300 | portion of the contents of the database; 301 | 302 | b. if You include all or a substantial portion of the database 303 | contents in a database in which You have Sui Generis Database 304 | Rights, then the database in which You have Sui Generis Database 305 | Rights (but not its individual contents) is Adapted Material, 306 | 307 | including for purposes of Section 3(b); and 308 | c. You must comply with the conditions in Section 3(a) if You Share 309 | all or a substantial portion of the contents of the database. 310 | 311 | For the avoidance of doubt, this Section 4 supplements and does not 312 | replace Your obligations under this Public License where the Licensed 313 | Rights include other Copyright and Similar Rights. 314 | 315 | 316 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 317 | 318 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 319 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 320 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 321 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 322 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 323 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 324 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 325 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 326 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 327 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 328 | 329 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 330 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 331 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 332 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 333 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 334 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 335 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 336 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 337 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 338 | 339 | c. The disclaimer of warranties and limitation of liability provided 340 | above shall be interpreted in a manner that, to the extent 341 | possible, most closely approximates an absolute disclaimer and 342 | waiver of all liability. 343 | 344 | 345 | Section 6 -- Term and Termination. 346 | 347 | a. This Public License applies for the term of the Copyright and 348 | Similar Rights licensed here. However, if You fail to comply with 349 | this Public License, then Your rights under this Public License 350 | terminate automatically. 351 | 352 | b. Where Your right to use the Licensed Material has terminated under 353 | Section 6(a), it reinstates: 354 | 355 | 1. automatically as of the date the violation is cured, provided 356 | it is cured within 30 days of Your discovery of the 357 | violation; or 358 | 359 | 2. upon express reinstatement by the Licensor. 360 | 361 | For the avoidance of doubt, this Section 6(b) does not affect any 362 | right the Licensor may have to seek remedies for Your violations 363 | of this Public License. 364 | 365 | c. For the avoidance of doubt, the Licensor may also offer the 366 | Licensed Material under separate terms or conditions or stop 367 | distributing the Licensed Material at any time; however, doing so 368 | will not terminate this Public License. 369 | 370 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 371 | License. 372 | 373 | 374 | Section 7 -- Other Terms and Conditions. 375 | 376 | a. The Licensor shall not be bound by any additional or different 377 | terms or conditions communicated by You unless expressly agreed. 378 | 379 | b. Any arrangements, understandings, or agreements regarding the 380 | Licensed Material not stated herein are separate from and 381 | independent of the terms and conditions of this Public License. 382 | 383 | 384 | Section 8 -- Interpretation. 385 | 386 | a. For the avoidance of doubt, this Public License does not, and 387 | shall not be interpreted to, reduce, limit, restrict, or impose 388 | conditions on any use of the Licensed Material that could lawfully 389 | be made without permission under this Public License. 390 | 391 | b. To the extent possible, if any provision of this Public License is 392 | deemed unenforceable, it shall be automatically reformed to the 393 | minimum extent necessary to make it enforceable. If the provision 394 | cannot be reformed, it shall be severed from this Public License 395 | without affecting the enforceability of the remaining terms and 396 | conditions. 397 | 398 | c. No term or condition of this Public License will be waived and no 399 | failure to comply consented to unless expressly agreed to by the 400 | Licensor. 401 | 402 | d. Nothing in this Public License constitutes or may be interpreted 403 | as a limitation upon, or waiver of, any privileges and immunities 404 | that apply to the Licensor or You, including from the legal 405 | processes of any jurisdiction or authority. 406 | 407 | 408 | ======================================================================= 409 | 410 | Creative Commons is not a party to its public 411 | licenses. Notwithstanding, Creative Commons may elect to apply one of 412 | its public licenses to material it publishes and in those instances 413 | will be considered the “Licensor.” The text of the Creative Commons 414 | public licenses is dedicated to the public domain under the CC0 Public 415 | Domain Dedication. Except for the limited purpose of indicating that 416 | material is shared under a Creative Commons public license or as 417 | otherwise permitted by the Creative Commons policies published at 418 | creativecommons.org/policies, Creative Commons does not authorize the 419 | use of the trademark "Creative Commons" or any other trademark or logo 420 | of Creative Commons without its prior written consent including, 421 | without limitation, in connection with any unauthorized modifications 422 | to any of its public licenses or any other arrangements, 423 | understandings, or agreements concerning use of licensed material. For 424 | the avoidance of doubt, this paragraph does not form part of the 425 | public licenses. 426 | 427 | Creative Commons may be contacted at creativecommons.org. 428 | 429 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Learning for Natural Language Processing - Lectures 2023 2 | 3 | This repository contains slides for the course "20-00-0947: Deep Learning for Natural Language Processing" (Technical University of Darmstadt, Summer term 2023). 4 | 5 | This course is jointly lectured by [Ivan Habernal](https://www.trusthlt.org) and [Martin Tutek](https://www.informatik.tu-darmstadt.de/ukp/ukp_home/staff_ukp/ukp_home_content_staff_1_details_124480.en.jsp). 6 | 7 | The slides are available as PDF as well as LaTeX source code (we've used Beamer because typesetting mathematics in PowerPoint or similar tools is painful). See the instructions below if you want to compile the slides yourselves. 8 | 9 | ![Logo](https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/CC_BY-SA_icon.svg/88px-CC_BY-SA_icon.svg.png) 10 | 11 | The content is licensed under [Creative Commons CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/) which means that you can re-use, adapt, modify, or publish it further, provided you keep the license and give proper credits. 12 | 13 | **Note:** The following content is continuously updated as the summer term progresses. If you're interested in the full previous 2022 content, checkout the latest [2022 Git commit](https://github.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/tree/a59910534ac600a6e8c22fbcde6ae8223a87cda9). 14 | 15 | ## YouTube Playlist 16 | 17 | Subscribe the YouTube playlist to get updates on new lectures: https://youtube.com/playlist?list=PL6WLGVNe6ZcA4gUr5MaAKdrGxYzYAETK3 18 | 19 | ## Lecture 1: NLP tasks and evaluation 20 | 21 | April 11, 2023 22 | 23 | * [Slides as PDF](/pdf/dl4nlp2023-lecture01.pdf), [YouTube recording](https://www.youtube.com/watch?v=-cku_A34-qE) 24 | 25 | ## Lecture 2: Mathematical foundations of deep learning 26 | 27 | April 18, 2023 28 | 29 | * [Slides as PDF](/pdf/dl4nlp2023-lecture02.pdf), [YouTube recording](https://www.youtube.com/watch?v=XbFNcvWdCTw) 30 | 31 | 32 | ## Lecture 3: Text classification 1: Log-linear models 33 | 34 | April 25, 2023 35 | 36 | * [Slides as PDF](/pdf/dl4nlp2023-lecture03.pdf), [YouTube recording](https://www.youtube.com/watch?v=t7YZ7OgtD5o) 37 | 38 | ## Lecture 4: Text classification 2: Deep neural networks 39 | 40 | May 2, 2023 41 | 42 | * [Slides as PDF](/pdf/dl4nlp2023-lecture04.pdf), [YouTube recording](https://www.youtube.com/watch?v=Fk1Y4ycO3aY) 43 | 44 | ## Lecture 5: Text generation 1: Language models and word embeddings 45 | 46 | May 9, 2023 47 | 48 | * [Slides as PDF](/pdf/dl4nlp2023-lecture05.pdf), [YouTube recording](https://www.youtube.com/watch?v=hqcFkKymRdw) 49 | 50 | ## Lecture 6: Text classification 3: Learning word embeddings 51 | 52 | May 16, 2023 53 | 54 | * [Slides as PDF](/pdf/dl4nlp2023-lecture06.pdf), [YouTube recording](https://www.youtube.com/watch?v=fClxXB8-m8I) 55 | 56 | ## Lecture 7: Text classification 4: Recurrent neural networks 57 | 58 | May 30, 2023 59 | 60 | * [Slides as PDF](/pdf/dl4nlp2023-lecture07.pdf), [YouTube recording](https://www.youtube.com/watch?v=sgjKJRoYx4s) 61 | 62 | ## Lecture 8: Text generation 2: Autoregressive encoder-decoder with RNNs and attention 63 | 64 | June 6, 2023 65 | 66 | * [Slides as PDF](/pdf/dl4nlp2023-lecture08.pdf), [YouTube recording](https://www.youtube.com/watch?v=tOmYTC3XaEo) 67 | 68 | ## Lecture 9: Text generation 3: Transformers 69 | 70 | June 13, 2023 71 | 72 | * [Slides as PDF](/pdf/dl4nlp2023-lecture09.pdf), [YouTube recording](https://youtu.be/yg5QrKOe0V4) 73 | 74 | ## Lecture 10: Text classification 4: self-attention and BERT 75 | 76 | June 20, 2023 77 | 78 | * [Slides as PDF](/pdf/dl4nlp2023-lecture10.pdf), [YouTube recording](https://youtu.be/NOD9irGv9Xg) 79 | 80 | ## Lecture 11: Text generation 4: Decoder-only Models and GPT 81 | 82 | June 27, 2023 83 | 84 | * [Slides as PDF](/pdf/dl4nlp2023-lecture11.pdf), [YouTube recording](https://youtu.be/t3J534JyE-E) 85 | 86 | ## Lecture 12: Contemporary LLMs: Prompting and in-context learning 87 | 88 | July 4, 2023 89 | 90 | * [Recap slides as PDF](/pdf/dl4nlp2023-lecture12-recap.pdf), [PPTX lecture slides](/pdf/DL4NLP%20Lecture%2012_%20Contemporary%20LLMs.pptx) 91 | 92 | ## Lecture 13: Guest lecture by Dr. Thomas Arnold: Ethics of generative AI 93 | 94 | July 11, 2023 95 | 96 | * [Slides as PDF](/pdf/dl4nlp2023-lecture13.pdf), [PPTX lecture slides](/pdf/dl4nlp2023-lecture13.pptx), [YouTube recording](https://www.youtube.com/watch?v=lO2-W5l2y40) 97 | 98 | 99 | ## Subtitles/Close caption 100 | 101 | Thanks to Jan Kühnemund for generating the close caption for YouTube with Open Whisper. We track the subtitles here under `subtitles`, so if you spot an error there (there are many, such as "tanh" -> "10h"), just open a bug or PR. 102 | 103 | ## FAQ 104 | 105 | * What are some essential pre-requisites? 106 | * Math: Derivatives and partial derivatives. We cover them in Lecture 2. If you need more, I would recommend these sources: 107 | * *Jeremy Kun: A Programmer's Introduction to Mathematics.* Absolutely amazing book. Pay-what-you-want for the PDF book. https://pimbook.org/ 108 | * *Deisenroth, A. Aldo Faisal, and Cheng Soon Ong: Mathematics for Machine Learning*. Excellent resource, freely available. Might be a bit dense. https://mml-book.github.io/ 109 | * Can I have the slide deck without "unfolding" the content over multiple pages? 110 | * You can compile the slides with the `handout` parameter, see below the section [Compiling handouts](#compiling-handouts). 111 | * Where do I find the code for plotting the functions? 112 | * Most of the plots are generated in Python/Jupyter (in Colab). The links are included as comments in the respective LaTeX sources for the slides. 113 | 114 | ## Compiling slides to PDF 115 | 116 | If you run a linux distribution (e.g., Ubuntu 20.04 and newer), all packages are provided as part of `texlive`. Install the following packages 117 | 118 | ```plain 119 | $ sudo apt-get install texlive-latex-recommended texlive-pictures texlive-latex-extra \ 120 | texlive-fonts-extra texlive-bibtex-extra texlive-humanities texlive-science \ 121 | texlive-luatex biber wget -y 122 | ``` 123 | 124 | Install Fira Sans fonts required by the beamer template locally 125 | 126 | ```plain 127 | $ wget https://github.com/mozilla/Fira/archive/refs/tags/4.106.zip -O 4.106.zip \ 128 | && unzip -o 4.106.zip && mkdir -p ~/.fonts/FiraSans && cp Fira-4.106/otf/Fira* \ 129 | ~/.fonts/FiraSans/ && rm -rf Fira-4.106 && rm 4.106.zip && fc-cache -f -v && mktexlsr 130 | ``` 131 | 132 | Compile each lecture's slides using ``lualatex`` 133 | 134 | ```plain 135 | $ lualatex dl4nlp2023-lecture*.tex && biber dl4nlp2023-lecture*.bcf && \ 136 | lualatex dl4nlp2023-lecture*.tex && lualatex dl4nlp2023-lecture*.tex 137 | ``` 138 | 139 | ### Compiling slides using Docker 140 | 141 | If you don't run a linux system or don't want to mess up your latex packages, I've tested compiling the slides in a Docker. 142 | 143 | Install Docker ( https://docs.docker.com/engine/install/ ) 144 | 145 | Create a folder to which you clone this repository (for example, `$ mkdir -p /tmp/slides`) 146 | 147 | Run Docker with Ubuntu 20.04 interactively; mount your slides directory under `/mnt` in this Docker container 148 | 149 | ```plain 150 | $ docker run -it --rm --mount type=bind,source=/tmp/slides,target=/mnt \ 151 | ubuntu:20.04 /bin/bash 152 | ``` 153 | 154 | Once the container is running, update, install packages and fonts as above 155 | 156 | ```plain 157 | # apt-get update && apt-get dist-upgrade -y && apt-get install texlive-latex-recommended \ 158 | texlive-pictures texlive-latex-extra texlive-fonts-extra texlive-bibtex-extra \ 159 | texlive-humanities texlive-science texlive-luatex biber wget -y 160 | ``` 161 | 162 | Fonts 163 | 164 | ```plain 165 | # wget https://github.com/mozilla/Fira/archive/refs/tags/4.106.zip -O 4.106.zip \ 166 | && unzip -o 4.106.zip && mkdir -p ~/.fonts/FiraSans && cp Fira-4.106/otf/Fira* \ 167 | ~/.fonts/FiraSans/ && rm -rf Fira-4.106 && rm 4.106.zip && fc-cache -f -v && mktexlsr 168 | ``` 169 | 170 | And compile 171 | 172 | ```plain 173 | # cd /mnt/dl4nlp/latex/lecture01 174 | # lualatex dl4nlp2023-lecture*.tex && biber dl4nlp2023-lecture*.bcf && \ 175 | lualatex dl4nlp2023-lecture*.tex && lualatex dl4nlp2023-lecture*.tex 176 | ``` 177 | 178 | which generates the PDF in your local folder (e.g, `/tmp/slides`). 179 | 180 | ### Compiling handouts 181 | 182 | We're uploading the PDFs as presented in the lecture. You can compile the slides in a concise way using the `handout` settings. Just comment/uncomment the respective line at the beginning of the tex file of the lecture slides. 183 | -------------------------------------------------------------------------------- /latex/dl4nlp-bibliography.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{Wang.et.al.2019.NeurIPS, 2 | address = {Vancouver, Canada}, 3 | author = {Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.}, 4 | booktitle = {Proceedings of the 33rd International Conference on Neural Information Processing Systems}, 5 | pages = {3266--3280}, 6 | publisher = {Curran Associates, Inc.}, 7 | title = {{SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems}}, 8 | year = {2019} 9 | } 10 | 11 | 12 | @inproceedings{Levesque.et.al.2012, 13 | address = {Rome, Italy}, 14 | author = {Levesque, Hector J. and Davis, Ernest and Morgenstern, Leora}, 15 | booktitle = {Proceedings of the Thirteenth International Conference on Principles of Knowledge Representation and Reasoning}, 16 | pages = {552--561}, 17 | publisher = {Association for the Advancement of Artificial Intelligence}, 18 | title = {{The Winograd Schema Challenge}}, 19 | year = {2012} 20 | } 21 | 22 | @article{Dagan.et.al.2009.NLE, 23 | author = {Dagan, Ido and Dolan, BIll and Magnini, Bernardo and Roth, Dan}, 24 | doi = {10.1017/S1351324909990209}, 25 | journal = {Natural Language Engineering}, 26 | number = {4}, 27 | pages = {1--27}, 28 | title = {{Recognizing textual entailment: Rational, evaluation and approaches}}, 29 | volume = {15}, 30 | year = {2009} 31 | } 32 | 33 | @inproceedings{Maas.et.al.2011, 34 | address = {Portland, Oregon}, 35 | author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher}, 36 | booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}, 37 | pages = {142--150}, 38 | publisher = {Association for Computational Linguistics}, 39 | title = {{Learning Word Vectors for Sentiment Analysis}}, 40 | url = {https://aclanthology.org/P11-1015}, 41 | year = {2011} 42 | } 43 | 44 | 45 | @inproceedings{Bowman.et.al.2015, 46 | address = {Lisbon, Portugal}, 47 | author = {Bowman, Samuel R. and Angeli, Gabor and Potts, Christopher and Manning, Christopher D.}, 48 | booktitle = {Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing}, 49 | doi = {10.18653/v1/D15-1075}, 50 | pages = {632--642}, 51 | publisher = {Association for Computational Linguistics}, 52 | title = {{A large annotated corpus for learning natural language inference}}, 53 | url = {http://aclweb.org/anthology/D15-1075}, 54 | year = {2015} 55 | } 56 | 57 | @article{Habernal.et.al.2023.AILaw, 58 | title = {{Mining Legal Arguments in Court Decisions}}, 59 | author = {\textbf{Habernal}, \textbf{Ivan} and Faber, Daniel and Recchia, Nicola and Bretthauer, Sebastian and Gurevych, Iryna and Spiecker gennant Döhmann, Indra and Burchard, Christoph}, 60 | year = 2023, 61 | journal = {Artificial Intelligence \& Law}, 62 | pages = {(to appear)} 63 | } 64 | 65 | @article{Artstein.Poesio.2008.CoLi, 66 | author = {Artstein, Ron and Poesio, Massimo}, 67 | doi = {10.1162/coli.07-034-R2}, 68 | journal = {Computational Linguistics}, 69 | number = {4}, 70 | pages = {555--596}, 71 | title = {{Inter-Coder Agreement for Computational Linguistics}}, 72 | volume = {34}, 73 | year = {2008} 74 | } 75 | 76 | 77 | @inproceedings{TjongKimSang.DeMeulder.2003, 78 | author = {{Tjong Kim Sang}, Erik F. and {De Meulder}, Fien}, 79 | booktitle = {Proceedings of the Seventh Conference on Natural Language Learning at HLT-NAACL 2003}, 80 | pages = {142--147}, 81 | publisher = {https://aclanthology.org/W03-0419}, 82 | title = {{Introduction to the CoNLL-2003 Shared Task: Language-Independent Named Entity Recognition}}, 83 | year = {2003} 84 | } 85 | 86 | 87 | @inproceedings{Clark.et.al.2019.NAACL, 88 | address = {Minneapolis, Minnesota}, 89 | author = {Clark, Christopher and Lee, Kenton and Chang, Ming-wei and Kwiatkowski, Tom and Collins, Michael and Toutanova, Kristina}, 90 | booktitle = {Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)}, 91 | doi = {10.18653/v1/N19-1300}, 92 | pages = {2924--2936}, 93 | publisher = {Association for Computational Linguistics}, 94 | title = {{BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions}}, 95 | year = {2019} 96 | } 97 | 98 | 99 | @inproceedings{Khashabi.et.al.2018.NAACL, 100 | address = {New Orleans, LA}, 101 | author = {Khashabi, Daniel and Chaturvedi, Snigdha and Roth, Michael and Upadhyay, Shyam and Roth, Dan}, 102 | booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)}, 103 | doi = {10.18653/v1/N18-1023}, 104 | pages = {252--262}, 105 | publisher = {Association for Computational Linguistics}, 106 | title = {{Looking Beyond the Surface: A Challenge Set for Reading Comprehension over Multiple Sentences}}, 107 | year = {2018} 108 | } 109 | 110 | 111 | @inproceedings{Bojar.et.al.2018.WMT, 112 | address = {Brussels, Belgium}, 113 | author = {Bojar, Ondřej and Federmann, Christian and Fishel, Mark and Graham, Yvette and Haddow, Barry and Koehn, Philipp and Monz, Christof}, 114 | booktitle = {Proceedings of the Third Conference on Machine Translation: Shared Task Papers}, 115 | doi = {10.18653/v1/W18-6401}, 116 | pages = {272--303}, 117 | publisher = {Association for Computational Linguistics}, 118 | title = {{Findings of the 2018 Conference on Machine Translation (WMT18)}}, 119 | volume = {2}, 120 | year = {2018} 121 | } 122 | 123 | 124 | @book{Koehn.2020, 125 | author = {Philipp Koehn}, 126 | title = {Neural Machine Translation}, 127 | publisher = {Cambridge University Press}, 128 | year = {2020}, 129 | note = {(not freely available)} 130 | } 131 | 132 | @inproceedings{Hermann.et.al.2015.NeurIPS, 133 | author = {Hermann, Karl Moritz and Kocisky, Tomas and Grefenstette, Edward and Espeholt, Lasse and Kay, Will and Suleyman, Mustafa and Blunsom, Phil}, 134 | booktitle = {Proceedings of NeurIPS}, 135 | pages = {1--9}, 136 | publisher = {Curran Associates, Inc.}, 137 | title = {{Teaching Machines to Read and Comprehend}}, 138 | year = {2015} 139 | } 140 | 141 | 142 | @article{Raffel.et.al.2020.JMLR, 143 | author = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J.}, 144 | journal = {Journal of Machine Learning Research}, 145 | keywords = {attention-,multi-task learning,natural language processing,transfer learning}, 146 | number = {140}, 147 | pages = {1--67}, 148 | title = {{Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer}}, 149 | volume = {21}, 150 | year = {2020} 151 | } 152 | 153 | @book{Japkowicz.Shah.2011, 154 | title = {{Evaluating Learning Algorithms: A Classification Perspective}}, 155 | author = {Nathalie Japkowicz and Mohak Shah}, 156 | year = {2011}, 157 | publisher = {Cambridge University Press}, 158 | note = {(not freely available)}, 159 | } 160 | 161 | @inproceedings{Papineni.et.al.2002.ACL, 162 | address = {Philadelphia, PA}, 163 | author = {Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing}, 164 | booktitle = {Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics}, 165 | doi = {10.3115/1073083.1073135}, 166 | pages = {311--318}, 167 | publisher = {Association for Computational Linguistics}, 168 | title = {{BLEU: a Method for Automatic Evaluation of Machine Translation}}, 169 | year = {2002} 170 | } 171 | 172 | @inproceedings{Lin.2004, 173 | title = "{ROUGE}: A Package for Automatic Evaluation of Summaries", 174 | author = "Lin, Chin-Yew", 175 | booktitle = "Text Summarization Branches Out", 176 | year = "2004", 177 | address = "Barcelona, Spain", 178 | publisher = "Association for Computational Linguistics", 179 | url = "https://aclanthology.org/W04-1013", 180 | pages = "74--81", 181 | } 182 | 183 | @inproceedings{Plank.2022.EMNLP, 184 | address = {Abu Dhabi, United Arab Emirates}, 185 | author = {Plank, Barbara}, 186 | booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing}, 187 | pages = {10671--10682}, 188 | publisher = {Association for Computational Linguistics}, 189 | title = {{The “Problem” of Human Label Variation: On Ground Truth in Data, Modeling and Evaluation}}, 190 | url = {https://aclanthology.org/2022.emnlp-main.731}, 191 | year = {2022} 192 | } 193 | 194 | 195 | @inproceedings{Geva.et.al.2019.EMNLP, 196 | address = {Hong Kong, China}, 197 | author = {Geva, Mor and Goldberg, Yoav and Berant, Jonathan}, 198 | booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)}, 199 | doi = {10.18653/v1/D19-1107}, 200 | pages = {1161--1166}, 201 | publisher = {Association for Computational Linguistics}, 202 | title = {{Are We Modeling the Task or the Annotator? An Investigation of Annotator Bias in Natural Language Understanding Datasets}}, 203 | year = {2019} 204 | } 205 | 206 | 207 | @article{Sai.et.al.2023.CSUR, 208 | author = {Sai, Ananya B. and Mohankumar, Akash Kumar and Khapra, Mitesh M.}, 209 | doi = {10.1145/3485766}, 210 | journal = {ACM Computing Surveys}, 211 | number = {2}, 212 | pages = {1--39}, 213 | title = {{A Survey of Evaluation Metrics Used for NLG Systems}}, 214 | volume = {55}, 215 | year = {2023} 216 | } 217 | 218 | 219 | @inproceedings{Habernal.et.al.2018.NAACL.ARCT, 220 | author = {\textbf{Habernal}, \textbf{Ivan} and Wachsmuth, Henning and Gurevych, Iryna and Stein, Benno}, 221 | booktitle = {Proceedings of NAACL}, 222 | pages = {1930--1940}, 223 | title = {{The Argument Reasoning Comprehension Task: Identification and Reconstruction of Implicit Warrants}}, 224 | url = {http://aclweb.org/anthology/N18-1175}, 225 | address = {New Orleans, LA}, 226 | year = {2018} 227 | } 228 | 229 | 230 | @inproceedings{Niven.Kao.2019.ACL, 231 | address = {Florence, Italy}, 232 | author = {Niven, Timothy and Kao, Hung-Yu}, 233 | booktitle = {Proceedings of ACL}, 234 | pages = {4658--4664}, 235 | title = {{Probing Neural Network Comprehension of Natural Language Arguments}}, 236 | url = {https://www.aclweb.org/anthology/P19-1459}, 237 | year = {2019} 238 | } 239 | 240 | 241 | 242 | @article{Forman.Scholz.2009.SIGKDD, 243 | annote = {fundamental article for reporting f-measure}, 244 | author = {Forman, George and Scholz, Martin}, 245 | file = {:home/habi/Dokumenty/Mendeley Desktop/Forman, Scholz - 2010 - Apples-to-Apples in Cross-Validation Studies Pitfalls in Classifier Performance Measurement.pdf:pdf}, 246 | journal = {ACM SIGKDD Explorations Newsletter}, 247 | mendeley-groups = {evaluation}, 248 | number = {1}, 249 | pages = {49--57}, 250 | title = {{Apples-to-Apples in Cross-Validation Studies: Pitfalls in Classifier Performance Measurement}}, 251 | volume = {12}, 252 | year = {2010} 253 | } 254 | 255 | 256 | @article{Sokolova.Lapalme.2009, 257 | author = {Sokolova, Marina and Lapalme, Guy}, 258 | doi = {10.1016/j.ipm.2009.03.002}, 259 | journal = {Information Processing and Management}, 260 | number = {4}, 261 | pages = {427--437}, 262 | publisher = {Elsevier Ltd}, 263 | title = {{A systematic analysis of performance measures for classification tasks}}, 264 | volume = {45}, 265 | year = {2009} 266 | } 267 | 268 | 269 | @inproceedings{caglayan-etal-2020-curious, 270 | title = "Curious Case of Language Generation Evaluation Metrics: A Cautionary Tale", 271 | author = "Caglayan, Ozan and 272 | Madhyastha, Pranava and 273 | Specia, Lucia", 274 | booktitle = "Proceedings of COLING", 275 | year = "2020", 276 | doi = "10.18653/v1/2020.coling-main.210", 277 | pages = "2322--2328", 278 | } 279 | 280 | 281 | @inproceedings{Rajpurkar.et.al.2018.ACL, 282 | address = {Melbourne, Australia}, 283 | author = {Rajpurkar, Pranav and Jia, Robin and Liang, Percy}, 284 | booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)}, 285 | doi = {10.18653/v1/P18-2124}, 286 | pages = {784--789}, 287 | publisher = {Association for Computational Linguistics}, 288 | title = {{Know What You Don't Know: Unanswerable Questions for SQuAD}}, 289 | year = {2018} 290 | } 291 | 292 | 293 | @inproceedings{Zhang.et.al.2018.ACL, 294 | address = {Melbourne, Australia}, 295 | author = {Zhang, Saizheng and Dinan, Emily and Urbanek, Jack and Szlam, Arthur and Kiela, Douwe and Weston, Jason}, 296 | booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, 297 | doi = {10.18653/v1/P18-1205}, 298 | pages = {2204--2213}, 299 | publisher = {Association for Computational Linguistics}, 300 | title = {{Personalizing Dialogue Agents: I have a dog, do you have pets too?}}, 301 | year = {2018} 302 | } 303 | 304 | 305 | 306 | @book{Deisenroth.et.al.2021.book, 307 | title = {Mathematics for Machine Learning}, 308 | author = {Deisenroth, Marc Peter and Faisal, Aldo and Ong, Cheng Soon}, 309 | year = {2021}, 310 | publisher = {Cambridge University Press}, 311 | url = {mml-book.com}, 312 | } 313 | 314 | @book{Koller.Friedman.2009.book, 315 | title = {Probabilistic Graphical Models: Principles and Techniques}, 316 | author = {Koller, Daphne and Friedman, Nir}, 317 | publisher = {MIT Press}, 318 | year = {2009}, 319 | } 320 | 321 | @book{Goodfellow.et.al.2016.book, 322 | title={Deep Learning}, 323 | author={Ian Goodfellow and Yoshua Bengio and Aaron Courville}, 324 | publisher={MIT Press}, 325 | url={www.deeplearningbook.org}, 326 | year={2016} 327 | } 328 | 329 | @inproceedings{Iacobacci.et.al.2015.ACL, 330 | address = {Beijing, China}, 331 | author = {Iacobacci, Ignacio and Pilehvar, Mohammad Taher and Navigli, Roberto}, 332 | booktitle = {Proceedings of ACL}, 333 | doi = {10.3115/v1/P15-1010}, 334 | pages = {95--105}, 335 | publisher = {Association for Computational Linguistics}, 336 | title = {{SensEmbed: Learning Sense Embeddings for Word and Relational Similarity}}, 337 | year = {2015} 338 | } 339 | 340 | 341 | @inproceedings{Upadhyay.et.al.2016.ACL, 342 | address = {Berlin, Germany}, 343 | author = {Upadhyay, Shyam and Faruqui, Manaal and Dyer, Chris and Roth, Dan}, 344 | booktitle = {Proceedings of ACL}, 345 | doi = {10.18653/v1/P16-1157}, 346 | pages = {1661--1670}, 347 | title = {{Cross-lingual Models of Word Embeddings: An Empirical Comparison}}, 348 | year = {2016} 349 | } 350 | 351 | @inproceedings{Glavas.et.al.2019.ACL, 352 | address = {Florence, Italy}, 353 | author = {Glava{\v{s}}, Goran and Litschko, Robert and Ruder, Sebastian and Vuli{\'{c}}, Ivan}, 354 | booktitle = {Proceedings of ACL}, 355 | doi = {10.18653/v1/P19-1070}, 356 | pages = {710--721}, 357 | title = {{How to (Properly) Evaluate Cross-Lingual Word Embeddings: On Strong Baselines, Comparative Analyses, and Some Misconceptions}}, 358 | year = {2019} 359 | } 360 | 361 | 362 | @inproceedings{Vulic.Moens.2015.ACL, 363 | address = {Beijing, China}, 364 | author = {Vuli{\'{c}}, Ivan and Moens, Marie-Francine}, 365 | booktitle = {Proceedings of ACL (Volume 2: Short Papers)}, 366 | doi = {10.3115/v1/P15-2118}, 367 | pages = {719--725}, 368 | title = {{Bilingual Word Embeddings from Non-Parallel Document-Aligned Data Applied to Bilingual Lexicon Induction}}, 369 | year = {2015} 370 | } 371 | 372 | 373 | @inproceedings{Artetxe.et.al.2017.ACL, 374 | address = {Vancouver, Canada}, 375 | author = {Artetxe, Mikel and Labaka, Gorka and Agirre, Eneko}, 376 | booktitle = {Proceedings of ACL}, 377 | doi = {10.18653/v1/P17-1042}, 378 | pages = {451--462}, 379 | title = {{Learning bilingual word embeddings with (almost) no bilingual data}}, 380 | year = {2017} 381 | } 382 | 383 | @inproceedings{Ling.et.al.2015.NAACL, 384 | address = {Denver, Colorado}, 385 | author = {Ling, Wang and Dyer, Chris and Black, Alan W and Trancoso, Isabel}, 386 | booktitle = {Proceedings of NAACL}, 387 | doi = {10.3115/v1/N15-1142}, 388 | pages = {1299--1304}, 389 | title = {{Two/Too Simple Adaptations of Word2Vec for Syntax Problems}}, 390 | year = {2015} 391 | } 392 | 393 | 394 | @inproceedings{Levy.Goldberg.2014.ACL, 395 | address = {Baltimore, MD, USA}, 396 | author = {Levy, Omer and Goldberg, Yoav}, 397 | booktitle = {Proceedings of ACL}, 398 | doi = {10.3115/v1/P14-2050}, 399 | pages = {302--308}, 400 | title = {{Dependency-Based Word Embeddings}}, 401 | year = {2014} 402 | } 403 | 404 | @article{Bojanowski.et.al.2017.TACL, 405 | author = {Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas}, 406 | doi = {10.1162/tacl_a_00051}, 407 | journal = {Transactions of the ACL}, 408 | pages = {135--146}, 409 | title = {{Enriching Word Vectors with Subword Information}}, 410 | volume = {5}, 411 | year = {2017} 412 | } 413 | 414 | 415 | @inproceedings{Madasu.AnveshRao.2019.EMNLP, 416 | address = {Hong Kong, China}, 417 | author = {Madasu, Avinash and {Anvesh Rao}, Vijjini}, 418 | booktitle = {Proceedings of EMNLP-IJCNLP}, 419 | doi = {10.18653/v1/D19-1567}, 420 | pages = {5657--5666}, 421 | publisher = {Association for Computational Linguistics}, 422 | title = {{Sequential Learning of Convolutional Features for Effective Text Classification}}, 423 | year = {2019} 424 | } 425 | 426 | 427 | @inproceedings{Kim.2014.EMNLP, 428 | address = {Doha, Qatar}, 429 | author = {Kim, Yoon}, 430 | booktitle = {Proceedings of EMNLP}, 431 | doi = {10.3115/v1/D14-1181}, 432 | pages = {1746--1751}, 433 | publisher = {Association for Computational Linguistics}, 434 | title = {{Convolutional Neural Networks for Sentence Classification}}, 435 | year = {2014} 436 | } 437 | 438 | @inproceedings{Devlin.et.al.2019.NAACL, 439 | address = {Minneapolis, Minnesota}, 440 | author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, 441 | booktitle = {Proceedings of NAACL}, 442 | doi = {10.18653/v1/N19-1423}, 443 | pages = {4171--4186}, 444 | publisher = {Association for Computational Linguistics}, 445 | title = {{BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}}, 446 | year = {2019} 447 | } 448 | 449 | @inproceedings{Gururangan.et.al.2018.NAACL.short, 450 | address = {New Orleans, LA}, 451 | author = {Gururangan, Suchin and Swayamdipta, Swabha and Levy, Omer and Schwartz, Roy and Bowman, Samuel and Smith, Noah A.}, 452 | booktitle = {Proceedings of NAACL}, 453 | doi = {10.18653/v1/N18-2017}, 454 | pages = {107--112}, 455 | publisher = {Association for Computational Linguistics}, 456 | title = {{Annotation Artifacts in Natural Language Inference Data}}, 457 | year = {2018} 458 | } 459 | 460 | @article{Goldberg.2016, 461 | author = {Goldberg, Yoav}, 462 | doi = {10.1613/jair.4992}, 463 | journal = {Journal of Artificial Intelligence Research}, 464 | pages = {345--420}, 465 | title = {{A Primer on Neural Network Models for Natural Language Processing}}, 466 | volume = {57}, 467 | year = {2016} 468 | } 469 | 470 | 471 | @inproceedings{Gehring.et.al.2017a.ICML, 472 | address = {Sydney, Australia}, 473 | author = {Gehring, Jonas and Auli, Michael and Grangier, David and Yarats, Denis and Dauphin, Yann N.}, 474 | booktitle = {Proceedings of the 34th International Conference on Machine Learning}, 475 | editor = {Precup, Doina and Teh, Yee Whye}, 476 | pages = {1243--1252}, 477 | publisher = {PMLR}, 478 | title = {{Convolutional Sequence to Sequence Learning}}, 479 | year = {2017} 480 | } 481 | 482 | 483 | @inproceedings{Krishnan.Manning.2006, 484 | address = {Sydney, Australia}, 485 | author = {Krishnan, Vijay and Manning, Christopher D.}, 486 | booktitle = {Proceedings of ACL}, 487 | doi = {10.3115/1220175.1220316}, 488 | pages = {1121--1128}, 489 | publisher = {Association for Computational Linguistics}, 490 | title = {{An Effective Two-Stage Model for Exploiting Non-Local Dependencies in Named Entity Recognition}}, 491 | year = {2006} 492 | } 493 | 494 | 495 | @inproceedings{artemova-etal-2021-teaching, 496 | title = "Teaching a Massive Open Online Course on Natural Language Processing", 497 | author = "Artemova, Ekaterina and 498 | Apishev, Murat and 499 | Kirianov, Denis and 500 | Sarkisyan, Veronica and 501 | Aksenov, Sergey and 502 | Serikov, Oleg", 503 | booktitle = "Proceedings of the Fifth Workshop on Teaching NLP", 504 | year = "2021", 505 | address = "Online", 506 | publisher = "Association for Computational Linguistics", 507 | url = "https://www.aclweb.org/anthology/2021.teachingnlp-1.2", 508 | pages = "13--27", 509 | } 510 | 511 | 512 | @inproceedings{Vaswani.et.al.2017, 513 | address = {Long Beach, CA, USA}, 514 | author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia}, 515 | booktitle = {Advances in Neural Information Processing Systems 30}, 516 | pages = {5998--6008}, 517 | publisher = {Curran Associates, Inc.}, 518 | title = {{Attention Is All You Need}}, 519 | year = {2017} 520 | } 521 | 522 | @article{Koehn.2017, 523 | author = {Koehn, Philipp}, 524 | title = {Neural Machine Translation}, 525 | journal = {arXiv preprint}, 526 | date = {2017}, 527 | url = {http://arxiv.org/abs/1709.07809} 528 | } 529 | 530 | 531 | @inproceedings{Schuster.Nakajima.2012, 532 | address = {Kyoto, Japan}, 533 | author = {Schuster, Mike and Nakajima, Kaisuke}, 534 | booktitle = {2012 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 535 | doi = {10.1109/ICASSP.2012.6289079}, 536 | pages = {5149--5152}, 537 | publisher = {IEEE}, 538 | title = {{Japanese and Korean voice search}}, 539 | year = {2012} 540 | } 541 | 542 | @article{Wu.et.al.2016.GoogleMT, 543 | author = {Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le, Quoc V. and Norouzi, Mohammad and Macherey, Wolfgang and Krikun, Maxim and Cao, Yuan and Gao, Qin and Macherey, Klaus and Klingner, Jeff and Shah, Apurva and Johnson, Melvin and Liu, Xiaobing and Kaiser, {\L}ukasz and Gouws, Stephan and Kato, Yoshikiyo and Kudo, Taku and Kazawa, Hideto and Stevens, Keith and Kurian, George and Patil, Nishant and Wang, Wei and Young, Cliff and Smith, Jason and Riesa, Jason and Rudnick, Alex and Vinyals, Oriol and Corrado, Greg and Hughes, Macduff and Dean, Jeffrey}, 544 | pages = {1--23}, 545 | title = {{Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation}}, 546 | url = {http://arxiv.org/abs/1609.08144}, 547 | year = {2016}, 548 | journal = {arXive}, 549 | } 550 | 551 | 552 | @inproceedings{Sennrich.et.al.2016.ACL, 553 | address = {Berlin, Germany}, 554 | author = {Sennrich, Rico and Haddow, Barry and Birch, Alexandra}, 555 | booktitle = {Proceedings of ACL}, 556 | doi = {10.18653/v1/P16-1162}, 557 | pages = {1715--1725}, 558 | publisher = {Association for Computational Linguistics}, 559 | title = {{Neural Machine Translation of Rare Words with Subword Units}}, 560 | year = {2016} 561 | } 562 | 563 | @article{Caruana.1997, 564 | author = {Caruana, Rich}, 565 | doi = {10.1023/A:1007379606734}, 566 | journal = {Machine Learning}, 567 | number = {1}, 568 | pages = {41--75}, 569 | title = {{Multi-task Learning}}, 570 | volume = {28}, 571 | year = {1997} 572 | } 573 | 574 | @inproceedings{Sogaard.Goldberg.2016, 575 | address = {Berlin, Germany}, 576 | author = {S{\o}gaard, Anders and Goldberg, Yoav}, 577 | booktitle = {Proceedings of ACL}, 578 | doi = {10.18653/v1/P16-2038}, 579 | pages = {231--235}, 580 | publisher = {Association for Computational Linguistics}, 581 | title = {{Deep multi-task learning with low level tasks supervised at lower layers}}, 582 | year = {2016} 583 | } 584 | 585 | @inproceedings{Conneau.et.al.2017.EMNLP, 586 | address = {Copenhagen, Denmark}, 587 | author = {Conneau, Alexis and Kiela, Douwe and Schwenk, Holger and Barrault, Lo{\"{i}}c and Bordes, Antoine}, 588 | booktitle = {Proceedings of EMNLP}, 589 | pages = {670--680}, 590 | title = {{Supervised Learning of Universal Sentence Representations from Natural Language Inference Data}}, 591 | year = {2017} 592 | } 593 | 594 | @article{Rogers.et.al.2020.BERT, 595 | author = {Rogers, Anna and Kovaleva, Olga and Rumshisky, Anna}, 596 | doi = {10.1162/tacl_a_00349}, 597 | journal = {Transactions of the Association for Computational Linguistics}, 598 | pages = {842--866}, 599 | title = {{A Primer in BERTology: What We Know About How BERT Works}}, 600 | volume = {8}, 601 | year = {2020} 602 | } 603 | 604 | 605 | @inproceedings{Kingma.Ba.2015, 606 | address = {San Diego, CA, USA}, 607 | author = {Kingma, Diederik P. and Ba, Jimmy Lei}, 608 | booktitle = {3rd International Conference on Learning Representations, ICLR 2015}, 609 | editor = {Bengio, Yoshua and LeCun, Yann}, 610 | pages = {1--15}, 611 | title = {{Adam: A Method for Stochastic Optimization}}, 612 | year = {2015}, 613 | url = {https://arxiv.org/abs/1412.6980}, 614 | } 615 | 616 | @article{Bengio.et.al.2003.JMLR, 617 | author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal and Jauvin, Christian}, 618 | journal = {Journal of Machine Learning Research}, 619 | pages = {1137--1155}, 620 | title = {{A Neural Probabilistic Language Model}}, 621 | volume = {3}, 622 | year = {2003}, 623 | url = {https://research.jmlr.org/papers/v3/bengio03a.html}, 624 | } 625 | 626 | 627 | @book{Kun.2020, 628 | author = {Jeremy Kun}, 629 | edition = {2}, 630 | title = {A Programmer’s Introduction to Mathematics}, 631 | url = {https://pimbook.org}, 632 | year = {2020}, 633 | } 634 | 635 | @book{Goldberg.2017, 636 | author = {Goldberg, Yoav}, 637 | title = {Neural Network Methods for Natural Language Processing}, 638 | year = {2017}, 639 | publisher = {Morgan \& Claypool}, 640 | 641 | } 642 | 643 | 644 | @inproceedings{Kudo.Richardson.2018.EMNLP, 645 | title = "{S}entence{P}iece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing", 646 | author = "Kudo, Taku and 647 | Richardson, John", 648 | booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations", 649 | year = "2018", 650 | address = "Brussels, Belgium", 651 | publisher = "Association for Computational Linguistics", 652 | doi = "10.18653/v1/D18-2012", 653 | pages = "66--71", 654 | } 655 | 656 | @article{kudo2018subword, 657 | title={Subword regularization: Improving neural network translation models with multiple subword candidates}, 658 | author={Kudo, Taku}, 659 | journal={arXiv preprint arXiv:1804.10959}, 660 | year={2018} 661 | } 662 | 663 | @article{bahdanau2014neural, 664 | title={Neural machine translation by jointly learning to align and translate}, 665 | author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua}, 666 | journal={arXiv preprint arXiv:1409.0473}, 667 | year={2014} 668 | } -------------------------------------------------------------------------------- /latex/lecture01/.gitignore: -------------------------------------------------------------------------------- 1 | dl4nlp2023-lecture01.pdf 2 | flags.tex 3 | pdf 4 | -------------------------------------------------------------------------------- /latex/lecture01/compile-pdf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Inspired by: https://tex.stackexchange.com/a/1501 4 | 5 | # current lecture file name 6 | lecture_filename="dl4nlp2023-lecture01" 7 | 8 | # create the output folder (might already exist) 9 | mkdir -p pdf 10 | 11 | # Compile the lecture version with pauses 12 | 13 | # set empty flags 14 | echo "" > flags.tex 15 | 16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 17 | biber pdf/${lecture_filename}-lecture.bcf 18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 20 | 21 | # Compile the handout (no slide unfolding) 22 | 23 | # set the flag 24 | echo "\handouttrue" > flags.tex 25 | 26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 27 | biber pdf/${lecture_filename}-handout.bcf 28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 30 | 31 | # Cleaning up temporary latex files 32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache 33 | -------------------------------------------------------------------------------- /latex/lecture01/img/arct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/arct.png -------------------------------------------------------------------------------- /latex/lecture01/img/cc-by-sa-icon.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/cc-by-sa-icon.pdf -------------------------------------------------------------------------------- /latex/lecture01/img/dial1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/dial1.png -------------------------------------------------------------------------------- /latex/lecture01/img/hfdata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/hfdata.png -------------------------------------------------------------------------------- /latex/lecture01/img/logo-trusthlt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/logo-trusthlt.pdf -------------------------------------------------------------------------------- /latex/lecture01/img/mt2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/mt2.png -------------------------------------------------------------------------------- /latex/lecture01/img/mtex.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/mtex.jpg -------------------------------------------------------------------------------- /latex/lecture01/img/nlg1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/nlg1.png -------------------------------------------------------------------------------- /latex/lecture01/img/nlg2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture01/img/nlg2.png -------------------------------------------------------------------------------- /latex/lecture02/.gitignore: -------------------------------------------------------------------------------- 1 | dl4nlp2023-lecture02.pdf 2 | flags.tex 3 | pdf 4 | -------------------------------------------------------------------------------- /latex/lecture02/compile-pdf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Inspired by: https://tex.stackexchange.com/a/1501 4 | 5 | # current lecture file name 6 | lecture_filename="dl4nlp2023-lecture02" 7 | 8 | # create the output folder (might already exist) 9 | mkdir -p pdf 10 | 11 | # Compile the lecture version with pauses 12 | 13 | # set empty flags 14 | echo "" > flags.tex 15 | 16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 17 | biber pdf/${lecture_filename}-lecture.bcf 18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 20 | 21 | # Compile the handout (no slide unfolding) 22 | 23 | # set the flag 24 | echo "\handouttrue" > flags.tex 25 | 26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 27 | biber pdf/${lecture_filename}-handout.bcf 28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 30 | 31 | # Cleaning up temporary latex files 32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache 33 | -------------------------------------------------------------------------------- /latex/lecture02/img/backprop-my.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop-my.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/backprop01.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop01.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/backprop02.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop02.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/backprop03.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop03.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/backprop04.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop04.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/backprop05.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop05.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/backprop06.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop06.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/backprop07.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop07.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/backprop08.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop08.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/backprop09.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop09.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/backprop10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/backprop10.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/cc-by-sa-icon.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/cc-by-sa-icon.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/desmos-graph1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/desmos-graph1.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/desmos-graph1.svg: -------------------------------------------------------------------------------- 1 | 2 | 13 | 35 | 37 | 40 | 42 | 44 | 55 | 66 | 77 | 88 | 99 | 110 | 121 | 132 | 143 | 154 | 165 | 176 | 187 | 197 | 207 | 217 | 227 | 237 | 247 | 257 | 268 | 279 | 289 | 291 | 293 | X axis 295 | 297 | 308 | 309 | 310 | 312 | 313 | 314 | 316 | 318 | Expression 1 320 | 327 | 329 | 342 | 343 | 344 | 345 | 347 | 348 | 349 | -------------------------------------------------------------------------------- /latex/lecture02/img/gradient1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/gradient1.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/logo-trusthlt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/logo-trusthlt.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/parent-child.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/parent-child.pdf -------------------------------------------------------------------------------- /latex/lecture02/img/rosenbrock.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture02/img/rosenbrock.pdf -------------------------------------------------------------------------------- /latex/lecture03/.gitignore: -------------------------------------------------------------------------------- 1 | dl4nlp2023-lecture03.pdf 2 | flags.tex 3 | pdf 4 | -------------------------------------------------------------------------------- /latex/lecture03/compile-pdf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Inspired by: https://tex.stackexchange.com/a/1501 4 | 5 | # current lecture file name 6 | lecture_filename="dl4nlp2023-lecture03" 7 | 8 | # create the output folder (might already exist) 9 | mkdir -p pdf 10 | 11 | # Compile the lecture version with pauses 12 | 13 | # set empty flags 14 | echo "" > flags.tex 15 | 16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 17 | biber pdf/${lecture_filename}-lecture.bcf 18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 20 | 21 | # Compile the handout (no slide unfolding) 22 | 23 | # set the flag 24 | echo "\handouttrue" > flags.tex 25 | 26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 27 | biber pdf/${lecture_filename}-handout.bcf 28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 30 | 31 | # Cleaning up temporary latex files 32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache 33 | -------------------------------------------------------------------------------- /latex/lecture03/img/cc-by-sa-icon.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture03/img/cc-by-sa-icon.pdf -------------------------------------------------------------------------------- /latex/lecture03/img/logo-trusthlt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture03/img/logo-trusthlt.pdf -------------------------------------------------------------------------------- /latex/lecture04/.gitignore: -------------------------------------------------------------------------------- 1 | dl4nlp2023-lecture04.pdf 2 | flags.tex 3 | pdf 4 | -------------------------------------------------------------------------------- /latex/lecture04/compile-pdf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Inspired by: https://tex.stackexchange.com/a/1501 4 | 5 | # current lecture file name 6 | lecture_filename="dl4nlp2023-lecture04" 7 | 8 | # create the output folder (might already exist) 9 | mkdir -p pdf 10 | 11 | # Compile the lecture version with pauses 12 | 13 | # set empty flags 14 | echo "" > flags.tex 15 | 16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 17 | biber pdf/${lecture_filename}-lecture.bcf 18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 20 | 21 | # Compile the handout (no slide unfolding) 22 | 23 | # set the flag 24 | echo "\handouttrue" > flags.tex 25 | 26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 27 | biber pdf/${lecture_filename}-handout.bcf 28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 30 | 31 | # Cleaning up temporary latex files 32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache 33 | -------------------------------------------------------------------------------- /latex/lecture04/dl4nlp2023-lecture04.tex: -------------------------------------------------------------------------------- 1 | % !TeX program = lualatex 2 | % !BIB program = biber 3 | % Lualatex is important to render Fira fonts; with pdflatex it's just the regular one 4 | % ratio 16:9 -- https://tex.stackexchange.com/questions/14336/ 5 | 6 | % compile two versions, inspired by https://tex.stackexchange.com/a/1501 7 | % use the script "compile-pdf.sh" 8 | \newif\ifhandout 9 | % if flags.tex does not exist, create an empty file to be able to compile in TeXstudio 10 | \input{flags} 11 | 12 | \ifhandout 13 | \documentclass[12pt,aspectratio=169,handout]{beamer} 14 | \else 15 | \documentclass[12pt,aspectratio=169]{beamer} 16 | \fi 17 | 18 | % adjust for 16:9 19 | % https://tex.stackexchange.com/questions/354022/modifying-the-margins-of-all-slides-in-beamer 20 | \setbeamersize{text margin left=0.3cm,text margin right=4.5cm} 21 | 22 | %\usepackage{xcolor} 23 | 24 | %%% better TOC 25 | \usetheme[subsectionpage=progressbar]{metropolis} 26 | 27 | % name in footer 28 | \setbeamertemplate{frame numbering}{\insertframenumber ~ | Dr.\ Ivan Habernal} 29 | 30 | % blocks with background globally 31 | \metroset{block=fill} 32 | 33 | % adjust the background to be completely white 34 | \setbeamercolor{background canvas}{bg=white} 35 | 36 | % typeset mathematics on serif 37 | \usefonttheme[onlymath]{serif} 38 | 39 | % better bibliography using biber as backend 40 | \usepackage[natbib=true,backend=biber,style=authoryear-icomp,maxbibnames=30,maxcitenames=9,uniquelist=false,giveninits=true,doi=false,url=false,dashed=false,isbn=false]{biblatex} 41 | % shared bibliography 42 | \addbibresource{../dl4nlp-bibliography.bib} 43 | % disable "ibid" for repeated citations 44 | \boolfalse{citetracker} 45 | 46 | \definecolor{76abdf}{RGB}{118, 171, 223} 47 | 48 | \setbeamercolor{frametitle}{bg=76abdf, fg=white} 49 | 50 | \usepackage{xspace} 51 | 52 | 53 | % for derivatives, https://tex.stackexchange.com/a/412442 54 | \usepackage{physics} 55 | 56 | \usepackage{tikz} 57 | \usetikzlibrary{matrix, positioning} 58 | \usetikzlibrary{angles,quotes} % for angles 59 | \usetikzlibrary{backgrounds} % background 60 | \usetikzlibrary{decorations.pathreplacing} % curly braces 61 | \usetikzlibrary{calligraphy} 62 | \usetikzlibrary{calc} % for neural nets 63 | 64 | % for plotting functions 65 | \usepackage{pgfplots} 66 | \usepgfplotslibrary{dateplot} 67 | 68 | % sub-figures 69 | \usepackage{caption} 70 | \usepackage{subcaption} 71 | 72 | % book tabs 73 | \usepackage{booktabs} 74 | 75 | 76 | % show TOC at every section start 77 | \AtBeginSection{ 78 | \frame{ 79 | \vspace{2em} 80 | \sectionpage 81 | \hspace*{2.2em}\begin{minipage}{10cm} 82 | \tableofcontents[currentsection] 83 | \end{minipage} 84 | } 85 | } 86 | 87 | % argmin, argmax 88 | \usepackage{amsmath} 89 | \DeclareMathOperator*{\argmax}{arg\!\max} 90 | \DeclareMathOperator*{\argmin}{arg\!\min} 91 | % softmax 92 | \DeclareMathOperator*{\softmax}{soft\!\max} 93 | 94 | % bold math 95 | \usepackage{bm} 96 | 97 | % for \mathclap 98 | \usepackage{mathtools} 99 | 100 | % algorithms 101 | \usepackage[noend]{algpseudocode} 102 | 103 | 104 | % for neurons and layers in tikz 105 | \tikzset{ 106 | neuron/.style={draw, circle, inner sep=0pt, minimum width=0.75cm, fill=blue!20}, 107 | param/.style={draw, circle, inner sep=0pt, minimum width=0.75cm, fill=green!20}, 108 | constant/.style={draw, circle, inner sep=0pt, minimum width=0.75cm, fill=black!15}, 109 | } 110 | 111 | 112 | \title{Deep Learning for Natural Language Processing} 113 | \subtitle{Lecture 4 --- Text classification 2: Deep neural networks} 114 | \date{May 2, 2023} 115 | \author{Dr.\ Ivan Habernal} 116 | \institute{Trustworthy Human Language Technologies \hfill \includegraphics[height=.8cm]{img/logo-trusthlt.pdf} \\ 117 | Department of Computer Science\\ 118 | Technical University of Darmstadt \hfill \texttt{www.trusthlt.org} } 119 | %\titlegraphic{\hfill } 120 | 121 | \begin{document} 122 | 123 | \maketitle 124 | 125 | 126 | \section{Where we finished last time} 127 | 128 | \begin{frame}{Our binary text classification function} 129 | 130 | Linear function through sigmoid --- log-linear model 131 | $$ 132 | \hat{y} = \sigma(f(\bm{x})) = \frac{1}{1 + \exp(- (\bm{x} \cdot \bm{w} + b))} 133 | $$ 134 | 135 | \begin{figure} 136 | \begin{tikzpicture} 137 | %\node (a1) [draw, circle, inner sep=0pt, minimum width=0.75cm, fill=green!20] {$a_1$}; 138 | \node (x) [constant] {$\bm{x}$}; 139 | \node (w) [param, below of=x] {$\bm{w}$}; 140 | \node (b) [param, below of=w] {$b$}; 141 | 142 | \node (f) [neuron, right of=w, xshift=1.5cm] {$\bm{x} \cdot \bm{w} + b$}; 143 | \node (s) [neuron, right of=f, xshift=1.5cm] {$\sigma$}; 144 | 145 | \begin{scope}[thick, black, ->, >=latex] 146 | \draw (x) -- (f); 147 | \draw (w) -- (f); 148 | \draw (b) -- (f); 149 | \draw (f) -- (s); 150 | \end{scope} 151 | \end{tikzpicture} 152 | \caption{Computational graph; green circles are trainable parameters, gray are inputs} 153 | \end{figure} 154 | 155 | \end{frame} 156 | 157 | \begin{frame}{Decision rule of log-linear model} 158 | 159 | Log-linear model 160 | $ 161 | \hat{y} = \sigma(f(\bm{x})) = \frac{1}{1 + \exp(- (\bm{x} \cdot \bm{w} + b))} 162 | $ 163 | 164 | \begin{itemize} 165 | \item Prediction = 1 if $\hat{y} > 0.5$ 166 | \item Prediction = 0 if $\hat{y} < 0.5$ 167 | \end{itemize} 168 | 169 | \bigskip 170 | 171 | Natural interpretation: Conditional probability of prediction = 1 given the input $\bm{x}$ 172 | $$ 173 | \begin{aligned} 174 | \sigma(f(\bm{x})) &= \Pr(\text{prediction} = 1 | \bm{x}) \\ 175 | 1 - \sigma(f(\bm{x})) &= \Pr(\text{prediction} = 0 | \bm{x}) 176 | \end{aligned} 177 | $$ 178 | 179 | \end{frame} 180 | 181 | \section{Finding the best model's parameters} 182 | 183 | \begin{frame}{The loss function} 184 | 185 | Loss function: Quantifies the loss suffered when predicting $\hat{y}$ while the true label is $y$ for a single example. In binary classification: \pause 186 | $$ 187 | L(\hat{y}, y): \mathbb{R}^2 \to \mathbb{R} 188 | $$ 189 | 190 | \pause 191 | Given a labeled training set 192 | $(\bm{x}_{1:n}, \bm{y}_{1:n})$, 193 | a per-instance loss function $L$ and a 194 | parameterized function $f(\bm{x}; \Theta)$ we define the corpus-wide loss with respect to the parameters $\Theta$ as the average loss over all training examples \pause 195 | $$ 196 | \mathcal{L}(\Theta) = \frac{1}{n} \sum_{i =1}^{n} L (f(\bm{x}_i; \Theta), y_i) 197 | $$ 198 | \end{frame} 199 | 200 | \begin{frame}{Training as optimization} 201 | $$ 202 | \mathcal{L}(\Theta) = \frac{1}{n} \sum_{i =1}^{n} L (f(\bm{x}_i; \Theta), y_i) 203 | $$ 204 | 205 | The training examples are fixed, and the values of the parameters determine the loss 206 | 207 | \pause 208 | The goal of the training algorithm is to set the values of the parameters $\Theta$‚ such that 209 | the value of $\mathcal{L}$ is minimized \pause 210 | $$ 211 | \hat{\Theta} = \argmin_{\Theta} \mathcal{L}(\Theta) = \argmin_{\Theta} \frac{1}{n} \sum_{i =1}^{n} L (f(\bm{x}_i; \Theta), y_i) 212 | $$ 213 | 214 | 215 | \end{frame} 216 | 217 | \begin{frame}{Binary cross-entropy loss (logistic loss)} 218 | $$ 219 | L_{\text{logistic}} = - y \log \hat{y} - (1 - y) \log (1 - \hat{y}) 220 | $$ 221 | 222 | \pause 223 | \begin{block}{Partial derivative wrt.\ input $\hat{y}$} 224 | $$ 225 | \dv{L_{\text{Logistic}}}{\hat{y}} = 226 | - \left( 227 | \frac{y}{\hat{y}} - \frac{1 - y}{1 - \hat{y}} 228 | \right) 229 | = 230 | - \frac{y - \hat{y}}{ \hat{y} (1 - \hat{y})} 231 | $$ 232 | \end{block} 233 | 234 | \end{frame} 235 | 236 | \begin{frame}{Full computational graph} 237 | \begin{figure} 238 | \begin{tikzpicture} 239 | %\node (a1) [draw, circle, inner sep=0pt, minimum width=0.75cm, fill=green!20] {$a_1$}; 240 | \node (x) [constant] {$\bm{x}$}; 241 | \node (w) [param, below of=x] {$\bm{w}$}; 242 | \node (b) [param, below of=w] {$b$}; 243 | 244 | \node (f) [neuron, right of=w, xshift=1.5cm] {$\bm{x} \cdot \bm{w} + b$}; 245 | \node (s) [neuron, right of=f, xshift=1.5cm] {$\sigma$}; 246 | 247 | \node (l) [neuron, right of=s, xshift=1cm] {$L$}; 248 | \node (y) [constant, below of=s] {$y$}; 249 | 250 | \begin{scope}[thick, black, ->, >=latex] 251 | \draw (x) -- (f); 252 | \draw (w) -- (f); 253 | \draw (b) -- (f); 254 | \draw (f) -- (s); 255 | \draw (s) -- (l); 256 | \draw (y) -- (l); 257 | \end{scope} 258 | \end{tikzpicture} 259 | \caption{Computational graph; green circles are trainable parameters, gray are constant inputs} 260 | \end{figure} 261 | 262 | How can we minimize this function? 263 | 264 | \pause 265 | \begin{itemize} 266 | \item Recall Lecture 2: (a) Gradient descent and (b) backpropagation 267 | \end{itemize} 268 | 269 | \end{frame} 270 | 271 | \begin{frame}{(Online) Stochastic Gradient Descent} 272 | 273 | \begin{algorithmic}[1] 274 | \Function{SGD}{$f(\bm{x}; \Theta)$, $(\bm{x}_1, \ldots, \bm{x}_n)$, $(\bm{y}_1, \ldots, \bm{y}_n)$, $L$} 275 | \While{stopping criteria not met} 276 | \State Sample a training example $\bm{x}_i, \bm{y}_i$ 277 | \State Compute the loss $L(f(\bm{x}_i; \Theta), \bm{y}_i)$ 278 | \State $\hat{\bm{g}} \gets$ gradient of $L(f(\bm{x}_i; \Theta), \bm{y}_i)$ wrt.\ $\Theta$ 279 | \State $\Theta \gets \Theta - \eta_t \hat{\bm{g}}$ 280 | \EndWhile 281 | \State \Return $\Theta$ 282 | \EndFunction 283 | \end{algorithmic} 284 | 285 | \pause 286 | Loss in line 4 is based on a \textbf{single training example} $\to$ a rough estimate of the corpus loss $\mathcal{L}$ we aim to minimize 287 | 288 | \pause 289 | The noise in the loss computation may result in inaccurate gradients 290 | 291 | \end{frame} 292 | 293 | 294 | 295 | \begin{frame}{Minibatch Stochastic Gradient Descent} 296 | 297 | \begin{algorithmic}[1] 298 | \Function{mbSGD}{$f(\bm{x}; \Theta)$, $(\bm{x}_1, \ldots, \bm{x}_n)$, $(\bm{y}_1, \ldots, \bm{y}_n)$, $L$} 299 | \While{stopping criteria not met} 300 | \State Sample $m$ examples $\{ (\bm{x}_1, \bm{y}_1), \ldots (\bm{x}_m, \bm{y}_m) \}$ 301 | \State $\hat{\bm{g}} \gets 0$ 302 | \For{$i = 1$ to $m$} 303 | \State Compute the loss $L(f(\bm{x}_i; \Theta), \bm{y}_i)$ 304 | \State $\hat{\bm{g}} \gets \hat{\bm{g}}\ + $ gradient of $\frac{1}{m} L(f(\bm{x}_i; \Theta), \bm{y}_i)$ wrt.\ $\Theta$ 305 | \EndFor 306 | \State $\Theta \gets \Theta - \eta_t \hat{\bm{g}}$ 307 | \EndWhile 308 | \State \Return $\Theta$ 309 | \EndFunction 310 | \end{algorithmic} 311 | 312 | 313 | \end{frame} 314 | 315 | \begin{frame}{Properties of Minibatch Stochastic Gradient Descent} 316 | 317 | The minibatch size can vary in size from $m = 1$ to $m = n$ 318 | 319 | Higher values provide better estimates of the corpus-wide gradients, while smaller values allow more updates and in turn faster convergence 320 | 321 | Lines 6+7: May be easily parallelized 322 | 323 | \end{frame} 324 | 325 | 326 | \section{Log-linear multi-class classification} 327 | 328 | \begin{frame}{From binary to multi-class labels} 329 | 330 | So far we mapped our gold label $y \in \{0, 1\}$ 331 | 332 | What if we classify into distinct categorical classes? 333 | 334 | \begin{itemize} 335 | \item Categorical: There is no `ordering' 336 | \item Example: Classify the language of a document into 6 languages (En, Fr, De, It, Es, Other) 337 | \end{itemize} 338 | 339 | \pause 340 | \begin{block}{One-hot encoding of labels} 341 | $$ 342 | \begin{aligned} 343 | \text{En} &= \begin{pmatrix}1 & 0 & 0 & 0 & 0 & 0\end{pmatrix} \qquad 344 | \text{Fr} = \begin{pmatrix}0 & 1 & 0 & 0 & 0 & 0\end{pmatrix} \\ 345 | \text{De} &= \begin{pmatrix}0 & 0 & 1 & 0 & 0 & 0\end{pmatrix} \qquad \ldots \\ 346 | \bm{y} &\in \mathbb{R}^{d_{out}} \quad \text{where } d_{out} \text{ is the number of classes} 347 | \end{aligned} 348 | $$ 349 | \end{block} 350 | 351 | \end{frame} 352 | 353 | \begin{frame}{Possible solution: Six weight vectors and biases} 354 | 355 | Consider for each language $\ell \in \{\text{En}, \text{Fr}, \text{De}, \text{It}, \text{Es}, \text{Other}\}$ 356 | \begin{itemize} 357 | \item Weight vector $\bm{w}^{\ell}$ (e.g., $\bm{w}^{\text{Fr}})$ 358 | \item Bias $b^{\ell}$ (e.g., $b^{\text{Fr}})$ 359 | \end{itemize} 360 | \pause We can predict the language resulting in the highest score 361 | $$ 362 | \hat{y} = f(\bm{x}) = \argmax_{ 363 | \ell \in \{\text{En}, \text{Fr}, \text{De}, \text{It}, \text{Es}, \text{Other}\} 364 | } 365 | \bm{x} \cdot \bm{w}^{\ell} + b^{\ell} 366 | $$ 367 | 368 | \pause 369 | But we can re-arrange the $\bm{w} \in \mathbb{R}^{d_{in}}$ vectors into columns of a matrix $\bm{W} \in \mathbb{R}^{d_{in} \times 6}$ and $\bm{b} \in \mathbb{R}^6$, to get 370 | $$f(\bm{x}) = \bm{x} \bm{W} + \bm{b}$$ 371 | 372 | \end{frame} 373 | 374 | 375 | \begin{frame}{Projecting input vector to output vector $f(\bm{x}) : \mathbb{R}^{d_{in}} \to \mathbb{R}^{d_{out}}$} 376 | 377 | \pause 378 | \begin{block}{Recall from lecture 3: High-dimensional linear functions} 379 | Function $f(\bm{x}) : \mathbb{R}^{d_{in}} \to \mathbb{R}^{d_{out}}$ 380 | $$f(\bm{x}) = \bm{x} \bm{W} + \bm{b}$$ 381 | where 382 | $\bm{x} \in \mathbb{R}^{d_{in}} \qquad 383 | \bm{W} \in \mathbb{R}^{d_{in} \times d_{out}} \qquad 384 | \bm{b} \in \mathbb{R}^{d_{out}}$ 385 | \end{block} 386 | 387 | The simplest neural network --- a perceptron (simply a linear model) 388 | 389 | \begin{itemize} 390 | \item How to find the prediction $\hat{y}$? 391 | \end{itemize} 392 | 393 | \end{frame} 394 | 395 | \begin{frame}{Prediction of multi-class classifier} 396 | Project the input $\bm{x}$ to an output $\bm{y}$ 397 | $$\bm{\hat{y}} = f(\bm{x}) = \bm{x} \bm{W} + \bm{b}$$ 398 | and pick the element of $\bm{\hat{y}}$ with the highest value 399 | $$ 400 | \text{prediction} = \hat{y} = \argmax_{i} \bm{\hat{y}}_{[i]} 401 | $$ 402 | 403 | \begin{block}{Sanity check} 404 | What is $\hat{y}$? 405 | 406 | \pause 407 | Index of $1$ in the one-hot 408 | 409 | For example, if $\hat{y} = 3$, then the document is in German 410 | $\text{De} = \begin{pmatrix}0 & 0 & 1 & 0 & 0 & 0\end{pmatrix}$ 411 | \end{block} 412 | 413 | \end{frame} 414 | 415 | \subsection{Representations} 416 | 417 | 418 | \begin{frame}{Two representations of the input document} 419 | $$\bm{\hat{y}} = \bm{x} \bm{W} + \bm{b}$$ 420 | 421 | Vector $\bm{x}$ is a document representation 422 | \begin{itemize} 423 | \item Bag of words, for example ($d_{in} = |V|$ dimensions, sparse) 424 | \end{itemize} 425 | 426 | Vector $\bm{\hat{y}}$ is \textbf{also} a document representation 427 | \begin{itemize} 428 | \item More compact (only 6 dimensions) 429 | \item More specialized for the language prediction task 430 | \end{itemize} 431 | 432 | \end{frame} 433 | 434 | \begin{frame}{Matrix $\bm{W}$ as learned representation --- columns} 435 | $\bm{\hat{y}} = \bm{x} \bm{W} + \bm{b} \quad \to$ two views of $\bm{W}$, as rows or as columns 436 | 437 | \begin{tabular}{r|cccccc} 438 | & En & Fr & De & It & Es & Ot \\ \midrule 439 | a & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ \\ 440 | at & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ \\ 441 | ... & & & & & & \\ 442 | zoo & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ \\ 443 | \end{tabular} 444 | 445 | \pause 446 | Each of the 6 columns (corresponding to a language) is a $d_{in}$-dimensional vector representation of this language in terms of its characteristic word unigram patterns (e.g., we can then cluster the 6 language vectors according to their similarity) 447 | 448 | 449 | \end{frame} 450 | 451 | \begin{frame}{Matrix $\bm{W}$ as learned representation --- rows} 452 | $\bm{\hat{y}} = \bm{x} \bm{W} + \bm{b}$ 453 | 454 | \begin{tabular}{r|cccccc} 455 | & En & Fr & De & It & Es & Ot \\ \midrule 456 | a & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ \\ 457 | at & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ \\ 458 | ... & & & & & & \\ 459 | zoo & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ & $\bullet$ \\ 460 | \end{tabular} 461 | 462 | Each of the $d_{in}$ rows corresponds to a particular unigram, and provides a 6-dimensional vector 463 | representation of that unigram in terms of the languages it prompts 464 | 465 | \end{frame} 466 | 467 | \begin{frame}{From bag-of-words to continuous bag-of-words} 468 | \begin{block}{Recall from lecture 3 --- Averaged bag of words} 469 | $$\bm{x} = \frac{1}{|D|} \sum_{i =1}^{|D|} \bm{x}^{D_{[i]}}$$ 470 | $D_{[i]}$ --- word in doc $D$ at position $i$, $\bm{x}^{D_{[i]}}$ --- one-hot vector 471 | \end{block} 472 | $$ 473 | \begin{aligned} 474 | \bm{\hat{y}} &= \bm{x} \bm{W} = \pause 475 | \left (\frac{1}{|D|} \sum_{i =1}^{|D|} \bm{x}^{D_{[i]}} \right) \bm{W} 476 | \pause = \frac{1}{|D|} \sum_{i =1}^{|D|} \left ( \bm{x}^{D_{[i]}} \bm{W} \right) \\ 477 | &= \pause \frac{1}{|D|} \sum_{i =1}^{|D|} \bm{W}^{D_{[i]}} 478 | \end{aligned} 479 | $$ 480 | (we ignore the bias $\bm{b}$ here) 481 | 482 | \end{frame} 483 | 484 | \begin{frame}{From bag-of-words to continuous bag-of-words (CBOW)} 485 | \begin{block}{Two equivalent views; $\bm{W}^{D_{[i]}}$ is the $D_{[i]}$-th row of matrix $\bm{W}$} 486 | $$ 487 | \bm{\hat{y}} = \frac{1}{|D|} \sum_{i =1}^{|D|} \bm{W}^{D_{[i]}} 488 | \qquad 489 | \bm{\hat{y}} = \left (\frac{1}{|D|} \sum_{i =1}^{|D|} \bm{x}^{D_{[i]}} \right) \bm{W} 490 | $$ 491 | \end{block} 492 | 493 | \pause 494 | The continuous-bag-of-words (CBOW) representation 495 | \begin{itemize} 496 | \item \pause Either by summing word-representation vectors 497 | \item \pause Or by multiplying a bag-of-words vector by a matrix in which each row corresponds to a dense word representation (also called \textbf{embedding matrix}) 498 | \end{itemize} 499 | 500 | \end{frame} 501 | 502 | \begin{frame}{Learned representations --- central to deep learning} 503 | Representations are central to deep learning 504 | 505 | One could argue that the main power of deep-learning is the ability to learn good representations 506 | \end{frame} 507 | 508 | 509 | \subsection{From multi-dimensional linear transformation to probabilities} 510 | 511 | \begin{frame}{Turning output vector into probabilities of classes} 512 | 513 | \begin{block}{Recap: Categorical probability distribution} 514 | Categorical random variable $X$ is defined over $K$ categories, typically mapped to natural numbers $1, 2, \ldots, K$, for example En = 1, De = 2, $\ldots$ 515 | 516 | \pause 517 | Each category parametrized with probability $\Pr(X = k) = p_k$ 518 | 519 | \pause 520 | Must be valid probability distribution: $\sum_{i =1}^{K} \Pr(X = i) = 1$ 521 | \end{block} 522 | 523 | \pause 524 | How to turn an \textbf{unbounded} vector in $\mathbb{R}^K$ into a categorical probability distribution? 525 | 526 | \end{frame} 527 | 528 | \begin{frame}{The softmax function $\softmax (\bm{x}): \mathbb{R}^K \to \mathbb{R}^K$} 529 | 530 | \begin{block}{Softmax} 531 | Applied element-wise, for each element $\bm{x}_{[i]}$ we have 532 | $$ 533 | \softmax (\bm{x}_{[i]}) = \frac{\exp(\bm{x}_{[i]})}{ 534 | \sum_{k=1}^{K} \exp(\bm{x}_{[k]}) 535 | } 536 | $$ 537 | \end{block} 538 | 539 | \pause 540 | \begin{itemize} 541 | \item Nominator: Non-linear bijection from $\mathbb{R}$ to $(0; \infty)$ 542 | \item Denominator: Normalizing constant to ensure $\sum_{j = 1}^{K} \softmax (\bm{x}_{[j]}) = 1$ 543 | \end{itemize} 544 | 545 | \pause 546 | We also need to know how to compute the partial derivative of $\softmax (\bm{x}_{[i]})$ wrt.\ each argument $\bm{x}_{[k]}$: $\pdv{\softmax (\bm{x}_{[i]})}{\bm{x}_{[k]}}$ 547 | 548 | \end{frame} 549 | 550 | 551 | \begin{frame}{Softmax can be smoothed with a `temperature' $T$} 552 | \vspace{-1em} 553 | $$ 554 | \softmax (\bm{x}_{[i]}; T) = \frac{ 555 | \exp(\frac{\bm{x}_{[i]}}{T}) 556 | }{ 557 | \sum_{k=1}^{K} \exp( 558 | \frac{\bm{x}_{[k]}}{T}) 559 | } 560 | $$ 561 | 562 | \pause 563 | \begin{block}{Example: Softmax of $\bm{x} = (3, 0, 1)$ at different $T$} 564 | \includegraphics[width=0.95\linewidth]{img/temperatures.png} 565 | 566 | High temperature $\to$ uniform distribution 567 | 568 | Low temperature $\to$ `spiky' distribution, all mass on the largest element 569 | 570 | \end{block} 571 | 572 | \begin{tikzpicture}[overlay, remember picture] 573 | \node at (current page.north east)[anchor = north east, text width=4cm, yshift=-1.3cm] {\scriptsize Figure: \fullcite[p.~103]{Murphy.2012} \par}; 574 | \end{tikzpicture} 575 | 576 | 577 | \end{frame} 578 | 579 | 580 | \section{Loss function for softmax} 581 | 582 | \begin{frame}{Categorical cross-entropy loss (aka.\ negative log likelihood)} 583 | 584 | Vector representing the gold-standard categorical distribution over the classes/labels $1, \ldots, K$: 585 | $$ 586 | \bm{y} = (\bm{y_{[1]}}, \bm{y}_{[2]}, \ldots, \bm{y}_{[K]}) 587 | $$ 588 | Output from softmax: 589 | $$ 590 | \bm{\hat{y}} = (\bm{\hat{y}_{[1]}}, \bm{\hat{y}}_{[2]}, \ldots, \bm{\hat{y}}_{[K]}) 591 | $$ 592 | which is in fact $\bm{\hat{y}_{[i]}} = \Pr(y = i| \bm{x})$ 593 | 594 | 595 | \begin{block}{Cross entropy loss} 596 | $$ 597 | L_{\text{cross-entropy}} (\bm{\hat{y}, \bm{y}}) = 598 | - \sum_{k = 1}^{K} \bm{y}_{[k]} \log \left( \bm{\hat{y}}_{[k]} \right) 599 | $$ 600 | \end{block} 601 | \end{frame} 602 | 603 | \begin{frame}{Background: K-L divergence (also known as \emph{relative entropy})} 604 | 605 | Let $Y$ and $\hat{Y}$ be categorical random variables over same categories, with probability distributions $P(Y)$ and $Q(\hat{Y})$ 606 | \begin{align*} 607 | \mathbb{D}(P(Y) || Q(\hat{Y})) &= \mathbb{E}_{P(Y)} \left[ \log \frac{P(Y)}{Q(\hat{Y})} \right] \\ 608 | &= \mathbb{E}_{P(Y)} \left[ \log P(Y) - \log Q(\hat{Y}) \right] \\ 609 | &= \mathbb{E}_{P(Y)} \left[ \log P(Y)\right] - \mathbb{E}_{P(Y)} \left[ \log Q(\hat{Y}) \right] \\ 610 | &= - \mathbb{E}_{P(Y)} \left[ \log \frac{1}{P(Y)}\right] - \mathbb{E}_{P(Y)} \left[ \log Q(\hat{Y}) \right] \\ 611 | &= - \mathbb{H}_{P} (Y) - \mathbb{E}_{P(Y)} \left[ \log Q(\hat{Y}) \right] \\ 612 | \end{align*} 613 | 614 | \end{frame} 615 | 616 | 617 | 618 | \section{Stacking transformations and non-linearity} 619 | 620 | \begin{frame}{Stacking linear layers on top of each other --- still linear!} 621 | \vspace{-1em} 622 | $$ 623 | \bm{x} \in \mathbb{R}^{d_{in}} \qquad 624 | \bm{W^1} \in \mathbb{R}^{d_{in} \times d_1} \qquad 625 | \bm{b^1} \in \mathbb{R}^{d_1} \qquad 626 | \bm{W^2} \in \mathbb{R}^{d_1 \times d_{out}} \qquad 627 | \bm{b^2} \in \mathbb{R}^{d_{out}} \qquad 628 | $$ 629 | $$ 630 | f(\bm{x}) = \left( 631 | \bm{x} \bm{W^1} + \bm{b^1} 632 | \right) 633 | \bm{W^2} + \bm{b^2} 634 | $$ 635 | 636 | \begin{figure} 637 | \begin{tikzpicture} 638 | %\node (a1) [draw, circle, inner sep=0pt, minimum width=0.75cm, fill=green!20] {$a_1$}; 639 | \node (x) [constant] {$\bm{x}$}; 640 | \node (w) [param, below of=x] {$\bm{W^1}$}; 641 | \node (b) [param, below of=w] {$\bm{b^1}$}; 642 | 643 | \node (f1) [neuron, right of=w, xshift=1.5cm] {$\bm{x} \bm{W^1} + \bm{b^1}$}; 644 | \node (f2) [neuron, right of=f1, xshift=1.5cm] {$\bm{h^1} \bm{W^2} + \bm{b^2}$}; 645 | 646 | \node (w2) [param, below of=f2, xshift=-1.5cm, yshift=0cm] {$\bm{W^2}$}; 647 | \node (b2) [param, below of=f2, xshift=-0.5cm, yshift=-0.5cm] {$\bm{b^2}$}; 648 | 649 | \node (l) [neuron, right of=f2, xshift=1cm] {$L$}; 650 | \node (y) [constant, below of=f2, xshift=1.5cm] {$\bm{y}$}; 651 | 652 | \begin{scope}[thick, black, ->, >=latex] 653 | \draw (x) -- (f1); 654 | \draw (w) -- (f1); 655 | \draw (b) -- (f1); 656 | \draw (f1) -- (f2); 657 | \draw (f2) -- (l); 658 | \draw (w2) -- (f2); 659 | \draw (b2) -- (f2); 660 | \draw (y) -- (l); 661 | \end{scope} 662 | \end{tikzpicture} 663 | \caption{Computational graph; green circles are trainable parameters, gray are constant inputs} 664 | \end{figure} 665 | 666 | \end{frame} 667 | 668 | 669 | \begin{frame}{Adding non-linear function $g: \mathbb{R}^{d_1} \to \mathbb{R}^{d_1}$} 670 | \vspace{-1em} 671 | $$ 672 | f(\bm{x}) = g \left( 673 | \bm{x} \bm{W^1} + \bm{b^1} 674 | \right) 675 | \bm{W^2} + \bm{b^2} 676 | $$ 677 | 678 | \begin{figure} 679 | \begin{tikzpicture} 680 | %\node (a1) [draw, circle, inner sep=0pt, minimum width=0.75cm, fill=green!20] {$a_1$}; 681 | \node (x) [constant] {$\bm{x}$}; 682 | \node (w) [param, below of=x] {$\bm{W^1}$}; 683 | \node (b) [param, below of=w] {$\bm{b^1}$}; 684 | 685 | \node (f1) [neuron, right of=w, xshift=1.5cm] {$\bm{x} \bm{W^1} + \bm{b^1}$}; 686 | 687 | \node (g) [neuron, right of=f1, xshift=1.5cm] {$g$}; 688 | \node (f2) [neuron, right of=g, xshift=1.5cm] {$\bm{h^1} \bm{W^2} + \bm{b^2}$}; 689 | 690 | \node (w2) [param, below of=f2, xshift=-1.5cm, yshift=0cm] {$\bm{W^2}$}; 691 | \node (b2) [param, below of=f2, xshift=-0.5cm, yshift=-0.5cm] {$\bm{b^2}$}; 692 | 693 | \node (l) [neuron, right of=f2, xshift=1cm] {$L$}; 694 | \node (y) [constant, below of=f2, xshift=1.5cm] {$\bm{y}$}; 695 | 696 | \begin{scope}[thick, black, ->, >=latex] 697 | \draw (x) -- (f1); 698 | \draw (w) -- (f1); 699 | \draw (b) -- (f1); 700 | \draw (f1) -- (g); 701 | \draw (g) -- (f2); 702 | \draw (f2) -- (l); 703 | \draw (w2) -- (f2); 704 | \draw (b2) -- (f2); 705 | \draw (y) -- (l); 706 | \end{scope} 707 | \end{tikzpicture} 708 | \caption{Computational graph; green circles are trainable parameters, gray are constant inputs} 709 | \end{figure} 710 | 711 | \end{frame} 712 | 713 | 714 | \begin{frame}{Non-linear function $g$: Rectified linear unit (ReLU) activation} 715 | 716 | 717 | \begin{columns} 718 | 719 | \begin{column}{0.6\linewidth} 720 | 721 | $$ 722 | \mathrm{ReLU}(z) = 723 | \begin{cases} 724 | 0 & \quad \text{if } z < 0\\ 725 | z & \quad \text{if } z \geq 0 726 | \end{cases} 727 | $$ 728 | 729 | or \hspace{0.4em} $\mathrm{ReLU}(z) = \max(0, z)$ 730 | 731 | 732 | 733 | 734 | \end{column} 735 | 736 | \begin{column}{0.4\linewidth} 737 | \begin{figure} 738 | \begin{tikzpicture} 739 | 740 | \begin{axis}[ 741 | xmin = -5, xmax = 5, 742 | ymin = -5, ymax = 5, 743 | xtick distance = 5, 744 | ytick distance = 5, 745 | grid = both, 746 | minor tick num = 5, 747 | major grid style = {lightgray}, 748 | minor grid style = {lightgray!25}, 749 | width = \textwidth, 750 | height = \textwidth, 751 | legend pos = north west 752 | ] 753 | 754 | \addplot[ 755 | domain = -5:0, 756 | samples = 10, 757 | smooth, 758 | thick, 759 | blue, 760 | ] {0}; 761 | 762 | \addplot[ 763 | domain = 0:5, 764 | samples = 10, 765 | smooth, 766 | thick, 767 | blue, 768 | ] {x}; 769 | 770 | 771 | \end{axis} 772 | 773 | \end{tikzpicture} 774 | \caption{ReLU function} 775 | \end{figure} 776 | \end{column} 777 | \end{columns} 778 | 779 | 780 | \end{frame} 781 | 782 | 783 | 784 | \section*{Recap} 785 | 786 | \begin{frame}{Take aways} 787 | 788 | \begin{itemize} 789 | \item Binary classification as a linear function of words and a sigmoid 790 | \item Binary cross-entropy (logistic) loss 791 | \item Training as minimizing the loss using minibatch SGD and backpropagation 792 | \item Stacking layers and non-linear functions: MLP 793 | \item ReLU as a go-to activation function in NLP 794 | \end{itemize} 795 | 796 | \end{frame} 797 | 798 | 799 | 800 | \begin{frame}{License and credits} 801 | 802 | \begin{columns} 803 | \begin{column}{0.7\textwidth} 804 | Licensed under Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) 805 | \end{column} 806 | \begin{column}{0.2\textwidth} 807 | \includegraphics[width=0.9\linewidth]{img/cc-by-sa-icon.pdf} 808 | \end{column} 809 | \end{columns} 810 | 811 | \bigskip 812 | 813 | Credits 814 | 815 | \begin{scriptsize} 816 | 817 | Ivan Habernal 818 | 819 | Content from ACL Anthology papers licensed under CC-BY \url{https://www.aclweb.org/anthology} 820 | 821 | \end{scriptsize} 822 | 823 | \end{frame} 824 | 825 | 826 | 827 | \end{document} 828 | 829 | -------------------------------------------------------------------------------- /latex/lecture04/img/cc-by-sa-icon.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture04/img/cc-by-sa-icon.pdf -------------------------------------------------------------------------------- /latex/lecture04/img/logo-trusthlt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture04/img/logo-trusthlt.pdf -------------------------------------------------------------------------------- /latex/lecture04/img/temperatures.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture04/img/temperatures.png -------------------------------------------------------------------------------- /latex/lecture05/.gitignore: -------------------------------------------------------------------------------- 1 | dl4nlp2023-lecture05.pdf 2 | flags.tex 3 | pdf 4 | -------------------------------------------------------------------------------- /latex/lecture05/compile-pdf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Inspired by: https://tex.stackexchange.com/a/1501 4 | 5 | # current lecture file name 6 | lecture_filename="dl4nlp2023-lecture05" 7 | 8 | # create the output folder (might already exist) 9 | mkdir -p pdf 10 | 11 | # Compile the lecture version with pauses 12 | 13 | # set empty flags 14 | echo "" > flags.tex 15 | 16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 17 | biber pdf/${lecture_filename}-lecture.bcf 18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 20 | 21 | # Compile the handout (no slide unfolding) 22 | 23 | # set the flag 24 | echo "\handouttrue" > flags.tex 25 | 26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 27 | biber pdf/${lecture_filename}-handout.bcf 28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 30 | 31 | # Cleaning up temporary latex files 32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache 33 | -------------------------------------------------------------------------------- /latex/lecture05/img/cc-by-sa-icon.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture05/img/cc-by-sa-icon.pdf -------------------------------------------------------------------------------- /latex/lecture05/img/linear1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture05/img/linear1.png -------------------------------------------------------------------------------- /latex/lecture05/img/linear2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture05/img/linear2.png -------------------------------------------------------------------------------- /latex/lecture05/img/linear3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture05/img/linear3.png -------------------------------------------------------------------------------- /latex/lecture05/img/linear4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture05/img/linear4.png -------------------------------------------------------------------------------- /latex/lecture05/img/logo-trusthlt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture05/img/logo-trusthlt.pdf -------------------------------------------------------------------------------- /latex/lecture05/img/xor1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture05/img/xor1.pdf -------------------------------------------------------------------------------- /latex/lecture05/img/xor1.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 45 | 47 | 48 | 50 | image/svg+xml 51 | 53 | 54 | 55 | 56 | 57 | 62 | 68 | 74 | 80 | good 91 | bad 102 | not 113 | not good 124 | not bad 135 | 140 | 145 | 150 | 0 161 | 1 172 | 1 183 | 1 194 | good 205 | bad 216 | not good 227 | 233 | not bad 244 | 245 | 246 | -------------------------------------------------------------------------------- /latex/lecture06/.gitignore: -------------------------------------------------------------------------------- 1 | dl4nlp2023-lecture06.pdf 2 | flags.tex 3 | pdf 4 | -------------------------------------------------------------------------------- /latex/lecture06/compile-pdf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Inspired by: https://tex.stackexchange.com/a/1501 4 | 5 | # current lecture file name 6 | lecture_filename="dl4nlp2023-lecture06" 7 | 8 | # create the output folder (might already exist) 9 | mkdir -p pdf 10 | 11 | # Compile the lecture version with pauses 12 | 13 | # set empty flags 14 | echo "" > flags.tex 15 | 16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 17 | biber pdf/${lecture_filename}-lecture.bcf 18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 20 | 21 | # Compile the handout (no slide unfolding) 22 | 23 | # set the flag 24 | echo "\handouttrue" > flags.tex 25 | 26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 27 | biber pdf/${lecture_filename}-handout.bcf 28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 30 | 31 | # Cleaning up temporary latex files 32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache 33 | -------------------------------------------------------------------------------- /latex/lecture06/img/cc-by-sa-icon.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture06/img/cc-by-sa-icon.pdf -------------------------------------------------------------------------------- /latex/lecture06/img/logo-trusthlt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture06/img/logo-trusthlt.pdf -------------------------------------------------------------------------------- /latex/lecture06/img/rewe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture06/img/rewe.png -------------------------------------------------------------------------------- /latex/lecture07/.gitignore: -------------------------------------------------------------------------------- 1 | dl4nlp2023-lecture07.pdf 2 | flags.tex 3 | pdf 4 | -------------------------------------------------------------------------------- /latex/lecture07/compile-pdf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Inspired by: https://tex.stackexchange.com/a/1501 4 | 5 | # current lecture file name 6 | lecture_filename="dl4nlp2023-lecture07" 7 | 8 | # create the output folder (might already exist) 9 | mkdir -p pdf 10 | 11 | # Compile the lecture version with pauses 12 | 13 | # set empty flags 14 | echo "" > flags.tex 15 | 16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 17 | biber pdf/${lecture_filename}-lecture.bcf 18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 20 | 21 | # Compile the handout (no slide unfolding) 22 | 23 | # set the flag 24 | echo "\handouttrue" > flags.tex 25 | 26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 27 | biber pdf/${lecture_filename}-handout.bcf 28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 30 | 31 | # Cleaning up temporary latex files 32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache 33 | -------------------------------------------------------------------------------- /latex/lecture07/img/cc-by-sa-icon.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture07/img/cc-by-sa-icon.pdf -------------------------------------------------------------------------------- /latex/lecture07/img/logo-trusthlt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture07/img/logo-trusthlt.pdf -------------------------------------------------------------------------------- /latex/lecture08/.gitignore: -------------------------------------------------------------------------------- 1 | dl4nlp2023-lecture08.pdf 2 | flags.tex 3 | pdf 4 | -------------------------------------------------------------------------------- /latex/lecture08/compile-pdf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Inspired by: https://tex.stackexchange.com/a/1501 4 | 5 | # current lecture file name 6 | lecture_filename="dl4nlp2023-lecture08" 7 | 8 | # create the output folder (might already exist) 9 | mkdir -p pdf 10 | 11 | # Compile the lecture version with pauses 12 | 13 | # set empty flags 14 | echo "" > flags.tex 15 | 16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 17 | biber pdf/${lecture_filename}-lecture.bcf 18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 20 | 21 | # Compile the handout (no slide unfolding) 22 | 23 | # set the flag 24 | echo "\handouttrue" > flags.tex 25 | 26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 27 | biber pdf/${lecture_filename}-handout.bcf 28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 30 | 31 | # Cleaning up temporary latex files 32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache 33 | -------------------------------------------------------------------------------- /latex/lecture08/img/cc-by-sa-icon.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/cc-by-sa-icon.pdf -------------------------------------------------------------------------------- /latex/lecture08/img/logo-trusthlt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/logo-trusthlt.pdf -------------------------------------------------------------------------------- /latex/lecture08/img/seq2seq.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/seq2seq.pdf -------------------------------------------------------------------------------- /latex/lecture08/img/seq2seq_attention_motivation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/seq2seq_attention_motivation.pdf -------------------------------------------------------------------------------- /latex/lecture08/img/seq2seq_attention_t1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/seq2seq_attention_t1.pdf -------------------------------------------------------------------------------- /latex/lecture08/img/seq2seq_attn_encdec.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/seq2seq_attn_encdec.pdf -------------------------------------------------------------------------------- /latex/lecture08/img/seq2seq_selfattn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/seq2seq_selfattn.pdf -------------------------------------------------------------------------------- /latex/lecture08/img/sequence_classification.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/sequence_classification.pdf -------------------------------------------------------------------------------- /latex/lecture08/img/sequence_labeling.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/sequence_labeling.pdf -------------------------------------------------------------------------------- /latex/lecture08/img/sequence_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/sequence_length.png -------------------------------------------------------------------------------- /latex/lecture08/img/sequence_to_sequence.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/sequence_to_sequence.pdf -------------------------------------------------------------------------------- /latex/lecture08/img/sequence_to_sequence_anno.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/sequence_to_sequence_anno.pdf -------------------------------------------------------------------------------- /latex/lecture08/img/sequence_to_sequence_boxed.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/sequence_to_sequence_boxed.pdf -------------------------------------------------------------------------------- /latex/lecture08/img/translation_heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/translation_heatmap.png -------------------------------------------------------------------------------- /latex/lecture08/img/ukp_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/ukp_logo.png -------------------------------------------------------------------------------- /latex/lecture08/img/variable_input_output.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture08/img/variable_input_output.pdf -------------------------------------------------------------------------------- /latex/lecture09/.gitignore: -------------------------------------------------------------------------------- 1 | dl4nlp2023-lecture08.pdf 2 | flags.tex 3 | pdf 4 | -------------------------------------------------------------------------------- /latex/lecture09/compile-pdf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Inspired by: https://tex.stackexchange.com/a/1501 4 | 5 | # current lecture file name 6 | lecture_filename="dl4nlp2023-lecture09" 7 | 8 | # create the output folder (might already exist) 9 | mkdir -p pdf 10 | 11 | # Compile the lecture version with pauses 12 | 13 | # set empty flags 14 | echo "" > flags.tex 15 | 16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 17 | biber pdf/${lecture_filename}-lecture.bcf 18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 20 | 21 | # Compile the handout (no slide unfolding) 22 | 23 | # set the flag 24 | echo "\handouttrue" > flags.tex 25 | 26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 27 | biber pdf/${lecture_filename}-handout.bcf 28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 30 | 31 | # Cleaning up temporary latex files 32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache 33 | -------------------------------------------------------------------------------- /latex/lecture09/dl4nlp2023-lecture09.tex: -------------------------------------------------------------------------------- 1 | % !TeX program = lualatex 2 | % !BIB program = biber 3 | % Lualatex is important to render Fira fonts; with pdflatex it's just the regular one 4 | % ratio 16:9 -- https://tex.stackexchange.com/questions/14336/ 5 | 6 | % compile two versions, inspired by https://tex.stackexchange.com/a/1501 7 | % use the script "compile-pdf.sh" 8 | \newif\ifhandout 9 | % if flags.tex does not exist, create an empty file to be able to compile in TeXstudio 10 | \input{flags} 11 | 12 | \ifhandout 13 | \documentclass[12pt,aspectratio=169,handout]{beamer} 14 | \else 15 | \documentclass[12pt,aspectratio=169]{beamer} 16 | \fi 17 | 18 | % adjust for 16:9 19 | % https://tex.stackexchange.com/questions/354022/modifying-the-margins-of-all-slides-in-beamer 20 | \setbeamersize{text margin left=0.3cm,text margin right=1.0cm} 21 | 22 | %\usepackage{xcolor} 23 | 24 | %%% better TOC 25 | \usetheme[subsectionpage=progressbar]{metropolis} 26 | 27 | % name in footer 28 | \setbeamertemplate{frame numbering}{\insertframenumber ~ | Dr.\ Martin Tutek} 29 | 30 | % blocks with background globally 31 | \metroset{block=fill} 32 | 33 | % adjust the background to be completely white 34 | \setbeamercolor{background canvas}{bg=white} 35 | 36 | % typeset mathematics on serif 37 | \usefonttheme[onlymath]{serif} 38 | 39 | % better bibliography using biber as backend 40 | \usepackage[natbib=true,backend=biber,style=authoryear-icomp,maxbibnames=30,maxcitenames=2,uniquelist=false,giveninits=true,doi=false,url=false,dashed=false,isbn=false]{biblatex} 41 | % shared bibliography 42 | \addbibresource{../dl4nlp-bibliography.bib} 43 | % disable "ibid" for repeated citations 44 | \boolfalse{citetracker} 45 | 46 | \definecolor{76abdf}{RGB}{118, 171, 223} 47 | 48 | \setbeamercolor{frametitle}{bg=76abdf, fg=white} 49 | 50 | \newcounter{saveenumi} 51 | \newcommand{\seti}{\setcounter{saveenumi}{\value{enumi}}} 52 | \newcommand{\conti}{\setcounter{enumi}{\value{saveenumi}}} 53 | 54 | \resetcounteronoverlays{saveenumi} 55 | 56 | \usepackage{xspace} 57 | % Emojis 58 | \usepackage{emoji} 59 | % Figs 60 | \usepackage{graphicx} 61 | \graphicspath{ {./img/} } 62 | 63 | 64 | % for derivatives, https://tex.stackexchange.com/a/412442 65 | \usepackage{physics} 66 | 67 | \usepackage{tikz} 68 | \usetikzlibrary{matrix, positioning} 69 | \usetikzlibrary{angles,quotes} % for angles 70 | \usetikzlibrary{backgrounds} % background 71 | \usetikzlibrary{decorations.pathreplacing} % curly braces 72 | \usetikzlibrary{calligraphy} 73 | \usetikzlibrary{calc} % for neural nets 74 | 75 | % for plotting functions 76 | \usepackage{pgfplots} 77 | \usepgfplotslibrary{dateplot} 78 | 79 | % sub-figures 80 | \usepackage{caption} 81 | \usepackage{subcaption} 82 | 83 | % Checkmark, xmark 84 | \usepackage{pifont}% http://ctan.org/pkg/pifont 85 | 86 | % book tabs 87 | \usepackage{booktabs} 88 | 89 | % caption* 90 | \usepackage{caption} 91 | 92 | 93 | % show TOC at every section start 94 | \AtBeginSection{ 95 | \frame{ 96 | \vspace{2em} 97 | \sectionpage 98 | \hspace*{2.2em}\begin{minipage}{10cm} 99 | \tableofcontents[currentsection] 100 | \end{minipage} 101 | } 102 | } 103 | 104 | % argmin, argmax 105 | \usepackage{amssymb}% http://ctan.org/pkg/amssymb 106 | \usepackage{amsmath} 107 | 108 | \DeclareMathOperator*{\argmax}{arg\!\max} 109 | \DeclareMathOperator*{\argmin}{arg\!\min} 110 | % softmax 111 | \DeclareMathOperator*{\softmax}{soft\!\max} 112 | % RNN 113 | \DeclareMathOperator*{\rnn}{RNN} 114 | % RNN star 115 | \DeclareMathOperator*{\rnnstar}{RNN^{*}} 116 | % bi-RNN 117 | \DeclareMathOperator*{\birnn}{biRNN} 118 | 119 | % bold math 120 | \usepackage{bm} 121 | 122 | % for \mathclap 123 | \usepackage{mathtools} 124 | 125 | % algorithms 126 | \usepackage[noend]{algpseudocode} 127 | 128 | 129 | % for neurons and layers in tikz 130 | \tikzset{ 131 | neuron/.style={draw, rectangle, inner sep=2pt, minimum width=0.75cm, fill=blue!20}, 132 | param/.style={draw, rectangle, inner sep=2pt, minimum width=0.75cm, fill=green!20}, 133 | constant/.style={draw, rectangle, inner sep=2pt, minimum width=0.75cm, fill=black!15}, 134 | state/.style={rectangle, inner sep=2pt, minimum width=0.75cm, fill=black!5}, 135 | } 136 | 137 | % for strike-through text 138 | \usepackage[normalem]{ulem} 139 | 140 | 141 | \title{Deep Learning for Natural Language Processing} 142 | \subtitle{Lecture 9 -- Text generation 3: Transformers} 143 | \date{June 13, 2023} 144 | \author{Dr.\ Martin Tutek} 145 | \institute{Ubiquitous Knowledge Processing \hfill \includegraphics[height=1.cm]{img/ukp_logo.png} \\ 146 | Department of Computer Science\\ 147 | Technical University of Darmstadt \hfill \href{https://www.informatik.tu-darmstadt.de/ukp/ukp_home/index.en.jsp}{\underline{UKP Web}}} 148 | %\titlegraphic{\hfill } 149 | 150 | \begin{document} 151 | 152 | \maketitle 153 | 154 | \begin{frame}{Recap} 155 | In the previous lecture we: 156 | \begin{itemize} 157 | \item Introduced the encoder-decoder architecture \& why we need it 158 | \item Defined the three broad classes of NLP problems 159 | \item Shown that RNNs have problems when modeling long dependencies 160 | \item Introduced the attention mechanism, its abstraction and design choices 161 | \end{itemize} 162 | \end{frame} 163 | 164 | 165 | \begin{frame}{Recap: Encoder-decoder with attention} 166 | \begin{center} 167 | \begin{figure}[h] 168 | \includegraphics[height=7cm]{seq2seq_attn_encdec.pdf} 169 | \end{figure} 170 | \end{center} 171 | \end{frame} 172 | 173 | 174 | \begin{frame}{Motivation} 175 | 176 | MLP -- fixed input sequence length 177 | 178 | RNN -- works well with \textbf{shorter} sequences 179 | 180 | RNN + attention -- works well with both \textbf{shorter and longer} sequences 181 | 182 | \pause 183 | 184 | \begin{itemize} 185 | \item Why not use \textbf{only} attention? 186 | \end{itemize} 187 | 188 | \pause 189 | 190 | \begin{center} 191 | \begin{figure}[h] 192 | \includegraphics[height=2cm]{aiayn} 193 | \end{figure} 194 | \end{center} 195 | 196 | \end{frame} 197 | 198 | \begin{frame}{Prerequisites for attention-only networks} 199 | What do we \textbf{gain} from recurrent networks? 200 | \pause 201 | 202 | \begin{itemize} 203 | \item \textbf{Memory cells}: contain summaries of sequence read \textit{so far} 204 | \pause 205 | \begin{itemize} 206 | \item \textbf{However}, they have \textbf{limited} capacity -- we complement them with attention 207 | \end{itemize} 208 | \pause 209 | \item \textbf{Position} of a word in sequence 210 | \pause 211 | \begin{itemize} 212 | \item For each hidden state $s_{i}$, the current word embedding $x_i$ is added to the previous state $s_{i-1}$ -- the network can distinguish \textbf{word order} 213 | \pause 214 | \item \textbf{However}, it takes $n$ recurrence operations to process a sequence 215 | \end{itemize} 216 | 217 | \end{itemize} 218 | \pause 219 | 220 | Do recurrent networks have any other \textbf{drawbacks}? 221 | 222 | \pause 223 | 224 | \begin{itemize} 225 | \item They \textbf{scale poorly} -- LSTMs are problematic to scale deeper than 4-8 layers 226 | \item \textbf{Closed vocabulary} -- so far, we assumed one word = one vector (no BPE) 227 | \end{itemize} 228 | \pause 229 | 230 | How to make attention-only networks work? 231 | 232 | \end{frame} 233 | 234 | \section{The Transformer} 235 | 236 | \begin{frame}{The Transformer (\cite{Vaswani.et.al.2017})} 237 | \begin{columns}[T] % align columns 238 | 239 | \begin{column}{.48\textwidth} 240 | 241 | \begin{figure}[h] 242 | \includegraphics[height=7cm]{anno_transformer} 243 | \end{figure} 244 | \end{column} 245 | 246 | \begin{column}{.48\textwidth} 247 | What are the unknown elements? 248 | \pause 249 | \begin{itemize} 250 | \item \textbf{Multi-head} attention 251 | \item Add \& Norm 252 | \item \textbf{Positional} embeddings 253 | \pause 254 | \item \textbf{Open vocabulary} through BPE 255 | \end{itemize} 256 | \end{column} 257 | 258 | \end{columns} 259 | 260 | \end{frame} 261 | 262 | \subsection{Contextualized representations} 263 | 264 | \begin{frame}{Contextualized representations} 265 | 266 | Recall: \textbf{limitations} of word embeddings 267 | \begin{block}{Polysemy, context independent representation} 268 | Some words have obvious multiple senses 269 | 270 | A \emph{bank} may refer to a financial institution or to the side of a river, a \emph{star} may an abstract shape, a celebrity, an astronomical entity 271 | \end{block} 272 | 273 | \pause 274 | 275 | How do recurrent networks handle contextualization? 276 | 277 | \pause 278 | $$ 279 | s_i = f_{\text{rnn}} (s_{i-1}, x_i) 280 | $$ 281 | 282 | \begin{itemize} 283 | \item Each state acts as a representation of the sequence \textbf{so far} 284 | \pause 285 | \end{itemize} 286 | 287 | \end{frame} 288 | 289 | \begin{frame}{Contextualized representations} 290 | $$ 291 | s_i = f_{\text{rnn}} (s_{i-1}, x_i) 292 | $$ 293 | 294 | \begin{itemize} 295 | \item Each state acts as a representation of the sequence \textbf{so far} 296 | \pause 297 | \begin{itemize} 298 | \item Recall: \textbf{bidirectional} RNNs (left- and right-hand context) 299 | \item A state contains \textbf{cues} about the meaning of the current word \textbf{in context} 300 | \pause 301 | \end{itemize} 302 | \vspace{1em} 303 | \item \textbf{However}, the state has to act as both 304 | \begin{enumerate} 305 | \item A summary of the entire sequence 306 | \item The meaning of the current word in context 307 | \end{enumerate} 308 | \end{itemize} 309 | 310 | \end{frame} 311 | 312 | \begin{frame}{Contextualized representations} 313 | 314 | \begin{columns}[T] % align columns 315 | 316 | \begin{column}{.48\textwidth} 317 | 318 | \begin{figure}[h] 319 | \includegraphics[height=7cm]{seq2seq_attention_t1.pdf} 320 | \end{figure} 321 | \end{column} 322 | 323 | \begin{column}{.48\textwidth} 324 | Step $1$ of encoder-decoder attention: 325 | \pause 326 | \begin{itemize} 327 | \item We obtain relevant information \textbf{for current state} from input sequence 328 | \pause 329 | \item This result of the attention operator should also contain \textbf{contextual cues} 330 | \end{itemize} 331 | \end{column} 332 | 333 | \end{columns} 334 | 335 | \end{frame} 336 | 337 | 338 | \begin{frame}{Contextualized representations} 339 | 340 | \begin{columns}[T] % align columns 341 | 342 | \begin{column}{.48\textwidth} 343 | 344 | \begin{figure}[h] 345 | \includegraphics[height=7cm]{seq2seq_attention_t2.pdf} 346 | \end{figure} 347 | \end{column} 348 | \pause 349 | \begin{column}{.48\textwidth} 350 | \begin{figure}[h] 351 | \includegraphics[height=7cm]{seq2seq_attention_t3.pdf} 352 | \end{figure} 353 | \end{column} 354 | 355 | \end{columns} 356 | 357 | \end{frame} 358 | 359 | \begin{frame}{Contextualized representations} 360 | Why not \textbf{cut out the middleman} (RNN)? 361 | \pause 362 | \begin{itemize} 363 | \item We use the RNN state as the \textbf{query} for attention 364 | \pause 365 | \item We could instead use the input \textbf{word representation} 366 | \end{itemize} 367 | \pause 368 | 369 | Recall: scaled dot-product attention 370 | 371 | \noindent\begin{minipage}{0.4\textwidth} 372 | $$ 373 | a = \sum_i^n \alpha_i v_i 374 | $$ 375 | \end{minipage}% 376 | \begin{minipage}{0.2\textwidth} 377 | \end{minipage} 378 | \begin{minipage}{0.4\textwidth} 379 | $$ 380 | \hat{\alpha}_i = \frac{q^T \cdot k_i}{\sqrt{d_{\text{k} } } } 381 | $$ 382 | \end{minipage} 383 | \pause 384 | 385 | Recall: what are the query, keys \& values (in encoder-decoder attention)? 386 | 387 | \noindent\begin{minipage}{0.29\textwidth} 388 | \vspace{1em} 389 | $$ 390 | q = f_q(s^{\text{dec}}_t) 391 | $$ 392 | \end{minipage}% 393 | \begin{minipage}{0.29\textwidth} 394 | $$ 395 | K = f_k(\{s^{\text{enc}}_i\}_{i=1}^n) 396 | $$ 397 | \end{minipage} 398 | \begin{minipage}{0.29\textwidth} 399 | $$ 400 | V = f_v(\{s^{\text{enc}}_i\}_{i=1}^n) 401 | $$ 402 | \end{minipage} 403 | 404 | \pause 405 | Where $f_q, f_k, f_v$ are arbitrary functions (neural network layers). 406 | 407 | \end{frame} 408 | 409 | \subsection{The Transformer attention block} 410 | 411 | \begin{frame}{The Transformer attention block} 412 | 413 | \begin{columns}[T] 414 | \begin{column}{.48\textwidth} 415 | 416 | \begin{figure}[h] 417 | \includegraphics[height=7cm]{anno_transformer} 418 | \end{figure} 419 | \end{column} 420 | \begin{column}{.48\textwidth} 421 | \textbf{Encoder} part of the Transformer block 422 | 423 | \begin{itemize} 424 | \item Inputs: $\{\bm{x}^l_i\}_{i=1}^n; \quad \bm{x}_i \in \mathbb{R}^{d_m}$ 425 | \item $x^0_i \to \text{word embeddings}$ 426 | \pause 427 | \end{itemize} 428 | 429 | Goal: \textbf{contextualize} word embeds. 430 | 431 | \begin{enumerate} 432 | \pause 433 | \item Transform \textbf{each} embedding to its query, key and value reprs. 434 | \pause 435 | \item Apply \textbf{pairwise} attention between all inputs 436 | \pause 437 | \item Use the outputs as word embeddings for \textbf{next layer} 438 | \end{enumerate} 439 | \end{column} 440 | \end{columns} 441 | \end{frame} 442 | 443 | \begin{frame}{The Transformer attention block} 444 | 445 | \begin{enumerate} 446 | \item Each layer $l$ has its own query, key and value linear transformation 447 | $$ 448 | \bm{W}^l_q, \bm{W}^l_k, \bm{W}^l_v \in \mathbb{R}^{d_m \times d_m} 449 | $$ 450 | \pause 451 | \item Transform the inputs of the current layer $\{\bm{x}^l_i\}$ into the keys, queries and values 452 | $$ 453 | \bm{Q} = \bm{W}_q (\{\bm{x}^l_i\}) \quad \bm{K} = \bm{W}_k (\{\bm{x}^l_i\}) \quad \bm{V} = \bm{W}_v (\{\bm{x}^l_i\}) 454 | $$ 455 | \pause 456 | \item Apply scaled dot-product attention 457 | $$ 458 | \text{Attention} (\bm{Q},\bm{K},\bm{V}) = \text{softmax} \left( \frac{\bm{Q}\bm{K}^T}{\sqrt{d_m}}\right) \bm{V} 459 | $$ 460 | 461 | \end{enumerate} 462 | 463 | 464 | \end{frame} 465 | 466 | 467 | \begin{frame}{The Transformer attention block: scaled dot-product} 468 | \begin{columns}[T] % align columns 469 | 470 | \begin{column}{.48\textwidth} 471 | 472 | \begin{figure}[h] 473 | \includegraphics[height=5cm]{anno_transformer_attn_block} 474 | \caption*{Figure from \cite{Vaswani.et.al.2017}} 475 | \end{figure} 476 | \end{column} 477 | 478 | \begin{column}{.48\textwidth} 479 | $$ 480 | \text{Attention} (\bm{Q},\bm{K},\bm{V}) = \text{softmax} \left( \frac{\bm{Q}\bm{K}^T}{\sqrt{d_m}} \right) \bm{V} 481 | $$ 482 | \pause 483 | \begin{itemize} 484 | \item Matmul between $\bm{Q}$ and $\bm{K} \to$ \textbf{energy} 485 | \pause 486 | \item Masking (why?) 487 | \begin{itemize} 488 | \item We might not want to attend to \textbf{all} tokens 489 | \end{itemize} 490 | \pause 491 | \item Output $=$ weighted sum 492 | \end{itemize} 493 | \end{column} 494 | 495 | \end{columns} 496 | 497 | \end{frame} 498 | 499 | 500 | \begin{frame}{The Transformer attention block: multi-head attention} 501 | 502 | \begin{columns}[T] 503 | \begin{column}{.48\textwidth} 504 | 505 | \begin{figure}[h] 506 | \includegraphics[height=7cm]{anno_transformer} 507 | \end{figure} 508 | \end{column} 509 | \begin{column}{.48\textwidth} 510 | However: we are using \textbf{multi-head} attention! 511 | \vspace{1em} 512 | \pause 513 | 514 | Idea: there could be \textbf{multiple aspects} in which two tokens can be similar 515 | \pause 516 | \begin{itemize} 517 | \item Intuition: \textit{each} hidden dimension $\approx$ one linguistic feature 518 | \item $\to$ perform \textbf{multiple} energy computations 519 | \end{itemize} 520 | \end{column} 521 | \end{columns} 522 | \end{frame} 523 | 524 | 525 | \begin{frame}{The Transformer attention block: multi-head attention} 526 | 527 | \textbf{Recall:} Transform the inputs of the current layer $\{\bm{x}^l_i\}$ into the keys, queries and values 528 | $$ 529 | \bm{Q} = \bm{W}_q (\{\bm{x}^l_i\}) \quad \bm{K} = \bm{W}_k (\{\bm{x}^l_i\}) \quad \bm{V} = \bm{W}_v (\{\bm{x}^l_i\}) 530 | $$ 531 | 532 | \pause 533 | Each matrix $\bm{Q}, \bm{K}, \bm{V} \in \mathbb{R}^{n \times d_m}$, where $d_m$ is the \textit{model dimension}. 534 | 535 | \pause 536 | \textbf{Split} each query/key/value into $h$ \textbf{heads} (aspects) by \textit{reshaping}. 537 | 538 | $$ 539 | \bm{Q}, \bm{K}, \bm{V} \in \mathbb{R}^{n \times d_m} \to \bm{Q}, \bm{K}, \bm{V} \in \mathbb{R}^{n \times h \times d_m/h} 540 | $$ 541 | \pause 542 | \begin{itemize} 543 | \item \textbf{Note}: $d_m$ \textbf{has} to be divisible by $h$ 544 | \end{itemize} 545 | \pause 546 | Remaining process continues as usual. 547 | 548 | \end{frame} 549 | 550 | \begin{frame}{The Transformer attention block: multi-head attention} 551 | 552 | \textbf{Recall}: 3. Apply scaled dot-product attention 553 | $$ 554 | \text{Attention} (\bm{Q},\bm{K},\bm{V}) = \text{softmax} \left( \frac{\bm{Q}\bm{K}^T}{\sqrt{d_m}}\right) \bm{V} 555 | $$ 556 | 557 | \pause 558 | 559 | Apply attention $h$ times \textbf{in parallel}, then \textbf{concatenate} the results. 560 | 561 | \pause 562 | 563 | 564 | $$ 565 | \text{Attention}_j (\bm{Q}_j, \bm{K}_j, \bm{V}_j) = \text{softmax} \left( \frac{\bm{Q}_j \bm{K}_j^T}{ \sqrt{d_m / h} } \right) \bm{V}_j 566 | $$ 567 | 568 | \pause 569 | 570 | Where $ \{ \bm{Q}, \bm{K}, \bm{V} \}^h_{j=1} $ are different \textit{heads}. 571 | 572 | \end{frame} 573 | 574 | \begin{frame}{The Transformer attention block: multi-head attention} 575 | 576 | \begin{columns}[T] 577 | \begin{column}{.48\textwidth} 578 | 579 | \begin{figure}[h] 580 | \includegraphics[height=7cm]{anno_trf_multihead} 581 | \end{figure} 582 | \end{column} 583 | \begin{column}{.48\textwidth} 584 | Although this entire process happens behind the scenes, we will still refer to (multi-head) attention as 585 | 586 | $$ 587 | \text{Attention} (\bm{Q},\bm{K},\bm{V}) = \text{softmax} \left( \frac{\bm{Q}\bm{K}^T}{\sqrt{d_m}}\right) \bm{V} 588 | $$ 589 | 590 | for brevity. 591 | \end{column} 592 | \end{columns} 593 | 594 | \end{frame} 595 | 596 | \begin{frame}{The Transformer attention block: residual connection} 597 | 598 | \begin{columns}[T] 599 | \begin{column}{.48\textwidth} 600 | 601 | \begin{figure}[h] 602 | \includegraphics[height=7cm]{anno_trf_hlattn.png} 603 | \end{figure} 604 | \end{column} 605 | \begin{column}{.48\textwidth} 606 | We use \textit{residual connections} with the input of the layer 607 | \begin{enumerate} 608 | \item $\hat{x}^l$ is the output of attention 609 | $$ 610 | \hat{x}^l = \text{Attention} (\bm{Q}^l,\bm{K}^l,\bm{V}^l) 611 | $$ 612 | \item We apply the residual connection and normalize 613 | $$ 614 | x^{l*} = \text{LayerNorm} ( x^l + \hat{x}^l ) 615 | $$ 616 | \seti 617 | \end{enumerate} 618 | \end{column} 619 | \end{columns} 620 | 621 | \end{frame} 622 | 623 | \begin{frame}{The Transformer attention block: position-wise linear layer} 624 | 625 | \begin{columns}[T] 626 | \begin{column}{.48\textwidth} 627 | 628 | \begin{figure}[h] 629 | \includegraphics[height=7cm]{anno_trf_hllinear.png} 630 | \end{figure} 631 | \end{column} 632 | \begin{column}{.48\textwidth} 633 | \begin{enumerate} 634 | \conti 635 | \item We apply an extra \textbf{linear transformation} to each individual representation 636 | 637 | $$ 638 | x^{l+1} = \text{LayerNorm} (x^{l*} + f^l_{hh} (x^{l*})) 639 | $$ 640 | 641 | Where $f_{hh}$ is an arbitrary transformation (single hidden layer NN) 642 | 643 | \item We use $x^{l+1}$ as the input to the \textbf{next} layer $l+1$ 644 | \end{enumerate} 645 | \end{column} 646 | \end{columns} 647 | 648 | \end{frame} 649 | 650 | \subsection{Byte-pair encodings} 651 | 652 | \begin{frame}{Byte-pair encodings} 653 | \textbf{Recall}: sub-word embeddings 654 | 655 | \begin{block}{Sub-word embeddings} 656 | Each character $n-$gram has its own embedding. 657 | 658 | Resolves the issues of \textbf{rare words}, \textbf{typos} and doesn't ignore the \textbf{morphology} of each word. 659 | 660 | However -- it scales poorly (there are \textbf{many} character $n-$grams) 661 | \end{block} 662 | 663 | \pause 664 | 665 | \textbf{Byte pair encodings} -- characters ($1$-grams / \textit{bytes}) can represent \textbf{any} word. 666 | 667 | 668 | \end{frame} 669 | 670 | \begin{frame}{Byte-pair encodings} 671 | 672 | \begin{block}{Byte-pair encodings} 673 | Start at \textbf{character} level. 674 | 675 | Merge the two \textbf{most frequently co-occurring} characters into a \textbf{new character}. 676 | 677 | Continue until you reach desired vocabulary size. 678 | \textbf{Each word} will always be represented. 679 | 680 | \end{block} 681 | 682 | \pause 683 | 684 | \textbf{Variants}: WordPiece, SentencePiece, subword-nmt (\href{https://github.com/google/sentencepiece}{\underline{GitHub}}) 685 | 686 | \pause 687 | 688 | The differences are in the \textbf{merging criterion}: 689 | \pause 690 | \begin{itemize} 691 | \item \cite{Sennrich.et.al.2016.ACL} use \textbf{frequency} of co-occurrence; 692 | \item \cite{kudo2018subword} trains a \textbf{unigram language model}. 693 | \end{itemize} 694 | 695 | \end{frame} 696 | 697 | \subsection{Positional embeddings} 698 | 699 | \begin{frame}{Positional embeddings} 700 | The Transformer processes all tokens \textbf{in parallel} -- there is \textbf{no information} about word order which in RNNs originated from recurrence. 701 | 702 | \pause 703 | 704 | \textbf{Idea}: use functions which depend on \textbf{position of token in sequence}. The closer the tokens, the higher the similarity of the functions. 705 | 706 | \pause 707 | 708 | \begin{itemize} 709 | \item Sine and cosine waves 710 | $$ 711 | PE_{(pos, 2i)} = \underbrace{\text{sin} ( \text{pos} / 10000^{2i / d_m})}_{\text{Even dimensions}} 712 | $$ 713 | \pause 714 | $$ 715 | PE_{(pos, 2i+1)} = \underbrace{\text{cos} ( \text{pos} / 10000^{2i / d_m})}_{\text{Even dimensions}} 716 | $$ 717 | \pause 718 | \item We \textbf{sum} the positional embedding vector to the token embedding 719 | \end{itemize} 720 | 721 | \end{frame} 722 | 723 | \begin{frame}{Positional embeddings} 724 | \begin{center} 725 | \begin{figure}[h] 726 | \includegraphics[height=5cm]{positional_embs} 727 | \end{figure} 728 | \end{center} 729 | \end{frame} 730 | 731 | 732 | \begin{frame}{Positional embeddings} 733 | Alternative: \textbf{trained} positional embeddings 734 | 735 | \pause 736 | \begin{itemize} 737 | \item Similar to word embeddings (byte pair embeddings) 738 | \item We randomly initialize a \textbf{position embedding matrix} and train it along with our model 739 | \pause 740 | \begin{itemize} 741 | \item \underline{Issues}? 742 | \pause 743 | \item How \textbf{large} is this position embedding matrix? 744 | \item What if test data contains sequences \textbf{longer} than training data? 745 | \end{itemize} 746 | \end{itemize} 747 | 748 | \end{frame} 749 | 750 | 751 | \section*{Recap} 752 | 753 | % **Content** 754 | % 755 | %* Vanilla RNNs (and maybe vanishing/exploding gradient?) 756 | %* LSTM cells 757 | %* Bi-Directional LSTMs 758 | %* Domain adaptation and multi-task learning (?) 759 | % 760 | %**Notes** 761 | % 762 | %efficiency, bidirectionality, multi-layer RNNs, how to apply to different tasks, how to ensure no data leakage 763 | %connection to LMs (using RNNs)? 764 | % 765 | %* Domain adaptation and multi-task learning (?) - should be somewhere early too 766 | 767 | \begin{frame}{Takeaways} 768 | 769 | \begin{itemize} 770 | \item Transformer networks are \textbf{fully attentional networks} 771 | \begin{itemize} 772 | \item More efficient than RNNs (process tokens in parallel) 773 | \item Scale better than RNNs (deeper networks) 774 | \end{itemize} 775 | \item Multi-head attention 776 | \begin{itemize} 777 | \item Split each token representation into $h$ parts, perform $h$ attention operations in parallel 778 | \item Increased expressivity 779 | \end{itemize} 780 | 781 | \item They require \textbf{positional embeddings} 782 | \begin{itemize} 783 | \item Parallel processing $=$ no information about word position 784 | \end{itemize} 785 | \item Byte pair encoding allows for \textbf{open vocabulary} 786 | \end{itemize} 787 | 788 | \end{frame} 789 | 790 | 791 | 792 | \begin{frame}{License and credits} 793 | 794 | \begin{columns} 795 | \begin{column}{0.7\textwidth} 796 | Licensed under Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) 797 | \end{column} 798 | \begin{column}{0.2\textwidth} 799 | \includegraphics[width=0.9\linewidth]{img/cc-by-sa-icon.pdf} 800 | \end{column} 801 | \end{columns} 802 | 803 | \bigskip 804 | 805 | Credits 806 | 807 | \begin{scriptsize} 808 | 809 | Martin Tutek 810 | 811 | Content from ACL Anthology papers licensed under CC-BY \url{https://www.aclweb.org/anthology} 812 | 813 | 814 | \end{scriptsize} 815 | 816 | \end{frame} 817 | 818 | 819 | 820 | \end{document} 821 | 822 | -------------------------------------------------------------------------------- /latex/lecture09/img/aiayn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/aiayn.png -------------------------------------------------------------------------------- /latex/lecture09/img/anno_transformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/anno_transformer.png -------------------------------------------------------------------------------- /latex/lecture09/img/anno_transformer_attn_block.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/anno_transformer_attn_block.png -------------------------------------------------------------------------------- /latex/lecture09/img/anno_trf_hlattn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/anno_trf_hlattn.png -------------------------------------------------------------------------------- /latex/lecture09/img/anno_trf_hllinear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/anno_trf_hllinear.png -------------------------------------------------------------------------------- /latex/lecture09/img/anno_trf_multihead.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/anno_trf_multihead.png -------------------------------------------------------------------------------- /latex/lecture09/img/cc-by-sa-icon.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/cc-by-sa-icon.pdf -------------------------------------------------------------------------------- /latex/lecture09/img/positional_embs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/positional_embs.png -------------------------------------------------------------------------------- /latex/lecture09/img/seq2seq_attention_t1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/seq2seq_attention_t1.pdf -------------------------------------------------------------------------------- /latex/lecture09/img/seq2seq_attention_t2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/seq2seq_attention_t2.pdf -------------------------------------------------------------------------------- /latex/lecture09/img/seq2seq_attention_t3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/seq2seq_attention_t3.pdf -------------------------------------------------------------------------------- /latex/lecture09/img/seq2seq_attn_encdec.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/seq2seq_attn_encdec.pdf -------------------------------------------------------------------------------- /latex/lecture09/img/ukp_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture09/img/ukp_logo.png -------------------------------------------------------------------------------- /latex/lecture10/.gitignore: -------------------------------------------------------------------------------- 1 | dl4nlp2023-lecture08.pdf 2 | flags.tex 3 | pdf 4 | -------------------------------------------------------------------------------- /latex/lecture10/compile-pdf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Inspired by: https://tex.stackexchange.com/a/1501 4 | 5 | # current lecture file name 6 | lecture_filename="dl4nlp2023-lecture10" 7 | 8 | # create the output folder (might already exist) 9 | mkdir -p pdf 10 | 11 | # Compile the lecture version with pauses 12 | 13 | # set empty flags 14 | echo "" > flags.tex 15 | 16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 17 | biber pdf/${lecture_filename}-lecture.bcf 18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 20 | 21 | # Compile the handout (no slide unfolding) 22 | 23 | # set the flag 24 | echo "\handouttrue" > flags.tex 25 | 26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 27 | biber pdf/${lecture_filename}-handout.bcf 28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 30 | 31 | # Cleaning up temporary latex files 32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache 33 | -------------------------------------------------------------------------------- /latex/lecture10/img/BERT-language-modeling-masked-lm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/BERT-language-modeling-masked-lm.png -------------------------------------------------------------------------------- /latex/lecture10/img/aiayn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/aiayn.png -------------------------------------------------------------------------------- /latex/lecture10/img/anno_transformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/anno_transformer.png -------------------------------------------------------------------------------- /latex/lecture10/img/anno_transformer_attn_block.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/anno_transformer_attn_block.png -------------------------------------------------------------------------------- /latex/lecture10/img/anno_trf_hlattn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/anno_trf_hlattn.png -------------------------------------------------------------------------------- /latex/lecture10/img/anno_trf_hllinear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/anno_trf_hllinear.png -------------------------------------------------------------------------------- /latex/lecture10/img/anno_trf_multihead.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/anno_trf_multihead.png -------------------------------------------------------------------------------- /latex/lecture10/img/bart-pretraining-tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bart-pretraining-tasks.png -------------------------------------------------------------------------------- /latex/lecture10/img/bert-decoder-head-hl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-decoder-head-hl.png -------------------------------------------------------------------------------- /latex/lecture10/img/bert-google.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-google.png -------------------------------------------------------------------------------- /latex/lecture10/img/bert-next-sentence-prediction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-next-sentence-prediction.png -------------------------------------------------------------------------------- /latex/lecture10/img/bert-pair-classification.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-pair-classification.png -------------------------------------------------------------------------------- /latex/lecture10/img/bert-paper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-paper.png -------------------------------------------------------------------------------- /latex/lecture10/img/bert-results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-results.png -------------------------------------------------------------------------------- /latex/lecture10/img/bert-seq-labeling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-seq-labeling.png -------------------------------------------------------------------------------- /latex/lecture10/img/bert-single-sentence-clf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-single-sentence-clf.png -------------------------------------------------------------------------------- /latex/lecture10/img/bert-spanex-qa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-spanex-qa.png -------------------------------------------------------------------------------- /latex/lecture10/img/bert-viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert-viz.png -------------------------------------------------------------------------------- /latex/lecture10/img/bert_dual_seq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert_dual_seq.png -------------------------------------------------------------------------------- /latex/lecture10/img/bert_modeling.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert_modeling.pdf -------------------------------------------------------------------------------- /latex/lecture10/img/bert_nsp_anno.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/bert_nsp_anno.png -------------------------------------------------------------------------------- /latex/lecture10/img/cc-by-sa-icon.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/cc-by-sa-icon.pdf -------------------------------------------------------------------------------- /latex/lecture10/img/gifs/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "bc9c5062", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "ename": "SyntaxError", 11 | "evalue": "unmatched ')' (2784638048.py, line 8)", 12 | "output_type": "error", 13 | "traceback": [ 14 | "\u001b[0;36m Cell \u001b[0;32mIn [1], line 8\u001b[0;36m\u001b[0m\n\u001b[0;31m os.rename(os.path.join(path, file), os.path.join(path, file[:9]+\".png\")))\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m unmatched ')'\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import os\n", 20 | "path = 'transformer-decoding-frames'\n", 21 | "files = os.listdir(path)\n", 22 | "\n", 23 | "\n", 24 | "for index, file in enumerate(files):\n", 25 | " \n", 26 | " os.rename(os.path.join(path, file), os.path.join(path, file[:9]+\".png\"))" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "id": "5279e0ac", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [] 36 | } 37 | ], 38 | "metadata": { 39 | "kernelspec": { 40 | "display_name": "Python 3 (ipykernel)", 41 | "language": "python", 42 | "name": "python3" 43 | }, 44 | "language_info": { 45 | "codemirror_mode": { 46 | "name": "ipython", 47 | "version": 3 48 | }, 49 | "file_extension": ".py", 50 | "mimetype": "text/x-python", 51 | "name": "python", 52 | "nbconvert_exporter": "python", 53 | "pygments_lexer": "ipython3", 54 | "version": "3.9.12" 55 | } 56 | }, 57 | "nbformat": 4, 58 | "nbformat_minor": 5 59 | } 60 | -------------------------------------------------------------------------------- /latex/lecture10/img/gifs/transformer_decoding_1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/gifs/transformer_decoding_1.gif -------------------------------------------------------------------------------- /latex/lecture10/img/gifs/transformer_decoding_2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/gifs/transformer_decoding_2.gif -------------------------------------------------------------------------------- /latex/lecture10/img/positional_embs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/positional_embs.png -------------------------------------------------------------------------------- /latex/lecture10/img/pretrained-lm-variants.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/pretrained-lm-variants.png -------------------------------------------------------------------------------- /latex/lecture10/img/seq2seq_attention_t1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/seq2seq_attention_t1.pdf -------------------------------------------------------------------------------- /latex/lecture10/img/seq2seq_attention_t2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/seq2seq_attention_t2.pdf -------------------------------------------------------------------------------- /latex/lecture10/img/seq2seq_attention_t3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/seq2seq_attention_t3.pdf -------------------------------------------------------------------------------- /latex/lecture10/img/seq2seq_attn_encdec.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/seq2seq_attn_encdec.pdf -------------------------------------------------------------------------------- /latex/lecture10/img/t5-objectives.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/t5-objectives.png -------------------------------------------------------------------------------- /latex/lecture10/img/the_transformer_mt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/the_transformer_mt.png -------------------------------------------------------------------------------- /latex/lecture10/img/transformer_blocks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/transformer_blocks.png -------------------------------------------------------------------------------- /latex/lecture10/img/transformer_encoder_decoder_stack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/transformer_encoder_decoder_stack.png -------------------------------------------------------------------------------- /latex/lecture10/img/transformer_encoder_decoder_stack_full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/transformer_encoder_decoder_stack_full.png -------------------------------------------------------------------------------- /latex/lecture10/img/transformer_encoders_decoders.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/transformer_encoders_decoders.png -------------------------------------------------------------------------------- /latex/lecture10/img/transformer_residual_layer_norm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/transformer_residual_layer_norm.png -------------------------------------------------------------------------------- /latex/lecture10/img/ukp_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/ukp_logo.png -------------------------------------------------------------------------------- /latex/lecture10/img/word2vec_cbow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture10/img/word2vec_cbow.pdf -------------------------------------------------------------------------------- /latex/lecture11/.gitignore: -------------------------------------------------------------------------------- 1 | dl4nlp2023-lecture08.pdf 2 | flags.tex 3 | pdf 4 | -------------------------------------------------------------------------------- /latex/lecture11/compile-pdf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Inspired by: https://tex.stackexchange.com/a/1501 4 | 5 | # current lecture file name 6 | lecture_filename="dl4nlp2023-lecture11" 7 | 8 | # create the output folder (might already exist) 9 | mkdir -p pdf 10 | 11 | # Compile the lecture version with pauses 12 | 13 | # set empty flags 14 | echo "" > flags.tex 15 | 16 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 17 | biber pdf/${lecture_filename}-lecture.bcf 18 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 19 | lualatex --jobname=${lecture_filename}-lecture --output-directory=pdf ${lecture_filename}.tex 20 | 21 | # Compile the handout (no slide unfolding) 22 | 23 | # set the flag 24 | echo "\handouttrue" > flags.tex 25 | 26 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 27 | biber pdf/${lecture_filename}-handout.bcf 28 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 29 | lualatex --jobname=${lecture_filename}-handout --output-directory=pdf ${lecture_filename}.tex 30 | 31 | # Cleaning up temporary latex files 32 | rm -f pdf/*.dvi pdf/*.idx pdf/*.aux pdf/*.toc pdf/*.log pdf/*.bbl pdf/*.blg pdf/*.lof pdf/*.out pdf/*.bcf pdf/*.synctex.gz pdf/*.run.xml pdf/*.nav pdf/*.snm pdf/*.vrb pdf/*.rubbercache 33 | -------------------------------------------------------------------------------- /latex/lecture11/dl4nlp2023-lecture11.tex: -------------------------------------------------------------------------------- 1 | % !TeX program = lualatex 2 | % !BIB program = biber 3 | % Lualatex is important to render Fira fonts; with pdflatex it's just the regular one 4 | % ratio 16:9 -- https://tex.stackexchange.com/questions/14336/ 5 | 6 | % compile two versions, inspired by https://tex.stackexchange.com/a/1501 7 | % use the script "compile-pdf.sh" 8 | \newif\ifhandout 9 | % if flags.tex does not exist, create an empty file to be able to compile in TeXstudio 10 | \input{flags} 11 | 12 | \ifhandout 13 | \documentclass[12pt,aspectratio=169,handout]{beamer} 14 | \else 15 | \documentclass[12pt,aspectratio=169]{beamer} 16 | \fi 17 | 18 | % adjust for 16:9 19 | % https://tex.stackexchange.com/questions/354022/modifying-the-margins-of-all-slides-in-beamer 20 | \setbeamersize{text margin left=0.3cm,text margin right=1.0cm} 21 | 22 | %\usepackage{xcolor} 23 | 24 | %%% better TOC 25 | \usetheme[subsectionpage=progressbar]{metropolis} 26 | 27 | % name in footer 28 | \setbeamertemplate{frame numbering}{\insertframenumber ~ | Dr.\ Martin Tutek} 29 | 30 | % blocks with background globally 31 | \metroset{block=fill} 32 | 33 | % adjust the background to be completely white 34 | \setbeamercolor{background canvas}{bg=white} 35 | 36 | % typeset mathematics on serif 37 | \usefonttheme[onlymath]{serif} 38 | 39 | % better bibliography using biber as backend 40 | \usepackage[natbib=true,backend=biber,style=authoryear-icomp,maxbibnames=30,maxcitenames=2,uniquelist=false,giveninits=true,doi=false,url=false,dashed=false,isbn=false]{biblatex} 41 | % shared bibliography 42 | \addbibresource{../dl4nlp-bibliography.bib} 43 | % disable "ibid" for repeated citations 44 | \boolfalse{citetracker} 45 | 46 | \definecolor{76abdf}{RGB}{118, 171, 223} 47 | 48 | \setbeamercolor{frametitle}{bg=76abdf, fg=white} 49 | 50 | \newcounter{saveenumi} 51 | \newcommand{\seti}{\setcounter{saveenumi}{\value{enumi}}} 52 | \newcommand{\conti}{\setcounter{enumi}{\value{saveenumi}}} 53 | 54 | \resetcounteronoverlays{saveenumi} 55 | % \usepackage{movie15} 56 | \usepackage{animate} 57 | 58 | \usepackage{xspace} 59 | % Emojis 60 | \usepackage{emoji} 61 | % Figs 62 | \usepackage{graphicx} 63 | \graphicspath{ {./img/} } 64 | 65 | 66 | % for derivatives, https://tex.stackexchange.com/a/412442 67 | \usepackage{physics} 68 | 69 | \usepackage{tikz} 70 | \usetikzlibrary{matrix, positioning} 71 | \usetikzlibrary{angles,quotes} % for angles 72 | \usetikzlibrary{backgrounds} % background 73 | \usetikzlibrary{decorations.pathreplacing} % curly braces 74 | \usetikzlibrary{calligraphy} 75 | \usetikzlibrary{calc} % for neural nets 76 | 77 | % for plotting functions 78 | \usepackage{pgfplots} 79 | \usepgfplotslibrary{dateplot} 80 | 81 | % sub-figures 82 | \usepackage{caption} 83 | \usepackage{subcaption} 84 | 85 | % Checkmark, xmark 86 | \usepackage{pifont}% http://ctan.org/pkg/pifont 87 | 88 | % book tabs 89 | \usepackage{booktabs} 90 | 91 | % caption* 92 | \usepackage{caption} 93 | 94 | 95 | % show TOC at every section start 96 | \AtBeginSection{ 97 | \frame{ 98 | \vspace{2em} 99 | \sectionpage 100 | \hspace*{2.2em}\begin{minipage}{10cm} 101 | \tableofcontents[currentsection] 102 | \end{minipage} 103 | } 104 | } 105 | 106 | % argmin, argmax 107 | \usepackage{amssymb}% http://ctan.org/pkg/amssymb 108 | \usepackage{amsmath} 109 | 110 | \DeclareMathOperator*{\argmax}{arg\!\max} 111 | \DeclareMathOperator*{\argmin}{arg\!\min} 112 | % softmax 113 | \DeclareMathOperator*{\softmax}{soft\!\max} 114 | % RNN 115 | \DeclareMathOperator*{\rnn}{RNN} 116 | % RNN star 117 | \DeclareMathOperator*{\rnnstar}{RNN^{*}} 118 | % bi-RNN 119 | \DeclareMathOperator*{\birnn}{biRNN} 120 | 121 | % bold math 122 | \usepackage{bm} 123 | 124 | % for \mathclap 125 | \usepackage{mathtools} 126 | 127 | % algorithms 128 | \usepackage[noend]{algpseudocode} 129 | 130 | 131 | % for neurons and layers in tikz 132 | \tikzset{ 133 | neuron/.style={draw, rectangle, inner sep=2pt, minimum width=0.75cm, fill=blue!20}, 134 | param/.style={draw, rectangle, inner sep=2pt, minimum width=0.75cm, fill=green!20}, 135 | constant/.style={draw, rectangle, inner sep=2pt, minimum width=0.75cm, fill=black!15}, 136 | state/.style={rectangle, inner sep=2pt, minimum width=0.75cm, fill=black!5}, 137 | } 138 | 139 | % for strike-through text 140 | \usepackage[normalem]{ulem} 141 | 142 | 143 | \title{Deep Learning for Natural Language Processing} 144 | \subtitle{Lecture 11 -- Text generation 4: Decoder-only Models and GPT} 145 | \date{June 27, 2023} 146 | \author{Dr.\ Martin Tutek} 147 | \institute{Ubiquitous Knowledge Processing \hfill \includegraphics[height=1.cm]{img/ukp_logo.png} \\ 148 | Department of Computer Science\\ 149 | Technical University of Darmstadt \hfill \href{https://www.informatik.tu-darmstadt.de/ukp/ukp_home/index.en.jsp}{\underline{UKP Web}}} 150 | %\titlegraphic{\hfill } 151 | 152 | \begin{document} 153 | 154 | \maketitle 155 | 156 | \begin{frame}{Recap} 157 | In the previous lecture we: 158 | \begin{itemize} 159 | \item Introduced the \textbf{BERT model} 160 | \item Introduced the two pretraining tasks for BERT: \textbf{MLM} and \textbf{NSP} 161 | \item Explained the connection between MLM and CBOW-style training 162 | \item Explained the purpose of NSP -- learning a \textbf{sentence embedding} 163 | \item Analyzed how to \textbf{apply BERT} to various \textbf{downstream tasks} such as classification and QA 164 | \item Gave an overview of various other pretraining tasks for LLMs 165 | \end{itemize} 166 | \end{frame} 167 | 168 | \begin{frame}{Motivation} 169 | Recall: using the \textbf{same model} for \textbf{multiple tasks} without task-specific decoder heads 170 | \begin{figure}[h] 171 | \includegraphics[height=4.5cm]{t5-objectives} 172 | \caption*{Image from \href{https://jmlr.org/papers/volume21/20-074/20-074.pdf}{\underline{T5 paper}}} 173 | \end{figure} 174 | \end{frame} 175 | 176 | \begin{frame}{Motivation} 177 | Recall: using the \textbf{same model} for \textbf{multiple tasks} without task-specific decoder heads 178 | \begin{figure}[h] 179 | \includegraphics[height=4.5cm]{t5-anno-prompts} 180 | \caption*{Image from \href{https://jmlr.org/papers/volume21/20-074/20-074.pdf}{\underline{T5 paper}}} 181 | \end{figure} 182 | \end{frame} 183 | 184 | \section{Types of Transformer Architectures} 185 | 186 | \begin{frame}{Encoder-Decoder Transformer} 187 | 188 | \begin{figure}[h] 189 | \includegraphics[height=4.5cm]{transformer_enc_dec.pdf} 190 | \end{figure} 191 | 192 | \end{frame} 193 | 194 | 195 | \begin{frame}{Bidirectional Encoder-only Transformer} 196 | \begin{columns}[T] % align columns 197 | \begin{column}{.48\textwidth} 198 | \begin{figure}[h] 199 | \includegraphics[height=4.5cm]{bidirectional_trf_encoder.pdf} 200 | \end{figure} 201 | \end{column} 202 | 203 | \begin{column}{.48\textwidth} 204 | \begin{itemize} 205 | \item Efficient encoding \emoji{check-mark} 206 | \item Versatile base for downstream tasks \emoji{check-mark} 207 | \pause 208 | \item Can't \textbf{really} generate text \emoji{cross-mark} 209 | \end{itemize} 210 | \end{column} 211 | 212 | \end{columns} 213 | \end{frame} 214 | 215 | 216 | \begin{frame}{Autoregressive Decoder-only Transformer} 217 | \begin{columns}[T] % align columns 218 | \begin{column}{.48\textwidth} 219 | \begin{figure}[h] 220 | \includegraphics[height=4.5cm]{autoregressive_trf_decoder.pdf} 221 | \end{figure} 222 | \end{column} 223 | 224 | \begin{column}{.48\textwidth} 225 | An \textbf{autoregressive} (causal) language model uses \textbf{past} values of a time series to predict future values. 226 | \pause 227 | \begin{itemize} 228 | \item Didn't we decide not to use these because they were inefficient? 229 | \pause \begin{center} \textbf{(RNNs)} \end{center} 230 | \pause 231 | \item Yes, but... 232 | \begin{enumerate} 233 | \item Hardware has improved 234 | \item Autoregressive models are \textit{really} good at generating text 235 | \end{enumerate} 236 | \end{itemize} 237 | \end{column} 238 | \end{columns} 239 | 240 | \end{frame} 241 | 242 | \begin{frame}{Differences between attention masks} 243 | \begin{figure}[h] 244 | \includegraphics[height=4.5cm]{attention-types} 245 | \end{figure} 246 | Read: y axis $\to$ tokens attending, x axis $\to$ tokens attended to. 247 | 248 | Black cell $\to$ token visible, white cell $\to$ token \textbf{masked} 249 | \end{frame} 250 | 251 | \begin{frame}{Differences between attention masks} 252 | \begin{figure}[h] 253 | \includegraphics[height=4.5cm]{attention-masks-anno} 254 | \end{figure} 255 | Read: y axis $\to$ tokens attending, x axis $\to$ tokens attended to. 256 | 257 | Black cell $\to$ token visible, white cell $\to$ token \textbf{masked} 258 | \end{frame} 259 | 260 | \begin{frame}{Attention masks} 261 | 262 | Recall: the attention mechanism 263 | 264 | \noindent\begin{minipage}{0.4\textwidth} 265 | \begin{equation*} 266 | a = \sum_i^n \alpha_i v_i 267 | \end{equation*} 268 | \end{minipage}% 269 | \begin{minipage}{0.2\textwidth} 270 | \end{minipage} 271 | \begin{minipage}{0.4\textwidth} 272 | \begin{equation*} 273 | \hat{\alpha}_i = \frac{q^T \cdot k_i}{\sqrt{d_{\text{model} } } } 274 | \end{equation*} 275 | \end{minipage}\vskip1em 276 | 277 | \pause 278 | How do we do \textbf{masking}? 279 | 280 | In the \textbf{causal} scenario (each token can only attend to \textbf{past} tokens); 281 | \pause 282 | 283 | For a $q = W_q(s_j)$ query computed based on the hidden state $s_j$ at position $j$ 284 | \pause 285 | 286 | $$ 287 | \alpha_i = 288 | \begin{cases} 289 | \alpha_i,& \text{if } j\geq i\\ 290 | 0, & \text{otherwise} 291 | \end{cases} 292 | $$ 293 | \pause 294 | \textbf{NB: actually}, we set $\hat{\alpha}_i$ to $-\inf$ (before softmax) 295 | 296 | \end{frame} 297 | 298 | 299 | \begin{frame}{Differences between attention masks} 300 | \begin{figure}[h] 301 | \includegraphics[height=5.5cm]{attention-patterns} 302 | \end{figure} 303 | \end{frame} 304 | 305 | \section{Autoregressive decoder-only Models} 306 | 307 | \begin{frame}{Variants of language modeling} 308 | \begin{figure}[h] 309 | \includegraphics[height=3.5cm]{language-modeling-types} 310 | \end{figure} 311 | 312 | \begin{itemize} 313 | \item (Full) language modeling $\to$ given previous tokens, predict next token, for \textbf{every token} in sequence 314 | \pause 315 | \item Prefix language modeling $\to$ (1) feed a prefix (where mask \textbf{does not have to be causal}), (2) full LM starting after prefix 316 | \pause 317 | \item Masked language modeling $\to$ \textbf{reconstruct masked} tokens/spans 318 | \end{itemize} 319 | 320 | \end{frame} 321 | 322 | \begin{frame}{Autoregressive decoder-only models} 323 | \begin{figure}[h] 324 | \includegraphics[height=3cm]{gpt2-paper} 325 | \end{figure} 326 | 327 | Introduction of \textbf{GPT-2}, an autoregressive Transformer decoder-only model trained on full language modeling. 328 | 329 | \pause 330 | 331 | \textbf{GPT-3} is \textit{"just"} a \textbf{larger} version of GPT-2 332 | 333 | \end{frame} 334 | 335 | \begin{frame}{Autoregressive decoder-only models} 336 | \begin{figure}[h] 337 | \includegraphics[height=3cm]{gpt2-title-anno} 338 | \end{figure} 339 | 340 | Introduction of \textbf{GPT-2}, an autoregressive Transformer decoder-only model trained on full language modeling. 341 | 342 | \hspace{1em} What does "unsupervised multitask learners" mean \emoji{thinking}? 343 | \end{frame} 344 | 345 | \subsection{Zero-shot, one-shot and few-shot learning} 346 | 347 | \begin{frame}{Zero-shot, one-shot and few-shot learning} 348 | 349 | \textbf{Recall:} T5 was able to perform \textbf{multiple tasks} at the same time 350 | 351 | ... but it was trained on them \& on keywords which indicate the task. 352 | \vspace{1em} 353 | \pause 354 | 355 | For a model that has \textbf{not been trained on the downstream task}: 356 | \begin{itemize} 357 | \item \textbf{Few-shot} learning: tune pretrained model on a \textbf{small number} of target task instances, \textbf{then perform task (!)} 358 | \pause 359 | \item \textbf{One-shot} learning: tune pretrained model on \textbf{one instance (!)} \textit{per class}, then perform task 360 | \pause 361 | \item \textbf{Zero-shot} learning: \textbf{don't tune pretrained model}\textbf{(!!!)}, then perform task 362 | \end{itemize} 363 | 364 | \end{frame} 365 | 366 | \begin{frame}{Zero-shot learning} 367 | Zero shot learning $\approx$ unsupervised learning 368 | \pause 369 | 370 | \vspace{1em} 371 | Why $\approx$? 372 | \pause 373 | 374 | \textbf{Assumption:} when trained on a \textbf{massive} corpus of text, the language model is likely to \textbf{see some tasks naturally} occur (e.g. question answering). 375 | 376 | \pause 377 | \begin{itemize} 378 | \item We want to \textbf{transform} our task into a \textbf{generative one} by providing a \textbf{prompt} to the model which will make the label of the input instance the \textbf{most likely generated sequence}. 379 | \end{itemize} 380 | 381 | \end{frame} 382 | 383 | \begin{frame}%{Zero-shot learning} 384 | \begin{columns}[T] % align columns 385 | \begin{column}{.48\textwidth} 386 | \begin{figure}[h] 387 | \includegraphics[height=7.3cm]{gpt2-demonstrations} 388 | \caption*{Image from \href{https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf}{\underline{GPT2 paper}}} 389 | \end{figure} 390 | \end{column} 391 | 392 | \begin{column}{.48\textwidth} 393 | \vspace{1.5em} 394 | The internet \textbf{does} contain samples of various NLP tasks 395 | \pause 396 | \begin{itemize} 397 | \item ... and a large language model (LLM) \textbf{can} remember them; 398 | \pause 399 | \item ... and when \textbf{prompted} to perform a task, without seeing the prompt before, \textbf{recall it}; 400 | \pause 401 | \item ... and \textbf{perform them accurately}. 402 | \end{itemize} 403 | \end{column} 404 | \end{columns} 405 | 406 | \end{frame} 407 | 408 | \begin{frame}{GPT-2: Zero-shot question answering} 409 | \begin{figure}[h] 410 | \includegraphics[height=3.5cm]{gpt2-zeroshot-qa} 411 | \caption*{Image from \href{https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf}{\underline{GPT2 paper}}} 412 | \end{figure} 413 | \end{frame} 414 | 415 | 416 | \begin{frame}{GPT-2: Prompted one-shot question answering} 417 | \begin{figure}[h] 418 | \includegraphics[height=6.5cm]{gpt2-prompting-qa} 419 | \caption*{Image from \href{https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf}{\underline{GPT2 paper}}} 420 | \end{figure} 421 | \end{frame} 422 | 423 | \begin{frame} 424 | \begin{figure}[h] 425 | \includegraphics[height=8.5cm]{gpt2-prompting-qa} 426 | % \caption*{Image from \href{https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf}{\underline{GPT2 paper}}} 427 | \end{figure} 428 | \end{frame} 429 | 430 | \begin{frame} 431 | \begin{figure}[h] 432 | \includegraphics[height=8.5cm]{gpt2-prompt-anno} 433 | % \caption*{Image from \href{https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf}{\underline{GPT2 paper}}} 434 | \end{figure} 435 | \end{frame} 436 | 437 | \subsection{Prompting} 438 | 439 | \begin{frame}{Prompting} 440 | \textbf{A prompt} is a piece of text inserted in the input examples, so that the original task \textbf{can be formulated as} a (masked) \textbf{language modeling} problem. 441 | 442 | \pause 443 | 444 | \begin{figure}[h] 445 | \includegraphics[height=5cm]{zero-shot-translation-gpt3} 446 | \caption*{Image from \href{https://arxiv.org/pdf/2005.14165.pdf}{\underline{GPT3 paper}}} 447 | \end{figure} 448 | 449 | \end{frame} 450 | 451 | 452 | \begin{frame}{Prompting} 453 | \textbf{A prompt} is a piece of text inserted in the input examples, so that the original task \textbf{can be formulated as} a (masked) \textbf{language modeling} problem. 454 | 455 | \begin{figure}[h] 456 | \includegraphics[height=5cm]{one-shot-translation-gpt3} 457 | \caption*{Image from \href{https://arxiv.org/pdf/2005.14165.pdf}{\underline{GPT3 paper}}} 458 | \end{figure} 459 | 460 | \end{frame} 461 | 462 | 463 | \begin{frame}{Prompting} 464 | \textbf{A prompt} is a piece of text inserted in the input examples, so that the original task \textbf{can be formulated as} a (masked) \textbf{language modeling} problem. 465 | 466 | \begin{figure}[h] 467 | \includegraphics[height=5cm]{few-shot-translation-gpt3} 468 | \caption*{Image from \href{https://arxiv.org/pdf/2005.14165.pdf}{\underline{GPT3 paper}}} 469 | \end{figure} 470 | 471 | \end{frame} 472 | 473 | \begin{frame}{Prompting works well} 474 | \begin{figure}[h] 475 | \includegraphics[height=4.5cm]{gpt3-translation-results} 476 | \caption*{Image from \href{https://arxiv.org/pdf/2005.14165.pdf}{\underline{GPT3 paper}}} 477 | \end{figure} 478 | GPT3 \textbf{without fine-tuning} performs better than \textbf{unsupervised} alternatives, and sometimes even \textbf{better} than supervised state-of-the-art! 479 | \end{frame} 480 | 481 | \begin{frame}{In-context learning} 482 | \textbf{In-context learning} is the paradigm in which a LLM learns to solve a new task at inference time \textbf{without any change to its weights}, based only on examples in the \textbf{prompt}. 483 | \pause 484 | 485 | \hspace{1em}$\approx$ umbrella term for zero-, one- and few-shot learning with task descriptions also contained in prompt. 486 | \vspace{1em} 487 | 488 | \pause 489 | \textit{"During \textbf{unsupervised pre-training}, a language model develops a broad set of skills and pattern recognition abilities. It then uses these abilities at inference time to rapidly adapt to or recognize the desired task. We use the term “in-context learning” to describe the inner loop of this process, which occurs \textbf{within the forward-pass} upon each sequence."} -- from GPT3 paper 490 | 491 | \end{frame} 492 | 493 | \begin{frame}%{In-context learning} 494 | \begin{figure}[h] 495 | \includegraphics[height=7.5cm]{in-context-learning} 496 | \end{figure} 497 | 498 | \end{frame} 499 | 500 | \section{Prompt-tuning MLMs} 501 | 502 | \begin{frame}{Prompt-tuning MLMs} 503 | Can we only use prompting with autoregressive models? 504 | \pause 505 | \vspace{1em} 506 | \begin{itemize} 507 | \item No -- we can also use it with bidirectional decoder-only models! 508 | \begin{itemize} 509 | \pause 510 | \item ... but it is \textbf{more difficult} because they have not been trained to generate texts 511 | \pause 512 | \item ... because the downstream task is \textbf{less natural} (further from the pretraining task) to the model 513 | \end{itemize} 514 | 515 | \end{itemize} 516 | \pause 517 | How to overcome this gap between the \textbf{pretraining task} and the \textbf{prompting-transformed downstream task}? 518 | 519 | \end{frame} 520 | 521 | \begin{frame}{Prompt-tuning MLMs} 522 | So far, we have \textbf{fine-tuned} masked language models 523 | \pause 524 | \begin{figure}[h] 525 | \includegraphics[height=4cm]{fine-tuning-mlms} 526 | \caption*{Figure from \href{https://thegradient.pub/prompting/}{\underline{The Gradient}}} 527 | \end{figure} 528 | \pause 529 | Can we frame our downstream task \textbf{as MLM}? 530 | 531 | \end{frame} 532 | 533 | \begin{frame}{Prompt-tuning MLMs} 534 | \begin{figure}[h] 535 | \includegraphics[height=6cm]{prompting_mlms} 536 | \end{figure} 537 | \end{frame} 538 | 539 | 540 | \begin{frame}{Prompt-tuning MLMs} 541 | We transform the target task (e.g. sentiment analysis) to \textbf{masked language modeling}. 542 | \begin{enumerate} 543 | \item Choose the prompt and word/token used for each label 544 | \pause 545 | \begin{itemize} 546 | \item Choice of label token \textbf{important} 547 | \item Template design also \textbf{important} 548 | \end{itemize} 549 | \pause 550 | \item Demonstrate task through a few samples 551 | \pause 552 | \begin{itemize} 553 | \item Usually through \textbf{fine-tuning} 554 | \end{itemize} 555 | \pause 556 | \item \textbf{No new parameters needed} to perform task! 557 | \end{enumerate} 558 | \end{frame} 559 | 560 | 561 | \begin{frame}{Discrete and continuous prompts} 562 | So far, we have shown \textbf{discrete prompts}: actual text that we prepend/append to existing data which triggers the LLM to perform our task. 563 | \pause 564 | \vspace{1em} 565 | 566 | Can we learn \textbf{continuous prompts}? 567 | \pause 568 | (dense vectors which we prepend, e.g. as a token) 569 | \pause 570 | \begin{figure}[h] 571 | \includegraphics[height=3.5cm]{continuous-prompts} 572 | \caption*{Figure from \href{https://aclanthology.org/2022.naacl-main.266.pdf}{\underline{Prompt Waywardness}}} 573 | \end{figure} 574 | \end{frame} 575 | 576 | 577 | \section{A step back} 578 | 579 | \begin{frame}{Incredible Performance of Large Language Models} 580 | So... what caused LLMs to be \textbf{so good} all of a sudden? 581 | \pause 582 | \begin{itemize} 583 | \item More available data (more data $\to$ better models) 584 | \pause 585 | \item Training tricks (from experience) 586 | \pause 587 | \item Hardware advancements (faster training of larger models) 588 | \end{itemize} 589 | \begin{figure}[h] 590 | \includegraphics[height=4cm]{lm-scaling} 591 | \end{figure} 592 | \end{frame} 593 | 594 | \begin{frame}{Takeaways} 595 | 596 | \begin{itemize} 597 | \item Three types of Transformer-based architectures for LLM pretraining: 598 | \begin{itemize} 599 | \item \textbf{Encoder-decoder} (T5) 600 | \item \textbf{Bidirectional encoder-only} (BERT) 601 | \item \textbf{Autoregressive decoder-only} (GPT-2) 602 | \end{itemize} 603 | \item The \textbf{attention masks} of these models differ 604 | \item There are three variants of language modeling for pretraining LLMs 605 | \item GPT-2 (and 3) are autoregressive decoder-only transformers 606 | \item We introduced zero-, one- and few-shot learning 607 | \item We introduced prompting and its variants 608 | \begin{itemize} 609 | \item Autoregressive vs MLM prompting 610 | \item Continuous vs discrete prompts 611 | \item In-context learning 612 | \end{itemize} 613 | \end{itemize} 614 | 615 | \end{frame} 616 | 617 | \begin{frame}{Useful resources} 618 | 619 | \begin{itemize} 620 | \item \href{https://thegradient.pub/prompting/}{\underline{The Gradient: Prompting}} by Tianyu Gao 621 | \item \href{https://thegradient.pub/in-context-learning-in-context/}{\underline{The Gradient: In Context Learning}} by Daniel Bashir 622 | \item \href{http://ai.stanford.edu/blog/understanding-incontext/}{\underline{Understanding in-context learning}} by Sang Michael Xie and Sewon Min 623 | \end{itemize} 624 | 625 | \end{frame} 626 | 627 | 628 | 629 | \begin{frame}{License and credits} 630 | 631 | \begin{columns} 632 | \begin{column}{0.7\textwidth} 633 | Licensed under Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) 634 | \end{column} 635 | \begin{column}{0.2\textwidth} 636 | \includegraphics[width=0.9\linewidth]{img/cc-by-sa-icon.pdf} 637 | \end{column} 638 | \end{columns} 639 | 640 | \bigskip 641 | 642 | Credits 643 | 644 | \begin{scriptsize} 645 | 646 | Martin Tutek 647 | 648 | Content from ACL Anthology papers licensed under CC-BY \url{https://www.aclweb.org/anthology} 649 | 650 | 651 | \end{scriptsize} 652 | 653 | \end{frame} 654 | 655 | 656 | 657 | \end{document} 658 | 659 | -------------------------------------------------------------------------------- /latex/lecture11/img/attention-masks-anno.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/attention-masks-anno.png -------------------------------------------------------------------------------- /latex/lecture11/img/attention-patterns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/attention-patterns.png -------------------------------------------------------------------------------- /latex/lecture11/img/attention-types.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/attention-types.png -------------------------------------------------------------------------------- /latex/lecture11/img/autoregressive_trf_decoder.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/autoregressive_trf_decoder.pdf -------------------------------------------------------------------------------- /latex/lecture11/img/bidirectional_trf_encoder.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/bidirectional_trf_encoder.pdf -------------------------------------------------------------------------------- /latex/lecture11/img/cc-by-sa-icon.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/cc-by-sa-icon.pdf -------------------------------------------------------------------------------- /latex/lecture11/img/continuous-prompts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/continuous-prompts.png -------------------------------------------------------------------------------- /latex/lecture11/img/few-shot-translation-gpt3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/few-shot-translation-gpt3.png -------------------------------------------------------------------------------- /latex/lecture11/img/fine-tuning-mlms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/fine-tuning-mlms.png -------------------------------------------------------------------------------- /latex/lecture11/img/gpt2-demonstrations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/gpt2-demonstrations.png -------------------------------------------------------------------------------- /latex/lecture11/img/gpt2-paper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/gpt2-paper.png -------------------------------------------------------------------------------- /latex/lecture11/img/gpt2-prompt-anno.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/gpt2-prompt-anno.png -------------------------------------------------------------------------------- /latex/lecture11/img/gpt2-prompting-qa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/gpt2-prompting-qa.png -------------------------------------------------------------------------------- /latex/lecture11/img/gpt2-title-anno.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/gpt2-title-anno.png -------------------------------------------------------------------------------- /latex/lecture11/img/gpt2-zeroshot-qa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/gpt2-zeroshot-qa.png -------------------------------------------------------------------------------- /latex/lecture11/img/gpt3-translation-results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/gpt3-translation-results.png -------------------------------------------------------------------------------- /latex/lecture11/img/in-context-learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/in-context-learning.png -------------------------------------------------------------------------------- /latex/lecture11/img/language-modeling-types.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/language-modeling-types.png -------------------------------------------------------------------------------- /latex/lecture11/img/lm-scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/lm-scaling.png -------------------------------------------------------------------------------- /latex/lecture11/img/one-shot-translation-gpt3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/one-shot-translation-gpt3.png -------------------------------------------------------------------------------- /latex/lecture11/img/prompting_mlms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/prompting_mlms.png -------------------------------------------------------------------------------- /latex/lecture11/img/t5-anno-prompts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/t5-anno-prompts.png -------------------------------------------------------------------------------- /latex/lecture11/img/t5-objectives.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/t5-objectives.png -------------------------------------------------------------------------------- /latex/lecture11/img/transformer_enc_dec.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/transformer_enc_dec.pdf -------------------------------------------------------------------------------- /latex/lecture11/img/ukp_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/ukp_logo.png -------------------------------------------------------------------------------- /latex/lecture11/img/zero-shot-translation-gpt3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/latex/lecture11/img/zero-shot-translation-gpt3.png -------------------------------------------------------------------------------- /pdf/DL4NLP Lecture 12_ Contemporary LLMs.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/DL4NLP Lecture 12_ Contemporary LLMs.pptx -------------------------------------------------------------------------------- /pdf/dl4nlp2023-lecture01.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture01.pdf -------------------------------------------------------------------------------- /pdf/dl4nlp2023-lecture02.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture02.pdf -------------------------------------------------------------------------------- /pdf/dl4nlp2023-lecture03.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture03.pdf -------------------------------------------------------------------------------- /pdf/dl4nlp2023-lecture04.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture04.pdf -------------------------------------------------------------------------------- /pdf/dl4nlp2023-lecture05.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture05.pdf -------------------------------------------------------------------------------- /pdf/dl4nlp2023-lecture06.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture06.pdf -------------------------------------------------------------------------------- /pdf/dl4nlp2023-lecture07.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture07.pdf -------------------------------------------------------------------------------- /pdf/dl4nlp2023-lecture08.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture08.pdf -------------------------------------------------------------------------------- /pdf/dl4nlp2023-lecture09.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture09.pdf -------------------------------------------------------------------------------- /pdf/dl4nlp2023-lecture10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture10.pdf -------------------------------------------------------------------------------- /pdf/dl4nlp2023-lecture11.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture11.pdf -------------------------------------------------------------------------------- /pdf/dl4nlp2023-lecture12-recap.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture12-recap.pdf -------------------------------------------------------------------------------- /pdf/dl4nlp2023-lecture13.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture13.pdf -------------------------------------------------------------------------------- /pdf/dl4nlp2023-lecture13.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dl4nlp-tuda/deep-learning-for-nlp-lectures/5a60cd2392b29f12094fc1b5db53289da785164f/pdf/dl4nlp2023-lecture13.pptx --------------------------------------------------------------------------------