├── assets
    ├── seq2seq1.png
    ├── seq2seq2.png
    ├── seq2seq3.png
    ├── seq2seq4.png
    ├── seq2seq5.png
    ├── seq2seq6.png
    ├── seq2seq7.png
    ├── seq2seq8.png
    ├── seq2seq9.png
    ├── seq2seq10.png
    ├── convseq2seq1.png
    ├── convseq2seq2.png
    ├── seq2seq5.xml
    ├── seq2seq2.xml
    ├── seq2seq9.xml
    ├── seq2seq3.xml
    ├── seq2seq6.xml
    ├── seq2seq1.xml
    ├── seq2seq8.xml
    ├── seq2seq10.xml
    ├── seq2seq7.xml
    ├── seq2seq4.xml
    ├── convseq2seq1.xml
    └── convseq2seq2.xml
├── LICENSE
├── .gitignore
├── README.md
├── 5 - Convolutional Sequence to Sequence Learning.ipynb
├── 6 - Attention is All You Need.ipynb
├── 2 - Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation.ipynb
└── 3 - Neural Machine Translation by Jointly Learning to Align and Translate.ipynb


/assets/seq2seq1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SethHWeidman/pytorch-seq2seq/HEAD/assets/seq2seq1.png


--------------------------------------------------------------------------------
/assets/seq2seq2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SethHWeidman/pytorch-seq2seq/HEAD/assets/seq2seq2.png


--------------------------------------------------------------------------------
/assets/seq2seq3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SethHWeidman/pytorch-seq2seq/HEAD/assets/seq2seq3.png


--------------------------------------------------------------------------------
/assets/seq2seq4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SethHWeidman/pytorch-seq2seq/HEAD/assets/seq2seq4.png


--------------------------------------------------------------------------------
/assets/seq2seq5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SethHWeidman/pytorch-seq2seq/HEAD/assets/seq2seq5.png


--------------------------------------------------------------------------------
/assets/seq2seq6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SethHWeidman/pytorch-seq2seq/HEAD/assets/seq2seq6.png


--------------------------------------------------------------------------------
/assets/seq2seq7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SethHWeidman/pytorch-seq2seq/HEAD/assets/seq2seq7.png


--------------------------------------------------------------------------------
/assets/seq2seq8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SethHWeidman/pytorch-seq2seq/HEAD/assets/seq2seq8.png


--------------------------------------------------------------------------------
/assets/seq2seq9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SethHWeidman/pytorch-seq2seq/HEAD/assets/seq2seq9.png


--------------------------------------------------------------------------------
/assets/seq2seq10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SethHWeidman/pytorch-seq2seq/HEAD/assets/seq2seq10.png


--------------------------------------------------------------------------------
/assets/convseq2seq1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SethHWeidman/pytorch-seq2seq/HEAD/assets/convseq2seq1.png


--------------------------------------------------------------------------------
/assets/convseq2seq2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SethHWeidman/pytorch-seq2seq/HEAD/assets/convseq2seq2.png


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Ben Trevett
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/assets/seq2seq5.xml:
--------------------------------------------------------------------------------
1 | <mxfile userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36" version="9.2.3" editor="www.draw.io" type="device"><diagram id="d7628e3d-80be-0910-dbf4-95fda96d980f" name="Page-1">7VtLc5swEP41vmYAASbH1k3SQ9vpTA5NjjLIoEYgj5BjO7++khFP0YTaGNwEDo60eqH99lvtApmBRby7Y3AdfacBIjPLCHYz8GVmWXPPFb9SsM8EtuVkgpDhIBOZpeAevyAlNJR0gwOU1jpySgnH67rQp0mCfF6TQcbott5tRUl91TUMkSa49yHRpb9wwKNM6lluKf+KcBjlK5vuddayhP5TyOgmUevNLLA6XFlzDPO51EbTCAZ0WxGBmxlYMEp5Vop3C0SkanO1ZeNu/9Ja3DdDCe8yAKgRKd/ne0eBUIWqJjQRfz4fNoTkEEPUfBpjX5UjHhNRNEUR7TB/qJQfZZcrR9YSzvYPasShUrYVCpBtvxHne2UNcMOpEFHGIxrSBJJvlK7V9Cua8FsYYyLNa0E3DCMmNvEDbVWjmsN0RV1XSb5nMdBXm7aUkUEWItULZCKpjsowpcY7RGMkNiI6MEQgx891y4HKAMOiXwmCKCgc2jFR9/IMyQblVtTAqA7INsIc3a/hYTNbQcg6MDUVhwSmaQuQKWf0CS0ooeywBPCsJXCl/laYkIo8cJAX2P8KAiQ4TETFFyCIbq+g8owYR7tXNZ63uopEyslYtqpvS8rmoqjC1lx2CkZAx+RD8gbovMm7DU8cMBGnE3EKBoxAHNN8G6RjmXQhnHBaODEWJRxN2+GGo0RTubAc3lBmza4VDFWTViLNQKUdYhFKfVINMQ4CuUwr2+q4diJcP4Adf8wA0I0tVh9sMd49W9wLirxcTdsxFbc10eWEw2VIutjuFJVVQ7AqqWxnLFZZHY78KS6TVLHGi8u6ZJ3/+UlzWclKa7biwljaOpGHC6JpIQh5scnpFDqGSkOeQvlSH/4UagntRkuE7OveQVFA5LBcnuL707Ia+pNisWyZGRmN88posCdDXo1qYFXcRjdvqYfmUwzR5vi8EUOIIShW+r3SDT5eWJzhtfi93inZGRXvjTgjneKM3ug2aLKrP9t7GcMnLj3HdgzdJ648H/n+JfhEu3lODegUnQ/8gvW8DrA9JrGNK1C5bM+pQ+9cX7mVa27XF8j8eB8Ri6M/uo2En81c7mYpigdna+QyMV9FPPngDvQuMB3O6Tr6Y5I2VM0J1Z5PVnM+IMjOAIHsiH757HlhMwnpmBe+PVHzRO7RXesJZhuxrYnYpxK7+T51UGbPJ2afwmyr+cnVsczWJjojs/UEuI3ZYGL2qcxuvvodlNnv/PnuuZkNmt+EHctsbaLzMdvtlmLZE7NPZXbzddoZmS2q5efvmVGU/2IAbv4A</diagram></mxfile>


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | #data
107 | *.pt
108 | .data/
109 | data/
110 | models/
111 | .vector_cache/
112 | 


--------------------------------------------------------------------------------
/assets/seq2seq2.xml:
--------------------------------------------------------------------------------
1 | <mxfile modified="2018-12-10T15:41:04.062Z" host="www.draw.io" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0" version="9.5.9" etag="X2I1QPUOD1GhYeLAYC2y" type="device"><diagram id="d7628e3d-80be-0910-dbf4-95fda96d980f" name="Page-1">7Vxbb9s2FP41BraHBCIpydJj6qYdsLUYlgFrnwZFoiW2kujJdOz014+ySOtGN4qjWyIZBkweUqR4Pn4855CEF2gVHT4mzib4RD0cLqDmHRbo/QJCYCOd/6SSx0xiWCAT+AnxRKVccEd+YCHUhHRHPLwtVWSUhoxsykKXxjF2WUnmJAndl6utaVjudeP4uCa4c52wLv2HeCzIpBY0c/lvmPiB7BmYdlZy77jf/YTuYtHfAqL18ZMVR45sS/SwDRyP7jPRcezodoFWCaUsS0WHFQ5T3Uq1ZQr6cKb09N4JjlmTB5B8D/Yox449rgqRjWnMf94dB4TTRzSec2lEXJEOWBTyJOBJfCDsSyH9Na1ybaS5mCWPX8QTx0xedlJAWvYNM/YoZoOzY5SLaMIC6tPYCf+gdCOaX9OYfXAiEqbTa0V3CcEJH8RnvBeFog1g8nxdJXLM/EFXDBqKSeYkPha1UCZK1VF4TKjxI6YR5gPhFRIcOow8lGeOIyagf6qXg8ATAgc1JuJdHpxwh+UsqmBUBmQfEIbvNs5xMHvOyDIwJRX7obPdKoDcsoR+xysa0uTYBbLgPTJT/a1JGBbknoEtT38uCE5I/JhnXA4Cr/YTVB5wwvDhpxoXpVATq4VYZKAu8vucslIUFNgqZS/BCNUxmSRvUJ03slr/xEEzcZoRxxqOOAA8DdKlTBoJJwwFJ4aihFHTtr9jOK6pnM8cVlFmaV4LGIpTWohqEzSdh4S7UjeiICKel3ajZFsZ10aEawewy80MQs3YAttgi/bm2WKOyPMya9qOKH+tmS4vMC590kU3Z6+s6IIVSaUbQ7EKNjD5s1+WUsUczi9rEnW+ckszrmBFGa2YTpTO9TA1LphuTwKfnQY5W6FLqNSnFZJdTd4KKVy7wQIh3W4dFAGEhGV8im9Py+LRPynh3Z5IBvSKvdIq7MmQF09VsDq9RrPVsu6azz6EauEDcEAfog+O5Qtfvg5+HZmjYSkWvtY52RgV6wlHYzs7Gu3xrddwt767x4E8YuqRB54UaOr8+2Nh3IIsKevw5kvV+l9O7y1DN7T6crq2XOy6Y1hO9aqJ63E5NSZ8Ntvt0ql2ZyztGhU+ulVG3rCvzcJnqZfbzwxAG76OYbxtV7VrHGHVBWrolj7dUJXVLWK+nDF/CeaoehfgUsxrDXWI+RsPSTvHvHqMfTHm587D28fcmveGMj0o7Lw12LmffJt5d+GZZ+HA7M8dtuqYTJM6imMMSx+MOvOlq8vOxfukjo1m6hRpUqSODQejjj5T56LDvF6tTvvB+OukzlJhdQY7PLfav+gzqtBJpfm+T/NAdYu6vdM8OaR55Xvm6UKvK5/1tjnW9zZkUz493ZDZ3fbEvCXV6jbkxZjXGuoOc7v9LalpYV4N7C7G/FyE2AHmUGGA5TExlMfE/Vvk13cg3GsYP+FYpBR39OQRV/lodcZHZrP427vf8TL59Pdf7z+HwfrGuLLPX+9I1VaaCOZ/OyoLrrZHhd7wCsDcHPLC8o2QX4J/tYzt/PU0N8v8WrsiknV15o7IOC4GneGwYh6dpfUVqmC9rNMaGB3d41Gjr7rx2Dr6oIg+mNEXoBqDo6+KkFtFHxS5D6bMfdMogW8Pjr3dNfawiD2cMvZAHxn4QAZ+3aGPiuijKaMPK+v+CNBX3URoFX29iL4+ZfSR1hv3eTb/T6csPsj/OAvd/g8=</diagram></mxfile>


--------------------------------------------------------------------------------
/assets/seq2seq9.xml:
--------------------------------------------------------------------------------
1 | <mxfile modified="2018-12-12T14:01:45.146Z" host="www.draw.io" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0" version="9.6.1" etag="9MxNf8pWGP2gW5PkkeP_" type="device"><diagram id="d7628e3d-80be-0910-dbf4-95fda96d980f" name="Page-1">7Vvfd6I4FP5rfGwPEEB8bG3pPOzu2XO6Z3f6tCdCxMwAcUOsdv76DRDkR6KiIjpTfGjJTUjI/e735YboCEyjzQuFy8XvxEfhyND8zQg8jQxjYgD+NzV85AbTGeeGgGI/N+ml4RX/QMKoCesK+yipNWSEhAwv60aPxDHyWM0GKSXrerM5CeujLmGAJMOrB0PZ+g/22SK3OoZd2r8gHCyKkXV7ktfMoPc9oGQVi/FGBphnn7w6gkVfYqLJAvpkXTGB5xGYUkJYfhVtpihMXVu4Lb/P3VG7fW6KYtbmBiDuSNhHMXfkc1eIYkxi/u8xmxBKb9F4ySMR9sT1gkUhv9T5Jdpg9rVy/ZY2ubfSUszox1dxR1Yo67YOSOu+IcY+RDTAFSPcRChbkIDEMPyNkKXofk5i5sIIh2l4TcmKYkT5JP5Aa1Ep+tBtXpZdUsyZ3+iJSRsiyCANkGglQjh1R+U24cYXRCLEJ8IbUBRCht/rkQNFAAbbdiUI/ELgoMZEPMs7DFeoiKIGRnVA1gvM0OsSZpNZc0LWgam5OAhhkiiATBgl39GUhIRmQwDHmAE79d8ch2HF7lvI8c1jQYAhDmJe8DgIvBl/Dgp9zAuVnl3XNabTSt0TppzcmMRZJNKUOrvRfEeUoc1epIpaW5BPiJNhivK6pHphWlRYXtjOwRbIWH5KvgGZb0Wz/gkHBsJdlHBb5lyBcLp+GNxTGXgjXLIUXLoWlSzJ28GKoVhyOY8c1nBmjQ8ChioVhEkK7DQOMU/dHkRFhH0/HUbJ0jqurYjaDWCnL08AtGOL0QVbtF+eLfYNZXq25O2I8Mca6HLG4tInXUx7yOYuk7qJW/8kmA+7xdrUtHtQ+Vj1tMKa3NuVz9is959TXnTZAHn7jO12aC2SiiFjPIPUwLhexthm//2Tr4G3tf1S7r9sGKUcCdNlD5FkawjYdpLD+ngKlfpcH4uhPv36qEg6r7ZFe3r9tnn7103IOpq8f/n77cXygrvxfo0r/fRcWhvAKcASABVwHQDrgvg0vHwSYGq/9SaS8l5hSDm6TDmcK2Yck85lch/zStV8u7G0xFHIpH21tMQ5kJYkQ1rSGd36zEqsT3zmeVlq7dq11zbtpnPctj1XiC627Zb8dnPBGZyTeTXjlxmNtcLG+6uYB3a3YPcW0x7pLO/XVajqA6rnanbz3FzvEWRrr2a32Z8cnTFdXqb3LRI78bruS1YdnPSSVRpF35VvH5D9B0rhR6XZMm2Q7J5Ncxzd1Bpxl/d48poi78tU6mMM6nOu+ujNbxH0KT/jQX5+KfmR1rILyU9znM7lR96kquQHDPJzrvwYzSOpPuVn/wuiQX5+NvkBu74R17H8NMfpWn7sdjtqc5Cfc+UHaNeTH+XJh32+JrV6kdaJ9Ci1Q4FI93Li9CEnpnnOKN29YFMHivI9eiYHKWa1ELL/W5Gi4i7JQH/gDXR7uSkrS/kQL9/jWZL+gxWVyXtuLTMU8cHgLGuQRqbQVd7aehxZT2nnPPQSQemK3oRozhRqw9KwfEy4yuA4+CuL0buDp2M7JKFVmJbh1oBeVgnQp0qMu/6mVOPc0XWnU9c9Vo+lY03Xte2smy4waCi1brRT6i5OEtUYmJdQ6uNOGG/obF/tI8U3oq574L/3MSt0+nEenyQqzBzLtDSZaXPHQ553GkSd8umCJ/O8WP70Ml//yp+3guf/AQ==</diagram></mxfile>


--------------------------------------------------------------------------------
/assets/seq2seq3.xml:
--------------------------------------------------------------------------------
1 | <mxfile modified="2018-12-10T16:25:57.986Z" host="www.draw.io" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0" version="9.5.9" etag="I99zBpU3MFMaG6q5Htob" type="device"><diagram id="d7628e3d-80be-0910-dbf4-95fda96d980f" name="Page-1">7VxLc6M4EP41rto9zBRCgPEx483MHPYxVTls5jSlgIyZAHJAfmR+/QpLMk9nsSMLXCaHGLUEQvr66261ZE/gPN59SdFq+RfxcTQxDX83gX9MTBMAw2UfueSVSxxzygVBGvqiUSF4CH9hITSEdB36OKs0pIRENFxVhR5JEuzRigylKdlWmy1IVO11hQLcEDx4KGpK/w19uuRS13QK+VccBkvZM3BmvOYJec9BStaJ6G9iwsX+j1fHSD5L9JAtkU+2XLQfO7yfwHlKCOVX8W6Oo3xu5bTxCfp8pPbw3ilOaKcbxHtn9FWOHftsKkQxIQn7+LQfEM5vMVjJI3HoiesljSN2Cdgl3oX0MRd/tEXpu2h0GGRe+IkpfRWIozUlTERSuiQBSVD0JyEr8bgFSehnFIdRrkJzsk5DnLIX/RtvRaV4BnBYuTlsOb/sRk8MTOhkPr5SGzEvXzCJMU1fWYMUR4iGm6oqIKFRwaHd4dZvJGS9mobUfksosdB90zCqj6AoDTAVdxXYsIvSaxSiPWLt6FlQOXqgjB1HMmHv8yju2BeKumEhyyeWi2aqwa7A9AYm4l02KFpjaQNqGFUB2S5Dih9WaD+YLbOnVWAqUxxEKMtagMxoSp7xnEQk3XcBHc/FT4t8PsMoKsl9hN2FdyoIKAqDhBU8BgJr9gYqG5xSvHtzxmWtWaOJpM22MLhStCzZWst4P0ay61uwesqJ0G71zDqcF7R61mj1SsiWrR7szezNRrPXyewdeNGD2YPqidM0ewVxCh59bwO1P+JA2E8kCO3Zx5np2lNg8f/WpUykHGGFjg6Kc5JFNEeCZAdBQA8TV9IMps+0RskK24S2lIkmRA3a5OwI2eLqTlTEoe/n3bTagKr6dTIDajTk/NDFMmZM17uw2FTBYmdkMWdTP5GNRhbbDRYHhPgjVc93t5qp6mqg6lDoqGmhAesB1OUWGnAMajuyzO0vqLVAA5ObdIeW2Q//9LlDOcISHWOSJmESjB7xfK7q9YjWVDlbrzJ3Y4kpLydvlO9PdAalGWdOTEZj69fEvheXfTi+J9e2bKPp+Bauh71BOL76Vo9Gvze1bye2dDQt9eoJmzpO6nyZ00Y5/QybOVOInCbDMPBtPB0Cw+q5Flcjw3QkWgbCsKmhh2H1ZMrlGCZHNDLsxBSJToY1ITmXYZfn0VtrwKMwlOO7KdDDMVjzYqAeuB8h2f8u9YBTexC3IErYqn5dMFxbq0sPGrnPi9laaXRGW3va4lunrVV3VuwabK2meAaCmomEZ9raemDU1dbepSl6LTVb5Q2y7gGYXPIUWsSfeLYpUHfG4ArUTNfCFB5B7WQ1q69wL+nSmychx93L9y1+QTd3oSJRO23ufY3J9nevrTQCKEOLo0eA8HgESF0gpxNY9Wmpq9xCcVv8savcH3dGRcdRjyvYhm5FpTdQhrEuHf4BjnqgcYg3NSxMZzq+jHENzDGbzOnvPL98m5E6J0YCOqnjjqcpKm6/4nTM3qij4zu112DQWlDpz54NY8P9CuxZ/XyfzlCgg9MZbjaxEylmbfGx8g2hzhPeFiA7+/SAH27YpUgMyMNgQB4G423Y4yvN9PPp6o6I6aTTcxzAHzvPWm6eHv95eYmAQb99AC2ZPYlnPh8VCJ2XNZEVH7L9TN2xBsBZ7YrKqpr8lv0A/Nggez/D44XfG3rDuzqiOMNIPB1BukUfjoIv9wdklmjaBB/YF8oTtaPfnhZUib5ZRt+8ZfSBNTz4jxt8RfDDMvzwluE3TX3ws2Lxo0F8s674ZSZ4/x8=</diagram></mxfile>


--------------------------------------------------------------------------------
/assets/seq2seq6.xml:
--------------------------------------------------------------------------------
1 | <mxfile userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36" version="9.2.3" editor="www.draw.io" type="device"><diagram id="d7628e3d-80be-0910-dbf4-95fda96d980f" name="Page-1">7VxRl5soFP41ed0TRdE8dmfb7kO7Z8+Zh20fGSXGrUoOkknSX78YMVEwWZIgZqb2YQpXFOG79+PeC2YGnvLdZ4rWq68kxtnMnce7Gfhj5rpBCPnfSrCvBSDwa0FC07gWOSfBc/oTC+FcSDdpjMtOQ0ZIxtJ1VxiRosAR68gQpWTbbbYkWbfXNUqwIniOUKZK/0ljtqqloQtP8j9xmqyanh24qK+8oOhHQsmmEP3NXLA8/Ksv56h5lhhouUIx2bZE4OMMPFFCWF3Kd084q6a2mbb6vk9nrh7fm+KCad0g3rtk+2bsOOZTIaoFKfh/vx8GhKtb5rwWkTyNRHnF8owXHV7Eu5R9q8S/+aL2XTQ6DrKq/IsZ2wvE0YYRLiKUrUhCCpR9IWQtHrckBfuE8jSrVOiJbGiKKX/Rv/BWXBTPcCCvq8MWM1HyGyMxsLAWVeNrtRHz8hmTHDO65w0ozhBLX7uqgIRGJcd2x1v/Jinv1Z0L5ffnAluh+25Tbx7BEE0wE3edsOGF1mucRAfE+tHzgHH0nDZ2NZIFf59v4o5D5XTtsZCtJ7YWLUyD3YHpAibiXV5RtsENB0gYdQHZrlKGn9foMJgtp9MuMJ0pTjJUlj1AloySH/iJZIQeugAwCvHLsprPNMta8hjhcBldCwLK0qTglYiDwJtdQOUVU4Z3F2dcXPVCyUw8Ud+eCLcRrVpc683vx+jIvr8A6xk3hDOsJ8M5IOt5E+u1kG2zHhiN9hYT7WnRng/Hoz3HU0AqZy7MWDWVmxdeTKoiaGT8eS2xgicfMZNA6+AhDLENhRApE1vNX8rd7w/iQp7GcdVNr5Z09UhLUcwYnz7K0Oui7AR6KLsGUAbm6VFd3E70eGLL732mOx49AqDyo/EQQNsr1Iqzog19PWDi6GASXAAluIxKjMrVsaMBIZIm+ibMoG/CYflAKdq3GqwrR6S84M/4Ek93A2leqJ94qxPTjLSzYEKUVwRXEzIpj4KJf+8MLgCwyL9w4t/DPPiqLY/nnjo6oNzJv2+DYo3QqcqXC8njkU3pTPx3LS9DMDAv+wovJ4TEE/neHuJYJd/QAvk+CsFaSu1AOWQdLrUDpjTCbQGmzTSCp7WWvn8Hx3NVB2fEBJxqKVOEqQdaYIQ3r/VkAkcyYWDWk2lG2uLSnNAiLZLJmbmdaG06M15gnGjf5EaH1xNJjpfJ00qvXsezIweSkpezDCMcRX1+0Uvoe/78du4dJvT0LIWeHuhP6psi7NCfNOuxNMsPLGmW3I9vWLO0UpL3aVbLNXTHdg3tqt0jOJO+K2kQNKxBWq7I7Rr07pVk+AM34UDcJK2uwDGsWVoZu0mz7GmWku8bSLN8ObQynMr31FT+T0XXLGQTjyif04vRs4mefGTZYjYRqhvhfYeSnOlQ0t0oh/3xi41UBtQ7euZOKN+Lsrz7ZhPlQCuCfR+7b9DvXbTN+3kSN8uca27zDarr5RjL5QIGAPW4UdiJfRw8xHIpEWlob7UMbBwuehALa4jL9qcLw1lYM6LJwq5cxGxamArJrRY2vB1d2iU/C4OarRrexgJpFdPNpP5vcNokuo4PqhnEiLWa3357XK61pAcyfANybUM6E9det8Ntk2vNfb/8FrjWkj8D5d0kcCPXyo6RLtdenQiU+/HMJgIDc19EvQE1sxWYwjOoXa9msm8w4JKufp0/ne6+M4toMb2kng6eTrTdnyC0B2CjOmc/esPTR28GPzq2B6zjanly0y5uT+jlOGCQ9Vn5HaKhDgjIn/vMzXpvYn5+kX1cxdx79OcR9nF59fQrZTWqp1+CAx//Aw==</diagram></mxfile>


--------------------------------------------------------------------------------
/assets/seq2seq1.xml:
--------------------------------------------------------------------------------
1 | <mxfile userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36" version="9.1.8" editor="www.draw.io" type="device"><diagram id="d7628e3d-80be-0910-dbf4-95fda96d980f" name="Page-1">7V1Lc6M4EP41vqYMAoGPs9l5HGa3tiqHnTkSkDE7GLlAjpP59SuMwCDJMTFC4EQ+zEAjXvr66251S2QB7rfPX/Ngt/kLRyhd2MvoeQH+XNg2tB36byl4qQTAsitBnCdRJbJOgofkN2LCJZPukwgVnYYE45Qku64wxFmGQtKRBXmOD91ma5x277oLYiQIHsIgFaX/JhHZVFLfhif5N5TEm/rOFlxVRx6D8Fec433G7rewwfr4qw5vg/pa7EWLTRDhQ0sEPi/AfY4xqba2z/coLbu27rbqvC9njjbPnaOM9DkBsDMK8lK/O4poV7DdDGf0vz+OL4TKU5Z0L8TbJGTbG7JN6aZFN9FzQn60tn+WTe7cci8j+csPdsZx53Ss6YDy2H+IkBemDcGeYCrCOdngGGdB+h3jHbv8GmfkS7BN0lK97vE+T1BOX+JvdGAH2TUsSPfFLqnfmZ4Yspdm6kmCPEasFahEZXe0TmPd+BXhLaIvQhvkKA1I8tTVnIApYNy0O4FANxgOckzYszwF6R7VWsRh1AXksEkIetgFx5c5UEJ2gel0cZwGRSEBsiA5/oXucYrz4y2Abz8CWPbfOknTljxykR85bwUhSJM4ozshBYE2ewWVJ5QT9Pxqj9dHISMRMzK2w/YPJ8rWok2LrbVsCEZAxORD8gaIvKmb6ScOMMTpRZyGARMQx7Iug3Qtk2bCCVfCiako4Qq9He8JyoQup5pDuM7s6DWDoa3STCQoaKmHCQ2lPrED2ySKyttI2dbFtRfh1AB2vZsBoB9bbBVsWb57tsAZRV5Q6O0tpo9l6DLAuWily0p5VMYisToumw9vfNUkYaf+gxN61wZOd8nFCksOp4q47CwOquYxeqHnAOXo3WRM7YsWcTWVRfQv+x8NITUMffS4loTUAfLX4RxCasefLqRuckkfwOopJ8IZq8fDOaLVc4zVayHbiQMnM3srY/Z6mT13whSc5QggFQsbpmVgXuwf6WZcboJaRq/XEgt4mpD+LMqQC+ktT19I70BjHttZ1bZ9dNyp7GOjQybV+jp1gD1hXGhfBunGk0fzqj9ICxAw2Ja6fnRBCBeNwHihgVTSmVgC6odYt+mFJNlaeyq6OR8o21d3/NgDX0tfuk+SbTcxhMzwTZla0kGxk907mcGfM4szJDlZ9ZTsjYosK9uOMwoTZyijm9YwQ30u8CbpBiRhvfJ6V29QLkX1hm0KCyda6aY+t3SbdJPMWJos9Q4kU5Ywjgynrs/Ka+WUr4FTc+GNpmok5Kss4w3K6lcyla831kR0jsoc9Ys2btJvObbot6arGddP0509mCVZbHzX9bzS6bscTzmvbjLL60jiwcmGX44YD/6ewiU9+q7jLkWXtPZDFM7CJTl87lajS3I/8DrCcZOC8pDQWd6B1s/x3S707uoOtn6e071BxXYVAaMrrlDYSGbhLM0snKvp3WCqzxO6YiwjQ9UyqA412lz6S+fUKtdVbrNnNVQfvVZ65SThyxfiPbJCcy0WXWXEtg2xhxKbXzaoldnqBzIfitn2lQm3yxcakdliUVjGbDMdejCz+RWOWpn9zuc8jc1swKdtr2W2cKHxmA37DbEcw+yhzOanmOpkNhSnGciWs5gh1/A82YRjLthv0ZKJvwejzBfBdaLsvfORdef7HcoXIckdLv8hAt7fqquBQ7HiIMCpoeCwgh4IJBOTkRW5yHsrljoWvfv9GKai3uDpmLo1E4bVhkv3ovfxGOaJEa1hWB8nppNhIiTzXSz42mSVszC063aepvGnx3kxiw83+o4/+TlhTbpR/fjTe+fZxI6t1aQHPHwj2tp5LNWev63lZx7ptLXqvnx1C7ZWUzwDPc5EgittLR8Y9bW1n/I8eGk125UNiv4BmMV066RF1RWvNgXqlhfdgJrpGpjCM6i9Xc342GBEly4Wi8wii4FZRI3pJXGSvplpPDxBqA/AWnXMh0K0fK5qNGDp7ulPdVRm+PTnUMDn/wE=</diagram></mxfile>


--------------------------------------------------------------------------------
/assets/seq2seq8.xml:
--------------------------------------------------------------------------------
1 | <mxfile modified="2018-12-11T16:22:17.236Z" host="www.draw.io" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0" version="9.6.0" etag="v1S240oHrk1YLLYpY3lh" type="device"><diagram id="d7628e3d-80be-0910-dbf4-95fda96d980f" name="Page-1">7Vxdc5s4FP01fmwGJMDwuHXSdmd2dzqbh22fOgQUoMUoBTmO8+tXgMAWCAdsC5GEPCRw+RI69557dCWygKv10+fUfQj/xj6KF0DznxbwegGAbpgO/ZNbdqUFmnZpCNLIZyftDbfRM2JGjVk3kY8y7kSCcUyiB97o4SRBHuFsbpriLX/aPY75pz64AWoZbj03blv/i3wSllYbWHv7FxQFYfVk3WIvfOd6v4IUbxL2vAWA98VPeXjtVvdiT8hC18fb0lS8O7xZwFWKMSm31k8rFOd9W3Vb2UGfOo7W7U5RQvpckEUW/PPn129Jtvv39+/rL+btzv4AbPaeGdlVPYJ82kFsF6ckxAFO3Phmb/1YvDXK76vRvZCsY7qp0030FJFvufnKZHvf2Uk/ESE7hr67IZia9vf+C+MHdgffzcLizvlO2bS8PXxP4k3qoaNvtWz3jl73OfVlhNeIpDt6Sopil0SP/BNc5lRBfV596Vcc0TsCrQoAi/kxc3/d1PhbEDcNEGFX7eGhGwfN2JsK0AYACHUoBcBD+EowE9rQb9Vp+c7+GAdaX6Q7wG1BNgTvsrOPdZcGznQDDsChaNUNfXTjDap4owEfD842jAi6fXCLDthSDuYBq1klPzeI3Sxj2x5eRx7bzkiKf6EVjnFaPAL61p1lWvTIfRTHB3bKXsDzcjtOyCd3HcW5W69o/0copa38B23ZQQawnt/EjaMgoTseBY6eVkH7iFKCno6D2waiugA24mrJ9rd7kq5CLzzgZ0uTBp3zFpnS0cZiSqCcKY1xmFI5G1aYvsiGjI6UsWHV0JkNX2BDMDk2dPSXoRsaXYcgdYHaN54GQdaHSvsG0yhUqvPuAPUGziVHSKNS4MAZfWWJFCpH/02OGZ3RwhcqV0LmOEpo0mNGp++YkYW2OpU0jxn7qSQ4PZVkz3lSGc0aivMk1PSzwZ5enmQVtDEAVJ4mzw/WV1EwqCHtUT5VWjCoGzqnwpdS4cQyIdTOn6boSHy1yDzQnN9fV1IckVN11UlRlzN4nNLYo68IkqB4nCtbc0wAy98WBzXQtCvNWFq6U/0eWQ6JxjG0DcbzwrxZmKsY3ZNiRn+x/LhYXpfHWs5COZE0OJyj5wQnqMHMzNTi2ZxhI8+N/2AH1pHvF+4mShq8C/bKG4OI5AKk/8FssL7TZn1DwPpAHuuLxj9WnAPoR490MyDFq+dIhz+Mph8wDygvoC3grpn9oq9fGA23AMrdwhniFnB2CyluoU/OLyot0soQ4Q/Q4QQz3L3nEKcHt2gOsYRbm+E+dwSoEG54vPDFIOg3umMCv9oWCHytLfDlj/s6xf9htUVQVoHCYYSyuQVgH0Vq9BKlsHf69bP0WXaJ9cdplLVscActQVnLN5HtGxcqawmgbPFct2SxeUiAgNRk1bXg8WrwuyE12Ca16rTJsNryLbAalMJqzdUjElmtxyKhmdXy8FHIakINfiGamwhhmQLCUsVXZqu3gw1BSavL3/bA5jwN4PDRUq/6GGFgIyxQvK1osdrRomzMYrV6e41ps+ZwGZJcFIaLcXyu7zKSeSJx03+yV7cvHU8ds4HW0dlA8/hsoOTFNXVwv26NPtLSNZmlhx4CcFbpOXEqVOl9CkSvXHdMq64gHLta7jr39WLaEeGsNrzHmcfzQkmhJqke9d7LeCKhr2xY3CERJvbF9YkSwZIiEZoL+WRKhPY4bJYIIl5TqRCOx8optNYOFLVLbftFmy2gNUuZjBCufzuQEdksI04PN4UqQpywXtAWJyasYfJCbjKrkkw7vMYonouqr53LBtkSojQHXrCseF49eGLwVcnNGC/aTNHwtxN4vQX8DPll+FbnFeeIK8pMUSLllge34Z6BPXWOYExkLRGpc98DzMhertIygUW/XV8BzYt+xdgO+tIDNOBW/gWQPuALoOc5cUtyi2WjMDGidutwi0FfAIHZMeQ4RnMtjcT0QHf3/+u4LELu/6E0vPkf</diagram></mxfile>


--------------------------------------------------------------------------------
/assets/seq2seq10.xml:
--------------------------------------------------------------------------------
1 | <mxfile modified="2018-12-12T15:12:32.426Z" host="www.draw.io" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0" version="9.6.1" etag="bEdqW5090JaddLoKv5o3" type="device"><diagram id="d7628e3d-80be-0910-dbf4-95fda96d980f" name="Page-1">7VxNk6M2EP01PmYKEGA4bjzj2UOSStVUJTunlAwyZheQC/DYs78+wggMksCYT88OvhgagUCv3+tuJHsBVv7pOYT73Z/YRt5CkezTAjwuFEWWJYN8JZb31KIDKTU4oWvTRhfDi/sTUWPW7ODaKCo1jDH2YndfNlo4CJAVl2wwDPGx3GyLvXKve+ggzvBiQY+3/uva8S61Gop+sX9FrrPLepZ1Mz2ygdYPJ8SHgPa3UMD2/EkP+zC7Fn3QaAdtfCyYwNMCrEKM43TLP62Ql4xtNmzpeeuKo/l9hyiIm5wA6BlR/J49O7LJUNDdAAfk6/fzA6HkFInsWdh3Lbq9i32PbMpkE53c+Fth+zVp8qAle0Ecvn+jZ5x3LsfyAUiOfUdx/E69AR5iTEw4jHfYwQH0/sB4Ty+/xUG8hr7rJe61wofQRSF5iL/QkR6k15B1ss8PSfbM5ESLPrRCnQyGDqKtQGpKhqNwGh3GZ4R9RB6ENAiRB2P3rew5kDqgk7e7gEA2KA5iTOi9vEHvgDIvYjAqA3LcuTF62cPzwxwJI8vAlIbY8WAUCYCM4hD/QCvs4fDcBTCUDdCT8du6nlew2xoybPVWEKDnOgHZsQgIpBm5jxDaLtkpXHm9XiurVeHYoxsScrs4OHtimFCnGs03FMboVItUdlSn5KPipKh0/3ihembaFVie2bpgC3gsPyXfAM+3rNn4hAMz4QYlXM6cCQgny9fBbcvAO+GSJuDSVFTSuNF2DjEKuCEnnhMzg1niA4WhSAVq4hw78UOXpG5f6AHfte2kGyFLy7g2Imo/gLUPTwA0Y4vSB1ukX54t+h1lejo32j4mtzXTpUNwGZMuqj5nc8OkbvTUv7FLus2xViXpARQ+Wjmt0MwHvfBZquXrp5Snl2RAzu+xWYXWIKmYM8YOpAbKdBljk/r7g8fA+yq/hPWXDv2EI14S9hCOcoMT5w85x8c2VBozPmZdffr4KEg6JyvRHl++n17/W0f46JtvX/95fdYs57dlfSZzGaeni5UBTgAWBSiD6wpYA+LDjHIrwMTjpo0lknytMKccfaYcxoQZh9m7TNYx76Kar3eWlhgCmdQnS0uMK2lJNKclvdFtzKxE+8RznsNSq6pqLxXtqnFb2Z4qRB9lu8a/3dwRBqdkPmzI5pnGUmYj1yuYZ3Y3YHeO6Yh05ut1EaryjGpXzWbnzeURQdZqNbtJfXJzxjS8TNcFiUq8pn3JKoNWL1m5XuSqfPuK7H8JQ/heaLZPGkTVT8P2I6sS43fpFVvHFL4uE6mPMqtPV/WR2VUEY8rPcpafX0p+uFg2kPyw/fQuP3yRKpIfMMtPV/lR2CmpMeWn/gXRLD8fTX5A1Yq4nuWH7adv+dGbVdTqLD9d5QdI08mPeOaje0rU6EXaHc0KiccBDKNDxgBLXW6VD5XNkvR+5aNmQEua8pNztdumuthZrI2hqZrEKEnyUx3DQpbVzpk4KgvcqZLdKsvuEWefhCAY4AOz2zqEb+fbSM60YbTLd1i8t0g/4815iL00NxIzbaZw82Z9yohgFbe4oTGI3misBxqMa/UlKVptRnK1PZCvtDdr2w8jWWabWFjy0hbUqfXMktcPQrLbqTQFWwy1I1u65Uj8zyGOPYeyfHSr8Jg6lMnKxKHMrJ/u7YOcfPV891nrSFFEahZFessdRWs2euWbbhlos+X5ZkNkbCfh24QLl8SSW7/oc4ClhPyCprunnzlWttcJSlnil4UKsVnWgLOcIDnhaCudP6ywT4Fo54ykVdqtsZW8MkYlbwrUeF5BN5Dwj7mETuzZfKztXfjlGuH/CHmXIViHN+3C8trbLDAX3ik3p65vRlzpVhGj618+0CEvxb1eS5jqWDsZyXoJsVeLGzaDHri2MVRBNO21tjH1JYCCdztItjW0vIMQZ0z9KqFNhGv4KuF+q5mRUtqrfMv/GKP/leFitPnJ7ejDLyyemsLycvJ4yb8jcjC2Z7zEeA0GF9m9/M9hytnLn0mCp/8B</diagram></mxfile>


--------------------------------------------------------------------------------
/assets/seq2seq7.xml:
--------------------------------------------------------------------------------
1 | <mxfile userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36" version="9.2.3" editor="www.draw.io" type="device"><diagram id="d7628e3d-80be-0910-dbf4-95fda96d980f" name="Page-1">7V3Lkps4FP0ab7sQ4uVlpifJLGampiqLSZY0yJgJRl0Y9yNfP8IIDJJsyyCE7VYvEhBvnXuO7r16eAEfN29fi/B5/ReOUbawrfhtAX9f2LYfeOTfquC9LoC+WxckRRrXReBQ8C39hWihRUt3aYy2vRNLjLMyfe4XRjjPUVT2ysKiwK/901Y46z/1OUwQV/AtCjO+9N80Ltd1aWB7h/I/UJqsmycDb1kfeQqjn0mBdzl93sKGq/1ffXgTNveiH7pdhzF+7RTBzwv4WGBc1lubt0eUVVXbVFt93ZcjR9v3LlBeSl1A33tbvjffjmJSFXQ3xzn577f9B6HqEovsRXiTRnR7XW4ysgnIJnpLy+9V8YNL937Qk9qPrHb+Q2X5ThEPdyUmRbgo1zjBeZj9ifEzvd0K5+WXcJNmlQk94l2RooK86N/olR6k9wAe2ec/m9bEllwY0Q8L6qLq+zrn0Hr5ivAGlcU7OaFAWVimL31TCKlFJe157aX/4JQ81bao8bsWxZbavt3sN7cowyJBJb3qgA3Z6LzGoWiPmBg9BypHD3Sxq5HMyft8p1fsdw7HrgvZumLroqVqsHswncCEvstLmO1QowEMRn1AXtdpib49h/uPeSVy2gemV8VJFm63AiC3ZYF/okec4WL/COhFAXpaVfWZZlmnPA5RsIouBSHM0iQnOxEBgZx2ApUXVJTo7WSN06NOwNDEofuvB8FtitYdrXWs8Ri16vsBVE85EY6oHgvnhKrnGNXrINtVPTib7C2N7EnJnuvNJ3vA4UDaLmwvK6uq3D2RzaTahE0ZuV+nmMOTfHHJgNbDgxKxCwUt4iq2qr+UuN+f6IFNGsfVY4RW0rcjKUNRQz55lD2njzLw5VC2FaAM1csj37gd5PGglj9E1J1PHiHk9VF5CCDtFUrFWdGueNljAmQw8U+A4p9GJQ636/ZBE0LEVPQgzDxXhcPyqSjC984Jz5Ujsj3hz7iMTvcDabJR33GoE9N8aa/B9MJNJXC1IONtW2D0d2RwAaFG/fWM/u7rweW5PJ97CmRAGam/tyGxSuSU18sl4/GwVDoS/12qyx6cWJddTpcTjGMjvsNDHK3iG2gQ32sRWE2pHY8NWadL7UCTRhgWYOpMIzhSben9OziOzTs4MybgeKaYCFMONF+Jbl7qyfiAoTBU68k0X9rR0g0u8jRPjDMzXGh1OjOOr1xob7KjwxFEkvNl8qTSq5fp7MyBJOPlrIIIRZHIL3oKXMe1hmvvNKGnoyn0dKA4qa9KsAPXWNZ1WZbra7Is9jmuYsuSSkmOs6yOa2jP7RrqNbtrcCZdm7EgT7EFSbkiwy3o7o1k+gE3wUTaxLSuECi2LKmMnbEsfZbF5fsmsiyXDa0Up/IdPpX/i7M1DdnEFuVjdjF7NtFhhyxrzCZ6fEe4aFASMIOSRqMciOMXHakMT27omW1QHosy2/umE2VfKoK9j943zxU22ur9PEabWc1V1/nm8e3lHM3l0vNhKHCjEIhd5F9Fc8kIaaCvtfR1DC66EoY1wqV76sJ0DGu+yDDswkZMJ8N4SIYybHoeneolPwoDn62anmM+04rJZlLPBqdNoqu9Ua0gStiqvvvterVWkx2w8E2otY3oGK29rIdbp9aqm798C1qryZ/x2N4kOFBrWcdIVmsvTgSyz3HUJgJ9dTOibsDMdAWm3hHULjcz1jeYsEnnZ+eb0d0js4ga00v86GAzom18glAfgI3pHJ30hsykN4WTjvUBu1SflrrOkYpcF21vQQafb42XyyPIjmiPpVHxBXTTHwIF9hP0BCEQCYCC2FEUAnE0kary5oIZF2VYqp+xdIvUESxmAix3Ru5cx0yk6+eOPeNMJGCJHIqbCfUkeAEsQYTX6PocvGhfqBtC7UqUfzBPTWl7o3NSCbDsu+eMYBWROf2w9oX6gWtiSDOmodFLmo+y5twZZgkGvAMA56TWdQxauX4/rRldO4+f5t19myOI/WcNYNo3Msm2KVojlk56WyOTM9g3PJbAz5s1NpJa5e9GRozI1b3KehZ3DwJ966m3X2W8iTPyN+Pi3QB8lPVnzjDQFngcU9BSHhhRZsGsaanI45hxSUtgSw3UMpM0hTyFkzSL3M+MTDX/l81VWmoHZ7Xpgw8xTXOUAsw5TROAD5sBm7yBPUI86wF2/pxmeZgGfHf54HX+mkGO6gfNAcDn2NaCyZuWmbw5mOEtqBobdcAn5USwmpnXql03nVNyAbinWSznnS/16YeBP2x0/kZss6xSsflxziJq24baY6nNduXr5fZd5xun57Y9cGXr8zeakNuN5ZzhtvkVp9HcZkccaOW2rSPLecfchuy4xKHc5m40Jbf5DKqI247h9lhus/23E3Kb7B5+0Lm2isOPZsPP/wM=</diagram></mxfile>


--------------------------------------------------------------------------------
/assets/seq2seq4.xml:
--------------------------------------------------------------------------------
1 | <mxfile modified="2018-12-10T16:27:25.304Z" host="www.draw.io" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0" version="9.5.9" etag="BBcjZVduOTF8nYcN0Qsv" type="device"><diagram id="d7628e3d-80be-0910-dbf4-95fda96d980f" name="Page-1">7V3Pc5s4FP5rPLN7SAYQYDi22bZ72O52podtTx0CCmaLLQfjxOlfv8JINgjhKLYQsi0fEhDih/S9772np4eYgLv55lMRLWefUQLziWMlmwn4Y+I4dghc/K8qealLfGdaF6RFlpBK+4Kv2S9ICi1Sus4SuGpVLBHKy2zZLozRYgHjslUWFQV6bld7QHn7rssohZ2Cr3GUd0v/zZJyVpcGjr8v/xNm6Yze2fbD+sh9FP9MC7RekPtNHPCw/dWH5xG9FrnDahYl6Lku2rYdfJiAuwKhst6ab+5gXvUt7ba6gz72HN09dwEXpdAJ5LlX5QttO0xwV5DdBVrgf++3DYLVKRbei9E8i8n2rJzneNPGm3CTld+q4luP7H0nlXaNrHb+g2X5QhCP1iXCRagoZyhFiyj/C6EludwDWpQfo3mWVyJ0h9ZFBgv8oH/DZ3KQXMP28X632bR/8YkxaVhQF1Xta9Qh/fIJojksixdcoYB5VGZPbVGIiESlu3q7U7+gDN/Vsaj0u0SIiew7ltW+RBkVKSzJWXts8EbjMfZFW8T46LlAOnp2E7sayQV+nm/kjO3O/pheyNYdWxeFssFuwXQAE/IsT1G+hlQHMBi1AXmeZSX8uoy2jXnG+rQNTKuL0zxarThArsoC/YR3KEfF9hbAjwN4/1D1Z5bnjfIkgsFD/FYQojxLF3gnxiDgagdQeYJFCTcHe5wedRiaUNo87xUuLZo1dK1rnY4RvfU1aD3pROBrPYeFc0Ct5xqt10C2qfXAaGovNGpPSO3teDGC2gPyidNVe3vi7Hn0nQfqeMQBYBxPEHjhbegE3tR267/uUCqStrBFRz+aVyTLywoJtNoVpOWu4xqSgeW5ZCjZYhuRlibRSFGHNhU7Mjy4ekcOzLMkqW7D1QFt8RNSA3Ik5HjXxbVCLOsiLHZksNg3LK7ZNI5no5DFXofFKUKJoerx5lYxVQMFVNWFjooGGoB1oIYbaADj1AqyLBjPqXXtDiZXaQ5dZxz+qTOHtIUNOs5RscgWqbGIx3NVrUV0p9LZepaxG5d0eTN4I31+QhiUrp85cTCN3V8T7wPZHMPw3Qee61ldw/cQxDDWwvCxUz0K7d7Uux7f0lc01GMDNixO8myZz6OceoaF/hREfpdh0E48ONWBYWysJVDIMBWBFk0YNrXUMIwNpgzHMNoiw7A3hkhUMqwLybEMG55Hh8aAvTA0/buprYZjgLFiNuu495Ds1aGe7TMXqjWIFLbKHxfoq2tVyUEn9jmYrqVKx+jatw2+Vepaebli56BrFfkzwGZUJDhS17KOkaiufVcU0Uuj2rKqsBJ3wOiQZy9F9RWPVgXycgzOQMxUDUxBD2pvFjN2hDukSe9mQprZy9MGv7aYuZARqJ12575MsP3ksZVCAKlr0ZsCBE0KkDxHTiWw8sNSZzmFEnDscSDdHgujoiLV4wymobmojAaKHuNS/RM4WEdj528qGJiGKl7GOAfmOF3mjJfPT5/GUOeNnoBK6gQmm6Jl9ltGxxmNOireqT0HhcZBZTx9pseE+xnoMza/T6UrIGB09I0mCpEi5PnH0ieEhDuc5yD72/BAkj3hTRIYoMlgNk0Gq+vgy7eqqefT2aWIqaTTz3kKfmxid/Z0/+2fx8fctsovN/YA7w/p6TQwdGrzsKdvpl129tQMe2RDAWsPP/vIFi5w7oHPmYRNPBgkriRGdugnBAQ54aZj4mzRFOgBWSk/fnQ5rOS86c6vSTOZNWKlHu8QnQErOzEoDVjp8iZRzswhPYJwLsdJPWx09CGc23Vp03UJFx3cLnt2S66FVPuSUA+wFzA4PIaL4HxdUre7DMUc4WYYMp5iGLUg47WswXQMYwdwQvnZXGxsYbgM38MtNc7taxxmkdLBt/Wv054Kh3j0G0y6vBCPyemSZWg7b39qYGdNWKi3bzxL2DPWb5R6Sesun4Dd8D7SjeO3ae1Md7RW5Sd5AmEk4ydtwWKzaTRwlLxrWWHoGB47ojp4CMafiCsvoGQWyZQWQxx1oaEeyK82bDEOQXtMcqXV/cbPa+t8EN7ajaP0lUL5r6j1dMQlrazTEQQF8NrscoDCHtfr12o4BMrk4ZLWgRlFHtglO0+Rh/7cDWXycElrVYwhD+ynIk4Qh/6cAWXSYCI0/X0jnLjjDyB9J7qKJnHnyGQBB4w/ZqcxH8PKfq4JsBLoxkr67IaVb80a0IKVXbAMKwfkmlDWgMMuATN0NNznfZzGcPj1CUktKHy1ETQBCvuihpU6lxoZ1osOeoljN8KEpB0qn5D0BXJ/jAqecGYxtNDBJgAlN0AtzkCBYDdQHoLyL+kzUzoEqE+Sh/5oiDJ5uOjkIvUB6lPEoX8Urkoa6OLM1+exCwS41Hl97BKujqdeEhyO11ev5FD1aEtG/Mc1ogduVtu+focr2P5ysz/YXiDit9kPq/5gEH4+K653fu+sGFHfqmfJCD0SamQ4ji5g8HZ5juMu60JN/suU+11g2RJgNyXANhKwW/pDBwlwh5YAu6kD7KvWAQC0M5uwZ6aBBPBe2pIqAU5TApyrlgDH01ECeNEfqRIAmhIArloC7KmloQRwXySTKQFuUwLcq5aAUEcVwMtukSgAK+MG7ASASW/eKYTx0Od/G0Am+sYF2H9xRT/4+1d9lAS/sf/77y2pgx/vFqgCZB81wq2afUYJrGr8Dw==</diagram></mxfile>


--------------------------------------------------------------------------------
/assets/convseq2seq1.xml:
--------------------------------------------------------------------------------
1 | <mxfile modified="2019-04-15T15:36:16.247Z" host="www.draw.io" agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36" etag="9Xa_4EXjIbYysUdtaXcB" version="10.6.2" type="device"><diagram id="TZEOtu3ZNOnLiaiyQRdz" name="Page-1">7V1Zk6M4Ev419VgVCCGOx+6q7p2H6YmO7d3YnXmZwEa2mcFQi3Ed/etX4rBBwgYbIYky/VBthLiU+WUqD6Xu4OP27R+p/7z5lgQ4ujON4O0OPt2ZJoCmQ/6jLe9Fi2O6RcM6DYOy07HhR/gTl41G2boPA7xrdMySJMrC52bjMoljvMwabX6aJq/Nbqskaj712V9jruHH0o/41v+EQbYpWt3qs2j7Lzhcb6onA9srzmz9qnP5JbuNHySvtSb45Q4+pkmSFb+2b484ooNXjUtx3dcTZw8vluI463NB4v7x5/tTin99j/6J/5183/3x7es9QuXLZe/VF+OADEB5mKTZJlknsR99ObZ+TpN9HGB6W4McHfv8miTPpBGQxr9wlr2X1PT3WUKaNtk2Ks/itzD7L738AZVHv5c3o7+f3uoH79VBnKXvtYvo4e/V/ejB8bL8qLpulcTZV38bRrThFxy94Cxc+uRE8eH0a08OaNm0S/bpEp8ZxYox/XSNszP9LPdAdwIYnGwxeVNyYYojPwtfmi/il5y7PvQ7Epf8KOl7Aa3L+7740b58Ekf7JmVfN2GGfzz7+be/Enw3qbgKo+gxiZI0vxZiECDskPZdliZ/49oZz3agbx+G/AWnGX47P+j8GFUXuCWcSnliWeXx6xGdVdOmBsyqTfyoIucKCC336Us+zoDD0zLZhsvy91/77XN1o3U+/nlLgSv7SvBdCaSD/KIHBat/x2lIRhGnlEnCeN0GuUcCnhCnZIB+w6/lyfLdABQLQ9gThtBoZzE5MIQcDH++4pDjIQKRrEm4FO/Cn/4i70CH+TkJ4yx/O/T5Dj2RFkrsXTm29DAK1zH5vSQjSSgAP1PkEeEXfSpPZJRlPu8K4v0r55976xICWmJADY0mqJHZD9TmWKBG9lTU4kn1di3MRapFV5ZeLC/9ThFxZKsDf5RsBV2GX0opVlzFsMzhNQaoBndcjbtarczlsk3jBvbCRoI0LjuKGmjca9A5a9wRNK7ZW+VClSrX5HUu4FjophQu6la4pkyFaxscPTRVuDrbodDoCUdkKp0BG+PqRTmWKDtp1UAvXuPMmfXiCHoRmn31oq0UiCYHxC2Od8sNjm9aPVoG6lSPjlT1yE9XNFWPOtujsO9sdbB6bLdHocVoDEOuPQr5abBQvSvHHuVGUb3etWa9q4neRX31rqdU7yIOiDwSb0rhOrrZozw9NFW4Wtujdl+Fq9Q9VL3mtO1RSzc/LeTd3yu83GQ3blwg2G1cIKmyDk5F1mltXHiyZF27cWHZTfhDT7Jx4Y0rROUYF+woKheiFp+1w8P1psSnp9tU8Rrrb54qsnze12NaGNuqporVa057qoh0c6FYPTxTEkA1mNGFa3XE6qOeKSydN0KsPCwQOtr0wJ5MorHOk0yrr39rsJQ8wUVM9q3sjCqLd5xNcJLJjqJ68asm3VHddKQ7IxGNgx/Pe3Dh8d8JT4AkmezMhr8Imdw3y9UZKcsVMDKZFRNjy2ThWa6niM3ZqJdL73XqByG5vHbuC3hCX5zauacwxcssTOijXvEuEyP0TSZdCNqKhb6rZIr90exW1NdudZRGHKvXnEF6FqSQXTGgHKSTUdFag7Rv4o+rdGUYEp6f8yFBalm6gXT2AIsAqdUXpENdbcOyvYBSkQzqtD5QXhK1A3+3OeSsqSA9AOZA2ueXfkpT/73WoYyOnTSxbFbiuEyphY7+wDjfn3UIMf3Jj+KNhdpt1aDP2uastmEd58q1jc1nK+EIb8nnvIY7TOm333KEFBghjvCK3uqq+HCdK+qxYbs6Lt8YCCAcaDrXoMPTDYAWwo0WKHZ4Gywj7B7TZ8UB/TYy3Dn/mnZEh3iRNoho/29Pa8XkA3VfEOYT6QDs57fjSfJrTf/H2wUOApqROyIrDEsWkMoM3R530JZ0Nd6c0Z3gnNFA3iXziPzokCEucSrp9HXJuo7KqaSjj+fUXeJ2DbxwkYWOHnOhLlDTVaxKvekvqxKJmqEeyvbJLbtehyPnyPEJR3hiot4oY32Y6lE2mUCDDJQNdjG2o4xNXJWNsorJbgVlrBNSPcomEymQgrKR0vFcxSgTXsNRb5SxzhflKHPbEtAKaz0IX1rNdWpv35fDSe31w4je0XK0vCmf2/CMNV89g1jUcW+fQNuNojDGPnUrRP47zdMvbkuGorhz82mkOf+oZuskvhNvF+R0EG7J33vS6G8p45fPMDZhUJ29aABmpxroxiNoW30BRwOko41EXGH7hBfb8RaGIInI1r892NQ1Ctiu1IkHH0f4LX/fvCj3y8Md5YcKn4soWf69q+N1xhTjqD5gTJmj2ptz9QWkEHvjuFNYEQyuXRFC+OcBOKYFoONBYDC5yESuPBi269kI5Z2s5kNGzkz2JlN/V+ck9jKMPIZHT1/OuaYa+8w5HOeMYz9btsacM8WInIacY47COcjVmHO0iTAECLuB1WaDuOYC2qIqgjNxvIPFp6wCm6GN91kOBdgYjwYU0MYzKYcCrP9fAwqYt0UB1jesAQXa8vcn5DS9Fedw7v49+3GDH3FwOceL3XM+Lu2e6JqfevZEX+Y160a/XE80MGa3mRADQpLfrGfx5Qv9ZoTnFFoiwJgdZyJ4EKhwnKlmndlzJoR1VHjOVLPO7DoTwjoqXGeqWUcf35mP3VVr/N5eunixEmO1sr4z1aWfDpHlWyEA6zpTTwB9PGdSCMB6ztQTQB/HmRQCcDUllRNAaf0EfVbUd+71Ut8Ypv/EQ1x5FEGT20uX3yO2dFbH8nu2f9fye3bthJTl9wAoNdVlcH2DuU9C4OxGQo0thwSyfO+ifYLcUZeyvMlWHehgebZ/F8uzmzpLYnmlHlJ9WF6k7DZ7y24hPq1LGRmeqBh8ijHZ/l2MzK4ulcTIfNY5DbgEe59eu0ziuKxEwrL7DQZqGEmD+Nlma+WF0bKbAWhbdn9V+NIstFJrBPNk+HLR+oS+tyhbaGY9kTO1Hkayz573WS1guFAXRByvMIgAhvTYSBBsYck2C2hElmzzgk2OJbeLMJ6Z8kqmNLrFpC2VJ6s7z079IfMz0N/9D4dO0AbSW7gfUsXWNrZuG3iBahinvrdNL2YfZ287m90wQ/K2I8DkM/se9FVVEnZhs3XbsBeYkynCq/NmFsDsu8PQcHXVjnXnhLNXHtY/xB5D7DBqoAdvbZOhHmBz+oLNHLof0UDa8f4uXtzekv5zgHb6b87BEgHJyhfUQ/+ZaiHpqaS3PkFzRcS3RUx+Lg2mOBdGBdn+XcEUdtIgJ5hSjbr6/ButC9GzRrDyQvQAzuVzhSqEfqCUXXOw/C4t8Cml6CALNOVFBwGcTG1PrfPr4TibIhOAaluaovxmHcArZ1U4C171q8LhZJaF6g1eITPuC8GreHEM5Lf6UQVeKZnpXJxHOXbnNZFCZsa9s20tpRuyHF502jFwh4WRehz1WOX3UWLg1jhxMZcN6MiOi1ltbpqynEmeyYWTHVvf5JYjBE5zKtEWIZCb92pN312jQ4TcMqUpsxOSgN2kXLokEO4QUhEhZ4dRvZa01Lh5dJ45Wr13BraGel8H0o73s/B2/y3pPxc24aU+Qm7NjhghkOydtGINXTo5kN4ffoGt/Ah5f+IjIanAl0bI3QuXG7L9uyLk7KRBToTc0miTG40j5KyJrD5Cbk0mKUuO6hintCMLSukRckubIlZyIuQs0NRHyCtPiv5A0zrIhsYpXuhqvNVQ+c06gFdOhJwFr/oIOeIHfAbvFeAdZ/HdefAqjpCjtpL7asArJULORYGUY/eaGe5yn74czNMmeeolzjrroUn0djYKUZ0tWdXb0QYFz67RSIXTDevBMyzHBsVft8mAsoMPqLVURi0MuZvDkPVtEtxuN6zcMKR9zVR9lhhjSAx7nOm+aWklMWx+gs+z4C3JBBN2pyYICs2QwzShxYCOFCVTrs23JMC0x/8B</diagram></mxfile>


--------------------------------------------------------------------------------
/assets/convseq2seq2.xml:
--------------------------------------------------------------------------------
1 | <mxfile modified="2019-04-15T15:31:14.100Z" host="www.draw.io" agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36" etag="LeimyCaan_w7baEtX3sB" version="10.6.2" type="device"><diagram id="8t-f0pdZJ5VZ50Wk4v7P" name="Page-1">7V1bl5s2EP41+7h7kLj6cTdpkrZpT85Je5o89WAb27QYXIz3kl9fYYNtJIxlY5gBtC9rhIwF880nzUXDnf5u+foxdleL36KpF9xRbfp6p7+/o1TX9BH7l7a87VqoYRm7lnnsT3dt5NDw1f/hZY1a1rrxp9660DGJoiDxV8XGSRSG3iQptLlxHL0Uu82ioPirK3fuCQ1fJ24gtv7lT5PFrtWh9qH9k+fPF/kvEyu746Wbd87uZL1wp9HLUZP+053+Lo6iZPdp+frOC9Knlz+X3fc+nDi7H1jshYnMFx614Ocn/fdv/4W/fHr9/PYrGf8I7qmzu8yzG2yyO85Gm7zlj+DZixOfPZHP7tgLvkRrP/GjkJ0aR0kSLe/0p7zDY+DP0xNJtGKti2QZsAPCPrJbX6UXW77OU5g8jN21P3lwJ5vE+zuJfTecp7/1tIWN9mCyj3G0Cafe9PD93aPT2MEkWvqT7PM6iaN/vXdREMXbserWxPHGM3Zm5gfBUfvU9ZwZ+9ZTtHInfpKi0UwvkN09uwHv9eRzJXtpMZx70dJL4jfWJfuCaWUCziBumtklXg54oVrWZ3GEFTNrczOIzveXPkiRfcgEeYlQbSXUmkI1HHRCtZRQ6wpVQydUUwm1plB1ik6ouhJqTaEyVcUmVKqEWlOoBN1CydQFGXpTtvzPDqM4WUTzKHSDnw6tR088fS6HPp+jVJpbMfzjJclbZsu4myQqCtl79ZNvR5+/74WZHr1/za68PXjLD0J2v98OHdPD7/k10oPD17ZH+fem7nqxR8fuZtM7rJYgeyDRJp54FU/uhKRjL3AT/7l4/TKxZV/9Evnslw9qb44eHP3wZxbwwqbvB+vozzaK10/ceO4l2SUPyHiMY/ftqNsq7bA+PQidWyVY2X18kOxvnOlPrcr+7MNuxAdY7x/w9Ugn59mriOuXhZ94XxkHpGdfGBkVMTyLwuSDu/SD9CY+ecGzlxIbO+Fm1DZhkPJikV9mzsSbTMoYaeyYRlPzh6WJVGOUMI3RGNMYimmuYxraCNPoOgamcS5kGucyptENAKaRWCf1iWl48wOeaUzFNNcxjd4I0xgEAdMY1mVMw/c/xzQGBWAaCTO7T0zDe6/gmcZSTHMd0xjNMM0IAdOY/Mr7DHPw/c8xjVltbTXDNMbAmIZfl4Izja2Y5jqmMRthGtNGwDQWv/I+wxx8/7NMU21tNcM0EvGgPjENHzuHZxpHMc11TGM1wjQWBo+wfaFHmO9/jmksCI+wRDpBn5jGwuYRFhOvvjy+xyuCeexOffb1/FwYhSn1FWKMx8HHAs3cPpZoY3O7jbolzjZlxcd9wWVlEEEwLU7yh4n9+/G8ftNJ/obzOsnmhd1cWtGRjmquAGrJNB8m/Hy2z6A4nXNxA6USMmQotFJJCKAFLbsl8nVJ5OtauahaQj4aP3g7yBcS/sCRLyGAjiHfkEV+3TygeshH45dtB/lC/jI48iUE0DHkm7LIp6DIR+MnbAn5/HYMcORLCKBjyLdkkV83T6Ie8tH4rdpBvrC7DBz5EgLoGPJtWeTXjdvXQ77Err4+IZ/32MIjX0IAiJB/81gMn7hOHfJgFi+y0zUh4HL+Uga7lHb448Z2Io5zs+T4kUR6PCLJymRNjyQ5jdig03k+TgSkZnrO1CgjNYeOdctqxm1nGNCkJlEAABH0b05qfI58DVITLgVMarRTkpVJ0NakSQ10pZaPcyikxntk4UlNYlZBBP2bkxqfjl+D1IRLAZNa73ztOpEmtbrJnPVIDc0W0nZIjXe2g5OaKTGrIIL+zUmNz/yvQWrCpYBJrXdhFJ1Kk1rdvNF6pCaxRu4VqfE57eCk1i3Py+1JzbkdqfGXAia13kXIdNl8IGLboKSGJyGoFVLjQ2TwpNYtz8vNSY3fz1CD1IRLAZNa74KfumyqF7EdUFLDk+vVCqnx0U9wUiNUYqmmKuBdNFORfKYAq4BHciEqqd7MqMIgVVWC9Nb+PwxSlZgDlVQrpcqHqjBIVdWAritVPqsCg1RVufa6UuX3jWKQqipEc6WdSWhd8/FEJmFlgQjClYegnBuioZLBNFN22ZLBZ7oL2ZKF7s2UhyAyb4ZBs529gTmEaiLbtGxwjxTZXLtzt+6O3BN6W1k1uC2y4eM858hG6F/dndeDNsgGT8YcTNVgeLLJ03sU2VxONrQRsqkuHNwS2fCFgM+xh9i/ujuvB62QDR0W2fCeLARko967cjXZQNQObols+FrA59hD7F/dvdLqaops0MQtYWoHIyAb9UKEq8mmmaKe1eWDWyIbvhzwOfYQ+1eTTaXV1RTZoCmOAFM+GAHZKAfx1WTTjIO4uoJwS2TDVwQ+xx5i/2pugnAQ6wNzEPMZWQjIpmNFZ5HXEIYXqCG6QVELFLKKMAJpgbpMu1ZGOK8Pez7R2IQtsyRTyLZPdZaE5BkKrli92+RN8kqZEugH3RBJZEp69gn9QkIgPPp7t3GOGLK1JYkJusmEyBQ37BP6hSRnePR3q8ieFPodWfRbsIXkZWqB9Qr9/MYNePR3q3CRVEFt6aJdFgVFv0x9lT6hX9iMBo7+jhWDkEK/tNVrwVq9MnvW+4R+3puLAP39s3pre3LKwx1CPvyDdlwSYB+Ea6sqgInHZoYpYguuOv0zmWu7gcpVR8juhlYdPAY3TKlUcNXpn71d24dUrjpCrjK06uCx1mEKcoKrTv+M9doOqBPJcHw6C7DqWHhMfZiyj9CqY/XP0q/tvTqhOnweKbTq4PETwBQXBFed/rkJaru+JLMioVVnYG4CoYQduOqgeWsmTEmDe5KbMHAiQGPvw2z0xiACNO8RhNn+ikEEaEx3mE2BGEQgYbz3SQT8OhaBCGw0VjjMBhIMIiiz5qyA/e7T1H8uiML6bxOl7ez+k/vsgT6yHtkz3Z9nn+bZ/+111is3zNsC9y3tmlbt3p1koz4+f9S8/fliaxMjmkThcxRstoX3LhsTB9R0EEU0xt7a/+GOtx1SBGV7xFhv8+nOfJ/CkllV652BRY5QGniz9FLlxf/W6caKcP7H1ky7N6pQn57I7Ddi5cfZiMkN8Mwtbcrez0vKyvM1BmealxFSOzKkXnst+z4LSmBz0m00L7SYzWa0fKKYWmPLbCi+rlta/v4EqJmCwtaH6pxqyW73oAT0VTH7gQ5FtXh7HIVqgThwO6taprRqwe4lsfF4G1tRLd7PgkK1LKVaF6iW7DYtSkawqoXHi9yOajkIVQu00k3nVMuWVS0KuwfMxhMdaEW1eL8oCtVylGpdoFqy2ytPQqEt1cIT9WlFtXh/NwLVIiOJmEPHsmfskST+s/wHOPzjCbmBuPHu83pdYOh3+pexnK+MJdAP68R2JJinT+gXMl/g0S8x/3YN/UQa/bB+5nygQ0G/kHQEj/7+bfRyqDT6YV3B+UAHg34+3wse/b17cTtxZMP3Wa4bHPoHFr4XUu3g0Q/6CoSuOZQc6VqNNqyv1hlY8F5IoARXrFHZompI+ZMfP/+ZPtgJQ7urkiivwDQpvo/AyItKHydR0nYxXWYmDwnTKeamGzd9UpMoDL2JAvYVwL4vAlu3SoBtthpW00ZqFSS9CqKatHkBuQbaDxPBGghkT6flgAfVKFFp95colnRuMKxi4TEuQHbqolAslXR/iWLJZgbrsIqFJy8YZP81CsUC8f52VrFk84JBSxjvhzkYxXIQKhbom707p1iyWcGgiVv7YQ5FsfgADgrFMpRiXaBYsjnBoDlh+2EORbH4AA4KxRrYqoF3IFGHwMtgYBMM72tAIYOBcRFvlqKQAZ4MeRALBoMMctUcigz4xS4KGeDJFoZ5tTwGGdASGQwpDWDlTlXQ/1JCLwb9LU0M+lvtorgsPNwKirVok6w2aTnq9Bao5S5TWYXj9aqIIFiQ5wNOn5EfbserIH8R5IlA3QLkid4u5ssit61ACAmoVVHHGu6BIoMT+KKOtCxeOiQ4qxzb2xYqBU+xZYuggUNapdjWwnXF/oeWc2opzAYIPEhWVmJtK5HCW4kwWx7okZW46zWOS9CjQHXFwvWeaCXkSEpgRZuDVZk7vx3ng7Llb4Ahq8yWLzN+roAQO4yjVKT7cx9jd7X4LZp6aY//AQ==</diagram></mxfile>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Original repo [here](https://github.com/bentrevett/pytorch-seq2seq).
 2 | 
 3 | # PyTorch Seq2Seq
 4 | 
 5 | This repo contains tutorials covering understanding and implementing sequence-to-sequence (seq2seq) models using [PyTorch](https://github.com/pytorch/pytorch) 1.0 and [TorchText](https://github.com/pytorch/text) 0.3 using Python 3.7.
 6 | 
 7 | **If you find any mistakes or disagree with any of the explanations, please do not hesitate to [submit an issue](https://github.com/bentrevett/pytorch-seq2seq/issues/new). I welcome any feedback, positive or negative!**
 8 | 
 9 | ## Getting Started
10 | 
11 | To install PyTorch, see installation instructions on the [PyTorch website](pytorch.org).
12 | 
13 | To install TorchText:
14 | 
15 | ``` bash
16 | pip install torchtext
17 | ```
18 | 
19 | We'll also make use of spaCy to tokenize our data. To install spaCy, follow the instructions [here](https://spacy.io/usage/) making sure to install both the English and German models with:
20 | 
21 | ``` bash
22 | python -m spacy download en
23 | python -m spacy download de
24 | ```
25 | 
26 | ## Tutorials
27 | 
28 | * 1 - [Sequence to Sequence Learning with Neural Networks](https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb)
29 | 
30 |     This first tutorial covers the workflow of a PyTorch with TorchText seq2seq project. We'll cover the basics of seq2seq networks using encoder-decoder models, how to implement these models in PyTorch, and how to use TorchText to do all of the heavy lifting with regards to text processing. The model itself will be based off an implementation of [Sequence to Sequence Learning with Neural Networks](https://arxiv.org/abs/1409.3215), which uses multi-layer LSTMs.
31 | 
32 | * 2 - [Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation](https://github.com/bentrevett/pytorch-seq2seq/blob/master/2%20-%20Learning%20Phrase%20Representations%20using%20RNN%20Encoder-Decoder%20for%20Statistical%20Machine%20Translation.ipynb)
33 | 
34 |     Now we have the basic workflow covered, this tutorial will focus on improving our results. Building on our knowledge of PyTorch and TorchText gained from the previous tutorial, we'll cover a second second model, which helps with the information compression problem faced by encoder-decoder models. This model will be based off an implementation of [Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation](https://arxiv.org/abs/1406.1078), which uses GRUs.
35 | 
36 | * 3 - [Neural Machine Translation by Jointly Learning to Align and Translate](https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb)
37 | 
38 |     Next, we learn about attention by implementing [Neural Machine Translation by Jointly Learning to Align and Translate](https://arxiv.org/abs/1409.0473). This further allievates the information compression problem by allowing the decoder to "look back" at the input sentence by creating context vectors that are weighted sums of the encoder hidden states. The weights for this weighted sum are calculated via an attention mechanism, where the decoder learns to pay attention to the most relevant words in the input sentence.
39 | 
40 | * 4 - [Packed Padded Sequences, Masking and Inference](https://github.com/bentrevett/pytorch-seq2seq/blob/master/4%20-%20Packed%20Padded%20Sequences%2C%20Masking%20and%20Inference.ipynb)
41 | 
42 |     In this notebook, we will improve the previous model architecture by adding *packed padded sequences* and *masking*. These are two methods commonly used in NLP. Packed padded sequences allow us to only process the non-padded elements of our input sentence with our RNN. Masking is used to force the model to ignore certain elements we do not want it to look at, such as attention over padded elements. Together, these give us a small performance boost. We also cover a very basic way of using the model for inference, allowing us to get translations for any sentence we want to give to the model. Finally, we show how we can view the attention values for translations.
43 | 
44 | * 5 - [Convolutional Sequence to Sequence Learning](https://github.com/bentrevett/pytorch-seq2seq/blob/master/5%20-%20Convolutional%20Sequence%20to%20Sequence%20Learning.ipynb) 
45 | 
46 |     **(WRITE-UP IN PROGRESS)** We finally move away from RNN based models and implement a fully convolutional model. One of the downsides of RNNs is that they are sequential. That is, before a word is processed by the RNN, all previous words must also be processed. Convolutional models can be fully parallelized, which allow them to be trained much quicker. We will be implementing the [Convolutional Sequence to Sequence](https://arxiv.org/abs/1705.03122) model, which uses multiple convolutional layers in both the encoder and decoder, with an attention mechanism between them.  
47 | 
48 | * 6 - [Attention Is All You Need](https://github.com/bentrevett/pytorch-seq2seq/blob/master/6%20-%20Attention%20is%20All%20You%20Need.ipynb) 
49 | 
50 |     **(TWEAKING IN PROGRESS)** Continuing with the non-RNN based models, we implement the Transformer model from [Attention Is All You Need](https://arxiv.org/abs/1706.03762). This model is based soley on attention mechanisms and introduces Multi-Head Attention. The encoder and decoder are made of multiple layers, with each layer consisting of Multi-Head Attention and Positionwise Feedforward sublayers. This model is currently used in many state-of-the-art sequence-to-sequence and transfer learning tasks.
51 | 
52 | ## References
53 | 
54 | Here are some things I looked at while making these tutorials. Some of it may be out of date.
55 | 
56 | - https://github.com/spro/practical-pytorch
57 | - https://github.com/keon/seq2seq
58 | - https://github.com/pengshuang/CNN-Seq2Seq
59 | - https://github.com/pytorch/fairseq
60 | - https://github.com/jadore801120/attention-is-all-you-need-pytorch
61 | - http://nlp.seas.harvard.edu/2018/04/03/attention.html


--------------------------------------------------------------------------------
/5 - Convolutional Sequence to Sequence Learning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 5 - Convolutional Sequence to Sequence Learning\n",
  8 |     "\n",
  9 |     "In this notebook we'll be implementing the [Convolutional Sequence to Sequence Learning](https://arxiv.org/abs/1705.03122) model. \n",
 10 |     "\n",
 11 |     "## Introduction\n",
 12 |     "\n",
 13 |     "## Preparing the Data\n",
 14 |     "\n",
 15 |     "First, let's import all the required modules and set the random seeds for reproducability."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import torch\n",
 25 |     "import torch.nn as nn\n",
 26 |     "import torch.optim as optim\n",
 27 |     "import torch.nn.functional as F\n",
 28 |     "\n",
 29 |     "from torchtext.datasets import TranslationDataset, Multi30k\n",
 30 |     "from torchtext.data import Field, BucketIterator\n",
 31 |     "\n",
 32 |     "import spacy\n",
 33 |     "\n",
 34 |     "import random\n",
 35 |     "import math\n",
 36 |     "import time"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "SEED = 1234\n",
 46 |     "\n",
 47 |     "random.seed(SEED)\n",
 48 |     "torch.manual_seed(SEED)\n",
 49 |     "torch.backends.cudnn.deterministic = True"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "Next, we'll load the spaCy models and define the tokenizers for the source and target languages."
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 3,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "spacy_de = spacy.load('de')\n",
 66 |     "spacy_en = spacy.load('en')"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 4,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "def tokenize_de(text):\n",
 76 |     "    \"\"\"\n",
 77 |     "    Tokenizes German text from a string into a list of strings\n",
 78 |     "    \"\"\"\n",
 79 |     "    return [tok.text for tok in spacy_de.tokenizer(text)]\n",
 80 |     "\n",
 81 |     "def tokenize_en(text):\n",
 82 |     "    \"\"\"\n",
 83 |     "    Tokenizes English text from a string into a list of strings\n",
 84 |     "    \"\"\"\n",
 85 |     "    return [tok.text for tok in spacy_en.tokenizer(text)]"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Next, we'll set up the `Field`s which decide how the data will be processed. By default RNN models in PyTorch require the sequence to be a tensor of shape **[sequence length, batch size]** so TorchText will, by default, return batches of tensors in the same shape. However in this notebook we are using CNNs which expect the batch dimension to be first. We tell TorchText to have batches be **[batch size, sequence length]** by setting `batch_first = True`.  "
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 5,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "SRC = Field(tokenize=tokenize_de, \n",
102 |     "            init_token = '<sos>', \n",
103 |     "            eos_token = '<eos>', \n",
104 |     "            lower = True, \n",
105 |     "            batch_first = True)\n",
106 |     "\n",
107 |     "TRG = Field(tokenize=tokenize_en, \n",
108 |     "            init_token = '<sos>', \n",
109 |     "            eos_token = '<eos>', \n",
110 |     "            lower = True, \n",
111 |     "            batch_first = True)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "Then, we load our dataset."
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 6,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), \n",
128 |     "                                                    fields=(SRC, TRG))"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "We build our vocabulary as before, by converting any tokens that appear less than 2 times into `<unk>` tokens."
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 7,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "SRC.build_vocab(train_data, min_freq = 2)\n",
145 |     "TRG.build_vocab(train_data, min_freq = 2)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "The final bit of data preparation is defining the device and then building the iterator."
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 8,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 9,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "BATCH_SIZE = 128\n",
171 |     "\n",
172 |     "train_iterator, valid_iterator, test_iterator = BucketIterator.splits(\n",
173 |     "    (train_data, valid_data, test_data), \n",
174 |     "     batch_size = BATCH_SIZE,\n",
175 |     "     device = device)"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "## Building the Model\n",
183 |     "\n",
184 |     "### Encoder\n",
185 |     "\n",
186 |     "![](assets/convseq2seq1.png)\n",
187 |     "\n",
188 |     "![](assets/convseq2seq2.png)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "class Encoder(nn.Module):\n",
198 |     "    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, device):\n",
199 |     "        super().__init__()\n",
200 |     "        \n",
201 |     "        assert kernel_size % 2 == 1, \"Kernel size must be odd!\"\n",
202 |     "        \n",
203 |     "        self.input_dim = input_dim\n",
204 |     "        self.emb_dim = emb_dim\n",
205 |     "        self.hid_dim = hid_dim\n",
206 |     "        self.kernel_size = kernel_size\n",
207 |     "        self.dropout = dropout\n",
208 |     "        self.device = device\n",
209 |     "        \n",
210 |     "        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)\n",
211 |     "        \n",
212 |     "        self.tok_embedding = nn.Embedding(input_dim, emb_dim)\n",
213 |     "        self.pos_embedding = nn.Embedding(100, emb_dim)\n",
214 |     "        \n",
215 |     "        self.emb2hid = nn.Linear(emb_dim, hid_dim)\n",
216 |     "        self.hid2emb = nn.Linear(hid_dim, emb_dim)\n",
217 |     "        \n",
218 |     "        self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim, \n",
219 |     "                                              out_channels = 2 * hid_dim, \n",
220 |     "                                              kernel_size = kernel_size, \n",
221 |     "                                              padding = (kernel_size - 1) // 2)\n",
222 |     "                                    for _ in range(n_layers)])\n",
223 |     "        \n",
224 |     "        self.dropout = nn.Dropout(dropout)\n",
225 |     "        \n",
226 |     "    def forward(self, src):\n",
227 |     "        \n",
228 |     "        #src = [batch size, src sent len]\n",
229 |     "        \n",
230 |     "        #create position tensor\n",
231 |     "        pos = torch.arange(0, src.shape[1]).unsqueeze(0).repeat(src.shape[0], 1).to(self.device)\n",
232 |     "        \n",
233 |     "        #pos = [batch size, src sent len]\n",
234 |     "        \n",
235 |     "        #embed tokens and positions\n",
236 |     "        tok_embedded = self.tok_embedding(src)\n",
237 |     "        pos_embedded = self.pos_embedding(pos)\n",
238 |     "        \n",
239 |     "        #tok_embedded = pos_embedded = [batch size, src sent len, emb dim]\n",
240 |     "        \n",
241 |     "        #combine embeddings by elementwise summing\n",
242 |     "        embedded = self.dropout(tok_embedded + pos_embedded)\n",
243 |     "        \n",
244 |     "        #embedded = [batch size, src sent len, emb dim]\n",
245 |     "        \n",
246 |     "        #pass embedded through linear layer to go through emb dim -> hid dim\n",
247 |     "        conv_input = self.emb2hid(embedded)\n",
248 |     "        \n",
249 |     "        #conv_input = [batch size, src sent len, hid dim]\n",
250 |     "        \n",
251 |     "        #permute for convolutional layer\n",
252 |     "        conv_input = conv_input.permute(0, 2, 1) \n",
253 |     "        \n",
254 |     "        #conv_input = [batch size, hid dim, src sent len]\n",
255 |     "        \n",
256 |     "        for i, conv in enumerate(self.convs):\n",
257 |     "        \n",
258 |     "            #pass through convolutional layer\n",
259 |     "            conved = conv(self.dropout(conv_input))\n",
260 |     "\n",
261 |     "            #conved = [batch size, 2*hid dim, src sent len]\n",
262 |     "\n",
263 |     "            #pass through GLU activation function\n",
264 |     "            conved = F.glu(conved, dim = 1)\n",
265 |     "\n",
266 |     "            #conved = [batch size, hid dim, src sent len]\n",
267 |     "            \n",
268 |     "            #apply residual connection\n",
269 |     "            conved = (conved + conv_input) * self.scale\n",
270 |     "\n",
271 |     "            #conved = [batch size, hid dim, src sent len]\n",
272 |     "            \n",
273 |     "            #set conv_input to conved for next loop iteration\n",
274 |     "            conv_input = conved\n",
275 |     "        \n",
276 |     "        #permute and convert back to emb dim\n",
277 |     "        conved = self.hid2emb(conved.permute(0, 2, 1))\n",
278 |     "        \n",
279 |     "        #conved = [batch size, src sent len, emb dim]\n",
280 |     "        \n",
281 |     "        #elementwise sum output (conved) and input (embedded) to be used for attention\n",
282 |     "        combined = (conved + embedded) * self.scale\n",
283 |     "        \n",
284 |     "        #combined = [batch size, src sent len, emb dim]\n",
285 |     "        \n",
286 |     "        return conved, combined"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": null,
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": [
295 |     "class Decoder(nn.Module):\n",
296 |     "    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, pad_idx, device):\n",
297 |     "        super().__init__()\n",
298 |     "        \n",
299 |     "        self.output_dim = output_dim\n",
300 |     "        self.emb_dim = emb_dim\n",
301 |     "        self.hid_dim = hid_dim\n",
302 |     "        self.kernel_size = kernel_size\n",
303 |     "        self.dropout = dropout\n",
304 |     "        self.pad_idx = pad_idx\n",
305 |     "        self.device = device\n",
306 |     "        \n",
307 |     "        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)\n",
308 |     "        \n",
309 |     "        self.tok_embedding = nn.Embedding(output_dim, emb_dim)\n",
310 |     "        self.pos_embedding = nn.Embedding(100, emb_dim)\n",
311 |     "        \n",
312 |     "        self.emb2hid = nn.Linear(emb_dim, hid_dim)\n",
313 |     "        self.hid2emb = nn.Linear(hid_dim, emb_dim)\n",
314 |     "        \n",
315 |     "        self.attn_hid2emb = nn.Linear(hid_dim, emb_dim)\n",
316 |     "        self.attn_emb2hid = nn.Linear(emb_dim, hid_dim)\n",
317 |     "        \n",
318 |     "        self.out = nn.Linear(emb_dim, output_dim)\n",
319 |     "        \n",
320 |     "        self.convs = nn.ModuleList([nn.Conv1d(hid_dim, 2*hid_dim, kernel_size)\n",
321 |     "                                    for _ in range(n_layers)])\n",
322 |     "        \n",
323 |     "        self.dropout = nn.Dropout(dropout)\n",
324 |     "      \n",
325 |     "    def calculate_attention(self, embedded, conved, encoder_conved, encoder_combined):\n",
326 |     "        \n",
327 |     "        #embedded = [batch size, trg sent len, emb dim]\n",
328 |     "        #conved = [batch size, hid dim, trg sent len]\n",
329 |     "        #encoder_conved = encoder_combined = [batch size, src sent len, emb dim]\n",
330 |     "        \n",
331 |     "        #permute and convert back to emb dim\n",
332 |     "        conved_emb = self.attn_hid2emb(conved.permute(0, 2, 1))\n",
333 |     "        \n",
334 |     "        #conved_emb = [batch size, trg sent len, emb dim]\n",
335 |     "        \n",
336 |     "        combined = (embedded + conved_emb) * self.scale\n",
337 |     "        \n",
338 |     "        #combined = [batch size, trg sent len, emb dim]\n",
339 |     "                \n",
340 |     "        energy = torch.matmul(combined, encoder_conved.permute(0, 2, 1))\n",
341 |     "        \n",
342 |     "        #energy = [batch size, trg sent len, src sent len]\n",
343 |     "        \n",
344 |     "        attention = F.softmax(energy, dim=2)\n",
345 |     "        \n",
346 |     "        #attention = [batch size, trg sent len, src sent len]\n",
347 |     "            \n",
348 |     "        attended_encoding = torch.matmul(attention, (encoder_conved + encoder_combined))\n",
349 |     "        \n",
350 |     "        #attended_encoding = [batch size, trg sent len, emd dim]\n",
351 |     "        \n",
352 |     "        #convert from emb dim -> hid dim\n",
353 |     "        attended_encoding = self.attn_emb2hid(attended_encoding)\n",
354 |     "        \n",
355 |     "        #attended_encoding = [batch size, trg sent len, hid dim]\n",
356 |     "        \n",
357 |     "        attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale\n",
358 |     "        \n",
359 |     "        #attended_combined = [batch size, hid dim, trg sent len]\n",
360 |     "        \n",
361 |     "        return attention, attended_combined\n",
362 |     "        \n",
363 |     "    def forward(self, trg, encoder_conved, encoder_combined):\n",
364 |     "        \n",
365 |     "        #trg = [batch size, trg sent len]\n",
366 |     "        #encoder_conved = encoder_combined = [batch size, src sent len, emb dim]\n",
367 |     "                \n",
368 |     "        #create position tensor\n",
369 |     "        pos = torch.arange(0, trg.shape[1]).unsqueeze(0).repeat(trg.shape[0], 1).to(device)\n",
370 |     "        \n",
371 |     "        #pos = [batch size, trg sent len]\n",
372 |     "        \n",
373 |     "        #embed tokens and positions\n",
374 |     "        tok_embedded = self.tok_embedding(trg)\n",
375 |     "        pos_embedded = self.pos_embedding(pos)\n",
376 |     "        \n",
377 |     "        #tok_embedded = [batch size, trg sent len, emb dim]\n",
378 |     "        #pos_embedded = [batch size, trg sent len, emb dim]\n",
379 |     "        \n",
380 |     "        #combine embeddings by elementwise summing\n",
381 |     "        embedded = self.dropout(tok_embedded + pos_embedded)\n",
382 |     "        \n",
383 |     "        #embedded = [batch size, trg sent len, emb dim]\n",
384 |     "        \n",
385 |     "        #pass embedded through linear layer to go through emb dim -> hid dim\n",
386 |     "        conv_input = self.emb2hid(embedded)\n",
387 |     "        \n",
388 |     "        #conv_input = [batch size, trg sent len, hid dim]\n",
389 |     "        \n",
390 |     "        #permute for convolutional layer\n",
391 |     "        conv_input = conv_input.permute(0, 2, 1) \n",
392 |     "        \n",
393 |     "        #conv_input = [batch size, hid dim, trg sent len]\n",
394 |     "        \n",
395 |     "        for i, conv in enumerate(self.convs):\n",
396 |     "        \n",
397 |     "            #apply dropout\n",
398 |     "            conv_input = self.dropout(conv_input)\n",
399 |     "        \n",
400 |     "            #need to pad so decoder can't \"cheat\"\n",
401 |     "            padding = torch.zeros(conv_input.shape[0], conv_input.shape[1], self.kernel_size-1).fill_(self.pad_idx).to(device)\n",
402 |     "            padded_conv_input = torch.cat((padding, conv_input), dim=2)\n",
403 |     "        \n",
404 |     "            #padded_conv_input = [batch size, hid dim, trg sent len + kernel size - 1]\n",
405 |     "        \n",
406 |     "            #pass through convolutional layer\n",
407 |     "            conved = conv(padded_conv_input)\n",
408 |     "\n",
409 |     "            #conved = [batch size, 2*hid dim, trg sent len]\n",
410 |     "            \n",
411 |     "            #pass through GLU activation function\n",
412 |     "            conved = F.glu(conved, dim=1)\n",
413 |     "\n",
414 |     "            #conved = [batch size, hid dim, trg sent len]\n",
415 |     "            \n",
416 |     "            attention, conved = self.calculate_attention(embedded, conved, encoder_conved, encoder_combined)\n",
417 |     "            \n",
418 |     "            #attention = [batch size, trg sent len, src sent len]\n",
419 |     "            #conved = [batch size, hid dim, trg sent len]\n",
420 |     "            \n",
421 |     "            #apply residual connection\n",
422 |     "            conved = (conved + conv_input) * self.scale\n",
423 |     "            \n",
424 |     "            #conved = [batch size, hid dim, trg sent len]\n",
425 |     "            \n",
426 |     "            #set conv_input to conved for next loop iteration\n",
427 |     "            conv_input = conved\n",
428 |     "            \n",
429 |     "        conved = self.hid2emb(conved.permute(0, 2, 1))\n",
430 |     "         \n",
431 |     "        #conved = [batch size, trg sent len, hid dim]\n",
432 |     "            \n",
433 |     "        output = self.out(self.dropout(conved))\n",
434 |     "        \n",
435 |     "        #output = [batch size, trg sent len, output dim]\n",
436 |     "            \n",
437 |     "        return output, attention"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "class Seq2Seq(nn.Module):\n",
447 |     "    def __init__(self, encoder, decoder, device):\n",
448 |     "        super().__init__()\n",
449 |     "        \n",
450 |     "        self.encoder = encoder\n",
451 |     "        self.decoder = decoder\n",
452 |     "        self.device = device\n",
453 |     "        \n",
454 |     "    def forward(self, src, trg):\n",
455 |     "        \n",
456 |     "        #src = [batch size, src sent len]\n",
457 |     "        #trg = [batch size, trg sent len]\n",
458 |     "           \n",
459 |     "        #calculate z^u (encoder_conved) and e (encoder_combined)\n",
460 |     "        #encoder_conved is output from final encoder conv. block\n",
461 |     "        #encoder_combined is encoder_conved plus (elementwise) src embedding plus positional embeddings \n",
462 |     "        encoder_conved, encoder_combined = self.encoder(src)\n",
463 |     "            \n",
464 |     "        #encoder_conved = [batch size, src sent len, emb dim]\n",
465 |     "        #encoder_combined = [batch size, src sent len, emb dim]\n",
466 |     "        \n",
467 |     "        #calculate predictions of next words\n",
468 |     "        #output is a batch of predictions for each word in the trg sentence\n",
469 |     "        #attention a batch of attention scores across the src sentence for each word in the trg sentence\n",
470 |     "        output, attention = self.decoder(trg, encoder_conved, encoder_combined)\n",
471 |     "        \n",
472 |     "        #output = [batch size, trg sent len, output dim]\n",
473 |     "        #attention = [batch size, trg sent len, src sent len]\n",
474 |     "        \n",
475 |     "        return output, attention"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "code",
480 |    "execution_count": null,
481 |    "metadata": {},
482 |    "outputs": [],
483 |    "source": [
484 |     "INPUT_DIM = len(SRC.vocab)\n",
485 |     "OUTPUT_DIM = len(TRG.vocab)\n",
486 |     "EMB_DIM = 256\n",
487 |     "HID_DIM = 512\n",
488 |     "ENC_LAYERS = 10\n",
489 |     "DEC_LAYERS = 10\n",
490 |     "ENC_KERNEL_SIZE = 3\n",
491 |     "DEC_KERNEL_SIZE = 3\n",
492 |     "ENC_DROPOUT = 0.25\n",
493 |     "DEC_DROPOUT = 0.25\n",
494 |     "PAD_IDX = TRG.vocab.stoi['<pad>']\n",
495 |     "    \n",
496 |     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
497 |     "    \n",
498 |     "enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_LAYERS, ENC_KERNEL_SIZE, ENC_DROPOUT, device)\n",
499 |     "dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, DEC_LAYERS, DEC_KERNEL_SIZE, DEC_DROPOUT, PAD_IDX, device)\n",
500 |     "\n",
501 |     "model = Seq2Seq(enc, dec, device).to(device)"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": null,
507 |    "metadata": {},
508 |    "outputs": [],
509 |    "source": [
510 |     "def count_parameters(model):\n",
511 |     "    return sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
512 |     "\n",
513 |     "print(f'The model has {count_parameters(model):,} trainable parameters')"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "code",
518 |    "execution_count": null,
519 |    "metadata": {},
520 |    "outputs": [],
521 |    "source": [
522 |     "optimizer = optim.Adam(model.parameters())"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": null,
528 |    "metadata": {},
529 |    "outputs": [],
530 |    "source": [
531 |     "criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": null,
537 |    "metadata": {},
538 |    "outputs": [],
539 |    "source": [
540 |     "def train(model, iterator, optimizer, criterion, clip):\n",
541 |     "    \n",
542 |     "    model.train()\n",
543 |     "    \n",
544 |     "    epoch_loss = 0\n",
545 |     "    \n",
546 |     "    for i, batch in enumerate(iterator):\n",
547 |     "        \n",
548 |     "        src = batch.src\n",
549 |     "        trg = batch.trg\n",
550 |     "        \n",
551 |     "        optimizer.zero_grad()\n",
552 |     "        \n",
553 |     "        output, _ = model(src, trg[:,:-1])\n",
554 |     "        \n",
555 |     "        #output = [batch size, trg sent len - 1, output dim]\n",
556 |     "        #trg = [batch size, trg sent len]\n",
557 |     "        \n",
558 |     "        output = output.contiguous().view(-1, output.shape[-1])\n",
559 |     "        trg = trg[:,1:].contiguous().view(-1)\n",
560 |     "        \n",
561 |     "        #output = [batch size * trg sent len - 1, output dim]\n",
562 |     "        #trg = [batch size * trg sent len - 1]\n",
563 |     "        \n",
564 |     "        loss = criterion(output, trg)\n",
565 |     "        \n",
566 |     "        loss.backward()\n",
567 |     "        \n",
568 |     "        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n",
569 |     "        \n",
570 |     "        optimizer.step()\n",
571 |     "        \n",
572 |     "        epoch_loss += loss.item()\n",
573 |     "        \n",
574 |     "    return epoch_loss / len(iterator)"
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "code",
579 |    "execution_count": null,
580 |    "metadata": {},
581 |    "outputs": [],
582 |    "source": [
583 |     "def evaluate(model, iterator, criterion):\n",
584 |     "    \n",
585 |     "    model.eval()\n",
586 |     "    \n",
587 |     "    epoch_loss = 0\n",
588 |     "    \n",
589 |     "    with torch.no_grad():\n",
590 |     "    \n",
591 |     "        for i, batch in enumerate(iterator):\n",
592 |     "\n",
593 |     "            src = batch.src\n",
594 |     "            trg = batch.trg\n",
595 |     "\n",
596 |     "            output, _ = model(src, trg[:,:-1])\n",
597 |     "        \n",
598 |     "            #output = [batch size, trg sent len - 1, output dim]\n",
599 |     "            #trg = [batch size, trg sent len]\n",
600 |     "\n",
601 |     "            output = output.contiguous().view(-1, output.shape[-1])\n",
602 |     "            trg = trg[:,1:].contiguous().view(-1)\n",
603 |     "\n",
604 |     "            #output = [batch size * trg sent len - 1, output dim]\n",
605 |     "            #trg = [batch size * trg sent len - 1]\n",
606 |     "            \n",
607 |     "            loss = criterion(output, trg)\n",
608 |     "\n",
609 |     "            epoch_loss += loss.item()\n",
610 |     "        \n",
611 |     "    return epoch_loss / len(iterator)"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": null,
617 |    "metadata": {},
618 |    "outputs": [],
619 |    "source": [
620 |     "def epoch_time(start_time, end_time):\n",
621 |     "    elapsed_time = end_time - start_time\n",
622 |     "    elapsed_mins = int(elapsed_time / 60)\n",
623 |     "    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n",
624 |     "    return elapsed_mins, elapsed_secs"
625 |    ]
626 |   },
627 |   {
628 |    "cell_type": "code",
629 |    "execution_count": null,
630 |    "metadata": {},
631 |    "outputs": [],
632 |    "source": [
633 |     "N_EPOCHS = 10\n",
634 |     "CLIP = 1\n",
635 |     "\n",
636 |     "best_valid_loss = float('inf')\n",
637 |     "\n",
638 |     "for epoch in range(N_EPOCHS):\n",
639 |     "    \n",
640 |     "    start_time = time.time()\n",
641 |     "    \n",
642 |     "    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)\n",
643 |     "    valid_loss = evaluate(model, valid_iterator, criterion)\n",
644 |     "    \n",
645 |     "    end_time = time.time()\n",
646 |     "    \n",
647 |     "    epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n",
648 |     "    \n",
649 |     "    if valid_loss < best_valid_loss:\n",
650 |     "        best_valid_loss = valid_loss\n",
651 |     "        torch.save(model.state_dict(), 'tut5-model.pt')\n",
652 |     "    \n",
653 |     "    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')\n",
654 |     "    print(f'\\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')\n",
655 |     "    print(f'\\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')"
656 |    ]
657 |   },
658 |   {
659 |    "cell_type": "code",
660 |    "execution_count": null,
661 |    "metadata": {},
662 |    "outputs": [],
663 |    "source": [
664 |     "model.load_state_dict(torch.load('tut5-model.pt'))\n",
665 |     "\n",
666 |     "test_loss = evaluate(model, test_iterator, criterion)\n",
667 |     "\n",
668 |     "print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')"
669 |    ]
670 |   }
671 |  ],
672 |  "metadata": {
673 |   "kernelspec": {
674 |    "display_name": "Python 3",
675 |    "language": "python",
676 |    "name": "python3"
677 |   },
678 |   "language_info": {
679 |    "codemirror_mode": {
680 |     "name": "ipython",
681 |     "version": 3
682 |    },
683 |    "file_extension": ".py",
684 |    "mimetype": "text/x-python",
685 |    "name": "python",
686 |    "nbconvert_exporter": "python",
687 |    "pygments_lexer": "ipython3",
688 |    "version": "3.7.0"
689 |   }
690 |  },
691 |  "nbformat": 4,
692 |  "nbformat_minor": 2
693 | }
694 | 


--------------------------------------------------------------------------------
/6 - Attention is All You Need.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "http://nlp.seas.harvard.edu/2018/04/03/attention.html\n",
  8 |     "    "
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 43,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "import torch\n",
 18 |     "import torch.nn as nn\n",
 19 |     "import torch.optim as optim\n",
 20 |     "import torch.nn.functional as F\n",
 21 |     "from torch import Tensor\n",
 22 |     "from torch.optim import Optimizer\n",
 23 |     "\n",
 24 |     "import torchtext\n",
 25 |     "from torchtext.datasets import TranslationDataset, Multi30k\n",
 26 |     "from torchtext.data import Field, BucketIterator\n",
 27 |     "\n",
 28 |     "import spacy\n",
 29 |     "\n",
 30 |     "import random\n",
 31 |     "import math\n",
 32 |     "import os\n",
 33 |     "import time"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 44,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "SEED = 1\n",
 43 |     "\n",
 44 |     "random.seed(SEED)\n",
 45 |     "torch.manual_seed(SEED)\n",
 46 |     "torch.backends.cudnn.deterministic = True"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 45,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "spacy_de = spacy.load('de')\n",
 56 |     "spacy_en = spacy.load('en')"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 46,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "def tokenize_de(text):\n",
 66 |     "    \"\"\"\n",
 67 |     "    Tokenizes German text from a string into a list of strings\n",
 68 |     "    \"\"\"\n",
 69 |     "    return [tok.text for tok in spacy_de.tokenizer(text)]\n",
 70 |     "\n",
 71 |     "def tokenize_en(text):\n",
 72 |     "    \"\"\"\n",
 73 |     "    Tokenizes English text from a string into a list of strings\n",
 74 |     "    \"\"\"\n",
 75 |     "    return [tok.text for tok in spacy_en.tokenizer(text)]"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 47,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True)\n",
 85 |     "TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 48,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG))"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 49,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "SRC.build_vocab(train_data, min_freq=2)\n",
104 |     "TRG.build_vocab(train_data, min_freq=2)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 50,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 51,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "BATCH_SIZE = 128\n",
123 |     "\n",
124 |     "train_iterator, valid_iterator, test_iterator = BucketIterator.splits(\n",
125 |     "    (train_data, valid_data, test_data), \n",
126 |     "     batch_size=BATCH_SIZE,\n",
127 |     "     device=device)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 52,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "class SelfAttention(nn.Module):\n",
137 |     "    def __init__(self, \n",
138 |     "                 hid_dim: int, \n",
139 |     "                 n_heads: int, \n",
140 |     "                 dropout: float, \n",
141 |     "                 device: torch.device):\n",
142 |     "        super().__init__()\n",
143 |     "        \n",
144 |     "        self.hid_dim = hid_dim\n",
145 |     "        self.n_heads = n_heads\n",
146 |     "        \n",
147 |     "        assert hid_dim % n_heads == 0\n",
148 |     "        \n",
149 |     "        self.w_q = nn.Linear(hid_dim, hid_dim)\n",
150 |     "        self.w_k = nn.Linear(hid_dim, hid_dim)\n",
151 |     "        self.w_v = nn.Linear(hid_dim, hid_dim)\n",
152 |     "        \n",
153 |     "        self.fc = nn.Linear(hid_dim, hid_dim)\n",
154 |     "        \n",
155 |     "        self.dropout = nn.Dropout(dropout)\n",
156 |     "        \n",
157 |     "        self.scale = torch.sqrt(torch.FloatTensor([hid_dim // n_heads])).to(device)\n",
158 |     "        \n",
159 |     "    def forward(self, \n",
160 |     "                query: Tensor, \n",
161 |     "                key: Tensor, \n",
162 |     "                value: Tensor, \n",
163 |     "                mask: Tensor = None):\n",
164 |     "        \n",
165 |     "        bsz = query.shape[0]\n",
166 |     "        \n",
167 |     "        #query = key = value [batch size, sent len, hid dim]\n",
168 |     "                \n",
169 |     "        Q = self.w_q(query)\n",
170 |     "        K = self.w_k(key)\n",
171 |     "        V = self.w_v(value)\n",
172 |     "        \n",
173 |     "        #Q, K, V = [batch size, sent len, hid dim]\n",
174 |     "        \n",
175 |     "        Q = Q.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)\n",
176 |     "        K = K.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)\n",
177 |     "        V = V.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)\n",
178 |     "        \n",
179 |     "        #Q, K, V = [batch size, n heads, sent len, hid dim // n heads]\n",
180 |     "        \n",
181 |     "        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale\n",
182 |     "        \n",
183 |     "        #energy = [batch size, n heads, sent len, sent len]\n",
184 |     "        \n",
185 |     "        if mask is not None:\n",
186 |     "            energy = energy.masked_fill(mask == 0, -1e10)\n",
187 |     "        \n",
188 |     "        attention = self.dropout(F.softmax(energy, dim=-1))\n",
189 |     "        \n",
190 |     "        #attention = [batch size, n heads, sent len, sent len]\n",
191 |     "        \n",
192 |     "        x = torch.matmul(attention, V)\n",
193 |     "        \n",
194 |     "        #x = [batch size, n heads, sent len, hid dim // n heads]\n",
195 |     "        \n",
196 |     "        x = x.permute(0, 2, 1, 3).contiguous()\n",
197 |     "        \n",
198 |     "        #x = [batch size, sent len, n heads, hid dim // n heads]\n",
199 |     "        \n",
200 |     "        x = x.view(bsz, -1, self.n_heads * (self.hid_dim // self.n_heads))\n",
201 |     "        \n",
202 |     "        #x = [batch size, src sent len, hid dim]\n",
203 |     "        \n",
204 |     "        x = self.fc(x)\n",
205 |     "        \n",
206 |     "        #x = [batch size, sent len, hid dim]\n",
207 |     "        \n",
208 |     "        return x"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 53,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "class PositionwiseFeedforward(nn.Module):\n",
218 |     "    def __init__(self, \n",
219 |     "                 hid_dim: int, \n",
220 |     "                 pf_dim: int, \n",
221 |     "                 dropout: float):\n",
222 |     "        super().__init__()\n",
223 |     "        \n",
224 |     "        self.hid_dim = hid_dim\n",
225 |     "        self.pf_dim = pf_dim\n",
226 |     "        \n",
227 |     "        self.fc_1 = nn.Conv1d(hid_dim, pf_dim, 1)\n",
228 |     "        self.fc_2 = nn.Conv1d(pf_dim, hid_dim, 1)\n",
229 |     "        \n",
230 |     "        self.do = nn.Dropout(dropout)\n",
231 |     "        \n",
232 |     "    def forward(self, \n",
233 |     "                x: Tensor):\n",
234 |     "        \n",
235 |     "        #x = [batch size, sent len, hid dim]\n",
236 |     "        \n",
237 |     "        x = x.permute(0, 2, 1)\n",
238 |     "        \n",
239 |     "        #x = [batch size, hid dim, sent len]\n",
240 |     "        \n",
241 |     "        x = self.dropout(F.relu(self.fc_1(x)))\n",
242 |     "        \n",
243 |     "        #x = [batch size, ff dim, sent len]\n",
244 |     "        \n",
245 |     "        x = self.fc_2(x)\n",
246 |     "        \n",
247 |     "        #x = [batch size, hid dim, sent len]\n",
248 |     "        \n",
249 |     "        x = x.permute(0, 2, 1)\n",
250 |     "        \n",
251 |     "        #x = [batch size, sent len, hid dim]\n",
252 |     "        \n",
253 |     "        return x"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 54,
259 |    "metadata": {},
260 |    "outputs": [],
261 |    "source": [
262 |     "class EncoderLayer(nn.Module):\n",
263 |     "    def __init__(self, \n",
264 |     "                 hid_dim: int, \n",
265 |     "                 n_heads: int, \n",
266 |     "                 pf_dim: int, \n",
267 |     "                 self_attention: SelfAttention, \n",
268 |     "                 positionwise_feedforward: PositionwiseFeedforward, \n",
269 |     "                 dropout: float, \n",
270 |     "                 device: torch.device):\n",
271 |     "        super().__init__()      \n",
272 |     "        self.layer_norm = nn.LayerNorm(hid_dim)\n",
273 |     "        self.self_attention = self_attention(hid_dim, n_heads, dropout, device)\n",
274 |     "        self.positionwise_feedforward = positionwise_feedforward(hid_dim, pf_dim, dropout)\n",
275 |     "        self.dropout = nn.Dropout(dropout)\n",
276 |     "        \n",
277 |     "    def forward(self, \n",
278 |     "                src: Tensor, \n",
279 |     "                src_mask: Tensor):\n",
280 |     "        \n",
281 |     "        #src = [batch size, src sent len, hid dim]\n",
282 |     "        #src_mask = [batch size, 1, 1, src sent len]\n",
283 |     " \n",
284 |     "        #src = [batch size, src sent len, hid dim]\n",
285 |     "        src = self.layer_norm(\n",
286 |     "            src + self.dropout(self.self_attention(\n",
287 |     "                src, src, src, src_mask)))\n",
288 |     "        \n",
289 |     "        #src = [batch size, src sent len, hid dim]        \n",
290 |     "        src = self.layer_norm(\n",
291 |     "            src + self.dropout(\n",
292 |     "                self.positionwise_feedforward(src)))\n",
293 |     "        \n",
294 |     "        return src"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 55,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "class Encoder(nn.Module):\n",
304 |     "    def __init__(self, \n",
305 |     "                 input_dim: int, \n",
306 |     "                 hid_dim: int, \n",
307 |     "                 n_layers: int, \n",
308 |     "                 n_heads: int, \n",
309 |     "                 pf_dim: int, \n",
310 |     "                 encoder_layer: EncoderLayer, \n",
311 |     "                 self_attention: SelfAttention, \n",
312 |     "                 positionwise_feedforward: PositionwiseFeedforward, \n",
313 |     "                 dropout: float, \n",
314 |     "                 device: torch.device):\n",
315 |     "        super().__init__()\n",
316 |     "\n",
317 |     "        self.input_dim = input_dim\n",
318 |     "        self.hid_dim = hid_dim\n",
319 |     "        self.n_layers = n_layers\n",
320 |     "        self.n_heads = n_heads\n",
321 |     "        self.pf_dim = pf_dim\n",
322 |     "        self.encoder_layer = encoder_layer\n",
323 |     "        self.self_attention = self_attention\n",
324 |     "        self.positionwise_feedforward = positionwise_feedforward\n",
325 |     "        self.dropout = dropout\n",
326 |     "        self.device = device\n",
327 |     "        \n",
328 |     "        self.tok_embedding = nn.Embedding(input_dim, hid_dim)\n",
329 |     "        self.pos_embedding = nn.Embedding(1000, hid_dim)\n",
330 |     "        \n",
331 |     "        self.layers = nn.ModuleList([encoder_layer(hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device) \n",
332 |     "                                     for _ in range(n_layers)])\n",
333 |     "        \n",
334 |     "        self.dropout = nn.Dropout(dropout)\n",
335 |     "        \n",
336 |     "        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)\n",
337 |     "        \n",
338 |     "    def forward(self, \n",
339 |     "                src: Tensor, \n",
340 |     "                src_mask: Tensor):      \n",
341 |     "        #src = [batch size, src sent len]\n",
342 |     "        #src_mask = [batch size, 1, 1, src sent len]\n",
343 |     "        \n",
344 |     "        pos = torch.arange(0, src.shape[1]).unsqueeze(0).repeat(src.shape[0], 1).to(self.device)\n",
345 |     "        #pos = [batch size, src sent len]\n",
346 |     "        \n",
347 |     "        src_embedded = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))\n",
348 |     "        \n",
349 |     "        #src = [batch size, src sent len, hid dim]\n",
350 |     "        \n",
351 |     "        # each layer is an \"EncoderLayer\"\n",
352 |     "        for layer in self.layers:\n",
353 |     "            src_embedded = layer(src_embedded, src_mask)\n",
354 |     "            \n",
355 |     "        return src_embedded"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": 56,
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": [
364 |     "class DecoderLayer(nn.Module):\n",
365 |     "    def __init__(self, \n",
366 |     "                 hid_dim: int, \n",
367 |     "                 n_heads: int, \n",
368 |     "                 pf_dim: int, \n",
369 |     "                 self_attention: SelfAttention, \n",
370 |     "                 positionwise_feedforward: PositionwiseFeedforward, \n",
371 |     "                 dropout: float, \n",
372 |     "                 device: torch.device):\n",
373 |     "        super().__init__()\n",
374 |     "        \n",
375 |     "        self.layer_nore = nn.LayerNorm(hid_dim)\n",
376 |     "        self.self_attention = self_attention(hid_dim, n_heads, dropout, device)\n",
377 |     "        self.encoder_attention = self_attention(hid_dim, n_heads, dropout, device)\n",
378 |     "        self.positionwise_feedforward = positionwise_feedforward(hid_dim, pf_dim, dropout)\n",
379 |     "        self.dropout = nn.Dropout(dropout)\n",
380 |     "        \n",
381 |     "    def forward(self, \n",
382 |     "                trg: Tensor, \n",
383 |     "                src: Tensor, \n",
384 |     "                trg_mask: Tensor, \n",
385 |     "                src_mask: Tensor):\n",
386 |     "        \n",
387 |     "        #trg = [batch size, trg sent len, hid dim]\n",
388 |     "        #src = [batch size, src sent len, hid dim]\n",
389 |     "        #trg_mask = [batch size, trg sent len]\n",
390 |     "        #src_mask = [batch size, src sent len]\n",
391 |     "                \n",
392 |     "        trg = self.layer_norm(\n",
393 |     "            trg + self.dropout(\n",
394 |     "                self.self_attention(trg, trg, trg, trg_mask)))\n",
395 |     "                \n",
396 |     "        trg = self.layer_norm(\n",
397 |     "            trg + self.do(\n",
398 |     "                self.encoder_attention(trg, src, src, src_mask)))\n",
399 |     "        \n",
400 |     "        trg = self.layer_norm(\n",
401 |     "            trg + self.dropout(\n",
402 |     "                self.positionwise_feedforward(trg)))\n",
403 |     "        \n",
404 |     "        return trg"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": 57,
410 |    "metadata": {},
411 |    "outputs": [],
412 |    "source": [
413 |     "class Decoder(nn.Module):\n",
414 |     "    def __init__(self, \n",
415 |     "                 output_dim: int, \n",
416 |     "                 hid_dim: int, \n",
417 |     "                 n_layers: int, \n",
418 |     "                 n_heads: int, \n",
419 |     "                 pf_dim: int, \n",
420 |     "                 decoder_layer: DecoderLayer, \n",
421 |     "                 self_attention: SelfAttention, \n",
422 |     "                 positionwise_feedforward: PositionwiseFeedforward, \n",
423 |     "                 dropout: float, \n",
424 |     "                 device: torch.device):\n",
425 |     "        super().__init__()\n",
426 |     "        \n",
427 |     "        self.output_dim = output_dim\n",
428 |     "        self.hid_dim = hid_dim\n",
429 |     "        self.n_layers = n_layers\n",
430 |     "        self.n_heads = n_heads\n",
431 |     "        self.pf_dim = pf_dim\n",
432 |     "        self.decoder_layer = decoder_layer\n",
433 |     "        self.self_attention = self_attention\n",
434 |     "        self.positionwise_feedforward = positionwise_feedforward\n",
435 |     "        self.dropout = dropout\n",
436 |     "        self.device = device\n",
437 |     "        \n",
438 |     "        self.tok_embedding = nn.Embedding(output_dim, hid_dim)\n",
439 |     "        self.pos_embedding = nn.Embedding(1000, hid_dim)\n",
440 |     "        \n",
441 |     "        self.layers = nn.ModuleList([decoder_layer(hid_dim, n_heads, pf_dim, \n",
442 |     "                                                   self_attention, \n",
443 |     "                                                   positionwise_feedforward, \n",
444 |     "                                                   dropout, device)\n",
445 |     "                                     for _ in range(n_layers)])\n",
446 |     "        \n",
447 |     "        self.fc = nn.Linear(hid_dim, output_dim)\n",
448 |     "        \n",
449 |     "        self.dropout = nn.Dropout(dropout)\n",
450 |     "        \n",
451 |     "        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)\n",
452 |     "        \n",
453 |     "    def forward(self, \n",
454 |     "                trg: Tensor, \n",
455 |     "                src: Tensor, \n",
456 |     "                trg_mask: Tensor, \n",
457 |     "                src_mask: Tensor):            \n",
458 |     "        #trg = [batch_size, trg sent len]\n",
459 |     "        #src = [batch_size, src sent len]\n",
460 |     "        #trg_mask = [batch size, trg sent len]\n",
461 |     "        #src_mask = [batch size, src sent len]\n",
462 |     "        \n",
463 |     "        #pos = [batch_size, trg sent len]\n",
464 |     "        pos = torch.arange(0, trg.shape[1]).unsqueeze(0).repeat(trg.shape[0], 1).to(self.device)\n",
465 |     "\n",
466 |     "        #trg = [batch_size, trg sent len]        \n",
467 |     "        trg_embedded = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))\n",
468 |     "        \n",
469 |     "        #trg = [batch size, trg sent len, hid dim]\n",
470 |     "        \n",
471 |     "        #trg_mask = [batch size, 1, trg sent len, trg sent len]      \n",
472 |     "        \n",
473 |     "        for layer in self.layers:\n",
474 |     "            trg_embedded = layer(trg_embedded, src, trg_mask, src_mask)\n",
475 |     "\n",
476 |     "        #trg = [batch size, trg sent len, hid dim]            \n",
477 |     "\n",
478 |     "        return self.fc(trg_embedded)"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": 58,
484 |    "metadata": {},
485 |    "outputs": [],
486 |    "source": [
487 |     "class Seq2Seq(nn.Module):\n",
488 |     "    def __init__(self, \n",
489 |     "                 encoder: Encoder, \n",
490 |     "                 decoder: Decoder, \n",
491 |     "                 pad_idx: int, \n",
492 |     "                 device: torch.device):\n",
493 |     "        super().__init__()\n",
494 |     "        \n",
495 |     "        self.encoder = encoder\n",
496 |     "        self.decoder = decoder\n",
497 |     "        self.pad_idx = pad_idx\n",
498 |     "        self.device = device\n",
499 |     "        \n",
500 |     "    def make_masks(self, \n",
501 |     "                   src: Tensor, \n",
502 |     "                   trg: Tensor):\n",
503 |     "        \n",
504 |     "        #src = [batch size, src sent len]\n",
505 |     "        #trg = [batch size, trg sent len]\n",
506 |     "        \n",
507 |     "        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)\n",
508 |     "        \n",
509 |     "        trg_pad_mask = (trg != self.pad_idx).unsqueeze(1).unsqueeze(3)\n",
510 |     "\n",
511 |     "        trg_len = trg.shape[1]\n",
512 |     "        \n",
513 |     "        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), dtype=torch.uint8, device=self.device))\n",
514 |     "        \n",
515 |     "        trg_mask = trg_pad_mask & trg_sub_mask\n",
516 |     "        \n",
517 |     "        return src_mask, trg_mask\n",
518 |     "    \n",
519 |     "    def forward(self, \n",
520 |     "                src: Tensor, \n",
521 |     "                trg: Tensor):\n",
522 |     "        \n",
523 |     "        #src = [batch size, src sent len]\n",
524 |     "        #trg = [batch size, trg sent len]\n",
525 |     "                \n",
526 |     "        src_mask, trg_mask = self.make_masks(src, trg)\n",
527 |     "        \n",
528 |     "        enc_src = self.encoder(src, src_mask)\n",
529 |     "        \n",
530 |     "        #enc_src = [batch size, src sent len, hid dim]\n",
531 |     "                \n",
532 |     "        out = self.decoder(trg, enc_src, trg_mask, src_mask)\n",
533 |     "        \n",
534 |     "        #out = [batch size, trg sent len, output dim]\n",
535 |     "        \n",
536 |     "        return out"
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": 59,
542 |    "metadata": {},
543 |    "outputs": [],
544 |    "source": [
545 |     "input_dim = len(SRC.vocab)\n",
546 |     "hid_dim = 512\n",
547 |     "# n_layers = 2\n",
548 |     "n_layers = 6\n",
549 |     "n_heads = 8\n",
550 |     "pf_dim = 2048\n",
551 |     "dropout = 0.1\n",
552 |     "\n",
553 |     "enc = Encoder(input_dim, hid_dim, n_layers, n_heads, pf_dim, EncoderLayer, SelfAttention, PositionwiseFeedforward, dropout, device)"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "code",
558 |    "execution_count": 60,
559 |    "metadata": {},
560 |    "outputs": [],
561 |    "source": [
562 |     "output_dim = len(TRG.vocab)\n",
563 |     "hid_dim = 512\n",
564 |     "n_layers = 6\n",
565 |     "# n_layers = 2\n",
566 |     "n_heads = 8\n",
567 |     "pf_dim = 2048\n",
568 |     "dropout = 0.1\n",
569 |     "\n",
570 |     "dec = Decoder(output_dim, hid_dim, n_layers, n_heads, pf_dim, DecoderLayer, SelfAttention, PositionwiseFeedforward, dropout, device)"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": 61,
576 |    "metadata": {},
577 |    "outputs": [],
578 |    "source": [
579 |     "pad_idx = SRC.vocab.stoi['<pad>']\n",
580 |     "\n",
581 |     "model = Seq2Seq(enc, dec, pad_idx, device).to(device)"
582 |    ]
583 |   },
584 |   {
585 |    "cell_type": "code",
586 |    "execution_count": 62,
587 |    "metadata": {},
588 |    "outputs": [
589 |     {
590 |      "name": "stdout",
591 |      "output_type": "stream",
592 |      "text": [
593 |       "The model has 55,206,149 trainable parameters\n"
594 |      ]
595 |     }
596 |    ],
597 |    "source": [
598 |     "def count_parameters(model: nn.Module):\n",
599 |     "    return sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
600 |     "\n",
601 |     "print(f'The model has {count_parameters(model):,} trainable parameters')"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "code",
606 |    "execution_count": 63,
607 |    "metadata": {},
608 |    "outputs": [],
609 |    "source": [
610 |     "for p in model.parameters():\n",
611 |     "    if p.dim() > 1:\n",
612 |     "        nn.init.xavier_uniform_(p)"
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "code",
617 |    "execution_count": 66,
618 |    "metadata": {},
619 |    "outputs": [],
620 |    "source": [
621 |     "class NoamOpt:\n",
622 |     "    \"Optim wrapper that implements rate.\"\n",
623 |     "    def __init__(self, \n",
624 |     "                 model_size: int, \n",
625 |     "                 factor: int, \n",
626 |     "                 warmup: int, \n",
627 |     "                 optimizer: Optimizer):\n",
628 |     "        self.optimizer = optimizer\n",
629 |     "        self._step = 0\n",
630 |     "        self.warmup = warmup\n",
631 |     "        self.factor = factor\n",
632 |     "        self.model_size = model_size\n",
633 |     "        self._rate = 0\n",
634 |     "        \n",
635 |     "    def step(self):\n",
636 |     "        \"Update parameters and rate\"\n",
637 |     "        self._step += 1\n",
638 |     "        rate = self.rate()\n",
639 |     "        for p in self.optimizer.param_groups:\n",
640 |     "            p['lr'] = rate\n",
641 |     "        self._rate = rate\n",
642 |     "        self.optimizer.step()\n",
643 |     "        \n",
644 |     "    def rate(self, step = None):\n",
645 |     "        \"Implement `lrate` above\"\n",
646 |     "        if step is None:\n",
647 |     "            step = self._step\n",
648 |     "        return self.factor * \\\n",
649 |     "            (self.model_size ** (-0.5) *\n",
650 |     "            min(step ** (-0.5), step * self.warmup ** (-1.5)))"
651 |    ]
652 |   },
653 |   {
654 |    "cell_type": "code",
655 |    "execution_count": 67,
656 |    "metadata": {},
657 |    "outputs": [],
658 |    "source": [
659 |     "optimizer = NoamOpt(hid_dim, 1, 2000,\n",
660 |     "            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))"
661 |    ]
662 |   },
663 |   {
664 |    "cell_type": "code",
665 |    "execution_count": 68,
666 |    "metadata": {},
667 |    "outputs": [],
668 |    "source": [
669 |     "criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)"
670 |    ]
671 |   },
672 |   {
673 |    "cell_type": "code",
674 |    "execution_count": null,
675 |    "metadata": {},
676 |    "outputs": [],
677 |    "source": [
678 |     "def train(model: nn.Module, \n",
679 |     "          iterator: BucketIterator, \n",
680 |     "          optimizer: optim.Adam, \n",
681 |     "          criterion: nn.modules.loss.CrossEntropyLoss, \n",
682 |     "          clip: float):\n",
683 |     "\n",
684 |     "    model.train()\n",
685 |     "    \n",
686 |     "    epoch_loss = 0\n",
687 |     "    \n",
688 |     "    for i, batch in enumerate(iterator):\n",
689 |     "        \n",
690 |     "        src = batch.src\n",
691 |     "        trg = batch.trg\n",
692 |     "        \n",
693 |     "        optimizer.optimizer.zero_grad()\n",
694 |     "        \n",
695 |     "        output = model(src, trg[:,:-1])\n",
696 |     "                \n",
697 |     "        #output = [batch size, trg sent len - 1, output dim]\n",
698 |     "        #trg = [batch size, trg sent len]\n",
699 |     "            \n",
700 |     "        output = output.contiguous().view(-1, output.shape[-1])\n",
701 |     "        trg = trg[:,1:].contiguous().view(-1)\n",
702 |     "                \n",
703 |     "        #output = [batch size * trg sent len - 1, output dim]\n",
704 |     "        #trg = [batch size * trg sent len - 1]\n",
705 |     "            \n",
706 |     "        loss = criterion(output, trg)\n",
707 |     "        \n",
708 |     "        loss.backward()\n",
709 |     "        \n",
710 |     "        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n",
711 |     "        \n",
712 |     "        optimizer.step()\n",
713 |     "        \n",
714 |     "        epoch_loss += loss.item()\n",
715 |     "        \n",
716 |     "    return epoch_loss / len(iterator)"
717 |    ]
718 |   },
719 |   {
720 |    "cell_type": "code",
721 |    "execution_count": null,
722 |    "metadata": {},
723 |    "outputs": [],
724 |    "source": [
725 |     "def evaluate(model: nn.Module, \n",
726 |     "             iterator: BucketIterator, \n",
727 |     "             criterion: nn.modules.loss.CrossEntropyLoss):\n",
728 |     "    \n",
729 |     "    model.eval()\n",
730 |     "    \n",
731 |     "    epoch_loss = 0\n",
732 |     "    \n",
733 |     "    with torch.no_grad():\n",
734 |     "    \n",
735 |     "        for i, batch in enumerate(iterator):\n",
736 |     "\n",
737 |     "            src = batch.src\n",
738 |     "            trg = batch.trg\n",
739 |     "\n",
740 |     "            output = model(src, trg[:,:-1])\n",
741 |     "            \n",
742 |     "            #output = [batch size, trg sent len - 1, output dim]\n",
743 |     "            #trg = [batch size, trg sent len]\n",
744 |     "            \n",
745 |     "            output = output.contiguous().view(-1, output.shape[-1])\n",
746 |     "            trg = trg[:,1:].contiguous().view(-1)\n",
747 |     "            \n",
748 |     "            #output = [batch size * trg sent len - 1, output dim]\n",
749 |     "            #trg = [batch size * trg sent len - 1]\n",
750 |     "            \n",
751 |     "            loss = criterion(output, trg)\n",
752 |     "\n",
753 |     "            epoch_loss += loss.item()\n",
754 |     "        \n",
755 |     "    return epoch_loss / len(iterator)"
756 |    ]
757 |   },
758 |   {
759 |    "cell_type": "code",
760 |    "execution_count": null,
761 |    "metadata": {},
762 |    "outputs": [],
763 |    "source": [
764 |     "def epoch_time(start_time: int, \n",
765 |     "               end_time: int):\n",
766 |     "    elapsed_time = end_time - start_time\n",
767 |     "    elapsed_mins = int(elapsed_time / 60)\n",
768 |     "    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n",
769 |     "    return elapsed_mins, elapsed_secs"
770 |    ]
771 |   },
772 |   {
773 |    "cell_type": "code",
774 |    "execution_count": null,
775 |    "metadata": {},
776 |    "outputs": [],
777 |    "source": [
778 |     "N_EPOCHS = 1\n",
779 |     "CLIP = 1\n",
780 |     "SAVE_DIR = 'models'\n",
781 |     "MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'transformer-seq2seq.pt')\n",
782 |     "\n",
783 |     "best_valid_loss = float('inf')\n",
784 |     "\n",
785 |     "if not os.path.isdir(f'{SAVE_DIR}'):\n",
786 |     "    os.makedirs(f'{SAVE_DIR}')\n",
787 |     "\n",
788 |     "for epoch in range(N_EPOCHS):\n",
789 |     "    \n",
790 |     "    start_time = time.time()\n",
791 |     "    \n",
792 |     "    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)\n",
793 |     "    valid_loss = evaluate(model, valid_iterator, criterion)\n",
794 |     "    \n",
795 |     "    end_time = time.time()\n",
796 |     "    \n",
797 |     "    epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n",
798 |     "    \n",
799 |     "    if valid_loss < best_valid_loss:\n",
800 |     "        best_valid_loss = valid_loss\n",
801 |     "        torch.save(model.state_dict(), MODEL_SAVE_PATH)\n",
802 |     "    \n",
803 |     "    print(f'| Epoch: {epoch+1:03} | Time: {epoch_mins}m {epoch_secs}s| Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} |')"
804 |    ]
805 |   },
806 |   {
807 |    "cell_type": "code",
808 |    "execution_count": null,
809 |    "metadata": {},
810 |    "outputs": [],
811 |    "source": [
812 |     "model.load_state_dict(torch.load(MODEL_SAVE_PATH))\n",
813 |     "\n",
814 |     "test_loss = evaluate(model, test_iterator, criterion)\n",
815 |     "\n",
816 |     "print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')"
817 |    ]
818 |   }
819 |  ],
820 |  "metadata": {
821 |   "kernelspec": {
822 |    "display_name": "Environment (conda_pytorch_p36)",
823 |    "language": "python",
824 |    "name": "conda_pytorch_p36"
825 |   },
826 |   "language_info": {
827 |    "codemirror_mode": {
828 |     "name": "ipython",
829 |     "version": 3
830 |    },
831 |    "file_extension": ".py",
832 |    "mimetype": "text/x-python",
833 |    "name": "python",
834 |    "nbconvert_exporter": "python",
835 |    "pygments_lexer": "ipython3",
836 |    "version": "3.7.3"
837 |   }
838 |  },
839 |  "nbformat": 4,
840 |  "nbformat_minor": 2
841 | }
842 | 


--------------------------------------------------------------------------------
/2 - Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 2 - Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation\n",
  8 |     "\n",
  9 |     "In this second notebook on sequence-to-sequence models using PyTorch and TorchText, we'll be implementing the model from [Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation](https://arxiv.org/abs/1406.1078). This model will achieve improved test perplexity whilst only using a single layer RNN in both the encoder and the decoder.\n",
 10 |     "\n",
 11 |     "## Introduction\n",
 12 |     "\n",
 13 |     "Let's remind ourselves of the general encoder-decoder model.\n",
 14 |     "\n",
 15 |     "![](assets/seq2seq1.png)\n",
 16 |     "\n",
 17 |     "We use our encoder (green) over the source sequence to create a context vector (red). We then use that context vector with the decoder (blue) and a linear layer (purple) to generate the target sentence.\n",
 18 |     "\n",
 19 |     "In the previous model, we used an multi-layered LSTM as the encoder and decoder.\n",
 20 |     "\n",
 21 |     "![](assets/seq2seq4.png)\n",
 22 |     "\n",
 23 |     "One downside of the previous model is that the decoder is trying to cram lots of information into the hidden states. Whilst decoding, the hidden state will need to contain information about the whole of the source sequence, as well as all of the tokens have been decoded so far. By alleviating some of this information compression, we can create a better model!\n",
 24 |     "\n",
 25 |     "We'll also be using a GRU (Gated Recurrent Unit) instead of an LSTM (Long Short-Term Memory). Why? Mainly because that's what they did in the paper (this paper also introduced GRUs) and also because we used LSTMs last time. If you want to understand how GRUs (and LSTMs) differ from standard RNNS, check out [this](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) link. Is a GRU better than an LSTM? [Research](https://arxiv.org/abs/1412.3555) has shown they're pretty much the same, and both are better than standard RNNs. \n",
 26 |     "\n",
 27 |     "## Preparing Data\n",
 28 |     "\n",
 29 |     "All of the data preparation will be (almost) the same as last time, so I'll very briefly detail what each code block does. See the previous notebook if you've forgotten.\n",
 30 |     "\n",
 31 |     "We'll import PyTorch, TorchText, spaCy and a few standard modules."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 13,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import torch\n",
 41 |     "import torch.nn as nn\n",
 42 |     "import torch.optim as optim\n",
 43 |     "from torch import Tensor\n",
 44 |     "\n",
 45 |     "from torchtext.datasets import TranslationDataset, Multi30k\n",
 46 |     "from torchtext.data import Field, BucketIterator\n",
 47 |     "\n",
 48 |     "import spacy\n",
 49 |     "\n",
 50 |     "import random\n",
 51 |     "import math\n",
 52 |     "import time"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "Then set a random seed for deterministic results/reproducability."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 2,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "SEED = 1234\n",
 69 |     "\n",
 70 |     "random.seed(SEED)\n",
 71 |     "torch.manual_seed(SEED)\n",
 72 |     "torch.backends.cudnn.deterministic = True"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "Instantiate our German and English spaCy models."
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "spacy_de = spacy.load('de')\n",
 89 |     "spacy_en = spacy.load('en')"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "Previously we reversed the source (German) sentence, however in the paper we are implementing they don't do this, so neither will we."
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 4,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "def tokenize_de(text):\n",
106 |     "    \"\"\"\n",
107 |     "    Tokenizes German text from a string into a list of strings\n",
108 |     "    \"\"\"\n",
109 |     "    return [tok.text for tok in spacy_de.tokenizer(text)]\n",
110 |     "\n",
111 |     "def tokenize_en(text):\n",
112 |     "    \"\"\"\n",
113 |     "    Tokenizes English text from a string into a list of strings\n",
114 |     "    \"\"\"\n",
115 |     "    return [tok.text for tok in spacy_en.tokenizer(text)]"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "Create our fields to process our data. This will append the \"start of sentence\" and \"end of sentence\" tokens as well as converting all words to lowercase."
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 5,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "SRC = Field(tokenize=tokenize_de, \n",
132 |     "            init_token='<sos>', \n",
133 |     "            eos_token='<eos>', \n",
134 |     "            lower=True)\n",
135 |     "\n",
136 |     "TRG = Field(tokenize = tokenize_en, \n",
137 |     "            init_token='<sos>', \n",
138 |     "            eos_token='<eos>', \n",
139 |     "            lower=True)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "Load our data."
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 6,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), \n",
156 |     "                                                    fields = (SRC, TRG))"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "We'll also print out an example just to double check they're not reversed."
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 7,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "name": "stdout",
173 |      "output_type": "stream",
174 |      "text": [
175 |       "{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}\n"
176 |      ]
177 |     }
178 |    ],
179 |    "source": [
180 |     "print(vars(train_data.examples[0]))"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "Then create our vocabulary, converting all tokens appearing less than twice into `<unk>` tokens."
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 8,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "SRC.build_vocab(train_data, min_freq = 2)\n",
197 |     "TRG.build_vocab(train_data, min_freq = 2)"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "Finally, define the `device` and create our iterators."
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 9,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 10,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "BATCH_SIZE = 128\n",
223 |     "\n",
224 |     "train_iterator, valid_iterator, test_iterator = BucketIterator.splits(\n",
225 |     "    (train_data, valid_data, test_data), \n",
226 |     "    batch_size = BATCH_SIZE, \n",
227 |     "    device = device)"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "## Building the Seq2Seq Model\n",
235 |     "\n",
236 |     "### Encoder\n",
237 |     "\n",
238 |     "The encoder is similar to the previous one, with the multi-layer LSTM swapped for a single-layer GRU. We also don't pass the dropout as an argument to the GRU as that dropout is used between each layer of a multi-layered RNN. As we only have a single layer, PyTorch will display a warning if we try and use pass a dropout value to it.\n",
239 |     "\n",
240 |     "Another thing to note about the GRU is that it only requires and returns a hidden state, there is no cell state like in the LSTM.\n",
241 |     "\n",
242 |     "$$\\begin{align*}\n",
243 |     "h_t &= \\text{GRU}(x_t, h_{t-1})\\\\\n",
244 |     "(h_t, c_t) &= \\text{LSTM}(x_t, (h_{t-1}, c_{t-1}))\\\\\n",
245 |     "h_t &= \\text{RNN}(x_t, h_{t-1})\n",
246 |     "\\end{align*}$$\n",
247 |     "\n",
248 |     "From the equations above, it looks like the RNN and the GRU are identical. Inside the GRU, however, is a number of *gating mechanisms* that control the information flow in to and out of the hidden state (similar to an LSTM). Again, for more info, check out [this](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) excellent post. \n",
249 |     "\n",
250 |     "The rest of the encoder should be very familar from the last tutorial, it takes in a sequence, $X = \\{x_1, x_2, ... , x_T\\}$, recurrently calculates hidden states, $H = \\{h_1, h_2, ..., h_T\\}$, and returns a context vector (the final hidden state), $z=h_T$.\n",
251 |     "\n",
252 |     "$$h_t = \\text{EncoderGRU}(x_t, h_{t-1})$$\n",
253 |     "\n",
254 |     "This is identical to the encoder of the general seq2seq model, with all the \"magic\" happening inside the GRU (green squares).\n",
255 |     "\n",
256 |     "![](assets/seq2seq5.png)"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 11,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "class Encoder(nn.Module):\n",
266 |     "    def __init__(self, \n",
267 |     "                 input_dim: int, \n",
268 |     "                 emb_dim: int, \n",
269 |     "                 hid_dim: int, \n",
270 |     "                 dropout: float):\n",
271 |     "        super().__init__()\n",
272 |     "        \n",
273 |     "        self.input_dim = input_dim\n",
274 |     "        self.emb_dim = emb_dim\n",
275 |     "        self.hid_dim = hid_dim\n",
276 |     "        self.dropout = dropout\n",
277 |     "        \n",
278 |     "        self.embedding = nn.Embedding(input_dim, emb_dim) #no dropout as only one layer!\n",
279 |     "        \n",
280 |     "        self.rnn = nn.GRU(emb_dim, hid_dim)\n",
281 |     "        \n",
282 |     "        self.dropout = nn.Dropout(dropout)\n",
283 |     "        \n",
284 |     "    def forward(self, src):\n",
285 |     "        \n",
286 |     "        #src = [src sent len, batch size]\n",
287 |     "        \n",
288 |     "        embedded = self.dropout(self.embedding(src))\n",
289 |     "        \n",
290 |     "        #embedded = [src sent len, batch size, emb dim]\n",
291 |     "        \n",
292 |     "        outputs, hidden = self.rnn(embedded) #no cell state!\n",
293 |     "        \n",
294 |     "        #outputs = [src sent len, batch size, hid dim * n directions]\n",
295 |     "        #hidden = [n layers * n directions, batch size, hid dim]\n",
296 |     "        \n",
297 |     "        #outputs are always from the top hidden layer\n",
298 |     "        \n",
299 |     "        return hidden"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "## Decoder\n",
307 |     "\n",
308 |     "The decoder is where the implementation differs significantly from the previous model and we alleviate some of the information compression.\n",
309 |     "\n",
310 |     "Instead of the GRU in the decoder taking just the target token, $y_t$ and the previous hidden state $s_{t-1}$ as inputs, it also takes the context vector $z$. \n",
311 |     "\n",
312 |     "$$s_t = \\text{DecoderGRU}(y_t, s_{t-1}, z)$$\n",
313 |     "\n",
314 |     "Note how this context vector, $z$, does not have a $t$ subscript, meaning we re-use the same context vector returned by the encoder for every time-step in the decoder. \n",
315 |     "\n",
316 |     "Before, we predicted the next token, $\\hat{y}_{t+1}$, with the linear layer, $f$, only using the top-layer decoder hidden state at that time-step, $s_t$, as $\\hat{y}_{t+1}=f(s_t^L)$. Now, we also pass the current token, $\\hat{y}_t$ and the context vector, $z$ to the linear layer.\n",
317 |     "\n",
318 |     "$$\\hat{y}_{t+1} = f(y_t, s_t, z)$$\n",
319 |     "\n",
320 |     "Thus, our decoder now looks something like this:\n",
321 |     "\n",
322 |     "![](assets/seq2seq6.png)\n",
323 |     "\n",
324 |     "Note, the initial hidden state, $s_0$, is still the context vector, $z$, so when generating the first token we are actually inputting two identical context vectors into the GRU.\n",
325 |     "\n",
326 |     "How do these two changes reduce the information compression? Well, hypothetically the decoder hidden states, $s_t$, no longer need to contain information about the source sequence as it is always available as an input. Thus, it only needs to contain information about what tokens it has generated so far. The addition of $y_t$ to the linear layer also means this layer can directly see what the token is, without having to get this information from the hidden state. \n",
327 |     "\n",
328 |     "However, this hypothesis is just a hypothesis, it is impossible to determine how the model actually uses the information provided to it (don't listen to anyone that tells you differently). Nevertheless, it is a solid intuition and the results seem to indicate that this modifications are a good idea!\n",
329 |     "\n",
330 |     "Within the implementation, we will pass $y_t$ and $z$ to the GRU by concatenating them together, so the input dimensions to the GRU are now `emb_dim + hid_dim` (as context vector will be of size `hid_dim`). The linear layer will take $y_t, s_t$ and $z$ also by concatenating them together, hence the input dimensions are now `emb_dim + hid_dim*2`. We also don't pass a value of dropout to the GRU as it only uses a single layer.\n",
331 |     "\n",
332 |     "`forward` now takes a `context` argument. Inside of `forward`, we concatenate $y_t$ and $z$ as `emb_con` before feeding to the GRU, and we concatenate $y_t$, $s_t$ and $z$ together as `output` before feeding it through the linear layer to receive our predictions, $\\hat{y}_{t+1}$."
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": 14,
338 |    "metadata": {},
339 |    "outputs": [],
340 |    "source": [
341 |     "class Decoder(nn.Module):\n",
342 |     "    def __init__(self, \n",
343 |     "                 output_dim: int, \n",
344 |     "                 emb_dim: int, \n",
345 |     "                 hid_dim: int, \n",
346 |     "                 dropout: float):\n",
347 |     "        super().__init__()\n",
348 |     "\n",
349 |     "        self.emb_dim = emb_dim\n",
350 |     "        self.hid_dim = hid_dim\n",
351 |     "        self.output_dim = output_dim\n",
352 |     "        self.dropout = dropout\n",
353 |     "        \n",
354 |     "        self.embedding = nn.Embedding(output_dim, emb_dim)\n",
355 |     "        \n",
356 |     "        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)\n",
357 |     "        \n",
358 |     "        self.out = nn.Linear(emb_dim + hid_dim * 2, output_dim)\n",
359 |     "        \n",
360 |     "        self.dropout = nn.Dropout(dropout)\n",
361 |     "        \n",
362 |     "    def forward(self, \n",
363 |     "                input: Tensor, \n",
364 |     "                hidden: Tensor, \n",
365 |     "                context: Tensor):\n",
366 |     "        \n",
367 |     "        #input = [batch size]\n",
368 |     "        #hidden = [n layers * n directions, batch size, hid dim]\n",
369 |     "        #context = [n layers * n directions, batch size, hid dim]\n",
370 |     "        \n",
371 |     "        #n layers and n directions in the decoder will both always be 1, therefore:\n",
372 |     "        #hidden = [1, batch size, hid dim]\n",
373 |     "        #context = [1, batch size, hid dim]\n",
374 |     "        \n",
375 |     "        input = input.unsqueeze(0)\n",
376 |     "        \n",
377 |     "        #input = [1, batch size]\n",
378 |     "        \n",
379 |     "        embedded = self.dropout(self.embedding(input))\n",
380 |     "        \n",
381 |     "        #embedded = [1, batch size, emb dim]\n",
382 |     "                \n",
383 |     "        emb_con = torch.cat((embedded, context), dim = 2)\n",
384 |     "            \n",
385 |     "        #emb_con = [1, batch size, emb dim + hid dim]\n",
386 |     "            \n",
387 |     "        output, hidden = self.rnn(emb_con, hidden)\n",
388 |     "        \n",
389 |     "        #output = [sent len, batch size, hid dim * n directions]\n",
390 |     "        #hidden = [n layers * n directions, batch size, hid dim]\n",
391 |     "        \n",
392 |     "        #sent len, n layers and n directions will always be 1 in the decoder, therefore:\n",
393 |     "        #output = [1, batch size, hid dim]\n",
394 |     "        #hidden = [1, batch size, hid dim]\n",
395 |     "        \n",
396 |     "        output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), \n",
397 |     "                           dim = 1)\n",
398 |     "        \n",
399 |     "        #output = [batch size, emb dim + hid dim * 2]\n",
400 |     "        \n",
401 |     "        prediction = self.out(output)\n",
402 |     "        \n",
403 |     "        #prediction = [batch size, output dim]\n",
404 |     "        \n",
405 |     "        return prediction, hidden"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "markdown",
410 |    "metadata": {},
411 |    "source": [
412 |     "## Seq2Seq Model\n",
413 |     "\n",
414 |     "Putting the encoder and decoder together, we get:\n",
415 |     "\n",
416 |     "![](assets/seq2seq7.png)\n",
417 |     "\n",
418 |     "Again, in this implementation we need to ensure the hidden dimensions in both the encoder and the decoder are the same.\n",
419 |     "\n",
420 |     "Briefly going over all of the steps:\n",
421 |     "- the `outputs` tensor is created to hold all predictions, $\\hat{Y}$\n",
422 |     "- the source sequence, $X$, is fed into the encoder to create a `context` vector\n",
423 |     "- the initial decoder hidden state is set to be the `context` vector, $s_0 = z = h_T$\n",
424 |     "- we use a batch of `<sos>` tokens as the first `input`, $y_1$\n",
425 |     "- we then decode within a loop:\n",
426 |     "  - inserting the input token $y_t$, previous hidden state, $s_{t-1}$, and the context vector, $z$, into the decoder\n",
427 |     "  - receiving a prediction, $\\hat{y}_{t+1}$, and a new hidden state, $s_t$\n",
428 |     "  - we then decide if we are going to teacher force or not, setting the next input as appropriate"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": 16,
434 |    "metadata": {},
435 |    "outputs": [],
436 |    "source": [
437 |     "class Seq2Seq(nn.Module):\n",
438 |     "    def __init__(self, \n",
439 |     "                 encoder: nn.Module, \n",
440 |     "                 decoder: nn.Module, \n",
441 |     "                 device: torch.device):\n",
442 |     "        super().__init__()\n",
443 |     "        \n",
444 |     "        self.encoder = encoder\n",
445 |     "        self.decoder = decoder\n",
446 |     "        self.device = device\n",
447 |     "        \n",
448 |     "        assert encoder.hid_dim == decoder.hid_dim, \\\n",
449 |     "            \"Hidden dimensions of encoder and decoder must be equal!\"\n",
450 |     "        \n",
451 |     "    def forward(self, \n",
452 |     "                src: Tensor, \n",
453 |     "                trg: Tensor, \n",
454 |     "                teacher_forcing_ratio = 0.5):\n",
455 |     "        \n",
456 |     "        #src = [src sent len, batch size]\n",
457 |     "        #trg = [trg sent len, batch size]\n",
458 |     "        #teacher_forcing_ratio is probability to use teacher forcing\n",
459 |     "        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time\n",
460 |     "        \n",
461 |     "        batch_size = trg.shape[1]\n",
462 |     "        max_len = trg.shape[0]\n",
463 |     "        trg_vocab_size = self.decoder.output_dim\n",
464 |     "        \n",
465 |     "        #tensor to store decoder outputs\n",
466 |     "        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)\n",
467 |     "        \n",
468 |     "        #last hidden state of the encoder is the context\n",
469 |     "        context = self.encoder(src)\n",
470 |     "        \n",
471 |     "        #context also used as the initial hidden state of the decoder\n",
472 |     "        hidden = context\n",
473 |     "        \n",
474 |     "        #first input to the decoder is the <sos> tokens\n",
475 |     "        input = trg[0,:]\n",
476 |     "        \n",
477 |     "        for t in range(1, max_len):\n",
478 |     "            \n",
479 |     "            output, hidden = self.decoder(input, hidden, context)\n",
480 |     "            outputs[t] = output\n",
481 |     "            teacher_force = random.random() < teacher_forcing_ratio\n",
482 |     "            top1 = output.max(1)[1]\n",
483 |     "            input = (trg[t] if teacher_force else top1)\n",
484 |     "\n",
485 |     "        return outputs"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "markdown",
490 |    "metadata": {},
491 |    "source": [
492 |     "# Training the Seq2Seq Model\n",
493 |     "\n",
494 |     "The rest of this tutorial is very similar to the previous one. \n",
495 |     "\n",
496 |     "We initialise our encoder, decoder and seq2seq model (placing it on the GPU if we have one). As before, the embedding dimensions and the amount of dropout used can be different between the encoder and the decoder, but the hidden dimensions must remain the same."
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "code",
501 |    "execution_count": 17,
502 |    "metadata": {},
503 |    "outputs": [],
504 |    "source": [
505 |     "INPUT_DIM = len(SRC.vocab)\n",
506 |     "OUTPUT_DIM = len(TRG.vocab)\n",
507 |     "ENC_EMB_DIM = 256\n",
508 |     "DEC_EMB_DIM = 256\n",
509 |     "HID_DIM = 512\n",
510 |     "ENC_DROPOUT = 0.5\n",
511 |     "DEC_DROPOUT = 0.5\n",
512 |     "\n",
513 |     "enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)\n",
514 |     "dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)\n",
515 |     "\n",
516 |     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
517 |     "\n",
518 |     "model = Seq2Seq(enc, dec, device).to(device)"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "markdown",
523 |    "metadata": {},
524 |    "source": [
525 |     "Next, we initialize our parameters. The paper states the parameters are initialized from a normal distribution with a mean of 0 and a standard deviation of 0.01, i.e. $\\mathcal{N}(0, 0.01)$. \n",
526 |     "\n",
527 |     "It also states we should initialize the recurrent parameters to a special initialization, however to keep things simple we'll also initialize them to $\\mathcal{N}(0, 0.01)$."
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": 18,
533 |    "metadata": {},
534 |    "outputs": [
535 |     {
536 |      "data": {
537 |       "text/plain": [
538 |        "Seq2Seq(\n",
539 |        "  (encoder): Encoder(\n",
540 |        "    (embedding): Embedding(7855, 256)\n",
541 |        "    (rnn): GRU(256, 512)\n",
542 |        "    (dropout): Dropout(p=0.5)\n",
543 |        "  )\n",
544 |        "  (decoder): Decoder(\n",
545 |        "    (embedding): Embedding(5893, 256)\n",
546 |        "    (rnn): GRU(768, 512)\n",
547 |        "    (out): Linear(in_features=1280, out_features=5893, bias=True)\n",
548 |        "    (dropout): Dropout(p=0.5)\n",
549 |        "  )\n",
550 |        ")"
551 |       ]
552 |      },
553 |      "execution_count": 18,
554 |      "metadata": {},
555 |      "output_type": "execute_result"
556 |     }
557 |    ],
558 |    "source": [
559 |     "def init_weights(m):\n",
560 |     "    for name, param in m.named_parameters():\n",
561 |     "        nn.init.normal_(param.data, mean=0, std=0.01)\n",
562 |     "        \n",
563 |     "model.apply(init_weights)"
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "markdown",
568 |    "metadata": {},
569 |    "source": [
570 |     "We print out the number of parameters.\n",
571 |     "\n",
572 |     "Even though we only have a single layer RNN for our encoder and decoder we actually have **more** parameters  than the last model. This is due to the increased size of the inputs to the GRU and the linear layer. However, it is not a significant amount of parameters and causes a minimal amount of increase in training time (~3 seconds per epoch extra)."
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "code",
577 |    "execution_count": 19,
578 |    "metadata": {},
579 |    "outputs": [
580 |     {
581 |      "name": "stdout",
582 |      "output_type": "stream",
583 |      "text": [
584 |       "The model has 14,220,293 trainable parameters\n"
585 |      ]
586 |     }
587 |    ],
588 |    "source": [
589 |     "def count_parameters(model):\n",
590 |     "    return sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
591 |     "\n",
592 |     "print(f'The model has {count_parameters(model):,} trainable parameters')"
593 |    ]
594 |   },
595 |   {
596 |    "cell_type": "markdown",
597 |    "metadata": {},
598 |    "source": [
599 |     "We initiaize our optimizer."
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": 20,
605 |    "metadata": {},
606 |    "outputs": [],
607 |    "source": [
608 |     "optimizer = optim.Adam(model.parameters())"
609 |    ]
610 |   },
611 |   {
612 |    "cell_type": "markdown",
613 |    "metadata": {},
614 |    "source": [
615 |     "We also initialize the loss function, making sure to ignore the loss on `<pad>` tokens."
616 |    ]
617 |   },
618 |   {
619 |    "cell_type": "code",
620 |    "execution_count": 21,
621 |    "metadata": {},
622 |    "outputs": [],
623 |    "source": [
624 |     "PAD_IDX = TRG.vocab.stoi['<pad>']\n",
625 |     "\n",
626 |     "criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)"
627 |    ]
628 |   },
629 |   {
630 |    "cell_type": "markdown",
631 |    "metadata": {},
632 |    "source": [
633 |     "We then create the training loop..."
634 |    ]
635 |   },
636 |   {
637 |    "cell_type": "code",
638 |    "execution_count": 22,
639 |    "metadata": {},
640 |    "outputs": [],
641 |    "source": [
642 |     "def train(model: nn.Module, \n",
643 |     "          iterator: BucketIterator, \n",
644 |     "          optimizer: optim.Adam, \n",
645 |     "          criterion: nn.modules.loss.CrossEntropyLoss, \n",
646 |     "          clip: float):\n",
647 |     "    \n",
648 |     "    model.train()\n",
649 |     "    \n",
650 |     "    epoch_loss = 0\n",
651 |     "    \n",
652 |     "    for i, batch in enumerate(iterator):\n",
653 |     "        \n",
654 |     "        src = batch.src\n",
655 |     "        trg = batch.trg\n",
656 |     "        \n",
657 |     "        optimizer.zero_grad()\n",
658 |     "        \n",
659 |     "        output = model(src, trg)\n",
660 |     "        \n",
661 |     "        #trg = [trg sent len, batch size]\n",
662 |     "        #output = [trg sent len, batch size, output dim]\n",
663 |     "        \n",
664 |     "        output = output[1:].view(-1, output.shape[-1])\n",
665 |     "        trg = trg[1:].view(-1)\n",
666 |     "        \n",
667 |     "        #trg = [(trg sent len - 1) * batch size]\n",
668 |     "        #output = [(trg sent len - 1) * batch size, output dim]\n",
669 |     "        \n",
670 |     "        loss = criterion(output, trg)\n",
671 |     "        \n",
672 |     "        loss.backward()\n",
673 |     "        \n",
674 |     "        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n",
675 |     "        \n",
676 |     "        optimizer.step()\n",
677 |     "        \n",
678 |     "        epoch_loss += loss.item()\n",
679 |     "        \n",
680 |     "    return epoch_loss / len(iterator)"
681 |    ]
682 |   },
683 |   {
684 |    "cell_type": "markdown",
685 |    "metadata": {},
686 |    "source": [
687 |     "...and the evaluation loop, remembering to set the model to `eval` mode and turn off teaching forcing."
688 |    ]
689 |   },
690 |   {
691 |    "cell_type": "code",
692 |    "execution_count": 23,
693 |    "metadata": {},
694 |    "outputs": [],
695 |    "source": [
696 |     "def evaluate(model: nn.Module, \n",
697 |     "             iterator: BucketIterator, \n",
698 |     "             criterion: nn.modules.loss.CrossEntropyLoss):\n",
699 |     "    \n",
700 |     "    model.eval()\n",
701 |     "    \n",
702 |     "    epoch_loss = 0\n",
703 |     "    \n",
704 |     "    with torch.no_grad():\n",
705 |     "    \n",
706 |     "        for i, batch in enumerate(iterator):\n",
707 |     "\n",
708 |     "            src = batch.src\n",
709 |     "            trg = batch.trg\n",
710 |     "\n",
711 |     "            output = model(src, trg, 0) #turn off teacher forcing\n",
712 |     "\n",
713 |     "            #trg = [trg sent len, batch size]\n",
714 |     "            #output = [trg sent len, batch size, output dim]\n",
715 |     "\n",
716 |     "            output = output[1:].view(-1, output.shape[-1])\n",
717 |     "            trg = trg[1:].view(-1)\n",
718 |     "\n",
719 |     "            #trg = [(trg sent len - 1) * batch size]\n",
720 |     "            #output = [(trg sent len - 1) * batch size, output dim]\n",
721 |     "\n",
722 |     "            loss = criterion(output, trg)\n",
723 |     "\n",
724 |     "            epoch_loss += loss.item()\n",
725 |     "        \n",
726 |     "    return epoch_loss / len(iterator)"
727 |    ]
728 |   },
729 |   {
730 |    "cell_type": "markdown",
731 |    "metadata": {},
732 |    "source": [
733 |     "We'll also define the function that calculates how long an epoch takes."
734 |    ]
735 |   },
736 |   {
737 |    "cell_type": "code",
738 |    "execution_count": 24,
739 |    "metadata": {},
740 |    "outputs": [],
741 |    "source": [
742 |     "def epoch_time(start_time: int, \n",
743 |     "               end_time: int):\n",
744 |     "    elapsed_time = end_time - start_time\n",
745 |     "    elapsed_mins = int(elapsed_time / 60)\n",
746 |     "    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n",
747 |     "    return elapsed_mins, elapsed_secs"
748 |    ]
749 |   },
750 |   {
751 |    "cell_type": "markdown",
752 |    "metadata": {},
753 |    "source": [
754 |     "Then, we train our model, saving the parameters that give us the best validation loss."
755 |    ]
756 |   },
757 |   {
758 |    "cell_type": "code",
759 |    "execution_count": 25,
760 |    "metadata": {},
761 |    "outputs": [
762 |     {
763 |      "name": "stdout",
764 |      "output_type": "stream",
765 |      "text": [
766 |       "Epoch: 01 | Time: 0m 55s\n",
767 |       "\tTrain Loss: 5.085 | Train PPL: 161.608\n",
768 |       "\t Val. Loss: 5.394 |  Val. PPL: 220.102\n",
769 |       "Epoch: 02 | Time: 0m 54s\n",
770 |       "\tTrain Loss: 4.445 | Train PPL:  85.161\n",
771 |       "\t Val. Loss: 5.224 |  Val. PPL: 185.738\n",
772 |       "Epoch: 03 | Time: 0m 53s\n",
773 |       "\tTrain Loss: 4.109 | Train PPL:  60.884\n",
774 |       "\t Val. Loss: 4.717 |  Val. PPL: 111.816\n",
775 |       "Epoch: 04 | Time: 0m 55s\n",
776 |       "\tTrain Loss: 3.751 | Train PPL:  42.551\n",
777 |       "\t Val. Loss: 4.276 |  Val. PPL:  71.962\n",
778 |       "Epoch: 05 | Time: 0m 54s\n",
779 |       "\tTrain Loss: 3.377 | Train PPL:  29.288\n",
780 |       "\t Val. Loss: 4.009 |  Val. PPL:  55.076\n",
781 |       "Epoch: 06 | Time: 0m 55s\n",
782 |       "\tTrain Loss: 3.054 | Train PPL:  21.191\n",
783 |       "\t Val. Loss: 3.803 |  Val. PPL:  44.821\n",
784 |       "Epoch: 07 | Time: 0m 55s\n",
785 |       "\tTrain Loss: 2.780 | Train PPL:  16.123\n",
786 |       "\t Val. Loss: 3.630 |  Val. PPL:  37.711\n",
787 |       "Epoch: 08 | Time: 0m 55s\n",
788 |       "\tTrain Loss: 2.522 | Train PPL:  12.456\n",
789 |       "\t Val. Loss: 3.578 |  Val. PPL:  35.807\n",
790 |       "Epoch: 09 | Time: 0m 56s\n",
791 |       "\tTrain Loss: 2.293 | Train PPL:   9.902\n",
792 |       "\t Val. Loss: 3.517 |  Val. PPL:  33.697\n",
793 |       "Epoch: 10 | Time: 0m 55s\n",
794 |       "\tTrain Loss: 2.127 | Train PPL:   8.390\n",
795 |       "\t Val. Loss: 3.549 |  Val. PPL:  34.762\n"
796 |      ]
797 |     }
798 |    ],
799 |    "source": [
800 |     "N_EPOCHS = 10\n",
801 |     "CLIP = 1\n",
802 |     "\n",
803 |     "best_valid_loss = float('inf')\n",
804 |     "\n",
805 |     "for epoch in range(N_EPOCHS):\n",
806 |     "    \n",
807 |     "    start_time = time.time()\n",
808 |     "    \n",
809 |     "    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)\n",
810 |     "    valid_loss = evaluate(model, valid_iterator, criterion)\n",
811 |     "    \n",
812 |     "    end_time = time.time()\n",
813 |     "    \n",
814 |     "    epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n",
815 |     "    \n",
816 |     "    if valid_loss < best_valid_loss:\n",
817 |     "        best_valid_loss = valid_loss\n",
818 |     "        torch.save(model.state_dict(), 'tut2-model.pt')\n",
819 |     "    \n",
820 |     "    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')\n",
821 |     "    print(f'\\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')\n",
822 |     "    print(f'\\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')"
823 |    ]
824 |   },
825 |   {
826 |    "cell_type": "markdown",
827 |    "metadata": {},
828 |    "source": [
829 |     "Finally, we test the model on the test set using these \"best\" parameters."
830 |    ]
831 |   },
832 |   {
833 |    "cell_type": "code",
834 |    "execution_count": 26,
835 |    "metadata": {},
836 |    "outputs": [
837 |     {
838 |      "name": "stdout",
839 |      "output_type": "stream",
840 |      "text": [
841 |       "| Test Loss: 3.514 | Test PPL:  33.597 |\n"
842 |      ]
843 |     }
844 |    ],
845 |    "source": [
846 |     "model.load_state_dict(torch.load('tut2-model.pt'))\n",
847 |     "\n",
848 |     "test_loss = evaluate(model, test_iterator, criterion)\n",
849 |     "\n",
850 |     "print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')"
851 |    ]
852 |   },
853 |   {
854 |    "cell_type": "markdown",
855 |    "metadata": {},
856 |    "source": [
857 |     "Just looking at the test loss, we get better performance. This is a pretty good sign that this model architecture is doing something right! Relieving the information compression seems like the way forard, and in the next tutorial we'll expand on this even further with *attention*."
858 |    ]
859 |   }
860 |  ],
861 |  "metadata": {
862 |   "kernelspec": {
863 |    "display_name": "Python 3",
864 |    "language": "python",
865 |    "name": "python3"
866 |   },
867 |   "language_info": {
868 |    "codemirror_mode": {
869 |     "name": "ipython",
870 |     "version": 3
871 |    },
872 |    "file_extension": ".py",
873 |    "mimetype": "text/x-python",
874 |    "name": "python",
875 |    "nbconvert_exporter": "python",
876 |    "pygments_lexer": "ipython3",
877 |    "version": "3.7.3"
878 |   }
879 |  },
880 |  "nbformat": 4,
881 |  "nbformat_minor": 2
882 | }
883 | 


--------------------------------------------------------------------------------
/3 - Neural Machine Translation by Jointly Learning to Align and Translate.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# 3 - Neural Machine Translation by Jointly Learning to Align and Translate\n",
   8 |     "\n",
   9 |     "In this third notebook on sequence-to-sequence models using PyTorch and TorchText, we'll be implementing the model from [Neural Machine Translation by Jointly Learning to Align and Translate](https://arxiv.org/abs/1409.0473). This model achives our best perplexity yet, ~27 compared to ~34 for the previous model.\n",
  10 |     "\n",
  11 |     "## Introduction\n",
  12 |     "\n",
  13 |     "As a reminder, here is the general encoder-decoder model:\n",
  14 |     "\n",
  15 |     "![](assets/seq2seq1.png)\n",
  16 |     "\n",
  17 |     "In the previous model, our architecture was set-up in a way to reduce \"information compression\" by explicitly passing the context vector, $z$, to the decoder at every time-step and by passing both the context vector and input word, $y_t$, along with the hidden state, $s_t$, to the linear layer, $f$, to make a prediction.\n",
  18 |     "\n",
  19 |     "![](assets/seq2seq7.png)\n",
  20 |     "\n",
  21 |     "Even though we have reduced some of this compression, our context vector still needs to contain all of the information about the source sentence. The model implemented in this notebook avoids this compression by allowing the decoder to look at the entire source sentence (via its hidden states) at each decoding step! How does it do this? It uses *attention*. \n",
  22 |     "\n",
  23 |     "Attention works by first, calculating an attention vector, $a$, that is the length of the source sentence. The attention vector has the property that each element is between 0 and 1, and the entire vector sums to 1. We then calculate a weighted sum of our source sentence hidden states, $H$, to get a weighted source vector, $w$. \n",
  24 |     "\n",
  25 |     "$$w = \\sum_{i}a_ih_i$$\n",
  26 |     "\n",
  27 |     "We calculate a new weighted source vector every time-step when decoding, using it as input to our decoder RNN as well as the linear layer to make a prediction. We'll explain how to do all of this during the tutorial.\n",
  28 |     "\n",
  29 |     "## Preparing Data\n",
  30 |     "\n",
  31 |     "Again, the preparation is similar to last time.\n",
  32 |     "\n",
  33 |     "First we import all the required modules."
  34 |    ]
  35 |   },
  36 |   {
  37 |    "cell_type": "code",
  38 |    "execution_count": 29,
  39 |    "metadata": {},
  40 |    "outputs": [],
  41 |    "source": [
  42 |     "import torch\n",
  43 |     "import torch.nn as nn\n",
  44 |     "import torch.optim as optim\n",
  45 |     "import torch.nn.functional as F\n",
  46 |     "from torch import Tensor\n",
  47 |     "\n",
  48 |     "from torchtext.datasets import TranslationDataset, Multi30k\n",
  49 |     "from torchtext.data import Field, BucketIterator"
  50 |    ]
  51 |   },
  52 |   {
  53 |    "cell_type": "code",
  54 |    "execution_count": 30,
  55 |    "metadata": {},
  56 |    "outputs": [],
  57 |    "source": [
  58 |     "import spacy\n",
  59 |     "\n",
  60 |     "from typing import Tuple\n",
  61 |     "import random\n",
  62 |     "import math\n",
  63 |     "import time"
  64 |    ]
  65 |   },
  66 |   {
  67 |    "cell_type": "markdown",
  68 |    "metadata": {},
  69 |    "source": [
  70 |     "Note: on top of the standard libraries included in the `pytorch_p36` kernel, I installed the other packages using `pip`."
  71 |    ]
  72 |   },
  73 |   {
  74 |    "cell_type": "markdown",
  75 |    "metadata": {},
  76 |    "source": [
  77 |     "Set the random seeds for reproducability."
  78 |    ]
  79 |   },
  80 |   {
  81 |    "cell_type": "code",
  82 |    "execution_count": 31,
  83 |    "metadata": {},
  84 |    "outputs": [],
  85 |    "source": [
  86 |     "SEED = 1234\n",
  87 |     "\n",
  88 |     "random.seed(SEED)\n",
  89 |     "torch.manual_seed(SEED)\n",
  90 |     "torch.backends.cudnn.deterministic = True"
  91 |    ]
  92 |   },
  93 |   {
  94 |    "cell_type": "markdown",
  95 |    "metadata": {},
  96 |    "source": [
  97 |     "Load the German and English spaCy models."
  98 |    ]
  99 |   },
 100 |   {
 101 |    "cell_type": "code",
 102 |    "execution_count": 32,
 103 |    "metadata": {},
 104 |    "outputs": [],
 105 |    "source": [
 106 |     "spacy_de = spacy.load('de')\n",
 107 |     "spacy_en = spacy.load('en')"
 108 |    ]
 109 |   },
 110 |   {
 111 |    "cell_type": "markdown",
 112 |    "metadata": {},
 113 |    "source": [
 114 |     "We create the tokenizers."
 115 |    ]
 116 |   },
 117 |   {
 118 |    "cell_type": "code",
 119 |    "execution_count": 33,
 120 |    "metadata": {},
 121 |    "outputs": [],
 122 |    "source": [
 123 |     "def tokenize_de(text):\n",
 124 |     "    \"\"\"\n",
 125 |     "    Tokenizes German text from a string into a list of strings\n",
 126 |     "    \"\"\"\n",
 127 |     "    return [tok.text for tok in spacy_de.tokenizer(text)]\n",
 128 |     "\n",
 129 |     "def tokenize_en(text):\n",
 130 |     "    \"\"\"\n",
 131 |     "    Tokenizes English text from a string into a list of strings\n",
 132 |     "    \"\"\"\n",
 133 |     "    return [tok.text for tok in spacy_en.tokenizer(text)]"
 134 |    ]
 135 |   },
 136 |   {
 137 |    "cell_type": "markdown",
 138 |    "metadata": {},
 139 |    "source": [
 140 |     "The fields remain the same as before."
 141 |    ]
 142 |   },
 143 |   {
 144 |    "cell_type": "code",
 145 |    "execution_count": 34,
 146 |    "metadata": {},
 147 |    "outputs": [],
 148 |    "source": [
 149 |     "SRC = Field(tokenize = tokenize_de, \n",
 150 |     "            init_token = '<sos>', \n",
 151 |     "            eos_token = '<eos>', \n",
 152 |     "            lower = True)\n",
 153 |     "\n",
 154 |     "TRG = Field(tokenize = tokenize_en, \n",
 155 |     "            init_token = '<sos>', \n",
 156 |     "            eos_token = '<eos>', \n",
 157 |     "            lower = True)"
 158 |    ]
 159 |   },
 160 |   {
 161 |    "cell_type": "markdown",
 162 |    "metadata": {},
 163 |    "source": [
 164 |     "Load the data."
 165 |    ]
 166 |   },
 167 |   {
 168 |    "cell_type": "code",
 169 |    "execution_count": 35,
 170 |    "metadata": {},
 171 |    "outputs": [],
 172 |    "source": [
 173 |     "train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), \n",
 174 |     "                                                    fields = (SRC, TRG))"
 175 |    ]
 176 |   },
 177 |   {
 178 |    "cell_type": "markdown",
 179 |    "metadata": {},
 180 |    "source": [
 181 |     "Build the vocabulary."
 182 |    ]
 183 |   },
 184 |   {
 185 |    "cell_type": "code",
 186 |    "execution_count": 36,
 187 |    "metadata": {},
 188 |    "outputs": [],
 189 |    "source": [
 190 |     "SRC.build_vocab(train_data, min_freq = 2)\n",
 191 |     "TRG.build_vocab(train_data, min_freq = 2)"
 192 |    ]
 193 |   },
 194 |   {
 195 |    "cell_type": "markdown",
 196 |    "metadata": {},
 197 |    "source": [
 198 |     "Define the device."
 199 |    ]
 200 |   },
 201 |   {
 202 |    "cell_type": "code",
 203 |    "execution_count": 37,
 204 |    "metadata": {},
 205 |    "outputs": [],
 206 |    "source": [
 207 |     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
 208 |    ]
 209 |   },
 210 |   {
 211 |    "cell_type": "markdown",
 212 |    "metadata": {},
 213 |    "source": [
 214 |     "Create the iterators."
 215 |    ]
 216 |   },
 217 |   {
 218 |    "cell_type": "code",
 219 |    "execution_count": 38,
 220 |    "metadata": {},
 221 |    "outputs": [],
 222 |    "source": [
 223 |     "BATCH_SIZE = 128\n",
 224 |     "\n",
 225 |     "train_iterator, valid_iterator, test_iterator = BucketIterator.splits(\n",
 226 |     "    (train_data, valid_data, test_data), \n",
 227 |     "    batch_size = BATCH_SIZE,\n",
 228 |     "    device = device)"
 229 |    ]
 230 |   },
 231 |   {
 232 |    "cell_type": "markdown",
 233 |    "metadata": {},
 234 |    "source": [
 235 |     "## Building the Seq2Seq Model\n",
 236 |     "\n",
 237 |     "### Encoder\n",
 238 |     "\n",
 239 |     "First, we'll build the encoder. Similar to the previous model, we only use a single layer GRU, however we now use a *bidirectional RNN*. With a bidirectional RNN, we have two RNNs in each layer. A *forward RNN* going over the sentence from left to right (shown below in green), and a *backward RNN* going over the sentence from right to left (yellow). All we need to do in code is set `bidirectional = True` and then pass the embedded sentence to the RNN as before. \n",
 240 |     "\n",
 241 |     "![](assets/seq2seq8.png)\n",
 242 |     "\n",
 243 |     "We now have:\n",
 244 |     "\n",
 245 |     "$$\\begin{align*}\n",
 246 |     "h_t^\\rightarrow &= \\text{EncoderGRU}^\\rightarrow(x_t^\\rightarrow,h_t^\\rightarrow)\\\\\n",
 247 |     "h_t^\\leftarrow &= \\text{EncoderGRU}^\\leftarrow(x_t^\\leftarrow,h_t^\\leftarrow)\n",
 248 |     "\\end{align*}$$\n",
 249 |     "\n",
 250 |     "Where $x_0^\\rightarrow = \\text{<sos>}, x_1^\\rightarrow = \\text{guten}$ and $x_0^\\leftarrow = \\text{<eos>}, x_1^\\leftarrow = \\text{morgen}$.\n",
 251 |     "\n",
 252 |     "As before, we only pass an input (`embedded`) to the RNN, which tells PyTorch to initialize both the forward and backward initial hidden states ($h_0^\\rightarrow$ and $h_0^\\leftarrow$, respectively) to a tensor of all zeros. We'll also get two context vectors, one from the forward RNN after it has seen the final word in the sentence, $z^\\rightarrow=h_T^\\rightarrow$, and one from the backward RNN after it has seen the first word in the sentence, $z^\\leftarrow=h_T^\\leftarrow$.\n",
 253 |     "\n",
 254 |     "The RNN returns `outputs` and `hidden`. \n",
 255 |     "\n",
 256 |     "`outputs` is of size **[src sent len, batch size, hid dim * num directions]** where the first `hid_dim` elements in the third axis are the hidden states from the top layer forward RNN, and the last `hid_dim` elements are hidden states from the top layer backward RNN. You can think of the third axis as being the forward and backward hidden states stacked on top of each other, i.e. $h_1 = [h_1^\\rightarrow; h_{T}^\\leftarrow]$, $h_2 = [h_2^\\rightarrow; h_{T-1}^\\leftarrow]$ and we can denote all stacked encoder hidden states as $H=\\{ h_1, h_2, ..., h_T\\}$.\n",
 257 |     "\n",
 258 |     "`hidden` is of size **[n layers * num directions, batch size, hid dim]**, where **[-2, :, :]** gives the top layer forward RNN hidden state after the final time-step (i.e. after it has seen the last word in the sentence) and **[-1, :, :]** gives the top layer backward RNN hidden state after the final time-step (i.e. after it has seen the first word in the sentence).\n",
 259 |     "\n",
 260 |     "As the decoder is not bidirectional, it only needs a single context vector, $z$, to use as its initial hidden state, $s_0$, and we currently have two, a forward and a backward one ($z^\\rightarrow=h_T^\\rightarrow$ and $z^\\leftarrow=h_T^\\leftarrow$, respectively). We solve this by concatenating the two context vectors together, passing them through a linear layer, $g$, and applying the $\\tanh$ activation function. \n",
 261 |     "\n",
 262 |     "$$z=\\tanh(g(h_T^\\rightarrow, h_T^\\leftarrow)) = \\tanh(g(z^\\rightarrow, z^\\leftarrow)) = s_0$$\n",
 263 |     "\n",
 264 |     "**Note**: this is actually a deviation from the paper. Instead, they feed only the first backward RNN hidden state through a linear layer to get the context vector/decoder initial hidden state. This doesn't seem to make sense to me, so I've changed it.\n",
 265 |     "\n",
 266 |     "As we want our model to look back over the whole of the source sentence we return `outputs`, the stacked forward and backward hidden states for every token in the source sentence. We also return `hidden`, which acts as our initial hidden state in the decoder."
 267 |    ]
 268 |   },
 269 |   {
 270 |    "cell_type": "code",
 271 |    "execution_count": 39,
 272 |    "metadata": {},
 273 |    "outputs": [],
 274 |    "source": [
 275 |     "class Encoder(nn.Module):\n",
 276 |     "    def __init__(self, \n",
 277 |     "                 input_dim: int, \n",
 278 |     "                 emb_dim: int, \n",
 279 |     "                 enc_hid_dim: int, \n",
 280 |     "                 dec_hid_dim: int, \n",
 281 |     "                 dropout: float):\n",
 282 |     "        super().__init__()\n",
 283 |     "        \n",
 284 |     "        self.input_dim = input_dim\n",
 285 |     "        self.emb_dim = emb_dim\n",
 286 |     "        self.enc_hid_dim = enc_hid_dim\n",
 287 |     "        self.dec_hid_dim = dec_hid_dim\n",
 288 |     "        self.dropout = dropout\n",
 289 |     "        \n",
 290 |     "        self.embedding = nn.Embedding(input_dim, emb_dim)\n",
 291 |     "        \n",
 292 |     "        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)\n",
 293 |     "        \n",
 294 |     "        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)\n",
 295 |     "        \n",
 296 |     "        self.dropout = nn.Dropout(dropout)\n",
 297 |     "        \n",
 298 |     "    def forward(self, \n",
 299 |     "                src: Tensor) -> Tuple[Tensor]:\n",
 300 |     "        \n",
 301 |     "        #src = [src sent len, batch size]\n",
 302 |     "        \n",
 303 |     "        embedded = self.dropout(self.embedding(src))\n",
 304 |     "        \n",
 305 |     "        #embedded = [src sent len, batch size, emb dim]\n",
 306 |     "        \n",
 307 |     "        outputs, hidden = self.rnn(embedded)\n",
 308 |     "                \n",
 309 |     "        #outputs = [src sent len, batch size, hid dim * num directions]\n",
 310 |     "        #hidden = [n layers * num directions, batch size, hid dim]\n",
 311 |     "        \n",
 312 |     "        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]\n",
 313 |     "        #outputs are always from the last layer\n",
 314 |     "        \n",
 315 |     "        #hidden [-2, :, : ] is the last of the forwards RNN \n",
 316 |     "        #hidden [-1, :, : ] is the last of the backwards RNN\n",
 317 |     "        \n",
 318 |     "        #initial decoder hidden is final hidden state of the forwards and backwards \n",
 319 |     "        #  encoder RNNs fed through a linear layer\n",
 320 |     "        # Note: torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)\n",
 321 |     "        # is of shape [batch_size, enc_hid_dim * 2]\n",
 322 |     "        \n",
 323 |     "        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))\n",
 324 |     "        \n",
 325 |     "        #outputs = [src sent len, batch size, enc hid dim * 2]\n",
 326 |     "        #hidden = [batch size, dec hid dim]\n",
 327 |     "        \n",
 328 |     "        return outputs, hidden"
 329 |    ]
 330 |   },
 331 |   {
 332 |    "cell_type": "markdown",
 333 |    "metadata": {},
 334 |    "source": [
 335 |     "### Attention\n",
 336 |     "\n",
 337 |     "Next up is the attention layer. This will take in the previous hidden state of the decoder, $s_{t-1}$, and all of the stacked forward and backward hidden states from the encoder, $H$. The layer will output an attention vector, $a_t$, that is the length of the source sentence, each element is between 0 and 1 and the entire vector sums to 1.\n",
 338 |     "\n",
 339 |     "Intuitively, this layer takes what we have decoded so far, $s_{t-1}$, and all of what we have encoded, $H$, to produce a vector, $a_t$, that represents which words in the source sentence we should pay the most attention to in order to correctly predict the next word to decode, $\\hat{y}_{t+1}$. \n",
 340 |     "\n",
 341 |     "First, we calculate the *energy* between the previous decoder hidden state and the encoder hidden states. As our encoder hidden states are a sequence of $T$ tensors, and our previous decoder hidden state is a single tensor, the first thing we do is `repeat` the previous decoder hidden state $T$ times. We then calculate the energy, $E_t$, between them by concatenating them together and passing them through a linear layer (`attn`) and a $\\tanh$ activation function. \n",
 342 |     "\n",
 343 |     "$$E_t = \\tanh(\\text{attn}(s_{t-1}, H))$$ \n",
 344 |     "\n",
 345 |     "This can be thought of as calculating how well each encoder hidden state \"matches\" the previous decoder hidden state.\n",
 346 |     "\n",
 347 |     "We currently have a **[dec hid dim, src sent len]** tensor for each example in the batch. We want this to be **[src sent len]** for each example in the batch as the attention should be over the length of the source sentence. This is achieved by multiplying the `energy` by a **[1, attn dim]** tensor, $v$.\n",
 348 |     "\n",
 349 |     "$$\\hat{a}_t = v E_t$$\n",
 350 |     "\n",
 351 |     "We can think of this as calculating a weighted sum of the \"match\" over all `dec_hid_dem` elements for each encoder hidden state, where the weights are learned (as we learn the parameters of $v$).\n",
 352 |     "\n",
 353 |     "Finally, we ensure the attention vector fits the constraints of having all elements between 0 and 1 and the vector summing to 1 by passing it through a $\\text{softmax}$ layer.\n",
 354 |     "\n",
 355 |     "$$a_t = \\text{softmax}(\\hat{a_t})$$\n",
 356 |     "\n",
 357 |     "This gives us the attention over the source sentence!\n",
 358 |     "\n",
 359 |     "Graphically, this looks something like below. This is for calculating the very first attention vector, where $s_{t-1} = s_0 = z$. The green/yellow blocks represent the hidden states from both the forward and backward RNNs, and the attention computation is all done within the pink block.\n",
 360 |     "\n",
 361 |     "![](assets/seq2seq9.png)"
 362 |    ]
 363 |   },
 364 |   {
 365 |    "cell_type": "code",
 366 |    "execution_count": 40,
 367 |    "metadata": {},
 368 |    "outputs": [],
 369 |    "source": [
 370 |     "class Attention(nn.Module):\n",
 371 |     "    def __init__(self, \n",
 372 |     "                 enc_hid_dim: int, \n",
 373 |     "                 dec_hid_dim: int,\n",
 374 |     "                 attn_dim: int):\n",
 375 |     "        super().__init__()\n",
 376 |     "        \n",
 377 |     "        self.enc_hid_dim = enc_hid_dim\n",
 378 |     "        self.dec_hid_dim = dec_hid_dim\n",
 379 |     "        \n",
 380 |     "        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim\n",
 381 |     "        \n",
 382 |     "        self.attn = nn.Linear(self.attn_in, attn_dim)\n",
 383 |     "        self.v = nn.Parameter(torch.rand(attn_dim))\n",
 384 |     "        \n",
 385 |     "    def forward(self, \n",
 386 |     "                decoder_hidden: Tensor, \n",
 387 |     "                encoder_outputs: Tensor) -> Tensor:\n",
 388 |     "        \n",
 389 |     "        #hidden = [batch size, dec hid dim]\n",
 390 |     "        #encoder_outputs = [src sent len, batch size, enc hid dim * 2]\n",
 391 |     "        \n",
 392 |     "        batch_size = encoder_outputs.shape[1]\n",
 393 |     "        src_len = encoder_outputs.shape[0]\n",
 394 |     "        \n",
 395 |     "        #repeat decoder hidden state src_len times\n",
 396 |     "        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)\n",
 397 |     "        \n",
 398 |     "        encoder_outputs = encoder_outputs.permute(1, 0, 2)\n",
 399 |     "        \n",
 400 |     "        #decoder_hidden = [batch size, src sent len, dec hid dim]\n",
 401 |     "        #encoder_outputs = [batch size, src sent len, enc hid dim * 2]\n",
 402 |     "        \n",
 403 |     "        # Step 1: to enable feeding through \"self.attn\" pink box above, concatenate \n",
 404 |     "        # `repeated_decoder_hidden` and `encoder_outputs`:\n",
 405 |     "        # torch.cat((hidden, encoder_outputs), dim = 2) has shape \n",
 406 |     "        # [batch_size, seq_len, enc_hid_dim * 2 + dec_hid_dim]\n",
 407 |     "        \n",
 408 |     "        # Step 2: feed through self.attn to end up with:\n",
 409 |     "        # [batch_size, seq_len, attn_dim]\n",
 410 |     "        \n",
 411 |     "        # Step 3: feed through tanh       \n",
 412 |     "        \n",
 413 |     "        energy = torch.tanh(self.attn(torch.cat((\n",
 414 |     "            repeated_decoder_hidden, \n",
 415 |     "            encoder_outputs), \n",
 416 |     "            dim = 2))) \n",
 417 |     "        \n",
 418 |     "        #energy = [batch size, src sent len, attn_dim]\n",
 419 |     "        \n",
 420 |     "        energy = energy.permute(0, 2, 1)\n",
 421 |     "        \n",
 422 |     "        #energy = [batch size, attn_dim, src sent len]\n",
 423 |     "        \n",
 424 |     "        #v = [attn_dim]\n",
 425 |     "        \n",
 426 |     "        v = self.v.repeat(batch_size, 1).unsqueeze(1)\n",
 427 |     "        \n",
 428 |     "        #v = [batch size, 1, attn_dim]\n",
 429 |     "        \n",
 430 |     "        # High level: energy a function of both encoder element outputs and most recent decoder hidden state,\n",
 431 |     "        # of shape attn_dim x enc_seq_len for each observation\n",
 432 |     "        # v, being 1 x attn_dim, transforms this into a vector of shape 1 x enc_seq_len for each observation\n",
 433 |     "        # Then, we take the softmax over these to get the output of the attention function\n",
 434 |     "\n",
 435 |     "        attention = torch.bmm(v, energy).squeeze(1)\n",
 436 |     "        \n",
 437 |     "        #attention= [batch size, src len]\n",
 438 |     "        \n",
 439 |     "        return F.softmax(attention, dim=1)"
 440 |    ]
 441 |   },
 442 |   {
 443 |    "cell_type": "markdown",
 444 |    "metadata": {},
 445 |    "source": [
 446 |     "#### Trying `Attention` with summing the values across the `attn_dim` dimension\n",
 447 |     "\n",
 448 |     "Instead of learning a `self.v` parameter"
 449 |    ]
 450 |   },
 451 |   {
 452 |    "cell_type": "code",
 453 |    "execution_count": 41,
 454 |    "metadata": {},
 455 |    "outputs": [],
 456 |    "source": [
 457 |     "class ModifiedAttention(nn.Module):\n",
 458 |     "    def __init__(self, \n",
 459 |     "                 enc_hid_dim: int, \n",
 460 |     "                 dec_hid_dim: int,\n",
 461 |     "                 attn_dim: int):\n",
 462 |     "        super().__init__()\n",
 463 |     "        \n",
 464 |     "        self.enc_hid_dim = enc_hid_dim\n",
 465 |     "        self.dec_hid_dim = dec_hid_dim\n",
 466 |     "        \n",
 467 |     "        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim\n",
 468 |     "        \n",
 469 |     "        self.attn = nn.Linear(self.attn_in, attn_dim)\n",
 470 |     "        \n",
 471 |     "    def forward(self, \n",
 472 |     "                decoder_hidden: Tensor, \n",
 473 |     "                encoder_outputs: Tensor) -> Tensor:\n",
 474 |     "        \n",
 475 |     "        #hidden = [batch size, dec hid dim]\n",
 476 |     "        #encoder_outputs = [src sent len, batch size, enc hid dim * 2]\n",
 477 |     "        \n",
 478 |     "        batch_size = encoder_outputs.shape[1]\n",
 479 |     "        src_len = encoder_outputs.shape[0]\n",
 480 |     "        \n",
 481 |     "        #repeat decoder hidden state src_len times\n",
 482 |     "        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)\n",
 483 |     "        \n",
 484 |     "        encoder_outputs = encoder_outputs.permute(1, 0, 2)\n",
 485 |     "        \n",
 486 |     "        #decoder_hidden = [batch size, src sent len, dec hid dim]\n",
 487 |     "        #encoder_outputs = [batch size, src sent len, enc hid dim * 2]\n",
 488 |     "        \n",
 489 |     "        # Step 1: to enable feeding through \"self.attn\" pink box above, concatenate \n",
 490 |     "        # `repeated_decoder_hidden` and `encoder_outputs`:\n",
 491 |     "        # torch.cat((hidden, encoder_outputs), dim = 2) has shape \n",
 492 |     "        # [batch_size, seq_len, enc_hid_dim * 2 + dec_hid_dim]\n",
 493 |     "        \n",
 494 |     "        # Step 2: feed through self.attn to end up with:\n",
 495 |     "        # [batch_size, seq_len, attn_dim]\n",
 496 |     "        \n",
 497 |     "        # Step 3: feed through tanh       \n",
 498 |     "        \n",
 499 |     "        energy = torch.tanh(self.attn(torch.cat((\n",
 500 |     "            repeated_decoder_hidden, \n",
 501 |     "            encoder_outputs), \n",
 502 |     "            dim = 2))) \n",
 503 |     "        \n",
 504 |     "        #energy = [batch size, src sent len, attn_dim]\n",
 505 |     "\n",
 506 |     "        attention = torch.sum(energy, dim=2)\n",
 507 |     "        #attention= [batch size, src len]\n",
 508 |     "        \n",
 509 |     "        return F.softmax(attention, dim=1)"
 510 |    ]
 511 |   },
 512 |   {
 513 |    "cell_type": "markdown",
 514 |    "metadata": {},
 515 |    "source": [
 516 |     "### Decoder\n",
 517 |     "\n",
 518 |     "Next up is the decoder. \n",
 519 |     "\n",
 520 |     "The decoder contains the attention layer, `attention`, which takes the previous hidden state, $s_{t-1}$, all of the encoder hidden states, $H$, and returns the attention vector, $a_t$.\n",
 521 |     "\n",
 522 |     "We then use this attention vector to create a weighted source vector, $w_t$, denoted by `weighted`, which is a weighted sum of the encoder hidden states, $H$, using $a_t$ as the weights.\n",
 523 |     "\n",
 524 |     "$$w_t = a_t H$$\n",
 525 |     "\n",
 526 |     "The input word (that has been embedded), $y_t$, the weighted source vector, $w_t$, and the previous decoder hidden state, $s_{t-1}$, are then all passed into the decoder RNN, with $y_t$ and $w_t$ being concatenated together.\n",
 527 |     "\n",
 528 |     "$$s_t = \\text{DecoderGRU}(y_t, w_t, s_{t-1})$$\n",
 529 |     "\n",
 530 |     "We then pass $y_t$, $w_t$ and $s_t$ through the linear layer, $f$, to make a prediction of the next word in the target sentence, $\\hat{y}_{t+1}$. This is done by concatenating them all together.\n",
 531 |     "\n",
 532 |     "$$\\hat{y}_{t+1} = f(y_t, w_t, s_t)$$\n",
 533 |     "\n",
 534 |     "The image below shows decoding the first word in an example translation.\n",
 535 |     "\n",
 536 |     "![](assets/seq2seq10.png)\n",
 537 |     "\n",
 538 |     "The green/yellow blocks show the forward/backward encoder RNNs which output $H$, the red block shows the context vector, $z = h_T = \\tanh(g(h^\\rightarrow_T,h^\\leftarrow_T)) = \\tanh(g(z^\\rightarrow, z^\\leftarrow)) = s_0$, the blue block shows the decoder RNN which outputs $s_t$, the purple block shows the linear layer, $f$, which outputs $\\hat{y}_{t+1}$ and the orange block shows the calculation of the weighted sum over $H$ by $a_t$ and outputs $w_t$. Not shown is the calculation of $a_t$."
 539 |    ]
 540 |   },
 541 |   {
 542 |    "cell_type": "code",
 543 |    "execution_count": 42,
 544 |    "metadata": {},
 545 |    "outputs": [],
 546 |    "source": [
 547 |     "class Decoder(nn.Module):\n",
 548 |     "    def __init__(self, \n",
 549 |     "                 output_dim: int, \n",
 550 |     "                 emb_dim: int, \n",
 551 |     "                 enc_hid_dim: int, \n",
 552 |     "                 dec_hid_dim: int, \n",
 553 |     "                 dropout: int, \n",
 554 |     "                 attention: nn.Module):\n",
 555 |     "        super().__init__()\n",
 556 |     "\n",
 557 |     "        self.emb_dim = emb_dim\n",
 558 |     "        self.enc_hid_dim = enc_hid_dim\n",
 559 |     "        self.dec_hid_dim = dec_hid_dim\n",
 560 |     "        self.output_dim = output_dim\n",
 561 |     "        self.dropout = dropout\n",
 562 |     "        self.attention = attention\n",
 563 |     "        # Note: from Attention: self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, attn_dim)\n",
 564 |     "        \n",
 565 |     "        # Note: `output_dim` same as `vocab_size`\n",
 566 |     "        self.embedding = nn.Embedding(output_dim, emb_dim)\n",
 567 |     "        \n",
 568 |     "        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)\n",
 569 |     "        \n",
 570 |     "        self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)\n",
 571 |     "        \n",
 572 |     "        self.dropout = nn.Dropout(dropout)\n",
 573 |     "        \n",
 574 |     "        \n",
 575 |     "    def _weighted_encoder_rep(self, \n",
 576 |     "                              decoder_hidden: Tensor,\n",
 577 |     "                              encoder_outputs: Tensor) -> Tensor:\n",
 578 |     "        \n",
 579 |     "        # Attention, at a high level, takes in:\n",
 580 |     "        # The decoder hidden state\n",
 581 |     "        # All the \"seq_len\" encoder outputs\n",
 582 |     "        # Outputs a vector summing to 1 of length seq_len for each observation\n",
 583 |     "        a = self.attention(decoder_hidden, encoder_outputs)\n",
 584 |     "\n",
 585 |     "        #a = [batch size, src len]\n",
 586 |     "\n",
 587 |     "        a = a.unsqueeze(1)\n",
 588 |     "\n",
 589 |     "        #a = [batch size, 1, src len]\n",
 590 |     "\n",
 591 |     "        encoder_outputs = encoder_outputs.permute(1, 0, 2)\n",
 592 |     "\n",
 593 |     "        #encoder_outputs = [batch size, src sent len, enc hid dim * 2]\n",
 594 |     "\n",
 595 |     "        weighted_encoder_rep = torch.bmm(a, encoder_outputs)\n",
 596 |     "\n",
 597 |     "        #weighted_encoder_rep = [batch size, 1, enc hid dim * 2]\n",
 598 |     "\n",
 599 |     "        weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)\n",
 600 |     "\n",
 601 |     "        #weighted_encoder_rep = [1, batch size, enc hid dim * 2]\n",
 602 |     "        \n",
 603 |     "        return weighted_encoder_rep\n",
 604 |     "        \n",
 605 |     "        \n",
 606 |     "    def forward(self, \n",
 607 |     "                input: Tensor, \n",
 608 |     "                decoder_hidden: Tensor, \n",
 609 |     "                encoder_outputs: Tensor) -> Tuple[Tensor]:\n",
 610 |     "             \n",
 611 |     "        #input = [batch size] Note: \"one character at a time\"\n",
 612 |     "        #hidden = [batch size, dec hid dim]\n",
 613 |     "        #encoder_outputs = [src sent len, batch size, enc hid dim * 2]\n",
 614 |     "        \n",
 615 |     "        input = input.unsqueeze(0)\n",
 616 |     "        \n",
 617 |     "        #input = [1, batch size]\n",
 618 |     "        \n",
 619 |     "        embedded = self.dropout(self.embedding(input))\n",
 620 |     "        \n",
 621 |     "        #embedded = [1, batch size, emb dim]\n",
 622 |     "        \n",
 623 |     "        weighted_encoder_rep = self._weighted_encoder_rep(decoder_hidden, \n",
 624 |     "                                                          encoder_outputs)\n",
 625 |     "        \n",
 626 |     "        # Then, the input to the decoder _for this character_ is a concatenation of:\n",
 627 |     "        # This weighted attention\n",
 628 |     "        # The embedding itself\n",
 629 |     "        \n",
 630 |     "        rnn_input = torch.cat((embedded, weighted_encoder_rep), dim = 2)\n",
 631 |     "        \n",
 632 |     "        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]\n",
 633 |     "            \n",
 634 |     "        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))\n",
 635 |     "        \n",
 636 |     "        #output = [sent len, batch size, dec hid dim * n directions]\n",
 637 |     "        #decoder_hidden = [n layers * n directions, batch size, dec hid dim]\n",
 638 |     "        \n",
 639 |     "        #sent len, n layers and n directions will always be 1 in this decoder, therefore:\n",
 640 |     "        #output = [1, batch size, dec hid dim]\n",
 641 |     "        #hidden = [1, batch size, dec hid dim]\n",
 642 |     "        #this also means that output == hidden\n",
 643 |     "        assert (output == decoder_hidden).all()\n",
 644 |     "        \n",
 645 |     "        embedded = embedded.squeeze(0)\n",
 646 |     "        output = output.squeeze(0)\n",
 647 |     "        weighted_encoder_rep = weighted_encoder_rep.squeeze(0)\n",
 648 |     "        \n",
 649 |     "        output = self.out(torch.cat((output, \n",
 650 |     "                                     weighted_encoder_rep, \n",
 651 |     "                                     embedded), dim = 1))\n",
 652 |     "        \n",
 653 |     "        #output = [bsz, output dim]\n",
 654 |     "        \n",
 655 |     "        return output, decoder_hidden.squeeze(0)"
 656 |    ]
 657 |   },
 658 |   {
 659 |    "cell_type": "markdown",
 660 |    "metadata": {},
 661 |    "source": [
 662 |     "### Seq2Seq\n",
 663 |     "\n",
 664 |     "This is the first model where we don't have to have the encoder RNN and decoder RNN have the same hidden dimensions, however the encoder has to be bidirectional. This requirement can be removed by changing all occurences of `enc_dim * 2` to `enc_dim * 2 if encoder_is_bidirectional else enc_dim`. \n",
 665 |     "\n",
 666 |     "This seq2seq encapsulator is similar to the last two. The only difference is that the `encoder` returns both the final hidden state (which is the final hidden state from both the forward and backward encoder RNNs passed through a linear layer) to be used as the initial hidden state for the encoder, as well as every hidden state (which are the forward and backward hidden states stacked on top of each other). We also need to ensure that `hidden` and `encoder_outputs` are passed to the decoder. \n",
 667 |     "\n",
 668 |     "Briefly going over all of the decoding steps:\n",
 669 |     "\n",
 670 |     "Briefly going over all of the steps:\n",
 671 |     "- the `outputs` tensor is created to hold all predictions, $\\hat{Y}$\n",
 672 |     "- the source sequence, $X$, is fed into the encoder to receive $z$ and $H$\n",
 673 |     "- the initial decoder hidden state is set to be the `context` vector, $s_0 = z = h_T$\n",
 674 |     "- we use a batch of `<sos>` tokens as the first `input`, $y_1$\n",
 675 |     "- we then decode within a loop:\n",
 676 |     "  - inserting the input token $y_t$, previous hidden state, $s_{t-1}$, and all encoder outputs, $H$, into the decoder\n",
 677 |     "  - receiving a prediction, $\\hat{y}_{t+1}$, and a new hidden state, $s_t$\n",
 678 |     "  - we then decide if we are going to teacher force or not, setting the next input as appropriate"
 679 |    ]
 680 |   },
 681 |   {
 682 |    "cell_type": "code",
 683 |    "execution_count": 43,
 684 |    "metadata": {},
 685 |    "outputs": [],
 686 |    "source": [
 687 |     "class Seq2Seq(nn.Module):\n",
 688 |     "    def __init__(self, \n",
 689 |     "                 encoder: nn.Module, \n",
 690 |     "                 decoder: nn.Module, \n",
 691 |     "                 device: torch.device):\n",
 692 |     "        super().__init__()\n",
 693 |     "        \n",
 694 |     "        self.encoder = encoder\n",
 695 |     "        self.decoder = decoder\n",
 696 |     "        self.device = device\n",
 697 |     "        \n",
 698 |     "    def forward(self, \n",
 699 |     "                src: Tensor, \n",
 700 |     "                trg: Tensor, \n",
 701 |     "                teacher_forcing_ratio: float = 0.5) -> Tensor:\n",
 702 |     "        \n",
 703 |     "        #src = [src sent len, batch size]\n",
 704 |     "        #trg = [trg sent len, batch size]\n",
 705 |     "        #teacher_forcing_ratio is probability to use teacher forcing\n",
 706 |     "        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time\n",
 707 |     "        \n",
 708 |     "        batch_size = src.shape[1]\n",
 709 |     "        max_len = trg.shape[0]\n",
 710 |     "        trg_vocab_size = self.decoder.output_dim\n",
 711 |     "        \n",
 712 |     "        #tensor to store decoder outputs\n",
 713 |     "        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)\n",
 714 |     "        \n",
 715 |     "        #encoder_outputs is all hidden states of the input sequence, back and forwards\n",
 716 |     "        #hidden is the final forward and backward hidden states, passed through a linear layer\n",
 717 |     "        encoder_outputs, hidden = self.encoder(src)\n",
 718 |     "                \n",
 719 |     "        #first input to the decoder is the <sos> tokens\n",
 720 |     "        output = trg[0,:]\n",
 721 |     "        \n",
 722 |     "        for t in range(1, max_len):\n",
 723 |     "            output, hidden = self.decoder(output, hidden, encoder_outputs)\n",
 724 |     "            outputs[t] = output\n",
 725 |     "            teacher_force = random.random() < teacher_forcing_ratio\n",
 726 |     "            top1 = output.max(1)[1]\n",
 727 |     "            output = (trg[t] if teacher_force else top1)\n",
 728 |     "\n",
 729 |     "        return outputs"
 730 |    ]
 731 |   },
 732 |   {
 733 |    "cell_type": "markdown",
 734 |    "metadata": {},
 735 |    "source": [
 736 |     "## Training the Seq2Seq Model\n",
 737 |     "\n",
 738 |     "The rest of this tutorial is very similar to the previous one.\n",
 739 |     "\n",
 740 |     "We initialise our parameters, encoder, decoder and seq2seq model (placing it on the GPU if we have one). "
 741 |    ]
 742 |   },
 743 |   {
 744 |    "cell_type": "code",
 745 |    "execution_count": 44,
 746 |    "metadata": {},
 747 |    "outputs": [],
 748 |    "source": [
 749 |     "INPUT_DIM = len(SRC.vocab)\n",
 750 |     "OUTPUT_DIM = len(TRG.vocab)\n",
 751 |     "ENC_EMB_DIM = 256\n",
 752 |     "DEC_EMB_DIM = 256\n",
 753 |     "ENC_HID_DIM = 512\n",
 754 |     "DEC_HID_DIM = 512\n",
 755 |     "ATTN_DIM = 64\n",
 756 |     "ENC_DROPOUT = 0.5\n",
 757 |     "DEC_DROPOUT = 0.5\n",
 758 |     "\n",
 759 |     "attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)\n",
 760 |     "enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)\n",
 761 |     "dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)\n",
 762 |     "\n",
 763 |     "model = Seq2Seq(enc, dec, device).to(device)"
 764 |    ]
 765 |   },
 766 |   {
 767 |    "cell_type": "markdown",
 768 |    "metadata": {},
 769 |    "source": [
 770 |     "#### Model with `ModifiedAttention`"
 771 |    ]
 772 |   },
 773 |   {
 774 |    "cell_type": "code",
 775 |    "execution_count": 45,
 776 |    "metadata": {},
 777 |    "outputs": [],
 778 |    "source": [
 779 |     "mod_attn = ModifiedAttention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)\n",
 780 |     "\n",
 781 |     "dec_mod_attn = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, mod_attn)\n",
 782 |     "\n",
 783 |     "model_mod_attn = Seq2Seq(enc, dec_mod_attn, device).to(device)"
 784 |    ]
 785 |   },
 786 |   {
 787 |    "cell_type": "markdown",
 788 |    "metadata": {},
 789 |    "source": [
 790 |     "We use a simplified version of the weight initialization scheme used in the paper. Here, we will initialize all biases to zero and all weights from $\\mathcal{N}(0, 0.01)$."
 791 |    ]
 792 |   },
 793 |   {
 794 |    "cell_type": "code",
 795 |    "execution_count": 46,
 796 |    "metadata": {},
 797 |    "outputs": [
 798 |     {
 799 |      "data": {
 800 |       "text/plain": [
 801 |        "Seq2Seq(\n",
 802 |        "  (encoder): Encoder(\n",
 803 |        "    (embedding): Embedding(7855, 256)\n",
 804 |        "    (rnn): GRU(256, 512, bidirectional=True)\n",
 805 |        "    (fc): Linear(in_features=1024, out_features=512, bias=True)\n",
 806 |        "    (dropout): Dropout(p=0.5)\n",
 807 |        "  )\n",
 808 |        "  (decoder): Decoder(\n",
 809 |        "    (attention): ModifiedAttention(\n",
 810 |        "      (attn): Linear(in_features=1536, out_features=64, bias=True)\n",
 811 |        "    )\n",
 812 |        "    (embedding): Embedding(5893, 256)\n",
 813 |        "    (rnn): GRU(1280, 512)\n",
 814 |        "    (out): Linear(in_features=1792, out_features=5893, bias=True)\n",
 815 |        "    (dropout): Dropout(p=0.5)\n",
 816 |        "  )\n",
 817 |        ")"
 818 |       ]
 819 |      },
 820 |      "execution_count": 46,
 821 |      "metadata": {},
 822 |      "output_type": "execute_result"
 823 |     }
 824 |    ],
 825 |    "source": [
 826 |     "def init_weights(m: nn.Module):\n",
 827 |     "    for name, param in m.named_parameters():\n",
 828 |     "        if 'weight' in name:\n",
 829 |     "            nn.init.normal_(param.data, mean=0, std=0.01)\n",
 830 |     "        else:\n",
 831 |     "            nn.init.constant_(param.data, 0)\n",
 832 |     "            \n",
 833 |     "model.apply(init_weights)\n",
 834 |     "model_mod_attn.apply(init_weights)"
 835 |    ]
 836 |   },
 837 |   {
 838 |    "cell_type": "markdown",
 839 |    "metadata": {},
 840 |    "source": [
 841 |     "Calculate the number of parameters. We get an increase of almost 50% in the amount of parameters from the last model. \n",
 842 |     "\n",
 843 |     "Note also that the model with modified attention has slightly fewer parameters."
 844 |    ]
 845 |   },
 846 |   {
 847 |    "cell_type": "code",
 848 |    "execution_count": 47,
 849 |    "metadata": {},
 850 |    "outputs": [
 851 |     {
 852 |      "name": "stdout",
 853 |      "output_type": "stream",
 854 |      "text": [
 855 |       "The model has 19,829,893 trainable parameters\n",
 856 |       "The model has 19,829,829 trainable parameters\n"
 857 |      ]
 858 |     }
 859 |    ],
 860 |    "source": [
 861 |     "def count_parameters(model: nn.Module):\n",
 862 |     "    return sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
 863 |     "\n",
 864 |     "print(f'The model has {count_parameters(model):,} trainable parameters')\n",
 865 |     "print(f'The model has {count_parameters(model_mod_attn):,} trainable parameters')"
 866 |    ]
 867 |   },
 868 |   {
 869 |    "cell_type": "markdown",
 870 |    "metadata": {},
 871 |    "source": [
 872 |     "We create an optimizer."
 873 |    ]
 874 |   },
 875 |   {
 876 |    "cell_type": "code",
 877 |    "execution_count": 48,
 878 |    "metadata": {},
 879 |    "outputs": [],
 880 |    "source": [
 881 |     "optimizer = optim.Adam(model.parameters())\n",
 882 |     "\n",
 883 |     "optimizer_mod_attn = optim.Adam(model_mod_attn.parameters())"
 884 |    ]
 885 |   },
 886 |   {
 887 |    "cell_type": "markdown",
 888 |    "metadata": {},
 889 |    "source": [
 890 |     "We initialize the loss function."
 891 |    ]
 892 |   },
 893 |   {
 894 |    "cell_type": "code",
 895 |    "execution_count": 49,
 896 |    "metadata": {},
 897 |    "outputs": [],
 898 |    "source": [
 899 |     "PAD_IDX = TRG.vocab.stoi['<pad>']\n",
 900 |     "\n",
 901 |     "criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)"
 902 |    ]
 903 |   },
 904 |   {
 905 |    "cell_type": "markdown",
 906 |    "metadata": {},
 907 |    "source": [
 908 |     "We then create the training loop..."
 909 |    ]
 910 |   },
 911 |   {
 912 |    "cell_type": "code",
 913 |    "execution_count": 50,
 914 |    "metadata": {},
 915 |    "outputs": [],
 916 |    "source": [
 917 |     "def train(model: nn.Module, \n",
 918 |     "          iterator: BucketIterator, \n",
 919 |     "          optimizer: optim.Adam, \n",
 920 |     "          criterion: nn.modules.loss.CrossEntropyLoss, \n",
 921 |     "          clip: float):\n",
 922 |     "    \n",
 923 |     "    model.train()\n",
 924 |     "    \n",
 925 |     "    epoch_loss = 0\n",
 926 |     "    \n",
 927 |     "    for i, batch in enumerate(iterator):\n",
 928 |     "        \n",
 929 |     "        src = batch.src\n",
 930 |     "        trg = batch.trg\n",
 931 |     "        \n",
 932 |     "        optimizer.zero_grad()\n",
 933 |     "        \n",
 934 |     "        output = model(src, trg)\n",
 935 |     "        \n",
 936 |     "        #trg = [trg sent len, batch size]\n",
 937 |     "        #output = [trg sent len, batch size, output dim]\n",
 938 |     "        \n",
 939 |     "        output = output[1:].view(-1, output.shape[-1])\n",
 940 |     "        trg = trg[1:].view(-1)\n",
 941 |     "        \n",
 942 |     "        #trg = [(trg sent len - 1) * batch size]\n",
 943 |     "        #output = [(trg sent len - 1) * batch size, output dim]\n",
 944 |     "        \n",
 945 |     "        loss = criterion(output, trg)\n",
 946 |     "        \n",
 947 |     "        loss.backward()\n",
 948 |     "        \n",
 949 |     "        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n",
 950 |     "        \n",
 951 |     "        optimizer.step()\n",
 952 |     "        \n",
 953 |     "        epoch_loss += loss.item()\n",
 954 |     "        \n",
 955 |     "    return epoch_loss / len(iterator)"
 956 |    ]
 957 |   },
 958 |   {
 959 |    "cell_type": "markdown",
 960 |    "metadata": {},
 961 |    "source": [
 962 |     "...and the evaluation loop, remembering to set the model to `eval` mode and turn off teaching forcing."
 963 |    ]
 964 |   },
 965 |   {
 966 |    "cell_type": "code",
 967 |    "execution_count": 51,
 968 |    "metadata": {},
 969 |    "outputs": [],
 970 |    "source": [
 971 |     "def evaluate(model: nn.Module, \n",
 972 |     "             iterator: BucketIterator, \n",
 973 |     "             criterion: nn.modules.loss.CrossEntropyLoss):\n",
 974 |     "    \n",
 975 |     "    model.eval()\n",
 976 |     "    \n",
 977 |     "    epoch_loss = 0\n",
 978 |     "    \n",
 979 |     "    with torch.no_grad():\n",
 980 |     "    \n",
 981 |     "        for i, batch in enumerate(iterator):\n",
 982 |     "\n",
 983 |     "            src = batch.src\n",
 984 |     "            trg = batch.trg\n",
 985 |     "\n",
 986 |     "            output = model(src, trg, 0) #turn off teacher forcing\n",
 987 |     "\n",
 988 |     "            #trg = [trg sent len, batch size]\n",
 989 |     "            #output = [trg sent len, batch size, output dim]\n",
 990 |     "\n",
 991 |     "            output = output[1:].view(-1, output.shape[-1])\n",
 992 |     "            trg = trg[1:].view(-1)\n",
 993 |     "\n",
 994 |     "            #trg = [(trg sent len - 1) * batch size]\n",
 995 |     "            #output = [(trg sent len - 1) * batch size, output dim]\n",
 996 |     "\n",
 997 |     "            loss = criterion(output, trg)\n",
 998 |     "\n",
 999 |     "            epoch_loss += loss.item()\n",
1000 |     "        \n",
1001 |     "    return epoch_loss / len(iterator)"
1002 |    ]
1003 |   },
1004 |   {
1005 |    "cell_type": "markdown",
1006 |    "metadata": {},
1007 |    "source": [
1008 |     "Finally, define a timing function."
1009 |    ]
1010 |   },
1011 |   {
1012 |    "cell_type": "code",
1013 |    "execution_count": 52,
1014 |    "metadata": {},
1015 |    "outputs": [],
1016 |    "source": [
1017 |     "def epoch_time(start_time: int, \n",
1018 |     "               end_time: int):\n",
1019 |     "    elapsed_time = end_time - start_time\n",
1020 |     "    elapsed_mins = int(elapsed_time / 60)\n",
1021 |     "    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n",
1022 |     "    return elapsed_mins, elapsed_secs"
1023 |    ]
1024 |   },
1025 |   {
1026 |    "cell_type": "markdown",
1027 |    "metadata": {},
1028 |    "source": [
1029 |     "Then, we train our model, saving the parameters that give us the best validation loss."
1030 |    ]
1031 |   },
1032 |   {
1033 |    "cell_type": "code",
1034 |    "execution_count": 54,
1035 |    "metadata": {},
1036 |    "outputs": [],
1037 |    "source": [
1038 |     "MODEL_PATH = 'tut3-model.pt'\n",
1039 |     "MODEL_PATH_MOD_ATTN = 'tut3-model-modified_attn.pt'"
1040 |    ]
1041 |   },
1042 |   {
1043 |    "cell_type": "markdown",
1044 |    "metadata": {},
1045 |    "source": [
1046 |     "First, train the model with `ModifiedAttention`:"
1047 |    ]
1048 |   },
1049 |   {
1050 |    "cell_type": "code",
1051 |    "execution_count": 55,
1052 |    "metadata": {},
1053 |    "outputs": [
1054 |     {
1055 |      "name": "stdout",
1056 |      "output_type": "stream",
1057 |      "text": [
1058 |       "Epoch: 01 | Time: 1m 29s\n",
1059 |       "\tTrain Loss: 5.062 | Train PPL: 157.902\n",
1060 |       "\t Val. Loss: 4.861 |  Val. PPL: 129.204\n",
1061 |       "Epoch: 02 | Time: 1m 29s\n",
1062 |       "\tTrain Loss: 4.223 | Train PPL:  68.266\n",
1063 |       "\t Val. Loss: 4.593 |  Val. PPL:  98.753\n",
1064 |       "Epoch: 03 | Time: 1m 29s\n",
1065 |       "\tTrain Loss: 3.589 | Train PPL:  36.213\n",
1066 |       "\t Val. Loss: 3.810 |  Val. PPL:  45.172\n",
1067 |       "Epoch: 04 | Time: 1m 29s\n",
1068 |       "\tTrain Loss: 3.005 | Train PPL:  20.189\n",
1069 |       "\t Val. Loss: 3.426 |  Val. PPL:  30.759\n",
1070 |       "Epoch: 05 | Time: 1m 29s\n",
1071 |       "\tTrain Loss: 2.587 | Train PPL:  13.295\n",
1072 |       "\t Val. Loss: 3.280 |  Val. PPL:  26.563\n",
1073 |       "Epoch: 06 | Time: 1m 29s\n",
1074 |       "\tTrain Loss: 2.289 | Train PPL:   9.866\n",
1075 |       "\t Val. Loss: 3.243 |  Val. PPL:  25.614\n",
1076 |       "Epoch: 07 | Time: 1m 29s\n",
1077 |       "\tTrain Loss: 2.037 | Train PPL:   7.669\n",
1078 |       "\t Val. Loss: 3.190 |  Val. PPL:  24.299\n",
1079 |       "Epoch: 08 | Time: 1m 29s\n",
1080 |       "\tTrain Loss: 1.821 | Train PPL:   6.177\n",
1081 |       "\t Val. Loss: 3.229 |  Val. PPL:  25.255\n",
1082 |       "Epoch: 09 | Time: 1m 30s\n",
1083 |       "\tTrain Loss: 1.650 | Train PPL:   5.207\n",
1084 |       "\t Val. Loss: 3.240 |  Val. PPL:  25.521\n",
1085 |       "Epoch: 10 | Time: 1m 28s\n",
1086 |       "\tTrain Loss: 1.537 | Train PPL:   4.651\n",
1087 |       "\t Val. Loss: 3.330 |  Val. PPL:  27.936\n"
1088 |      ]
1089 |     }
1090 |    ],
1091 |    "source": [
1092 |     "N_EPOCHS = 10\n",
1093 |     "CLIP = 1\n",
1094 |     "\n",
1095 |     "best_valid_loss = float('inf')\n",
1096 |     "\n",
1097 |     "for epoch in range(N_EPOCHS):\n",
1098 |     "    \n",
1099 |     "    start_time = time.time()\n",
1100 |     "    \n",
1101 |     "    train_loss = train(model_mod_attn, train_iterator, optimizer_mod_attn, criterion, CLIP)\n",
1102 |     "    valid_loss = evaluate(model_mod_attn, valid_iterator, criterion)\n",
1103 |     "    \n",
1104 |     "    end_time = time.time()\n",
1105 |     "    \n",
1106 |     "    epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n",
1107 |     "    \n",
1108 |     "    if valid_loss < best_valid_loss:\n",
1109 |     "        best_valid_loss = valid_loss\n",
1110 |     "        torch.save(model_mod_attn.state_dict(), MODEL_PATH_MOD_ATTN)\n",
1111 |     "    \n",
1112 |     "    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')\n",
1113 |     "    print(f'\\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')\n",
1114 |     "    print(f'\\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')"
1115 |    ]
1116 |   },
1117 |   {
1118 |    "cell_type": "markdown",
1119 |    "metadata": {},
1120 |    "source": [
1121 |     "Finally, we test the model on the test set using these \"best\" parameters."
1122 |    ]
1123 |   },
1124 |   {
1125 |    "cell_type": "code",
1126 |    "execution_count": 56,
1127 |    "metadata": {},
1128 |    "outputs": [
1129 |     {
1130 |      "name": "stdout",
1131 |      "output_type": "stream",
1132 |      "text": [
1133 |       "| Test Loss: 3.159 | Test PPL:  23.540 |\n"
1134 |      ]
1135 |     }
1136 |    ],
1137 |    "source": [
1138 |     "model_mod_attn.load_state_dict(torch.load(MODEL_PATH_MOD_ATTN))\n",
1139 |     "\n",
1140 |     "test_loss = evaluate(model_mod_attn, test_iterator, criterion)\n",
1141 |     "\n",
1142 |     "print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')"
1143 |    ]
1144 |   },
1145 |   {
1146 |    "cell_type": "markdown",
1147 |    "metadata": {},
1148 |    "source": [
1149 |     "Then, train the regular model:"
1150 |    ]
1151 |   },
1152 |   {
1153 |    "cell_type": "code",
1154 |    "execution_count": null,
1155 |    "metadata": {},
1156 |    "outputs": [],
1157 |    "source": [
1158 |     "N_EPOCHS = 10\n",
1159 |     "CLIP = 1\n",
1160 |     "\n",
1161 |     "best_valid_loss = float('inf')\n",
1162 |     "\n",
1163 |     "for epoch in range(N_EPOCHS):\n",
1164 |     "    \n",
1165 |     "    start_time = time.time()\n",
1166 |     "    \n",
1167 |     "    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)\n",
1168 |     "    valid_loss = evaluate(model, valid_iterator, criterion)\n",
1169 |     "    \n",
1170 |     "    end_time = time.time()\n",
1171 |     "    \n",
1172 |     "    epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n",
1173 |     "    \n",
1174 |     "    if valid_loss < best_valid_loss:\n",
1175 |     "        best_valid_loss = valid_loss\n",
1176 |     "        torch.save(model.state_dict(), MODEL_PATH)\n",
1177 |     "    \n",
1178 |     "    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')\n",
1179 |     "    print(f'\\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')\n",
1180 |     "    print(f'\\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')"
1181 |    ]
1182 |   },
1183 |   {
1184 |    "cell_type": "markdown",
1185 |    "metadata": {},
1186 |    "source": [
1187 |     "And evaluate it using its best parameters:"
1188 |    ]
1189 |   },
1190 |   {
1191 |    "cell_type": "code",
1192 |    "execution_count": null,
1193 |    "metadata": {},
1194 |    "outputs": [],
1195 |    "source": [
1196 |     "model.load_state_dict(torch.load(MODEL_PATH))\n",
1197 |     "\n",
1198 |     "test_loss = evaluate(model, test_iterator, criterion)\n",
1199 |     "\n",
1200 |     "print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')"
1201 |    ]
1202 |   },
1203 |   {
1204 |    "cell_type": "markdown",
1205 |    "metadata": {},
1206 |    "source": [
1207 |     "We've improved on the previous model, but this came at the cost of doubling the training time.\n",
1208 |     "\n",
1209 |     "In the next notebook, we'll be using the same architecture but using a few tricks that are applicable to all RNN architectures - packed padded sequences and masking. We'll also implement code which will allow us to look at what words in the input the RNN is paying attention to when decoding the output."
1210 |    ]
1211 |   }
1212 |  ],
1213 |  "metadata": {
1214 |   "kernelspec": {
1215 |    "display_name": "Environment (conda_pytorch_p36)",
1216 |    "language": "python",
1217 |    "name": "conda_pytorch_p36"
1218 |   },
1219 |   "language_info": {
1220 |    "codemirror_mode": {
1221 |     "name": "ipython",
1222 |     "version": 3
1223 |    },
1224 |    "file_extension": ".py",
1225 |    "mimetype": "text/x-python",
1226 |    "name": "python",
1227 |    "nbconvert_exporter": "python",
1228 |    "pygments_lexer": "ipython3",
1229 |    "version": "3.7.3"
1230 |   }
1231 |  },
1232 |  "nbformat": 4,
1233 |  "nbformat_minor": 2
1234 | }
1235 | 


--------------------------------------------------------------------------------