├── .gitignore
├── README.md
├── lectures
    ├── Alexander Rush. Conditional Text Generation and Pretraining.pdf
    ├── Joao Sedoc. Evaluating Conversational Agents.pdf
    └── README.md
├── project_ideas.md
├── remote_jupyter.md
└── tutorials
    ├── README.md
    ├── deeppavlov_track
        ├── README.md
        ├── Tutorial_1_Introduction_to_Tensorflow.ipynb
        ├── Tutorial_2_Sentence_classification_with_word_embeddings.ipynb
        ├── Tutorial_Day_2_seq2seq.ipynb
        ├── Tutorial_Day_3_Fine_Tuning_BERT.ipynb
        ├── Tutorial_Day_4_Transformer_BERT_text_generation.ipynb
        ├── Tutorial_Day_5_Serving_with_DeepPavlov.ipynb
        └── img
        │   ├── beam_search_vs_human.png
        │   ├── bert_ner_diagram.png
        │   ├── decoding.png
        │   └── seq2seq_training.png
    └── pytorch_track
        ├── tutorial1_intro_pytorch.ipynb
        ├── tutorial2_sentnece_classification.ipynb
        ├── tutorial3_seq2seq_dialog.ipynb
        ├── tutorial4_finetuning_bert.ipynb
        ├── tutorial5_serving_models.py
        ├── tutorial5_telegram.ipynb
        └── tutorial_6
            ├── config.json
            ├── dataset.py
            ├── requirements.txt
            ├── transformer
                ├── Beam.py
                ├── Constants.py
                ├── Layers.py
                ├── Models.py
                ├── Modules.py
                ├── Optim.py
                ├── SubLayers.py
                └── Translator.py
            └── transformer_tutorial.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Linux template
  3 | *~
  4 | 
  5 | # temporary files which can be created if a process still has a handle open of a deleted file
  6 | .fuse_hidden*
  7 | 
  8 | # KDE directory preferences
  9 | .directory
 10 | 
 11 | # Linux trash folder which might appear on any partition or disk
 12 | .Trash-*
 13 | 
 14 | # .nfs files are created when an open file is removed but is still being accessed
 15 | .nfs*
 16 | ### macOS template
 17 | # General
 18 | .DS_Store
 19 | .AppleDouble
 20 | .LSOverride
 21 | 
 22 | # Icon must end with two \r
 23 | Icon
 24 | 
 25 | # Thumbnails
 26 | ._*
 27 | 
 28 | # Files that might appear in the root of a volume
 29 | .DocumentRevisions-V100
 30 | .fseventsd
 31 | .Spotlight-V100
 32 | .TemporaryItems
 33 | .Trashes
 34 | .VolumeIcon.icns
 35 | .com.apple.timemachine.donotpresent
 36 | 
 37 | # Directories potentially created on remote AFP share
 38 | .AppleDB
 39 | .AppleDesktop
 40 | Network Trash Folder
 41 | Temporary Items
 42 | .apdisk
 43 | ### JetBrains template
 44 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
 45 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 46 | 
 47 | # User-specific stuff
 48 | .idea/**/workspace.xml
 49 | .idea/**/tasks.xml
 50 | .idea/**/usage.statistics.xml
 51 | .idea/**/dictionaries
 52 | .idea/**/shelf
 53 | 
 54 | # Sensitive or high-churn files
 55 | .idea/**/dataSources/
 56 | .idea/**/dataSources.ids
 57 | .idea/**/dataSources.local.xml
 58 | .idea/**/sqlDataSources.xml
 59 | .idea/**/dynamic.xml
 60 | .idea/**/uiDesigner.xml
 61 | .idea/**/dbnavigator.xml
 62 | 
 63 | # Gradle
 64 | .idea/**/gradle.xml
 65 | .idea/**/libraries
 66 | 
 67 | # Gradle and Maven with auto-import
 68 | # When using Gradle or Maven with auto-import, you should exclude module files,
 69 | # since they will be recreated, and may cause churn.  Uncomment if using
 70 | # auto-import.
 71 | # .idea/modules.xml
 72 | # .idea/*.iml
 73 | # .idea/modules
 74 | 
 75 | # CMake
 76 | cmake-build-*/
 77 | 
 78 | # Mongo Explorer plugin
 79 | .idea/**/mongoSettings.xml
 80 | 
 81 | # File-based project format
 82 | *.iws
 83 | 
 84 | # IntelliJ
 85 | out/
 86 | 
 87 | # mpeltonen/sbt-idea plugin
 88 | .idea_modules/
 89 | 
 90 | # JIRA plugin
 91 | atlassian-ide-plugin.xml
 92 | 
 93 | # Cursive Clojure plugin
 94 | .idea/replstate.xml
 95 | 
 96 | # Crashlytics plugin (for Android Studio and IntelliJ)
 97 | com_crashlytics_export_strings.xml
 98 | crashlytics.properties
 99 | crashlytics-build.properties
100 | fabric.properties
101 | 
102 | # Editor-based Rest Client
103 | .idea/httpRequests
104 | ### Python template
105 | # Byte-compiled / optimized / DLL files
106 | __pycache__/
107 | *.py[cod]
108 | *$py.class
109 | 
110 | # C extensions
111 | *.so
112 | 
113 | # Distribution / packaging
114 | .Python
115 | build/
116 | develop-eggs/
117 | dist/
118 | downloads/
119 | eggs/
120 | .eggs/
121 | lib/
122 | lib64/
123 | parts/
124 | sdist/
125 | var/
126 | wheels/
127 | *.egg-info/
128 | .installed.cfg
129 | *.egg
130 | MANIFEST
131 | 
132 | # PyInstaller
133 | #  Usually these files are written by a python script from a template
134 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
135 | *.manifest
136 | *.spec
137 | 
138 | # Installer logs
139 | pip-log.txt
140 | pip-delete-this-directory.txt
141 | 
142 | # Unit test / coverage reports
143 | htmlcov/
144 | .tox/
145 | .coverage
146 | .coverage.*
147 | .cache
148 | nosetests.xml
149 | coverage.xml
150 | *.cover
151 | .hypothesis/
152 | .pytest_cache/
153 | 
154 | # Translations
155 | *.mo
156 | *.pot
157 | 
158 | # Django stuff:
159 | *.log
160 | local_settings.py
161 | db.sqlite3
162 | 
163 | # Flask stuff:
164 | instance/
165 | .webassets-cache
166 | 
167 | # Scrapy stuff:
168 | .scrapy
169 | 
170 | # Sphinx documentation
171 | docs/_build/
172 | 
173 | # PyBuilder
174 | target/
175 | 
176 | # Jupyter Notebook
177 | .ipynb_checkpoints
178 | 
179 | # pyenv
180 | .python-version
181 | 
182 | # celery beat schedule file
183 | celerybeat-schedule
184 | 
185 | # SageMath parsed files
186 | *.sage.py
187 | 
188 | # Environments
189 | .env
190 | .venv
191 | env/
192 | venv/
193 | ENV/
194 | env.bak/
195 | venv.bak/
196 | 
197 | # Spyder project settings
198 | .spyderproject
199 | .spyproject
200 | 
201 | # Rope project settings
202 | .ropeproject
203 | 
204 | # mkdocs documentation
205 | /site
206 | 
207 | # mypy
208 | .mypy_cache/
209 | 
210 | 
211 | data/
212 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | CISS 2019 Materials
 2 | ===================
 3 | 
 4 | Lectures:
 5 |  * [lectures/](lectures/)
 6 | 
 7 | Tutorials:
 8 |  * [tutorials/](tutorials/)
 9 | 
10 | Materials:
11 |  * [Project Ideas](project_ideas.md)
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/lectures/Alexander Rush. Conditional Text Generation and Pretraining.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/text-machine-lab/ciss2_materials/0f2423eac4c4cbdb1c3e19452203b7d6f207b1dc/lectures/Alexander Rush. Conditional Text Generation and Pretraining.pdf


--------------------------------------------------------------------------------
/lectures/Joao Sedoc. Evaluating Conversational Agents.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/text-machine-lab/ciss2_materials/0f2423eac4c4cbdb1c3e19452203b7d6f207b1dc/lectures/Joao Sedoc. Evaluating Conversational Agents.pdf


--------------------------------------------------------------------------------
/lectures/README.md:
--------------------------------------------------------------------------------
 1 | ## Day 1:
 2 |   * Lecture 1: Neural Networks ([slides](https://docs.google.com/presentation/d/1NlNsgNPN2MiVWW8HOIk2CWi_IkXPnHHTV8MGmQEmMss/edit?usp=sharing), [video](https://echo360.org/media/98fb71cf-b104-444c-a114-43870e6a90c2/public))
 3 |   
 4 |   * Lecture 2: Representing words ([slides](https://docs.google.com/presentation/d/12MFFqeaMw8uaME_eqVjx9Ua29V8HoG1GhOSpJ_Y0nP4/edit?usp=sharing), [video](https://echo360.org/media/3cd73a3c-6c74-4a36-897d-e37e80f472b2/public))
 5 | 
 6 |   * Invited talk: Alexander Rush, Conditional Text Generation and Pretraining. ([slides](Alexander%20Rush.%20Conditional%20Text%20Generation%20and%20Pretraining.pdf), [video](https://uml.mediasite.com/Mediasite/Play/9b77f879b01a4679a5122f109957506d1d) (participants will receive the password via email))
 7 | 
 8 | ## Day 2
 9 |   * Lecture 3: Convolutional Neural Networks ([slides](https://docs.google.com/presentation/d/1G60Wv4eEpcouO2848A-8KxDAz4C7-mS1BpsEg2msKsk/edit?usp=sharing), [video](https://echo360.org/media/237e4c2e-e402-4ba3-9e4f-c13b511da39e/public))
10 | 
11 |   * Lecture 4: Recurrent Neural Networks ([slides](https://docs.google.com/presentation/d/1FRWbtzmaSj-adKm_QLRYZntI6BJYrgcnTYxhP-O2hJY/edit?usp=sharing), [video](https://echo360.org/media/951e7be3-a5d2-464c-ae25-f2456ce55442/public))
12 | 
13 |   * Invited talk: João Sedoc, Evaluating Conversational Agents ([slides](Joao%20Sedoc.%20Evaluating%20Conversational%20Agents.pdf), [video](https://echo360.org/media/2cdc409e-3075-4d6e-8fa0-bd287363b587/public))
14 | 
15 | ## Day 3
16 | 
17 |   * Lecture 4.75: Attention: A Quick Recap ([slides](https://docs.google.com/presentation/d/1_PLMA-c_hSs_0tS10yVU1kX4N6gLMXmy6vZCE8chWEk/edit?usp=sharing), [video](https://echo360.org/media/6212dbbc-21c8-418e-b0ae-85ea95dc41e2/public))
18 | 
19 |   * Lecture 5: Transformers ([slides](https://docs.google.com/presentation/d/1cg18KSHtgtkewC5srMuRTGuFL8k3rmaweuwlINSLXbs/edit?usp=sharing), [video](https://echo360.org/media/9aa918f6-99b0-4f31-b264-19c6111c8759/public))
20 | 
21 |   * Lecture 6: Contextualized embeddings: ELMo, GPT, BERT ([slides](https://docs.google.com/presentation/d/14dsuG-btGgvQ6IUF2ZNRjRZ9qma1oQCUrVuFcn5vVAw/edit?usp=sharing), [video](https://echo360.org/media/c986c7eb-f8da-4720-8e63-d4913c2fdb12/public))
22 | 
23 |   * Invited talk: Kate Saenko, Grounding Language in Pixels ([slides](https://drive.google.com/file/d/1G6DXv5JHtrvpuJtxbDLUJuKBrm6vJT8a/view?usp=drive_web), [video](https://echo360.org/media/436cb96f-5585-4c20-82ff-89cd8d093490/public))
24 | 
25 | ## Day 4
26 | 
27 |   * Lecture 7: Memory-based models. External knowledge integration ([slides](https://docs.google.com/presentation/d/10ENMJINp50US2VLTRbv_gIv3N3vogNbwFmKQBq_C9B4/edit?usp=sharing), [video](https://echo360.org/media/3a8e9c57-11bd-4441-9301-c577cd0676d9/public))
28 |   * Lecture 8: Deep Question Answering ([slides](https://docs.google.com/presentation/d/1Gy-SWO18fJo3mFEta6lq9ovEjEhcH05ZN876qwF1W48/edit?usp=sharing), [video](https://echo360.org/media/c33d748f-66b4-465b-abe4-0aeda24d5049/public))
29 | 
30 | ## Day 5
31 | 
32 |   * Lecture 10: Multi-skill conversational agents ([slides](https://docs.google.com/presentation/d/1vp_-V_Qe9HmA0j_yHw11X0Fg0vC0BVvSCiUcEqETk8Y/edit?usp=sharing), [video](https://echo360.org/media/17cf25aa-a82d-4159-b4a4-76eadf1f7715/public))
33 | 
34 |   * Lecture 11: Hierarchy in neural dialogue models ([slides](https://docs.google.com/presentation/d/1K9IAnExUJD5FdL3cHVMIFrtaFO8tt1NJIxkqVPh9kn4/edit?usp=sharing), [video](https://echo360.org/media/f15053bb-1ce6-4762-b72e-833c3e120ab3/public))
35 | 
36 |   * Lecture 12: Dialogue diversity([slides](https://docs.google.com/presentation/d/1qQT3ihVJtHyczyJnKZSVxHl8CK-lq2TKF6wuqPR-NDM/edit?usp=sharing), [video](https://echo360.org/media/e08a929d-e75c-4212-a635-f9b109fa825b/public))
37 | 
38 |   * Invited talk: Jason Weston, Putting Together the Threads of Conversational AI? ([video](https://echo360.org/media/55668c92-97cc-44a5-acac-354dc383c840/public))
39 | 


--------------------------------------------------------------------------------
/project_ideas.md:
--------------------------------------------------------------------------------
 1 | # Project Ideas
 2 | 
 3 | This page collects several ideas of possible projects to help you figure out what might be your project to work on.
 4 | 
 5 | 
 6 | ## Ideas
 7 | 
 8 |  - Encoder/decoder transformer-based chatbot
 9 |    - This [huggingface/pytorch-pretrained-BERT](https://github.com/huggingface/pytorch-pretrained-BERT) repository contains several Transformer-based models that you can use.
10 |  - HRED-based sequence-to-sequence architecture
11 |    - There is a PyTorch implmentation of [HRED](https://github.com/hsgodhia/hred)
12 |  - A chat-bot with personification
13 |    - [How to build a State-of-the-Art Conversational AI with Transfer Learning](https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313)
14 |  - Emotions and emojis 🙈
15 |      - [Understanding emotions — from Keras to pyTorch](https://medium.com/huggingface/understanding-emotions-from-keras-to-pytorch-3ccb61d5a983)
16 |  - Dialog State Tracking Challenge 
17 |    - [Web-page](https://www.microsoft.com/en-us/research/event/dialog-state-tracking-challenge/)
18 |  - Visual Dialog Challenge
19 |    - [Web-page](https://visualdialog.org/challenge/2018)
20 |  - Visual Question Answering
21 |    - [Web-page](https://visualqa.org/) 
22 |  - Generate discussions on current affairs (like two or more agents posting on a forum)
23 |    - You can start with this [Web-page](https://www.kaggle.com/aashita/training-a-bot-to-comment-on-current-affairs) 
24 | 
25 | 
26 | ## Be creative!
27 | 
28 | Choose a project you would really like to work on but did not have time! Remember, anything* can be framed as a question answering task, and, furthermore, as a dialog!
29 | 
30 | \*According to Socher
31 | 
32 | ## Datasets
33 | 
34 | Below is a list of datasets that you can use in your projects:
35 | 
36 |  https://docs.google.com/document/d/1QVVX0YV5_ebH5M9XUtT7VveD0v2wVQy69DeKkWqhcz4/edit?usp=drivesdk
37 |  
38 | 
39 | 
40 | ## Tips
41 | The tutorial sessions will show you how to do basic things, such as loading data, constructing a model, training and testing.
42 | Unless you are an advanced learner already, it may be a good idea to expand the code from the tutorials for your project.
43 | 
44 | The dataset can be huge, and our time is limited. In early development, it may be wise to use a small subset of your training data. Do not make each development cycle hours long. 
45 | 
46 | An innovative project may have an innovative task, or an innovative model, or both. Something is better than nothing. After you make things work, you will have more ideas and you can always add things on top of it.
47 | 


--------------------------------------------------------------------------------
/remote_jupyter.md:
--------------------------------------------------------------------------------
 1 | You can setup Jupyter in way so you can connect to it remotely 
 2 | from any computer (e.g. from your home). It can be very convenient 
 3 | as you don't need to go to the lab to work on a project remotely.
 4 | 
 5 | Since the lab machines are not accessible from the outside network, 
 6 | you need to first connect to the cs server (cs.uml.edu) and then connect 
 7 | to a lab machine (i.e. dan417-01.uml.edu). Moreover, you need to 
 8 | forward the Jupyter port (8888) to your local machine. 
 9 | 
10 | Below is an instruction how to achieve this on a Mac or Linux system. 
11 | Windows users can do the same using [putty](https://putty.org/).
12 | 
13 | Essentially, we need edit the ssh client config file, 
14 | located in your home dir: `~/.ssh/config`. 
15 | This file contains configurations options that are going to be used 
16 | while connecting to a specific server. Note that this is a file on your local machine (your laptop).
17 | 
18 | Open this file in your favourite editor (or create it if it does not exists) and type the following:
19 | ```
20 | Host cs
21 |     HostName cs.uml.edu
22 |     User your_username
23 | ``` 
24 | where `your_username` is your actual user name from the cs server. Save the changes and close the file. 
25 | Now, if you type `ssh cs` in the terminal, the ssh client understand that the hostname should be `cs.uml.edu`, 
26 | and the username should be `your_username`. Similarly, you can specify other connection options 
27 | under the `Host cs` directive.
28 | 
29 | Next, we will specify the port forwarding option, as well as the proxy connection using the cs server.
30 | Open the `~/.ssh/config` file again and insert the code below at the end of the file:
31 | ```
32 | Host dan417-01
33 |     Hostname dan417-01.uml.edu
34 |     User your_username
35 |     LocalForward 8888 127.0.0.1:8888
36 |     ProxyJump cs
37 | ```
38 | 
39 | The `LowelForward` option specifies that the connections to the port `8888` on your local machine should be
40 | forwarded to the address `127.0.0.1:8888` on the remote machine. Since the Jupyter listens on this address by default, 
41 | if you open the browser on your local machine and go to http://127.0.0.1:8888, you will connect to the Jupyter 
42 | running on the remote server. 
43 | 


--------------------------------------------------------------------------------
/tutorials/README.md:
--------------------------------------------------------------------------------
 1 | # Videos:
 2 | 
 3 | ## PyTorch track
 4 | 
 5 |   * [Intro to PyTorch](https://uml.mediasite.com/Mediasite/Play/66b4d7b9a4f44b2187809abcdc63c4bf1d)
 6 |   * [Sentence classification with word embeddings](https://uml.mediasite.com/Mediasite/Play/43c9bf608cb7480883626e8d97541a8a1d)
 7 |   * [Seq2seq](https://uml.mediasite.com/Mediasite/Play/d39705e0241f43cd868e2d1b152cd31e1d)
 8 |   * [BERT](https://uml.mediasite.com/Mediasite/Play/209e6b07904d429dbce1d0fdb92d59781d)
 9 |   * [Transformer-based chit-chat](https://uml.mediasite.com/Mediasite/Play/049fe8d3bc7b4e75b7dc145eead7568d1d)
10 | 
11 | ## TensorFlow track
12 |   * [Intro to TensorFlow](https://uml.mediasite.com/Mediasite/Play/42e45dc1b01245d2a2c23b4de984618d1d)
13 |   * [Sentence classification with word embeddings](https://uml.mediasite.com/Mediasite/Play/4eb5149c7021491bbc4119dd456f9d2a1d)
14 |   * [Seq2seq](https://uml.mediasite.com/Mediasite/Play/cd131dff73dd404ab8dfaef21aeaa0301d)
15 |   * [BERT](https://uml.mediasite.com/Mediasite/Play/c0dfd827d65449388360468e10eaf8f61d)
16 |   * [Transformer-based chit-chat](https://uml.mediasite.com/Mediasite/Play/bc26c5b8c10e4e429cbb38c7ad03dd501d)
17 | 


--------------------------------------------------------------------------------
/tutorials/deeppavlov_track/README.md:
--------------------------------------------------------------------------------
 1 | # Tutorials links
 2 | 
 3 | 1. Tutorial 1. Introduction to TensorFlow: https://colab.research.google.com/drive/10i1tovcAXjIRoPI8IP5flrhoGuZLUSRe
 4 | 
 5 | 2. Tutorial 2. Sentence classification with word embeddings: https://colab.research.google.com/drive/1Dnr3wC3FBf4KS0GOVNlEbp5fg74f0FM1
 6 | 
 7 | 3. Tutorial 3. Sequence to sequence: https://colab.research.google.com/drive/135BsS9VWUgIHwfTviKgFWRuBvMDHJAjD
 8 | 
 9 | 4. Tutorial 4. [Fine Tuning BERT](https://colab.research.google.com/github/text-machine-lab/ciss2_materials/blob/master/tutorials/deeppavlov_track/Tutorial_Day_3_Fine_Tuning_BERT.ipynb)
10 | 
11 | 5. Tutorial 5. [Transformer BERT for text generation](https://colab.research.google.com/github/text-machine-lab/ciss2_materials/blob/master/tutorials/deeppavlov_track/Tutorial_Day_4_Transformer_BERT_text_generation.ipynb)
12 | 
13 | 6. Tutorial 6. [Serving with DeepPavlov](https://colab.research.google.com/github/text-machine-lab/ciss2_materials/blob/master/tutorials/deeppavlov_track/Tutorial_Day_5_Serving_with_DeepPavlov.ipynb)
14 | 


--------------------------------------------------------------------------------
/tutorials/deeppavlov_track/Tutorial_Day_5_Serving_with_DeepPavlov.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "5dGp8dHi_1BU"
  8 |    },
  9 |    "source": [
 10 |     "# Models serving with DeepPavlov\n",
 11 |     "\n",
 12 |     "DeepPavlov supports out of the box serving for pre-trained models and custom ones.\n",
 13 |     "Serving can be done with:\n",
 14 |     "* [REST API](http://docs.deeppavlov.ai/en/master/intro/features.html#examples-of-some-components)\n",
 15 |     "* [Telegram](http://docs.deeppavlov.ai/en/master/intro/features.html#examples-of-some-components)\n",
 16 |     "* [Amazon Alexa](http://docs.deeppavlov.ai/en/master/devguides/amazon_alexa.html)\n",
 17 |     "* [Microsoft Bot Framework](http://docs.deeppavlov.ai/en/master/devguides/ms_bot_integration.html)\n",
 18 |     "  * Bing, Cortana, Email, Facebook Messenger, Slack, GroupMe, Microsoft Teams, Skype, Telegram, Twilio, Web Chat\n",
 19 |     "* [Yandex Alice](http://docs.deeppavlov.ai/en/master/devguides/yandex_alice.html)\n"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {
 25 |     "colab_type": "text",
 26 |     "id": "xDMqRTTORX3k"
 27 |    },
 28 |    "source": [
 29 |     "## Serving DeepPavlov pre-trained models\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "DeepPavlov has one-line commands to serve models:\n",
 33 |     "\n",
 34 |     "Run model in CLI:\n",
 35 |     "```\n",
 36 |     "python -m deeppavlov interact model_config\n",
 37 |     "```\n",
 38 |     "\n",
 39 |     "Serve model with REST API:\n",
 40 |     "```\n",
 41 |     "python -m deeppavlov riseapi model_config\n",
 42 |     "```\n",
 43 |     "\n",
 44 |     "Serve model with Telegram:\n",
 45 |     "```\n",
 46 |     "python -m deeppavlov interactbot model_config -t <TELEGRAM_TOKEN>\n",
 47 |     "```\n",
 48 |     "\n",
 49 |     "\n",
 50 |     "Let's try some of them for Goal Oriented bot trained on DSTC 2 dataset. This bot is trained to suggest restaurants in Cambridge area.\n"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {
 56 |     "colab_type": "text",
 57 |     "id": "7ggNsOzzVqHu"
 58 |    },
 59 |    "source": [
 60 |     "Install DeepPavlov library"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {
 67 |     "colab": {},
 68 |     "colab_type": "code",
 69 |     "id": "WzLfe9wBUjYU"
 70 |    },
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "! pip install deeppavlov"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {
 80 |     "colab": {},
 81 |     "colab_type": "code",
 82 |     "id": "d9iIw6wnVlxH"
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "import deeppavlov"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {
 92 |     "colab_type": "text",
 93 |     "id": "bbEEbgbJVwLU"
 94 |    },
 95 |    "source": [
 96 |     "Install requirements for Goal Oriented bot:"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {
103 |     "colab": {},
104 |     "colab_type": "code",
105 |     "id": "wxBXKBMWVjBt"
106 |    },
107 |    "outputs": [],
108 |    "source": [
109 |     "! python -m deeppavlov install gobot_dstc2"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {
115 |     "colab_type": "text",
116 |     "id": "OGTYPJzeWV0T"
117 |    },
118 |    "source": [
119 |     "Download pre-trained model:"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {
126 |     "colab": {},
127 |     "colab_type": "code",
128 |     "id": "z9glfej-WgBw"
129 |    },
130 |    "outputs": [],
131 |    "source": [
132 |     "! python -m deeppavlov download gobot_dstc2"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {
138 |     "colab_type": "text",
139 |     "id": "futyYStQWmNi"
140 |    },
141 |    "source": [
142 |     "Run with CLI:"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {
149 |     "colab": {},
150 |     "colab_type": "code",
151 |     "id": "HUoI6tKjW_vI"
152 |    },
153 |    "outputs": [],
154 |    "source": [
155 |     "! python -m deeppavlov interact gobot_dstc2"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {
161 |     "colab_type": "text",
162 |     "id": "XfywF_PNY4Jm"
163 |    },
164 |    "source": [
165 |     "Serving with Telegram:\n",
166 |     "```\n",
167 |     "python -m deeppavlov interactbot gobot_dstc2 -t <TELEGRAM_TOKEN>\n",
168 |     "```\n",
169 |     "\n",
170 |     "Telegram token can be created with @BotFather bot. Details by this [link](https://core.telegram.org/bots#3-how-do-i-create-a-bot).\n",
171 |     "\n",
172 |     "Once you got Telegram token you can run the Goal Oriented bot."
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {
179 |     "colab": {},
180 |     "colab_type": "code",
181 |     "id": "KHNZB6CUZvqu"
182 |    },
183 |    "outputs": [],
184 |    "source": [
185 |     "! python -m deeppavlov interactbot gobot_dstc2 -t <YOUR_TELEGRAM_TOKEN>"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {
191 |     "colab_type": "text",
192 |     "id": "VfkgYpbfYsNh"
193 |    },
194 |    "source": [
195 |     "## Serving custom models\n",
196 |     "\n",
197 |     "We have already discussed how to serve pre-trained DeepPavlov models. But how to use deeppavlov to serve custom ones?"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {
203 |     "colab_type": "text",
204 |     "id": "BrIbVM-ye5ig"
205 |    },
206 |    "source": [
207 |     "### Say Hi Example\n",
208 |     "\n",
209 |     "Let's consider simple example:"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {
216 |     "colab": {},
217 |     "colab_type": "code",
218 |     "id": "qkDKZndxaNaw"
219 |    },
220 |    "outputs": [],
221 |    "source": [
222 |     "class SayHiModel:\n",
223 |     "  def __init__(self, *args, **kwargs):\n",
224 |     "    pass\n",
225 |     "  \n",
226 |     "  def __call__(self, input_texts):\n",
227 |     "    '''\n",
228 |     "    __call__ method should return responses for each utterance in input_texts\n",
229 |     "    '''\n",
230 |     "    output_text = []\n",
231 |     "    for text in input_texts:\n",
232 |     "      output_text.append('Hi!')\n",
233 |     "    return output_text"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {
239 |     "colab_type": "text",
240 |     "id": "V9RV6HgIcpPZ"
241 |    },
242 |    "source": [
243 |     "Here we define utilitary function to generate configuration file, we need such kind of configurations for DeepPavlov lib."
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {
250 |     "colab": {},
251 |     "colab_type": "code",
252 |     "id": "7efSG1yhbVXi"
253 |    },
254 |    "outputs": [],
255 |    "source": [
256 |     "def generate_config(class_name):\n",
257 |     "  \"\"\"generate minimal required DeepPavlov model configuration\"\"\"\n",
258 |     "  \n",
259 |     "  config = {\n",
260 |     "    'chainer': {\n",
261 |     "        'in': ['x'],\n",
262 |     "        'out': ['y'],\n",
263 |     "        'pipe': [\n",
264 |     "            {\n",
265 |     "                'class_name': f'__main__:{class_name}',\n",
266 |     "                'in': ['x'],\n",
267 |     "                'out': ['y']\n",
268 |     "            }\n",
269 |     "        ]\n",
270 |     "    }\n",
271 |     "  }\n",
272 |     "  return config"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {
278 |     "colab_type": "text",
279 |     "id": "P1ITXwmQeOey"
280 |    },
281 |    "source": [
282 |     "Serving with Python API:"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {
289 |     "colab": {},
290 |     "colab_type": "code",
291 |     "id": "eGvf2K8WcB0A"
292 |    },
293 |    "outputs": [],
294 |    "source": [
295 |     "# to interact with CLI\n",
296 |     "from deeppavlov.core.commands.infer import interact_model\n",
297 |     "# to interact with Telegram\n",
298 |     "from deeppavlov.utils.telegram.telegram_ui import interact_model_by_telegram\n"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": null,
304 |    "metadata": {
305 |     "colab": {},
306 |     "colab_type": "code",
307 |     "id": "_Op0z9yGdqfy"
308 |    },
309 |    "outputs": [],
310 |    "source": [
311 |     "interact_model(generate_config('SayHiModel'))"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {
318 |     "colab": {},
319 |     "colab_type": "code",
320 |     "id": "APiZBIdheU7o"
321 |    },
322 |    "outputs": [],
323 |    "source": [
324 |     "interact_model_by_telegram(generate_config('SayHiModel'), token='YOUR_TOKEN')"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "metadata": {
330 |     "colab_type": "text",
331 |     "id": "0zFMQtPRfyrD"
332 |    },
333 |    "source": [
334 |     "### Serving BERT Generator from Day 4 Tutor\n",
335 |     "\n",
336 |     "Install requirements and download model:"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": null,
342 |    "metadata": {
343 |     "colab": {},
344 |     "colab_type": "code",
345 |     "id": "02JpuqCLithf"
346 |    },
347 |    "outputs": [],
348 |    "source": [
349 |     "! pip install git+https://github.com/deepmipt/bert.git@feat/multi_gpu\n",
350 |     "! wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip\n",
351 |     "! unzip uncased_L-12_H-768_A-12.zip"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {
357 |     "colab_type": "text",
358 |     "id": "ZJPCe0xFi8LI"
359 |    },
360 |    "source": [
361 |     "Define all required code from day 4 tutor in single cell:"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {
368 |     "colab": {},
369 |     "colab_type": "code",
370 |     "id": "n0DTAAp6i58M"
371 |    },
372 |    "outputs": [],
373 |    "source": [
374 |     "import deeppavlov\n",
375 |     "from deeppavlov.models.preprocessors.bert_preprocessor import BertPreprocessor\n",
376 |     "\n",
377 |     "from bert_dp import modeling\n",
378 |     "\n",
379 |     "\n",
380 |     "BERT_MODEL_PATH = './uncased_L-12_H-768_A-12/'\n",
381 |     "\n",
382 |     "bert_config = modeling.BertConfig.from_json_file(BERT_MODEL_PATH + 'bert_config.json')\n",
383 |     "\n",
384 |     "import tensorflow as tf\n",
385 |     "\n",
386 |     "# we should define placeholders for BERT model\n",
387 |     "input_ids_ph = tf.placeholder(shape=(None, None), dtype=tf.int32)\n",
388 |     "input_masks_ph = tf.placeholder(shape=(None, None), dtype=tf.int32)\n",
389 |     "token_types_ph = tf.placeholder(shape=(None, None), dtype=tf.int32)\n",
390 |     "is_train_ph = tf.placeholder_with_default(False, shape=[])\n",
391 |     "\n",
392 |     "# this will build Tensorflow graph for BERT model\n",
393 |     "bert_model = modeling.BertModel(config=bert_config,\n",
394 |     "                                is_training=is_train_ph,\n",
395 |     "                                input_ids=input_ids_ph,\n",
396 |     "                                input_mask=input_masks_ph,\n",
397 |     "                                token_type_ids=token_types_ph,\n",
398 |     "                                use_one_hot_embeddings=False)\n",
399 |     "\n",
400 |     "def gather_indexes(sequence_tensor, positions):\n",
401 |     "    \"\"\"Gathers the vectors at the specific positions over a minibatch.\"\"\"\n",
402 |     "    sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)\n",
403 |     "    batch_size = sequence_shape[0]\n",
404 |     "    seq_length = sequence_shape[1]\n",
405 |     "    width = sequence_shape[2]\n",
406 |     "\n",
407 |     "    flat_offsets = tf.reshape(\n",
408 |     "      tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])\n",
409 |     "    flat_positions = tf.reshape(positions + flat_offsets, [-1])\n",
410 |     "    flat_sequence_tensor = tf.reshape(sequence_tensor,\n",
411 |     "                                    [batch_size * seq_length, width])\n",
412 |     "    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)\n",
413 |     "    return output_tensor\n",
414 |     "\n",
415 |     "def get_masked_lm_output(bert_config, input_tensor, output_weights, positions):\n",
416 |     "    \"\"\"Get probabilies for the masked LM.\n",
417 |     "    \n",
418 |     "    bert_config - instance of BertConfig\n",
419 |     "    input_tensor - output of bert_model.get_sequence_output()\n",
420 |     "    output_weights - projection matrix, here we use embeddings matrix and then transpose it\n",
421 |     "    positions - posistions of MASKED tokens, i.e. at witch positions we want to make predictions\n",
422 |     "    \"\"\"\n",
423 |     "    input_tensor = gather_indexes(input_tensor, positions)\n",
424 |     "\n",
425 |     "    with tf.variable_scope(\"cls/predictions\"):\n",
426 |     "        # We apply one more non-linear transformation before the output layer.\n",
427 |     "        with tf.variable_scope(\"transform\"):\n",
428 |     "            input_tensor = tf.layers.dense(\n",
429 |     "              input_tensor,\n",
430 |     "              units=bert_config.hidden_size,\n",
431 |     "              activation=modeling.get_activation(bert_config.hidden_act),\n",
432 |     "              kernel_initializer=modeling.create_initializer(\n",
433 |     "                  bert_config.initializer_range))\n",
434 |     "            input_tensor = modeling.layer_norm(input_tensor)\n",
435 |     "\n",
436 |     "        # The output weights are the same as the input embeddings, but there is\n",
437 |     "        # an output-only bias for each token.\n",
438 |     "        output_bias = tf.get_variable(\n",
439 |     "            \"output_bias\",\n",
440 |     "            shape=[bert_config.vocab_size],\n",
441 |     "            initializer=tf.zeros_initializer())\n",
442 |     "        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)\n",
443 |     "        logits = tf.nn.bias_add(logits, output_bias)\n",
444 |     "        probs = tf.nn.softmax(logits, axis=-1)\n",
445 |     "\n",
446 |     "    return probs\n",
447 |     "  \n",
448 |     "# define placeholder for MASKED tokens positions\n",
449 |     "masked_lm_positions_ph = tf.placeholder(shape=(None, None), dtype=tf.int32)\n",
450 |     "\n",
451 |     "# define predictions for MASKED tokens \n",
452 |     "masked_lm_probs = get_masked_lm_output(bert_config, \n",
453 |     "                                       bert_model.get_sequence_output(),\n",
454 |     "                                       bert_model.get_embedding_table(),\n",
455 |     "                                       masked_lm_positions_ph)\n",
456 |     "\n",
457 |     "# define TensorFlow session\n",
458 |     "sess_config = tf.ConfigProto(allow_soft_placement=True)\n",
459 |     "sess_config.gpu_options.allow_growth = True\n",
460 |     "sess = tf.Session(config=sess_config)\n",
461 |     "\n",
462 |     "init_checkpoint = BERT_MODEL_PATH + 'bert_model.ckpt'\n",
463 |     "\n",
464 |     "# load from checkpoint\n",
465 |     "tvars = tf.trainable_variables()\n",
466 |     "assignment_map, initialized_variable_names = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)\n",
467 |     "tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
468 |     "\n",
469 |     "sess.run(tf.global_variables_initializer())\n",
470 |     "\n",
471 |     "from bert_dp import tokenization\n",
472 |     "\n",
473 |     "tokenizer = tokenization.FullTokenizer(\n",
474 |     "    vocab_file=BERT_MODEL_PATH + 'vocab.txt',\n",
475 |     "    do_lower_case=True,\n",
476 |     ")\n",
477 |     "\n",
478 |     "MASK_TOKEN = '[MASK]'\n",
479 |     "MASK_ID = tokenizer.convert_tokens_to_ids([MASK_TOKEN])[0]\n",
480 |     "\n",
481 |     "from copy import deepcopy\n",
482 |     "import numpy as np\n",
483 |     "\n",
484 |     "def append_tokens(input_example, token=MASK_TOKEN, token_id=MASK_ID, n=3):\n",
485 |     "    \"\"\"\n",
486 |     "    This function appends `token` to `input_example` `n` times.\n",
487 |     "    Also, it maintains correct values for `input_mask`, `input_ids`, `input_type_ids`.\n",
488 |     "    Don't forget that [SEP] token is always the last token.\n",
489 |     "    \n",
490 |     "    input_example - result of BertPreprocessor with tokens, input_ids, ...\n",
491 |     "    token - token to append\n",
492 |     "    token_id - token id to append\n",
493 |     "    n - how many times to append token to input_example\n",
494 |     "    \"\"\"\n",
495 |     "    input_example = deepcopy(input_example)\n",
496 |     "    max_seq_len = len(input_example.input_mask)\n",
497 |     "    input_len = sum(input_example.input_mask)\n",
498 |     "    \n",
499 |     "    # new_tokens = YOUR CODE HERE\n",
500 |     "    new_tokens = (input_example.tokens[:input_len - 1] + [token] * n + input_example.tokens[input_len-1:])[:max_seq_len]\n",
501 |     "    input_example.tokens = new_tokens\n",
502 |     "    assert len(new_tokens) <= max_seq_len\n",
503 |     "    \n",
504 |     "    # new_input_mask = YOUR CODE HERE\n",
505 |     "    new_input_mask = (input_example.input_mask[:input_len - 1] + [1] * n + input_example.input_mask[input_len-1:])[:max_seq_len]\n",
506 |     "    input_example.input_mask = new_input_mask\n",
507 |     "    assert len(new_input_mask) <= max_seq_len\n",
508 |     "    \n",
509 |     "    # new_input_ids = YOUR CODE HERE\n",
510 |     "    new_input_ids = (input_example.input_ids[:input_len - 1] + [token_id] * n + input_example.input_ids[input_len-1:])[:max_seq_len]\n",
511 |     "    input_example.input_ids = new_input_ids\n",
512 |     "    assert len(new_input_ids) <= max_seq_len\n",
513 |     "    \n",
514 |     "    # new_input_type_ids = YOUR CODE HERE\n",
515 |     "    new_input_type_ids = (input_example.input_type_ids[:input_len - 1] + [1] * n + input_example.input_type_ids[input_len-1:])[:max_seq_len]\n",
516 |     "    input_example.input_type_ids = new_input_type_ids\n",
517 |     "    assert len(new_input_type_ids) <= max_seq_len\n",
518 |     "    \n",
519 |     "    return input_example, [i for i in range(len(input_example.tokens)) if input_example.tokens[i] == MASK_TOKEN]\n",
520 |     "  \n",
521 |     "\n",
522 |     "def generate_text(input_example, sampling_method='greedy', mask_tokens_n=3, max_generated_tokens=15):\n",
523 |     "    \"\"\"\n",
524 |     "    This function generates text using input_example as initial text.\n",
525 |     "    \n",
526 |     "    Text generation stops when one of ['.', '?', '!'] symbols is predicted or \n",
527 |     "    achieved number of `max_generated_tokens`\n",
528 |     "    \"\"\"\n",
529 |     "    generated_example = deepcopy(input_example)\n",
530 |     "    for i in range(max_generated_tokens):\n",
531 |     "        # Firstly, we append [MASK] tokens to the end of a text.\n",
532 |     "        # If mask_tokens_n is too small (e.g., 1) then model will predict \".\" and generation will stop.\n",
533 |     "        # It happens because BERT learned that the last token in sentences is usually \".\".\n",
534 |     "        masked_input_example, masked_lm_positions = append_tokens(generated_example, n=mask_tokens_n)\n",
535 |     "        \n",
536 |     "        # get distribution over vocabulary for the first masked token\n",
537 |     "        probs = sess.run(masked_lm_probs, feed_dict={\n",
538 |     "            input_ids_ph: [masked_input_example.input_ids],\n",
539 |     "            input_masks_ph: [masked_input_example.input_mask],\n",
540 |     "            token_types_ph: [masked_input_example.input_type_ids],\n",
541 |     "            masked_lm_positions_ph: [masked_lm_positions],\n",
542 |     "        })[0]\n",
543 |     "        \n",
544 |     "        # sample token from vocabulary using probs\n",
545 |     "        if sampling_method == 'greedy':\n",
546 |     "            next_token_id = np.argmax(probs)\n",
547 |     "        else:\n",
548 |     "            next_token_id = sampling_method(probs)\n",
549 |     "        \n",
550 |     "        # append generated token to text\n",
551 |     "        next_token = tokenizer.convert_ids_to_tokens([next_token_id])[0]    \n",
552 |     "        generated_example, _ = append_tokens(generated_example, token=next_token, token_id=next_token_id, n=1)\n",
553 |     "        \n",
554 |     "        if generated_example.tokens[-2] in ['.', '?', '!']:\n",
555 |     "            break\n",
556 |     "\n",
557 |     "    return generated_example\n",
558 |     "  \n",
559 |     "\n",
560 |     "def top_k_sampling(probs, k=10):\n",
561 |     "    \"\"\"\n",
562 |     "    Sample from k tokens with the highest probabilities.\n",
563 |     "    Don't forget to re-normalize top k probs.\n",
564 |     "    \"\"\"\n",
565 |     "    #### YOUR CODE HERE START ####\n",
566 |     "    # get top k indicies from probs\n",
567 |     "    top_k_tokens_ids = np.argsort(probs)[::-1][:k]\n",
568 |     "    # get top k probabilites using top_k_tokens_ids\n",
569 |     "    top_k_probs = probs[top_k_tokens_ids]\n",
570 |     "    # make sure that sum of top_k_probs == 1\n",
571 |     "    top_k_probs = top_k_probs / sum(top_k_probs)\n",
572 |     "    #### YOUR CODE HERE END ####\n",
573 |     "    return top_k_tokens_ids[np.argmax(np.random.multinomial(n=1, pvals=top_k_probs))]\n"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "markdown",
578 |    "metadata": {
579 |     "colab_type": "text",
580 |     "id": "o8_tHTBWoiaW"
581 |    },
582 |    "source": [
583 |     "Definge BERT generator model:"
584 |    ]
585 |   },
586 |   {
587 |    "cell_type": "code",
588 |    "execution_count": null,
589 |    "metadata": {
590 |     "colab": {},
591 |     "colab_type": "code",
592 |     "id": "IDhsH35-rd9o"
593 |    },
594 |    "outputs": [],
595 |    "source": [
596 |     "class BertGenerator:\n",
597 |     "  def __init__(self, *args, **kwargs):\n",
598 |     "    self.bp = BertPreprocessor(vocab_file=BERT_MODEL_PATH + 'vocab.txt', do_lower_case=True, max_seq_length=32)\n",
599 |     "  \n",
600 |     "  def __call__(self, input_texts):\n",
601 |     "    '''\n",
602 |     "    __call__ method should return responses for each utterance in input_texts\n",
603 |     "    '''\n",
604 |     "    output_text = []\n",
605 |     "    for text in input_texts:\n",
606 |     "      input_example = self.bp(texts_a = [f'- {text}'], texts_b = ['- '])[0]\n",
607 |     "\n",
608 |     "      top_k_10_sampling = lambda x: top_k_sampling(x, 10)\n",
609 |     "      generated_example = generate_text(input_example, sampling_method=top_k_10_sampling)\n",
610 |     "      sep_index = generated_example.tokens.index('[SEP]')\n",
611 |     "      response = ' '.join(generated_example.tokens[sep_index + 2:-1]).replace(' ##', '').replace('##', '')\n",
612 |     "      output_text.append(response)\n",
613 |     "      return output_text"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "markdown",
618 |    "metadata": {
619 |     "colab_type": "text",
620 |     "id": "i2yIy9bkoq4J"
621 |    },
622 |    "source": [
623 |     "Interact with CLI:"
624 |    ]
625 |   },
626 |   {
627 |    "cell_type": "code",
628 |    "execution_count": null,
629 |    "metadata": {
630 |     "colab": {},
631 |     "colab_type": "code",
632 |     "id": "p5_il86NkmTR"
633 |    },
634 |    "outputs": [],
635 |    "source": [
636 |     "interact_model(generate_config('BertGenerator'))"
637 |    ]
638 |   },
639 |   {
640 |    "cell_type": "markdown",
641 |    "metadata": {
642 |     "colab_type": "text",
643 |     "id": "NZiaw9d1ottZ"
644 |    },
645 |    "source": [
646 |     "Interact with Telegram:"
647 |    ]
648 |   },
649 |   {
650 |    "cell_type": "code",
651 |    "execution_count": null,
652 |    "metadata": {
653 |     "colab": {},
654 |     "colab_type": "code",
655 |     "id": "FMuIALP-tERC"
656 |    },
657 |    "outputs": [],
658 |    "source": [
659 |     "interact_model_by_telegram(generate_config('BertGenerator'), token='YOUR_TOKEN')"
660 |    ]
661 |   },
662 |   {
663 |    "cell_type": "code",
664 |    "execution_count": null,
665 |    "metadata": {},
666 |    "outputs": [],
667 |    "source": []
668 |   }
669 |  ],
670 |  "metadata": {
671 |   "accelerator": "GPU",
672 |   "colab": {
673 |    "name": "Copy of Tutorial_Day_4_Transformer_BERT_text_generation.ipynb",
674 |    "provenance": [],
675 |    "toc_visible": true,
676 |    "version": "0.3.2"
677 |   },
678 |   "kernelspec": {
679 |    "display_name": "Python 3",
680 |    "language": "python",
681 |    "name": "python3"
682 |   },
683 |   "language_info": {
684 |    "codemirror_mode": {
685 |     "name": "ipython",
686 |     "version": 3
687 |    },
688 |    "file_extension": ".py",
689 |    "mimetype": "text/x-python",
690 |    "name": "python",
691 |    "nbconvert_exporter": "python",
692 |    "pygments_lexer": "ipython3",
693 |    "version": "3.6.7"
694 |   }
695 |  },
696 |  "nbformat": 4,
697 |  "nbformat_minor": 2
698 | }
699 | 


--------------------------------------------------------------------------------
/tutorials/deeppavlov_track/img/beam_search_vs_human.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/text-machine-lab/ciss2_materials/0f2423eac4c4cbdb1c3e19452203b7d6f207b1dc/tutorials/deeppavlov_track/img/beam_search_vs_human.png


--------------------------------------------------------------------------------
/tutorials/deeppavlov_track/img/bert_ner_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/text-machine-lab/ciss2_materials/0f2423eac4c4cbdb1c3e19452203b7d6f207b1dc/tutorials/deeppavlov_track/img/bert_ner_diagram.png


--------------------------------------------------------------------------------
/tutorials/deeppavlov_track/img/decoding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/text-machine-lab/ciss2_materials/0f2423eac4c4cbdb1c3e19452203b7d6f207b1dc/tutorials/deeppavlov_track/img/decoding.png


--------------------------------------------------------------------------------
/tutorials/deeppavlov_track/img/seq2seq_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/text-machine-lab/ciss2_materials/0f2423eac4c4cbdb1c3e19452203b7d6f207b1dc/tutorials/deeppavlov_track/img/seq2seq_training.png


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial1_intro_pytorch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import torch\n",
 11 |     "import torch.utils.data\n",
 12 |     "import sklearn.datasets\n",
 13 |     "from sklearn.metrics import accuracy_score"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## Introduction"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "As NumPy, PyTorch provides basic functions for creating tensors and common operations on them."
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "a = torch.ones(5)\n",
 37 |     "b = torch.full_like(a, 5)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "a"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "b"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "a + b"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "However, in contrast to NumPy, PyTorch can perform computations on GPU.\n",
 72 |     " - See the [CUDA semantics](https://pytorch.org/docs/stable/notes/cuda.html) documentation for details, including how to write device-agnositc code"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "a.device"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "a = a.to('cuda')"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "a.device"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "# a + b"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "b = torch.full_like(a, 5)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "a + b"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "## Neural Networks"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "Since PyTorch allows automatic differentiation, building neural networks with PyTorch is very easy.\n",
141 |     "\n",
142 |     "All the models implemented in PyTorch should subclass the [`torch.nn.Module` class](https://pytorch.org/docs/stable/nn.html?highlight=module#torch.nn.Module). The main method of this class (which is used by a lot of other PyTorch classes) is `forward()`. This is the core method that defines how your model is going to run and what outputs it should produce given the inputs. \n",
143 |     "In the constructor of the your model (the `__init__` method) you should initialize all the layers you are going to use. PyTorch provides a large amount of commonly used layers that are very easy to use. Please refer to the [documentation of PyTorch](https://pytorch.org/docs/stable/nn.html) for a complete list of layers.\n",
144 |     "\n",
145 |     "Below we are going to declare a simple neural network with two layers and a ReLU activation function between them."
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "class Net(torch.nn.Module):\n",
155 |     "    \"\"\"A basic neural network model with one layer\"\"\"\n",
156 |     "    def __init__(self, nb_features, hidden_size, nb_classes):\n",
157 |     "        \"\"\"\n",
158 |     "        Initialize the model class\n",
159 |     "\n",
160 |     "        :param nb_features: Number of input feature\n",
161 |     "        :param hidden_size: The size of the hidden layer\n",
162 |     "        :param nb_classes: Number of classes for classification\n",
163 |     "\n",
164 |     "        \"\"\"\n",
165 |     "\n",
166 |     "        super().__init__()\n",
167 |     "\n",
168 |     "        self.fc1 = torch.nn.Linear(nb_features, hidden_size)\n",
169 |     "        self.fc1_activ = torch.nn.ReLU()\n",
170 |     "\n",
171 |     "        self.fc_logits = torch.nn.Linear(hidden_size, nb_classes)\n",
172 |     "\n",
173 |     "    def forward(self, inputs):\n",
174 |     "        \"\"\"\n",
175 |     "        Perform the forward pass on the input data\n",
176 |     "\n",
177 |     "        :param inputs: input data\n",
178 |     "\n",
179 |     "        \"\"\"\n",
180 |     "        z1 = self.fc1(inputs)\n",
181 |     "        z1_active = self.fc1_activ(z1)\n",
182 |     "\n",
183 |     "        logits = self.fc_logits(z1_active)\n",
184 |     "\n",
185 |     "        return logits"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "model = Net(nb_features=4, hidden_size=8, nb_classes=3)"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "model"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "Let's test the model on a random input. Notice how the size of the input data correspond to the size of the first layer and the size of the output correspond to the size of the last layer."
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "inputs = torch.rand(1, 4)"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "inputs"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "outputs = model(inputs)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": [
246 |     "outputs"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "metadata": {},
252 |    "source": [
253 |     "### Loss calcualtion"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "PyTorch has a quite a few pre-defined loss functions that we can use. Most common loss functions are enumerated below:\n",
261 |     " - [Mean Squared Error loss](https://pytorch.org/docs/stable/nn.html#torch.nn.MSELoss)\n",
262 |     " - [Cross Entropy loss](https://pytorch.org/docs/stable/nn.html#torch.nn.CrossEntropyLoss)\n",
263 |     " - [Binary Cross Entropy loss](https://pytorch.org/docs/stable/nn.html#torch.nn.BCELoss)"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "targets = torch.rand_like(outputs)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": null,
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": [
281 |     "targets"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "criterion = torch.nn.MSELoss()\n",
291 |     "loss = criterion(outputs, targets)"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": null,
297 |    "metadata": {},
298 |    "outputs": [],
299 |    "source": [
300 |     "loss"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "metadata": {},
306 |    "source": [
307 |     "### Gradients"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "metadata": {},
313 |    "source": [
314 |     "After calling `loss.backward()` PyTorch performs the backward pass of the network and stores the gradients of the weights."
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": [
323 |     "model.zero_grad()"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "print('fc1.bias before backward')\n",
333 |     "print(model.fc1.bias.grad)"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "loss.backward()"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": null,
348 |    "metadata": {},
349 |    "outputs": [],
350 |    "source": [
351 |     "print('fc1.bias after backward')\n",
352 |     "print(model.fc1.bias.grad)"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "markdown",
357 |    "metadata": {},
358 |    "source": [
359 |     "### Parameters update"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "markdown",
364 |    "metadata": {},
365 |    "source": [
366 |     "Alongside with the loss functions, PyTorch provides several differnet optmizers, ranging from the classical [Stochastic Gradient Descent](https://pytorch.org/docs/stable/optim.html#torch.optim.SGD) to [RMSprop](https://pytorch.org/docs/stable/optim.html#torch.optim.RMSprop) and [Adam](https://pytorch.org/docs/stable/optim.html#torch.optim.Adam)"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": [
375 |     "optimizer = torch.optim.Adam(model.parameters(), lr=0.001)"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "markdown",
380 |    "metadata": {},
381 |    "source": [
382 |     "In general, training loop consists of the following parts:\n",
383 |     "1. Clearing the gradients\n",
384 |     "2. Obtaining inputs and targets, and, possibly, moving them to the GPU\n",
385 |     "3. Performing the forward pass of the model\n",
386 |     "4. Calculating the loss\n",
387 |     "5. Performing the backward pass\n",
388 |     "6. Updating the weights of the network"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": [
397 |     "optimizer.zero_grad()\n",
398 |     "\n",
399 |     "inputs = torch.rand(1, 4)\n",
400 |     "targets = torch.rand(1, 3)\n",
401 |     "\n",
402 |     "outputs = model(inputs)\n",
403 |     "\n",
404 |     "loss = criterion(outputs, targets)\n",
405 |     "\n",
406 |     "loss.backward()\n",
407 |     "optimizer.step()"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "markdown",
412 |    "metadata": {},
413 |    "source": [
414 |     "## Data loading"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": null,
420 |    "metadata": {},
421 |    "outputs": [],
422 |    "source": [
423 |     "iris_data = sklearn.datasets.load_iris()"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "metadata": {},
430 |    "outputs": [],
431 |    "source": [
432 |     "iris_data.feature_names"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "code",
437 |    "execution_count": null,
438 |    "metadata": {},
439 |    "outputs": [],
440 |    "source": [
441 |     "iris_data.data[:10,:]"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": null,
447 |    "metadata": {},
448 |    "outputs": [],
449 |    "source": [
450 |     "iris_data.target_names"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": null,
456 |    "metadata": {
457 |     "scrolled": true
458 |    },
459 |    "outputs": [],
460 |    "source": [
461 |     "iris_data.target"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "markdown",
466 |    "metadata": {},
467 |    "source": [
468 |     "The [Dataset](https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.Dataset) class provided by PyTorch is an abstract class representing any dataset used as input to a model. It is conveniently designed in a way that all the classes subclassing it would only have to override `__len__` and `__getitem__` methods. The goal of the `__getitem__` method is, given an index, to return the corresponding input data\n",
469 |     "\n",
470 |     "You might find it useful to have a look at the official [Data Loading and Processing Tutorial](https://pytorch.org/tutorials/beginner/data_loading_tutorial.html) on the PyTorch website."
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": null,
476 |    "metadata": {},
477 |    "outputs": [],
478 |    "source": [
479 |     "class IrisDataset(torch.utils.data.Dataset):\n",
480 |     "    \"\"\"A PyTorch dataset for the Scikit-learn Iris data\"\"\"\n",
481 |     "    def __init__(self, data):\n",
482 |     "        \"\"\"\n",
483 |     "        Initialize the dataset class\n",
484 |     "\n",
485 |     "        :param data: Scikit-learn Iris data\n",
486 |     "\n",
487 |     "        \"\"\"\n",
488 |     "        self.features_names = data.feature_names\n",
489 |     "        self.target_names = data.target_names\n",
490 |     "        self.X = data.data.astype(np.float32)\n",
491 |     "        self.y = data.target\n",
492 |     "\n",
493 |     "    def __getitem__(self, index):\n",
494 |     "        \"\"\"\n",
495 |     "        Return the item by its index\n",
496 |     "\n",
497 |     "        :param index: index of the item\n",
498 |     "\n",
499 |     "        \"\"\"\n",
500 |     "        X = self.X[index]\n",
501 |     "        y = self.y[index]\n",
502 |     "\n",
503 |     "        return X, y\n",
504 |     "\n",
505 |     "    def __len__(self):\n",
506 |     "        \"\"\" Return the length of the dataset \"\"\"\n",
507 |     "        return len(self.y)"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": null,
513 |    "metadata": {},
514 |    "outputs": [],
515 |    "source": [
516 |     "dataset = IrisDataset(iris_data)"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": null,
522 |    "metadata": {},
523 |    "outputs": [],
524 |    "source": [
525 |     "len(dataset)"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "code",
530 |    "execution_count": null,
531 |    "metadata": {},
532 |    "outputs": [],
533 |    "source": [
534 |     "dataset[0]"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "markdown",
539 |    "metadata": {},
540 |    "source": [
541 |     "[DataLoader](https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader) is another useful class of PyTorch that combines a dataset and a sampler, and provides single- or multi-process iterators over the dataset. The goal of data loader is to create batches of training examples for the network by sampling the dataset and combining the sampled items into batches."
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "code",
546 |    "execution_count": null,
547 |    "metadata": {},
548 |    "outputs": [],
549 |    "source": [
550 |     "dataloader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True)"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": null,
556 |    "metadata": {},
557 |    "outputs": [],
558 |    "source": [
559 |     "len(dataloader)"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "markdown",
564 |    "metadata": {},
565 |    "source": [
566 |     "## Training loop"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": null,
572 |    "metadata": {},
573 |    "outputs": [],
574 |    "source": [
575 |     "nb_features = dataset.X.shape[1]\n",
576 |     "hidden_size = 32\n",
577 |     "nb_classes = len(set(dataset.y))\n",
578 |     "\n",
579 |     "model = Net(nb_features, hidden_size, nb_classes)\n",
580 |     "model = model.to('cuda')"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": null,
586 |    "metadata": {},
587 |    "outputs": [],
588 |    "source": [
589 |     "model"
590 |    ]
591 |   },
592 |   {
593 |    "cell_type": "code",
594 |    "execution_count": null,
595 |    "metadata": {},
596 |    "outputs": [],
597 |    "source": [
598 |     "criterion = torch.nn.CrossEntropyLoss()\n",
599 |     "optimizer = torch.optim.Adam(model.parameters(), lr=0.001)"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": null,
605 |    "metadata": {},
606 |    "outputs": [],
607 |    "source": [
608 |     "nb_epochs = 9\n",
609 |     "\n",
610 |     "for i in range(nb_epochs):\n",
611 |     "    epoch_losses = []\n",
612 |     "    for X_batch, y_batch in dataloader:\n",
613 |     "        model.train()\n",
614 |     "        optimizer.zero_grad()\n",
615 |     "        \n",
616 |     "        X_batch = X_batch.to('cuda')\n",
617 |     "        y_batch = y_batch.to('cuda')\n",
618 |     "        \n",
619 |     "        logits = model(X_batch)\n",
620 |     "        loss = criterion(logits, y_batch)\n",
621 |     "        \n",
622 |     "        loss.backward()\n",
623 |     "        optimizer.step()\n",
624 |     "        \n",
625 |     "        epoch_losses.append(loss.item())\n",
626 |     "        \n",
627 |     "    epoch_loss = np.mean(epoch_losses)\n",
628 |     "    print(f'Epoch: {i+1}, loss: {epoch_loss:.3f}')"
629 |    ]
630 |   },
631 |   {
632 |    "cell_type": "code",
633 |    "execution_count": null,
634 |    "metadata": {},
635 |    "outputs": [],
636 |    "source": []
637 |   },
638 |   {
639 |    "cell_type": "code",
640 |    "execution_count": null,
641 |    "metadata": {},
642 |    "outputs": [],
643 |    "source": []
644 |   }
645 |  ],
646 |  "metadata": {
647 |   "kernelspec": {
648 |    "display_name": "Python 3",
649 |    "language": "python",
650 |    "name": "python3"
651 |   },
652 |   "language_info": {
653 |    "codemirror_mode": {
654 |     "name": "ipython",
655 |     "version": 3
656 |    },
657 |    "file_extension": ".py",
658 |    "mimetype": "text/x-python",
659 |    "name": "python",
660 |    "nbconvert_exporter": "python",
661 |    "pygments_lexer": "ipython3",
662 |    "version": "3.7.3"
663 |   }
664 |  },
665 |  "nbformat": 4,
666 |  "nbformat_minor": 2
667 | }
668 | 


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial2_sentnece_classification.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "import csv\n",
 11 |     "import shutil\n",
 12 |     "import zipfile\n",
 13 |     "import pickle\n",
 14 |     "import itertools\n",
 15 |     "import urllib.parse\n",
 16 |     "import urllib.request\n",
 17 |     "from collections import Counter\n",
 18 |     "\n",
 19 |     "import numpy as np\n",
 20 |     "import torch\n",
 21 |     "import torch.utils.data\n",
 22 |     "import sklearn.datasets\n",
 23 |     "from sklearn.metrics import accuracy_score"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "## Introduction"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "In this turorial, we will build a simple neural network for sentence classification using word embeddings. The model simply sums up the embeddings of the tokens in the sentence and pass it through several fully connected layers."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## Dataset"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "We will use the [Stanford Sentiment Treebank](https://nlp.stanford.edu/sentiment/index.html) dataset, converted into a two-way classification problem, where the goal is given an input sentence to determine is it positive or negative."
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "def maybe_download_and_unzip_file(file_url, file_name=None):\n",
 61 |     "    \"\"\"\n",
 62 |     "    Download and unzip a remote archive if it does not exists yet\n",
 63 |     "\n",
 64 |     "    :param file_url: Url of the archive\n",
 65 |     "    :param file_name:  (Default value = None) The filename to save the content\n",
 66 |     "\n",
 67 |     "    \"\"\"\n",
 68 |     "    if file_name is None:\n",
 69 |     "        file_name = os.path.basename(file_url)\n",
 70 |     "        \n",
 71 |     "    if not os.path.exists(file_name):\n",
 72 |     "        print(f'Downloading: {file_name}')\n",
 73 |     "        \n",
 74 |     "        with urllib.request.urlopen(file_url) as response, open(file_name, 'wb') as target_file:\n",
 75 |     "            shutil.copyfileobj(response, target_file)\n",
 76 |     "\n",
 77 |     "        print(f'Downloaded: {file_name}')\n",
 78 |     "            \n",
 79 |     "        if os.path.splitext(file_name)[1] == '.zip':\n",
 80 |     "            print(f'Extracting: {file_name}')\n",
 81 |     "            with zipfile.ZipFile(file_name, 'r') as zip_file:\n",
 82 |     "                zip_file.extractall('.')\n",
 83 |     "                \n",
 84 |     "    else:\n",
 85 |     "        print(f'Exists: {file_name}')"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "dataset_url = 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8'\n",
 95 |     "dataset_filename = 'SST-2.zip'"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "train_filename = 'SST-2/train.tsv'\n",
105 |     "val_filename = 'SST-2/dev.tsv'"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "maybe_download_and_unzip_file(dataset_url, dataset_filename)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "### Vocabulary\n",
122 |     "\n",
123 |     "Before the data gets loaded into the model, it has to be converted from raw text to a numeric representation. One way to achieve this is to introduce a token-to-id mapping. More specifically, we will use a vocabulary class that maintains the mapping between tokens and their IDs, and that is able to flexibly add tokens and prune the vocabulary based on the token counts. When the input dataset is very large, vocabulary pruning is widely used in practice for more efficient memory usage."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "class Vocab(object):\n",
133 |     "    \"\"\" Vocabulary class to provide token to id correpondance \"\"\"\n",
134 |     "    END_TOKEN = '<end>'\n",
135 |     "    START_TOKEN = '<start>'\n",
136 |     "    PAD_TOKEN = '<pad>'\n",
137 |     "    UNK_TOKEN = '<unk>'\n",
138 |     "\n",
139 |     "    def __init__(self, special_tokens=None):\n",
140 |     "        \"\"\"\n",
141 |     "        Initialize the vocabulary class\n",
142 |     "\n",
143 |     "        :param special_tokens:  (Default value = None) A list of special tokens. The PAD token should be the first in the list, if used.\n",
144 |     "\n",
145 |     "        \"\"\"\n",
146 |     "        super().__init__()\n",
147 |     "\n",
148 |     "        self.special_tokens = special_tokens\n",
149 |     "\n",
150 |     "        self.token2id = {}\n",
151 |     "        self.id2token = {}\n",
152 |     "\n",
153 |     "        self.token_counts = Counter()\n",
154 |     "\n",
155 |     "        if self.special_tokens is not None:\n",
156 |     "            self.add_document(self.special_tokens)\n",
157 |     "\n",
158 |     "    def add_document(self, document, rebuild=True):\n",
159 |     "        \"\"\"\n",
160 |     "        Process the document and add tokens from the it to the vocabulary\n",
161 |     "\n",
162 |     "        :param document: A list of tokens in the document\n",
163 |     "        :param rebuild:  (Default value = True) Whether to rebuild the token2id correspondance or not\n",
164 |     "\n",
165 |     "        \"\"\"\n",
166 |     "        for token in document:\n",
167 |     "            self.token_counts[token] += 1\n",
168 |     "\n",
169 |     "            if token not in self.token2id:\n",
170 |     "                self.token2id[token] = len(self.token2id)\n",
171 |     "\n",
172 |     "        if rebuild:\n",
173 |     "            self._rebuild_id2token()\n",
174 |     "\n",
175 |     "    def add_documents(self, documents):\n",
176 |     "        \"\"\"\n",
177 |     "        Process a list of documents and tokens from the them to the vocabulary\n",
178 |     "\n",
179 |     "        :param documents: A list of documents, where each document is a list of tokens\n",
180 |     "\n",
181 |     "        \"\"\"\n",
182 |     "        for doc in documents:\n",
183 |     "            self.add_document(doc, rebuild=False)\n",
184 |     "\n",
185 |     "        self._rebuild_id2token()\n",
186 |     "\n",
187 |     "    def _rebuild_id2token(self):\n",
188 |     "        \"\"\" Revuild the token to id correspondance \"\"\"\n",
189 |     "        self.id2token = {i: t for t, i in self.token2id.items()}\n",
190 |     "\n",
191 |     "    def get(self, item, default=None):\n",
192 |     "        \"\"\"\n",
193 |     "        Given a token, return the corresponding id\n",
194 |     "\n",
195 |     "        :param item: A token\n",
196 |     "        :param default:  (Default value = None) Default value to return if token is not present in the vocabulary\n",
197 |     "\n",
198 |     "        \"\"\"\n",
199 |     "        return self.token2id.get(item, default)\n",
200 |     "\n",
201 |     "    def __getitem__(self, item):\n",
202 |     "        \"\"\"\n",
203 |     "        Given a token, return the corresponding id\n",
204 |     "\n",
205 |     "        :param item: A token\n",
206 |     "\n",
207 |     "        \"\"\"\n",
208 |     "        return self.token2id[item]\n",
209 |     "\n",
210 |     "    def __contains__(self, item):\n",
211 |     "        \"\"\"\n",
212 |     "        Check if a token is present in the vocabulary\n",
213 |     "\n",
214 |     "        :param item: A token\n",
215 |     "\n",
216 |     "        \"\"\"\n",
217 |     "        return item in self.token2id\n",
218 |     "\n",
219 |     "    def __len__(self):\n",
220 |     "        \"\"\" Return the length of the vocabulary \"\"\"\n",
221 |     "        return len(self.token2id)\n",
222 |     "\n",
223 |     "    def __str__(self):\n",
224 |     "        \"\"\" Get a string representation of the vocabulary \"\"\"\n",
225 |     "        return f'{len(self)} tokens'"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {},
231 |    "source": [
232 |     "Now, let's create a dataset class. Notice how the vocabulary can be shared between the train and the test datasets."
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "class SSTDataset(torch.utils.data.Dataset):\n",
242 |     "    \"\"\" \"\"\"\n",
243 |     "    def __init__(self, filename, vocab=None, max_len=None):\n",
244 |     "        \"\"\"\n",
245 |     "        Initialize the Stanford Sentiment Treebank Dataset\n",
246 |     "\n",
247 |     "        :param filename: Path to the dataset from the GLUE benchmark\n",
248 |     "        :param vocab:  (Default value = None) Vocabulary to use, will be created if None\n",
249 |     "        :param max_len:  (Default value = None) Maximum length of the sentneces. The longer sentences will be cut\n",
250 |     "\n",
251 |     "        \"\"\"\n",
252 |     "        super().__init__()\n",
253 |     "    \n",
254 |     "        data = self._load_file(filename)\n",
255 |     "        \n",
256 |     "        self.sentences = [sent.split(' ') for sent, label in data]\n",
257 |     "        self.labels = [int(label) for sent, label in data]\n",
258 |     "    \n",
259 |     "        print(f'Sentences: {len(self.sentences)}')\n",
260 |     "        print(f'Labels: {len(self.labels)}')\n",
261 |     "    \n",
262 |     "        if vocab is None:            \n",
263 |     "            vocab = Vocab(special_tokens=[Vocab.PAD_TOKEN, Vocab.UNK_TOKEN])\n",
264 |     "            vocab.add_documents(self.sentences)\n",
265 |     "            print(f'Creating vocab: {vocab}')\n",
266 |     "        \n",
267 |     "        if max_len is None:\n",
268 |     "            max_len = max(len(s) for s in itertools.chain.from_iterable(self.sentences))\n",
269 |     "            print(f'Calculating max len: {max_len}')\n",
270 |     "        \n",
271 |     "        self.max_len = max_len\n",
272 |     "        self.vocab = vocab\n",
273 |     "    \n",
274 |     "    def _load_file(self, filename):\n",
275 |     "        \"\"\"\n",
276 |     "        Read the dataset from the file\n",
277 |     "\n",
278 |     "        :param filename: Path to the dataset\n",
279 |     "\n",
280 |     "        \"\"\"\n",
281 |     "        with open(filename, 'r') as csv_file:\n",
282 |     "            reader = csv.DictReader(csv_file, delimiter='\\t')\n",
283 |     "            data = [(r['sentence'].strip(), r['label']) for r in reader]\n",
284 |     "            \n",
285 |     "            return data\n",
286 |     "        \n",
287 |     "    def _pad_sentnece(self, sent):\n",
288 |     "        \"\"\"\n",
289 |     "        Cut the sentence if needed and pad it to the maximum len\n",
290 |     "\n",
291 |     "        :param sent: The input sentnece\n",
292 |     "\n",
293 |     "        \"\"\"\n",
294 |     "        sent = sent[:self.max_len]\n",
295 |     "        \n",
296 |     "        nb_pad = self.max_len - len(sent)\n",
297 |     "        sent = sent + [Vocab.PAD_TOKEN,] * nb_pad\n",
298 |     "        \n",
299 |     "        return sent\n",
300 |     "        \n",
301 |     "    def __getitem__(self, index):\n",
302 |     "        \"\"\"\n",
303 |     "        Return a processed and ready to be batched item from the dataset by its index\n",
304 |     "\n",
305 |     "        :param index: The index of the sentence in the dataset\n",
306 |     "\n",
307 |     "        \"\"\"\n",
308 |     "        sent = self.sentences[index]\n",
309 |     "        label = self.labels[index]\n",
310 |     "        \n",
311 |     "        sent = self._pad_sentnece(sent)\n",
312 |     "        sent = [self.vocab[t] if t in self.vocab else self.vocab[Vocab.UNK_TOKEN] for t in sent]\n",
313 |     "        sent = np.array(sent, dtype=np.long)\n",
314 |     "        \n",
315 |     "        return sent, label\n",
316 |     "    \n",
317 |     "    def __len__(self):\n",
318 |     "        \"\"\" Return the length of the dataset \"\"\"\n",
319 |     "        return len(self.labels)"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "metadata": {},
326 |    "outputs": [],
327 |    "source": [
328 |     "dataset_train = SSTDataset(train_filename)"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "dataset_val = SSTDataset(val_filename, vocab=dataset_train.vocab, max_len = dataset_train.max_len)"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": [
346 |     "dataset_train[0]"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "## Word embeddings"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {},
359 |    "source": [
360 |     "We'll use the [fastText](https://fasttext.cc/) embeddings, trained on Common Crawl. We've conveted them into a dictionary and pickled them using the standard `pickle` module."
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": null,
366 |    "metadata": {},
367 |    "outputs": [],
368 |    "source": [
369 |     "embeddings_url = 'https://mednli.blob.core.windows.net/shared/word_embeddings/crawl-300d-2M.pickled'\n",
370 |     "embeddings_filename = 'crawl-300d-2M.pickled'"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": null,
376 |    "metadata": {
377 |     "scrolled": true
378 |    },
379 |    "outputs": [],
380 |    "source": [
381 |     "maybe_download_and_unzip_file(embeddings_url, embeddings_filename)"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": [
390 |     "with open(embeddings_filename, 'rb') as pkl_file:\n",
391 |     "    word_embeddings = pickle.load(pkl_file)"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": null,
397 |    "metadata": {},
398 |    "outputs": [],
399 |    "source": [
400 |     "print(f'Word embeddings: {len(word_embeddings)} tokens, shape {word_embeddings[list(word_embeddings.keys())[0]].shape}')"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": null,
406 |    "metadata": {},
407 |    "outputs": [],
408 |    "source": [
409 |     "list(word_embeddings.keys())[:10]"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "word_embeddings['cat'].shape"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": null,
424 |    "metadata": {},
425 |    "outputs": [],
426 |    "source": [
427 |     "word_embeddings['cat'][:20]"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "markdown",
432 |    "metadata": {},
433 |    "source": [
434 |     "### Embedding matrix"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "markdown",
439 |    "metadata": {},
440 |    "source": [
441 |     "Since we do not need all the embeddings, let's create a matrix, where each row will correspond to a token in the vocabulary and will contain the corresponding embedding."
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": null,
447 |    "metadata": {},
448 |    "outputs": [],
449 |    "source": [
450 |     "def create_embeddings_matrix(word_embeddings, vocab):\n",
451 |     "    \"\"\"\n",
452 |     "    Given word embeddings dictionary and the vocabulary, construct the embeddings martix, where each row corresponds to a token and contains the embedding of this token\n",
453 |     "\n",
454 |     "    :param word_embeddings: Word embeddings dictionary, token -> numpy array\n",
455 |     "    :param vocab: Vocabulary\n",
456 |     "\n",
457 |     "    \"\"\"\n",
458 |     "    embedding_size = word_embeddings[list(word_embeddings.keys())[0]].shape[0]\n",
459 |     "\n",
460 |     "    W_emb = np.zeros((len(vocab), embedding_size), dtype=np.float32)\n",
461 |     "    \n",
462 |     "    special_tokens = {\n",
463 |     "        t: np.random.uniform(-0.3, 0.3, (embedding_size,))\n",
464 |     "        for t in (Vocab.UNK_TOKEN, )\n",
465 |     "    }\n",
466 |     "    special_tokens[Vocab.PAD_TOKEN] = np.zeros((embedding_size,))\n",
467 |     "\n",
468 |     "    nb_unk = 0\n",
469 |     "    for i, t in vocab.id2token.items():\n",
470 |     "        if t in special_tokens:\n",
471 |     "            W_emb[i] = special_tokens[t]\n",
472 |     "        else:\n",
473 |     "            if t in word_embeddings:\n",
474 |     "                W_emb[i] = word_embeddings[t]\n",
475 |     "            else:\n",
476 |     "                W_emb[i] = np.random.uniform(-0.3, 0.3, embedding_size)\n",
477 |     "                nb_unk += 1\n",
478 |     "\n",
479 |     "    print(f'Nb unk: {nb_unk}')\n",
480 |     "\n",
481 |     "    return W_emb"
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": null,
487 |    "metadata": {},
488 |    "outputs": [],
489 |    "source": [
490 |     "len(dataset_train.vocab)"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": null,
496 |    "metadata": {},
497 |    "outputs": [],
498 |    "source": [
499 |     "W_emb = create_embeddings_matrix(word_embeddings, dataset_train.vocab)"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "markdown",
504 |    "metadata": {},
505 |    "source": [
506 |     "## Model"
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "markdown",
511 |    "metadata": {},
512 |    "source": [
513 |     "Finally, let's declare a simple model. Notice how we put fully connected layers inside a `torch.nn.Sequential` container."
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "code",
518 |    "execution_count": null,
519 |    "metadata": {},
520 |    "outputs": [],
521 |    "source": [
522 |     "class BOWModel(torch.nn.Module):\n",
523 |     "    \"\"\" \"\"\"\n",
524 |     "    def __init__(self, vocab_size, embedding_size, hidden_size, dropout, trainable_embeddings, nb_classes, pad_index, W_emb=None):\n",
525 |     "        \"\"\"\n",
526 |     "        Initialize a simple feedforward Bag-of-words model with several hidden layers\n",
527 |     "\n",
528 |     "        :param vocab_size: Vocabulary size\n",
529 |     "        :param embedding_size: Dmension of the embeddings\n",
530 |     "        :param hidden_size: The size of the hidden layers\n",
531 |     "        :param dropout: Probability of the dropout \n",
532 |     "        :param trainable_embeddings: Whether the embedding layer will be trainable or frozen\n",
533 |     "        :param nb_classes: Number of the classes to classify the input to\n",
534 |     "        :param pad_index: Index of the PAD token\n",
535 |     "        :param W_emb:  (Default value = None) Initial values of the embedding layer, a numpy array\n",
536 |     "\n",
537 |     "        \"\"\"\n",
538 |     "        super().__init__()\n",
539 |     "\n",
540 |     "        self.pad_index = pad_index\n",
541 |     "        \n",
542 |     "        self.embedding = torch.nn.Embedding(vocab_size, embedding_size, padding_idx=pad_index)\n",
543 |     "        if W_emb is not None:\n",
544 |     "            self.embedding.weight.data.copy_(torch.from_numpy(W_emb))\n",
545 |     "        if not trainable_embeddings:\n",
546 |     "            self.embedding.weight.requires_grad = False\n",
547 |     "\n",
548 |     "        self.classifier = torch.nn.Sequential(\n",
549 |     "            torch.nn.Linear(embedding_size, hidden_size),\n",
550 |     "            torch.nn.ReLU(),\n",
551 |     "            torch.nn.Dropout(dropout),\n",
552 |     "            torch.nn.Linear(hidden_size, hidden_size),\n",
553 |     "            torch.nn.ReLU(),\n",
554 |     "            torch.nn.Dropout(dropout),\n",
555 |     "            torch.nn.Linear(hidden_size, nb_classes),\n",
556 |     "        )\n",
557 |     "\n",
558 |     "        \n",
559 |     "    def forward(self, inputs):\n",
560 |     "        \"\"\"\n",
561 |     "        Perform the forward pass of the model\n",
562 |     "\n",
563 |     "        :param inputs: Input sentnences\n",
564 |     "\n",
565 |     "        \"\"\"\n",
566 |     "        embedded = self.embedding(inputs)\n",
567 |     "        inputs_lengths = torch.sum(inputs != self.pad_index, dim=1).long()\n",
568 |     "        \n",
569 |     "        z = torch.sum(embedded, dim=1) / inputs_lenghts.unsqueeze(-1).float()\n",
570 |     "        \n",
571 |     "        logits = self.classifier(z)\n",
572 |     "        \n",
573 |     "        return logits"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": null,
579 |    "metadata": {},
580 |    "outputs": [],
581 |    "source": [
582 |     "hidden_size = 128\n",
583 |     "dropout = 0.3\n",
584 |     "trainable_embeddings = False"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": null,
590 |    "metadata": {},
591 |    "outputs": [],
592 |    "source": [
593 |     "model = BOWModel(\n",
594 |     "    vocab_size=len(dataset_train.vocab), \n",
595 |     "    embedding_size = W_emb.shape[1], \n",
596 |     "    hidden_size=hidden_size, \n",
597 |     "    dropout=dropout, \n",
598 |     "    trainable_embeddings=trainable_embeddings, \n",
599 |     "    nb_classes=len(set(dataset_train.labels)), \n",
600 |     "    pad_index=dataset_train.vocab[Vocab.PAD_TOKEN], \n",
601 |     "    W_emb=W_emb\n",
602 |     ")"
603 |    ]
604 |   },
605 |   {
606 |    "cell_type": "code",
607 |    "execution_count": null,
608 |    "metadata": {},
609 |    "outputs": [],
610 |    "source": [
611 |     "model = model.to('cuda')"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": null,
617 |    "metadata": {},
618 |    "outputs": [],
619 |    "source": [
620 |     "model"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "markdown",
625 |    "metadata": {},
626 |    "source": [
627 |     "## Training"
628 |    ]
629 |   },
630 |   {
631 |    "cell_type": "code",
632 |    "execution_count": null,
633 |    "metadata": {},
634 |    "outputs": [],
635 |    "source": [
636 |     "batch_size=256\n",
637 |     "nb_epochs = 5\n",
638 |     "learning_rate=0.001\n",
639 |     "weight_decay = 0.00001"
640 |    ]
641 |   },
642 |   {
643 |    "cell_type": "code",
644 |    "execution_count": null,
645 |    "metadata": {},
646 |    "outputs": [],
647 |    "source": [
648 |     "dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True)\n",
649 |     "dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=batch_size, shuffle=False)"
650 |    ]
651 |   },
652 |   {
653 |    "cell_type": "code",
654 |    "execution_count": null,
655 |    "metadata": {},
656 |    "outputs": [],
657 |    "source": [
658 |     "criterion = torch.nn.CrossEntropyLoss()\n",
659 |     "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)"
660 |    ]
661 |   },
662 |   {
663 |    "cell_type": "markdown",
664 |    "metadata": {},
665 |    "source": [
666 |     "Run the training!"
667 |    ]
668 |   },
669 |   {
670 |    "cell_type": "code",
671 |    "execution_count": null,
672 |    "metadata": {},
673 |    "outputs": [],
674 |    "source": [
675 |     "for i in range(nb_epochs):\n",
676 |     "    epoch_losses_train = []\n",
677 |     "    epoch_losses_val = []\n",
678 |     "    epoch_predictions = []\n",
679 |     "    epoch_targets = []\n",
680 |     "    \n",
681 |     "    for inputs, targets in dataloader_train:\n",
682 |     "        model.train()\n",
683 |     "        optimizer.zero_grad()\n",
684 |     "        \n",
685 |     "        inputs = inputs.to('cuda')\n",
686 |     "        targets = targets.to('cuda')\n",
687 |     "        \n",
688 |     "        logits = model(inputs)\n",
689 |     "        loss = criterion(logits, targets)\n",
690 |     "        \n",
691 |     "        loss.backward()\n",
692 |     "        optimizer.step()\n",
693 |     "        \n",
694 |     "        epoch_losses_train.append(loss.item())\n",
695 |     "\n",
696 |     "    # calc accuracy on the dev set\n",
697 |     "    for inputs, targets in dataloader_val:\n",
698 |     "        model.eval()\n",
699 |     "        \n",
700 |     "        with torch.no_grad():\n",
701 |     "            inputs = inputs.to('cuda')\n",
702 |     "            targets = targets.to('cuda')\n",
703 |     "\n",
704 |     "            logits = model(inputs)\n",
705 |     "            loss = criterion(logits, targets)\n",
706 |     "            pred = torch.argmax(logits, dim=1)\n",
707 |     "\n",
708 |     "            epoch_losses_val.append(loss.item())\n",
709 |     "            epoch_predictions.append(pred.cpu().numpy())\n",
710 |     "            epoch_targets.append(targets.cpu().numpy())\n",
711 |     "    \n",
712 |     "    epoch_predictions = np.concatenate(epoch_predictions, axis=0)\n",
713 |     "    epoch_targets = np.concatenate(epoch_targets, axis=0)\n",
714 |     "    epoch_accuracy = accuracy_score(epoch_targets, epoch_predictions)\n",
715 |     "    epoch_loss_train = np.mean(epoch_losses_train)\n",
716 |     "    epoch_loss_val = np.mean(epoch_losses_val)    \n",
717 |     "    \n",
718 |     "    print(f'Epoch: {i+1}, train loss: {epoch_loss_train:.3f}, val loss: {epoch_loss_val:.3f}, accuracy: {epoch_accuracy:.3f}')"
719 |    ]
720 |   },
721 |   {
722 |    "cell_type": "code",
723 |    "execution_count": null,
724 |    "metadata": {},
725 |    "outputs": [],
726 |    "source": []
727 |   }
728 |  ],
729 |  "metadata": {
730 |   "kernelspec": {
731 |    "display_name": "Python 3",
732 |    "language": "python",
733 |    "name": "python3"
734 |   },
735 |   "language_info": {
736 |    "codemirror_mode": {
737 |     "name": "ipython",
738 |     "version": 3
739 |    },
740 |    "file_extension": ".py",
741 |    "mimetype": "text/x-python",
742 |    "name": "python",
743 |    "nbconvert_exporter": "python",
744 |    "pygments_lexer": "ipython3",
745 |    "version": "3.7.3"
746 |   }
747 |  },
748 |  "nbformat": 4,
749 |  "nbformat_minor": 2
750 | }
751 | 


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial4_finetuning_bert.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import csv\n",
 10 |     "import logging\n",
 11 |     "import os\n",
 12 |     "import random\n",
 13 |     "import sys\n",
 14 |     "\n",
 15 |     "import numpy as np\n",
 16 |     "import torch\n",
 17 |     "from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset\n",
 18 |     "from torch.utils.data.distributed import DistributedSampler\n",
 19 |     "from tqdm import tqdm, trange, tqdm_notebook\n",
 20 |     "\n",
 21 |     "from torch.nn import CrossEntropyLoss\n",
 22 |     "from sklearn.metrics import f1_score\n",
 23 |     "\n",
 24 |     "from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME\n",
 25 |     "from pytorch_pretrained_bert.modeling import BertModel, BertForMaskedLM, BertForSequenceClassification, BertConfig\n",
 26 |     "from pytorch_pretrained_bert.tokenization import BertTokenizer\n",
 27 |     "from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Introduction"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "In this turorial, we will fine-tune the famous BERT model to solve specific tasks, such as classification, question answering, and NER\n",
 42 |     "\n",
 43 |     "\n",
 44 |     "This turorial uses a BERT implementation by [Hugging Face](https://huggingface.co/) and is based on an example from [the GitHub repository](https://github.com/huggingface/pytorch-pretrained-BERT) which can be installed by running `pip install https://github.com/huggingface/pytorch-pretrained-BERT/releases/download/v0.6.2/pytorch_pretrained_bert-0.6.2-py3-none-any.whl`"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## BERT model"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "Let's see the BERT in action on a task of masked language modelling."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "First, we need to choose which model are we going to use\n",
 66 |     "\n",
 67 |     "- `bert-base-uncased`: 12-layer, 768-hidden, 12-heads, 110M parameters\n",
 68 |     "- `bert-large-uncased`: 24-layer, 1024-hidden, 16-heads, 340M parameters\n",
 69 |     "- `bert-base-cased`: 12-layer, 768-hidden, 12-heads , 110M parameters\n",
 70 |     "- `bert-large-cased`: 24-layer, 1024-hidden, 16-heads, 340M parameters\n",
 71 |     "- `bert-base-multilingual-uncased`: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters\n",
 72 |     "- `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "bert_model = 'bert-base-uncased'"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# Load pre-trained model tokenizer (vocabulary)\n",
 91 |     "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "text = '[CLS] This summer school is great ! [SEP] I love it ! [SEP]'"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "tokenized_text = tokenizer.tokenize(text)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "Notice how the word `puppeteer` was split into multiple tokens."
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "tokenized_text"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "# Mask a token that we will try to predict back with `BertForMaskedLM`\n",
135 |     "masked_index = 5\n",
136 |     "tokenized_text[masked_index] = '[MASK]'"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "tokenized_text"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "# Convert token to vocabulary indices\n",
155 |     "indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n",
156 |     "# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)\n",
157 |     "segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "indexed_tokens"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "# Convert inputs to PyTorch tensors\n",
176 |     "tokens_tensor = torch.tensor([indexed_tokens])\n",
177 |     "segments_tensors = torch.tensor([segments_ids])"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "tokens_tensor"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "segments_tensors"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "# Load pre-trained model (weights)\n",
205 |     "model = BertForMaskedLM.from_pretrained(bert_model)\n",
206 |     "model.eval();"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "# If you have a GPU, put everything on cuda\n",
216 |     "tokens_tensor = tokens_tensor.to('cuda')\n",
217 |     "segments_tensors = segments_tensors.to('cuda')\n",
218 |     "model = model.to('cuda')"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "# Predict all tokens\n",
228 |     "with torch.no_grad():\n",
229 |     "    predictions = model(tokens_tensor, segments_tensors)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "predictions.shape"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "Get the prediction - it should be 'henson'"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "# confirm we were able to predict 'henson'\n",
255 |     "predicted_index = torch.argmax(predictions[0, masked_index]).item()\n",
256 |     "predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "predicted_token"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "metadata": {},
271 |    "source": [
272 |     "## Fine-tuning BERT"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {},
278 |    "source": [
279 |     "Now, let's fine-tune BERT on a classification task. We are going to use the [Microsoft Research Paraphrase Corpus](https://www.microsoft.com/en-us/download/details.aspx?id=52398), reformatted for the GLUE benchmark, where the goal is given two sentneces, predict whether they are a paraphrase or not. [This script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) helps with the formatting of the MRPC dataset."
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "class InputExample(object):\n",
289 |     "    \"\"\"A single training/test example for simple sequence classification.\"\"\"\n",
290 |     "    def __init__(self, guid, text_a, text_b=None, label=None):\n",
291 |     "        \"\"\"Constructs a InputExample.\n",
292 |     "        Args:\n",
293 |     "            guid: Unique id for the example.\n",
294 |     "            text_a: string. The untokenized text of the first sequence. For single sequence tasks, only this sequence must be specified.\n",
295 |     "            text_b: (Optional) string. The untokenized text of the second sequence. Only must be specified for sequence pair tasks.\n",
296 |     "            label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples.\n",
297 |     "        \"\"\"        \n",
298 |     "        self.guid = guid\n",
299 |     "        self.text_a = text_a\n",
300 |     "        self.text_b = text_b\n",
301 |     "        self.label = label"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "class InputFeatures(object):\n",
311 |     "    \"\"\"A single set of features of data, ready to be used by the model\"\"\"\n",
312 |     "    def __init__(self, input_ids, input_mask, segment_ids, label_id):\n",
313 |     "        self.input_ids = input_ids\n",
314 |     "        self.input_mask = input_mask\n",
315 |     "        self.segment_ids = segment_ids\n",
316 |     "        self.label_id = label_id"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": [
325 |     "class DataProcessor(object):\n",
326 |     "    \"\"\"Base class for data converters for sequence classification data sets.\"\"\"\n",
327 |     "\n",
328 |     "    def get_train_examples(self, data_dir):\n",
329 |     "        \"\"\"Gets a collection of `InputExample`s for the train set.\"\"\"\n",
330 |     "        raise NotImplementedError()\n",
331 |     "\n",
332 |     "    def get_dev_examples(self, data_dir):\n",
333 |     "        \"\"\"Gets a collection of `InputExample`s for the dev set.\"\"\"\n",
334 |     "        raise NotImplementedError()\n",
335 |     "\n",
336 |     "    def get_labels(self):\n",
337 |     "        \"\"\"Gets the list of labels for this data set.\"\"\"\n",
338 |     "        raise NotImplementedError()\n",
339 |     "\n",
340 |     "    @classmethod\n",
341 |     "    def _read_tsv(cls, input_file, quotechar=None):\n",
342 |     "        \"\"\"Reads a tab separated value file.\"\"\"\n",
343 |     "        with open(input_file, \"r\", encoding=\"utf-8\") as f:\n",
344 |     "            reader = csv.reader(f, delimiter=\"\\t\", quotechar=quotechar)\n",
345 |     "            lines = []\n",
346 |     "            for line in reader:\n",
347 |     "                if sys.version_info[0] == 2:\n",
348 |     "                    line = list(unicode(cell, 'utf-8') for cell in line)\n",
349 |     "                lines.append(line)\n",
350 |     "                \n",
351 |     "            return lines"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": null,
357 |    "metadata": {},
358 |    "outputs": [],
359 |    "source": [
360 |     "class MrpcProcessor(DataProcessor):\n",
361 |     "    \"\"\"Processor for the MRPC data set (GLUE version).\"\"\"\n",
362 |     "\n",
363 |     "    def get_train_examples(self, data_dir):\n",
364 |     "        \"\"\"See base class.\"\"\"\n",
365 |     "        print(\"LOOKING AT {}\".format(os.path.join(data_dir, \"train.tsv\")))\n",
366 |     "        return self._create_examples(self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n",
367 |     "\n",
368 |     "    def get_dev_examples(self, data_dir):\n",
369 |     "        \"\"\"See base class.\"\"\"\n",
370 |     "        return self._create_examples(self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n",
371 |     "\n",
372 |     "    def get_labels(self):\n",
373 |     "        \"\"\"See base class.\"\"\"\n",
374 |     "        return [\"0\", \"1\"]\n",
375 |     "\n",
376 |     "    def _create_examples(self, lines, set_type):\n",
377 |     "        \"\"\"Creates examples for the training and dev sets.\"\"\"\n",
378 |     "        examples = []\n",
379 |     "        for (i, line) in enumerate(lines):\n",
380 |     "            if i == 0:\n",
381 |     "                continue\n",
382 |     "            guid = \"%s-%s\" % (set_type, i)\n",
383 |     "            text_a = line[3]\n",
384 |     "            text_b = line[4]\n",
385 |     "            label = line[0]\n",
386 |     "            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n",
387 |     "            \n",
388 |     "        return examples"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": [
397 |     "def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):\n",
398 |     "    \"\"\"Loads a data file into a list of `InputBatch`s.\"\"\"\n",
399 |     "\n",
400 |     "    label_map = {label : i for i, label in enumerate(label_list)}\n",
401 |     "\n",
402 |     "    features = []\n",
403 |     "    for (ex_index, example) in enumerate(examples):\n",
404 |     "        if ex_index % 10000 == 0:\n",
405 |     "            print(\"Creating example %d of %d\" % (ex_index, len(examples)))\n",
406 |     "\n",
407 |     "        tokens_a = tokenizer.tokenize(example.text_a)\n",
408 |     "\n",
409 |     "        tokens_b = None\n",
410 |     "        if example.text_b:\n",
411 |     "            tokens_b = tokenizer.tokenize(example.text_b)\n",
412 |     "            # Modifies `tokens_a` and `tokens_b` in place so that the total\n",
413 |     "            # length is less than the specified length.\n",
414 |     "            # Account for [CLS], [SEP], [SEP] with \"- 3\"\n",
415 |     "            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)\n",
416 |     "        else:\n",
417 |     "            # Account for [CLS] and [SEP] with \"- 2\"\n",
418 |     "            if len(tokens_a) > max_seq_length - 2:\n",
419 |     "                tokens_a = tokens_a[:(max_seq_length - 2)]\n",
420 |     "\n",
421 |     "        # The convention in BERT is:\n",
422 |     "        # (a) For sequence pairs:\n",
423 |     "        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n",
424 |     "        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1\n",
425 |     "        # (b) For single sequences:\n",
426 |     "        #  tokens:   [CLS] the dog is hairy . [SEP]\n",
427 |     "        #  type_ids: 0     0   0   0  0     0 0\n",
428 |     "        #\n",
429 |     "        # Where \"type_ids\" are used to indicate whether this is the first\n",
430 |     "        # sequence or the second sequence. The embedding vectors for `type=0` and\n",
431 |     "        # `type=1` were learned during pre-training and are added to the wordpiece\n",
432 |     "        # embedding vector (and position vector). This is not *strictly* necessary\n",
433 |     "        # since the [SEP] token unambiguously separates the sequences, but it makes\n",
434 |     "        # it easier for the model to learn the concept of sequences.\n",
435 |     "        #\n",
436 |     "        # For classification tasks, the first vector (corresponding to [CLS]) is\n",
437 |     "        # used as as the \"sentence vector\". Note that this only makes sense because\n",
438 |     "        # the entire model is fine-tuned.\n",
439 |     "        tokens = [\"[CLS]\"] + tokens_a + [\"[SEP]\"]\n",
440 |     "        segment_ids = [0] * len(tokens)\n",
441 |     "\n",
442 |     "        if tokens_b:\n",
443 |     "            tokens += tokens_b + [\"[SEP]\"]\n",
444 |     "            segment_ids += [1] * (len(tokens_b) + 1)\n",
445 |     "\n",
446 |     "        input_ids = tokenizer.convert_tokens_to_ids(tokens)\n",
447 |     "\n",
448 |     "        # The mask has 1 for real tokens and 0 for padding tokens. Only real\n",
449 |     "        # tokens are attended to.\n",
450 |     "        input_mask = [1] * len(input_ids)\n",
451 |     "\n",
452 |     "        # Zero-pad up to the sequence length.\n",
453 |     "        padding = [0] * (max_seq_length - len(input_ids))\n",
454 |     "        input_ids += padding\n",
455 |     "        input_mask += padding\n",
456 |     "        segment_ids += padding\n",
457 |     "\n",
458 |     "        assert len(input_ids) == max_seq_length\n",
459 |     "        assert len(input_mask) == max_seq_length\n",
460 |     "        assert len(segment_ids) == max_seq_length\n",
461 |     "\n",
462 |     "        label_id = label_map[example.label]\n",
463 |     "\n",
464 |     "        if ex_index < 1:\n",
465 |     "            print(\"*** Example ***\")\n",
466 |     "            print(\"guid: %s\" % (example.guid))\n",
467 |     "            print(\"tokens: %s\" % \" \".join([str(x) for x in tokens]))\n",
468 |     "            print(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n",
469 |     "            print(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n",
470 |     "            print(\"segment_ids: %s\" % \" \".join([str(x) for x in segment_ids]))\n",
471 |     "            print(\"label: %s (id = %d)\" % (example.label, label_id))\n",
472 |     "\n",
473 |     "        features.append(InputFeatures(\n",
474 |     "            input_ids=input_ids,\n",
475 |     "            input_mask=input_mask,\n",
476 |     "            segment_ids=segment_ids,\n",
477 |     "            label_id=label_id\n",
478 |     "        ))\n",
479 |     "        \n",
480 |     "    return features"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "code",
485 |    "execution_count": null,
486 |    "metadata": {},
487 |    "outputs": [],
488 |    "source": [
489 |     "def _truncate_seq_pair(tokens_a, tokens_b, max_length):\n",
490 |     "    \"\"\"Truncates a sequence pair in place to the maximum length.\"\"\"\n",
491 |     "\n",
492 |     "    # This is a simple heuristic which will always truncate the longer sequence\n",
493 |     "    # one token at a time. This makes more sense than truncating an equal percent\n",
494 |     "    # of tokens from each, since if one sequence is very short then each token\n",
495 |     "    # that's truncated likely contains more information than a longer sequence.\n",
496 |     "    while True:\n",
497 |     "        total_length = len(tokens_a) + len(tokens_b)\n",
498 |     "        if total_length <= max_length:\n",
499 |     "            break\n",
500 |     "        if len(tokens_a) > len(tokens_b):\n",
501 |     "            tokens_a.pop()\n",
502 |     "        else:\n",
503 |     "            tokens_b.pop()"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": null,
509 |    "metadata": {},
510 |    "outputs": [],
511 |    "source": [
512 |     "def simple_accuracy(preds, labels):\n",
513 |     "    return (preds == labels).mean()\n",
514 |     "\n",
515 |     "\n",
516 |     "def acc_and_f1(preds, labels):\n",
517 |     "    acc = simple_accuracy(preds, labels)\n",
518 |     "    f1 = f1_score(y_true=labels, y_pred=preds)\n",
519 |     "    return {\n",
520 |     "        \"acc\": acc,\n",
521 |     "        \"f1\": f1,\n",
522 |     "        \"acc_and_f1\": (acc + f1) / 2,\n",
523 |     "    }"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "markdown",
528 |    "metadata": {},
529 |    "source": [
530 |     "### Parameters"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": null,
536 |    "metadata": {},
537 |    "outputs": [],
538 |    "source": [
539 |     "data_dir = '/home/aromanov/projects/bert_explained/data/datasets/glue/mrpc/'\n",
540 |     "bert_model = 'bert-base-uncased'\n",
541 |     "task_name = 'mrpc'\n",
542 |     "output_dir = 'tmp/mrpc/'\n",
543 |     "max_seq_length = 128\n",
544 |     "do_lower_case = True\n",
545 |     "train_batch_size = 16\n",
546 |     "eval_batch_size = 64\n",
547 |     "learning_rate = 5e-5\n",
548 |     "num_train_epochs = 1\n",
549 |     "warmup_proportion = 0.1"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": null,
555 |    "metadata": {},
556 |    "outputs": [],
557 |    "source": [
558 |     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "code",
563 |    "execution_count": null,
564 |    "metadata": {},
565 |    "outputs": [],
566 |    "source": [
567 |     "print(f\"device: {device}\")"
568 |    ]
569 |   },
570 |   {
571 |    "cell_type": "code",
572 |    "execution_count": null,
573 |    "metadata": {},
574 |    "outputs": [],
575 |    "source": [
576 |     "if not os.path.exists(output_dir):\n",
577 |     "    os.makedirs(output_dir)"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "code",
582 |    "execution_count": null,
583 |    "metadata": {},
584 |    "outputs": [],
585 |    "source": [
586 |     "processor = MrpcProcessor()"
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "code",
591 |    "execution_count": null,
592 |    "metadata": {},
593 |    "outputs": [],
594 |    "source": [
595 |     "label_list = processor.get_labels()\n",
596 |     "num_labels = len(label_list)"
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "code",
601 |    "execution_count": null,
602 |    "metadata": {},
603 |    "outputs": [],
604 |    "source": [
605 |     "tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)"
606 |    ]
607 |   },
608 |   {
609 |    "cell_type": "code",
610 |    "execution_count": null,
611 |    "metadata": {},
612 |    "outputs": [],
613 |    "source": [
614 |     "train_examples = None\n",
615 |     "num_train_optimization_steps = None\n",
616 |     "\n",
617 |     "train_examples = processor.get_train_examples(data_dir)\n",
618 |     "num_train_optimization_steps = int(len(train_examples) / train_batch_size) * num_train_epochs"
619 |    ]
620 |   },
621 |   {
622 |    "cell_type": "code",
623 |    "execution_count": null,
624 |    "metadata": {},
625 |    "outputs": [],
626 |    "source": [
627 |     "# Prepare model\n",
628 |     "model = BertForSequenceClassification.from_pretrained(bert_model, num_labels=num_labels)\n",
629 |     "model = model.to(device)"
630 |    ]
631 |   },
632 |   {
633 |    "cell_type": "code",
634 |    "execution_count": null,
635 |    "metadata": {},
636 |    "outputs": [],
637 |    "source": [
638 |     "# Prepare optimizer\n",
639 |     "param_optimizer = list(model.named_parameters())\n",
640 |     "no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']\n",
641 |     "optimizer_grouped_parameters = [\n",
642 |     "    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},\n",
643 |     "    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}\n",
644 |     "]\n",
645 |     "\n",
646 |     "optimizer = BertAdam(optimizer_grouped_parameters,\n",
647 |     "                     lr=learning_rate,\n",
648 |     "                     warmup=warmup_proportion,\n",
649 |     "                     t_total=num_train_optimization_steps)"
650 |    ]
651 |   },
652 |   {
653 |    "cell_type": "code",
654 |    "execution_count": null,
655 |    "metadata": {},
656 |    "outputs": [],
657 |    "source": [
658 |     "global_step = 0\n",
659 |     "nb_tr_steps = 0\n",
660 |     "tr_loss = 0"
661 |    ]
662 |   },
663 |   {
664 |    "cell_type": "code",
665 |    "execution_count": null,
666 |    "metadata": {},
667 |    "outputs": [],
668 |    "source": [
669 |     "train_features = convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer)"
670 |    ]
671 |   },
672 |   {
673 |    "cell_type": "code",
674 |    "execution_count": null,
675 |    "metadata": {},
676 |    "outputs": [],
677 |    "source": [
678 |     "print(f\"  Num examples = {len(train_examples)}\")\n",
679 |     "print(f\"  Batch size = {train_batch_size}\")\n",
680 |     "print(f\"  Num steps = {num_train_optimization_steps}\")"
681 |    ]
682 |   },
683 |   {
684 |    "cell_type": "code",
685 |    "execution_count": null,
686 |    "metadata": {},
687 |    "outputs": [],
688 |    "source": [
689 |     "all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)\n",
690 |     "all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)\n",
691 |     "all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)\n",
692 |     "all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)\n",
693 |     "\n",
694 |     "train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)\n",
695 |     "train_sampler = RandomSampler(train_data)\n",
696 |     "train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)"
697 |    ]
698 |   },
699 |   {
700 |    "cell_type": "code",
701 |    "execution_count": null,
702 |    "metadata": {},
703 |    "outputs": [],
704 |    "source": [
705 |     "model.train()\n",
706 |     "\n",
707 |     "for _ in tqdm_notebook(range(num_train_epochs), desc=\"Epoch\"):\n",
708 |     "    tr_loss = 0\n",
709 |     "    nb_tr_examples, nb_tr_steps = 0, 0\n",
710 |     "    for step, batch in enumerate(tqdm_notebook(train_dataloader, desc=\"Iteration\")):\n",
711 |     "        batch = tuple(t.to(device) for t in batch)\n",
712 |     "        input_ids, input_mask, segment_ids, label_ids = batch\n",
713 |     "\n",
714 |     "        logits = model(input_ids, segment_ids, input_mask, labels=None)\n",
715 |     "\n",
716 |     "        loss_fct = CrossEntropyLoss()\n",
717 |     "        loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))\n",
718 |     "        loss.backward()\n",
719 |     "\n",
720 |     "        tr_loss += loss.item()\n",
721 |     "        nb_tr_examples += input_ids.size(0)\n",
722 |     "        nb_tr_steps += 1\n",
723 |     "\n",
724 |     "        optimizer.step()\n",
725 |     "        optimizer.zero_grad()\n",
726 |     "        global_step += 1"
727 |    ]
728 |   },
729 |   {
730 |    "cell_type": "code",
731 |    "execution_count": null,
732 |    "metadata": {},
733 |    "outputs": [],
734 |    "source": [
735 |     "# Save a trained model, configuration and tokenizer\n",
736 |     "model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self\n",
737 |     "\n",
738 |     "# If we save using the predefined names, we can load using `from_pretrained`\n",
739 |     "output_model_file = os.path.join(output_dir, WEIGHTS_NAME)\n",
740 |     "output_config_file = os.path.join(output_dir, CONFIG_NAME)\n",
741 |     "\n",
742 |     "torch.save(model_to_save.state_dict(), output_model_file)\n",
743 |     "model_to_save.config.to_json_file(output_config_file)\n",
744 |     "tokenizer.save_vocabulary(output_dir)"
745 |    ]
746 |   },
747 |   {
748 |    "cell_type": "code",
749 |    "execution_count": null,
750 |    "metadata": {},
751 |    "outputs": [],
752 |    "source": [
753 |     "# Load a trained model and vocabulary that you have fine-tuned\n",
754 |     "model = BertForSequenceClassification.from_pretrained(output_dir, num_labels=num_labels)\n",
755 |     "tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=do_lower_case)\n",
756 |     "model = model.to(device)"
757 |    ]
758 |   },
759 |   {
760 |    "cell_type": "code",
761 |    "execution_count": null,
762 |    "metadata": {},
763 |    "outputs": [],
764 |    "source": [
765 |     "eval_examples = processor.get_dev_examples(data_dir)\n",
766 |     "eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer)"
767 |    ]
768 |   },
769 |   {
770 |    "cell_type": "code",
771 |    "execution_count": null,
772 |    "metadata": {},
773 |    "outputs": [],
774 |    "source": [
775 |     "print(\"  Num examples = {len(eval_examples)}\")\n",
776 |     "print(\"  Batch size = {eval_batch_size}\")"
777 |    ]
778 |   },
779 |   {
780 |    "cell_type": "code",
781 |    "execution_count": null,
782 |    "metadata": {},
783 |    "outputs": [],
784 |    "source": [
785 |     "all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)\n",
786 |     "all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)\n",
787 |     "all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)\n",
788 |     "all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)\n",
789 |     "\n",
790 |     "eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)\n",
791 |     "eval_sampler = SequentialSampler(eval_data)\n",
792 |     "eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)"
793 |    ]
794 |   },
795 |   {
796 |    "cell_type": "code",
797 |    "execution_count": null,
798 |    "metadata": {},
799 |    "outputs": [],
800 |    "source": [
801 |     "model.eval()\n",
802 |     "\n",
803 |     "eval_loss = 0\n",
804 |     "nb_eval_steps = 0\n",
805 |     "preds = []\n",
806 |     "\n",
807 |     "for input_ids, input_mask, segment_ids, label_ids in tqdm_notebook(eval_dataloader, desc=\"Evaluating\"):\n",
808 |     "    input_ids = input_ids.to(device)\n",
809 |     "    input_mask = input_mask.to(device)\n",
810 |     "    segment_ids = segment_ids.to(device)\n",
811 |     "    label_ids = label_ids.to(device)\n",
812 |     "\n",
813 |     "    with torch.no_grad():\n",
814 |     "        logits = model(input_ids, segment_ids, input_mask, labels=None)\n",
815 |     "\n",
816 |     "    # create eval loss and other metric required by the task\n",
817 |     "    loss_fct = CrossEntropyLoss()\n",
818 |     "    tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))\n",
819 |     "    eval_loss += tmp_eval_loss.mean().item()\n",
820 |     "    \n",
821 |     "    nb_eval_steps += 1\n",
822 |     "    if len(preds) == 0:\n",
823 |     "        preds.append(logits.detach().cpu().numpy())\n",
824 |     "    else:\n",
825 |     "        preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0)\n",
826 |     "\n",
827 |     "eval_loss = eval_loss / nb_eval_steps\n",
828 |     "preds = preds[0]\n",
829 |     "preds = np.argmax(preds, axis=1)\n",
830 |     "\n",
831 |     "result = acc_and_f1(preds, all_label_ids.numpy())\n",
832 |     "\n",
833 |     "loss = tr_loss/nb_tr_steps\n",
834 |     "\n",
835 |     "result['eval_loss'] = eval_loss\n",
836 |     "result['global_step'] = global_step\n",
837 |     "result['loss'] = loss"
838 |    ]
839 |   },
840 |   {
841 |    "cell_type": "code",
842 |    "execution_count": null,
843 |    "metadata": {},
844 |    "outputs": [],
845 |    "source": [
846 |     "output_eval_file = os.path.join(output_dir, \"eval_results.txt\")\n",
847 |     "with open(output_eval_file, \"w\") as writer:\n",
848 |     "    print(\"***** Eval results *****\")\n",
849 |     "    for key in sorted(result.keys()):\n",
850 |     "        print(f\"  {key} = {result[key]}\")\n",
851 |     "        writer.write(f\"{key} = {result[key]}\\n\")"
852 |    ]
853 |   },
854 |   {
855 |    "cell_type": "code",
856 |    "execution_count": null,
857 |    "metadata": {},
858 |    "outputs": [],
859 |    "source": []
860 |   }
861 |  ],
862 |  "metadata": {
863 |   "kernelspec": {
864 |    "display_name": "Python 3",
865 |    "language": "python",
866 |    "name": "python3"
867 |   },
868 |   "language_info": {
869 |    "codemirror_mode": {
870 |     "name": "ipython",
871 |     "version": 3
872 |    },
873 |    "file_extension": ".py",
874 |    "mimetype": "text/x-python",
875 |    "name": "python",
876 |    "nbconvert_exporter": "python",
877 |    "pygments_lexer": "ipython3",
878 |    "version": "3.7.3"
879 |   }
880 |  },
881 |  "nbformat": 4,
882 |  "nbformat_minor": 2
883 | }
884 | 


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial5_serving_models.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | import shutil
  4 | import zipfile
  5 | import gzip
  6 | import pickle
  7 | import itertools
  8 | import urllib.parse
  9 | import urllib.request
 10 | from collections import Counter
 11 | import functools
 12 | 
 13 | import numpy as np
 14 | import torch
 15 | import torch.utils.data
 16 | from nltk import word_tokenize
 17 | 
 18 | 
 19 | from flask import Flask, render_template, request, jsonify, Response
 20 | 
 21 | 
 22 | ### CODE BELOW IS COPIED FROM THE NOTEBOOK ###
 23 | 
 24 | class Vocab(object):
 25 |     """Vocabulary class to provide token to id correpondance"""
 26 |     END_TOKEN = '<end>'
 27 |     START_TOKEN = '<start>'
 28 |     PAD_TOKEN = '<pad>'
 29 |     UNK_TOKEN = '<unk>'
 30 | 
 31 |     def __init__(self, special_tokens=None):
 32 |         """
 33 |         Initialize the vocabulary class
 34 | 
 35 |         :param special_tokens: Default value = None) A list of special tokens. The PAD token should be the first in the list, if used.
 36 | 
 37 |         """
 38 |         super().__init__()
 39 | 
 40 |         self.special_tokens = special_tokens
 41 | 
 42 |         self.token2id = {}
 43 |         self.id2token = {}
 44 | 
 45 |         self.token_counts = Counter()
 46 | 
 47 |         if self.special_tokens is not None:
 48 |             self.add_document(self.special_tokens)
 49 | 
 50 |     def add_document(self, document, rebuild=True):
 51 |         """
 52 |         Process the document and add tokens from the it to the vocabulary
 53 | 
 54 |         :param document: A list of tokens in the document
 55 |         :param rebuild: Default value = True) Whether to rebuild the token2id correspondance or not
 56 | 
 57 |         """
 58 |         for token in document:
 59 |             self.token_counts[token] += 1
 60 | 
 61 |             if token not in self.token2id:
 62 |                 self.token2id[token] = len(self.token2id)
 63 | 
 64 |         if rebuild:
 65 |             self._rebuild_id2token()
 66 | 
 67 |     def add_documents(self, documents):
 68 |         """
 69 |         Process a list of documents and tokens from the them to the vocabulary
 70 | 
 71 |         :param documents: A list of documents, where each document is a list of tokens
 72 | 
 73 |         """
 74 |         for doc in documents:
 75 |             self.add_document(doc, rebuild=False)
 76 | 
 77 |         self._rebuild_id2token()
 78 | 
 79 |     def _rebuild_id2token(self):
 80 |         """Revuild the token to id correspondance"""
 81 |         self.id2token = {i: t for t, i in self.token2id.items()}
 82 | 
 83 |     def get(self, item, default=None):
 84 |         """
 85 |         Given a token, return the corresponding id
 86 | 
 87 |         :param item: A token
 88 |         :param default: Default value = None) Default value to return if token is not present in the vocabulary
 89 | 
 90 |         """
 91 |         return self.token2id.get(item, default)
 92 | 
 93 |     def __getitem__(self, item):
 94 |         """
 95 |         Given a token, return the corresponding id
 96 | 
 97 |         :param item: A token
 98 | 
 99 |         """
100 |         return self.token2id[item]
101 | 
102 |     def __contains__(self, item):
103 |         """
104 |         Check if a token is present in the vocabulary
105 | 
106 |         :param item: A token
107 | 
108 |         """
109 |         return item in self.token2id
110 | 
111 |     def __len__(self):
112 |         """ """
113 |         return len(self.token2id)
114 | 
115 |     def __str__(self):
116 |         """Get a string representation of the vocabulary"""
117 |         return f'{len(self)} tokens'
118 | 
119 |     def save(self, filename):
120 |         """
121 |         Save the vocabulary to a csv file. See the `load` method.
122 | 
123 |         :param filename: Path the file
124 | 
125 |         """
126 |         with open(filename, 'w') as csv_file:
127 |             writer = csv.DictWriter(csv_file, fieldnames=['token', 'counts', 'is_special'])
128 |             writer.writeheader()
129 |             for idx in range(len(self.token2id)):
130 |                 token = self.id2token[idx]
131 |                 is_special = 1 if token in self.special_tokens else 0
132 |                 writer.writerow({'token': token, 'counts': self.token_counts[token], 'is_special': is_special})
133 | 
134 |     @staticmethod
135 |     def load(filename):
136 |         """
137 |         Load the vocabulary from a csv file. See the `save` method.
138 | 
139 |         :param filename: 
140 | 
141 |         """
142 |         with open(filename, 'r') as csv_file:
143 |             token2id = {}
144 |             tokens_counts = {}
145 |             special_tokens = []
146 |             reader = csv.DictReader(csv_file)
147 |             for i, row in enumerate(reader):
148 |                 token2id[row['token']] = i
149 |                 tokens_counts[row['token']] = int(row['counts'])
150 |                 if bool(int(row['is_special'])):
151 |                     special_tokens.append(row['token'])
152 | 
153 |         vocab = Vocab()
154 |         vocab.token2id = token2id
155 |         vocab.token_counts = Counter(tokens_counts)
156 |         vocab.special_tokens = special_tokens
157 |         vocab._rebuild_id2token()
158 | 
159 |         return vocab
160 | 
161 | 
162 | class SubtitlesDialogDataset(torch.utils.data.Dataset):
163 |     """ A conversational dialog dataset with query-response pairs  """
164 |     def __init__(self, filename, vocab=None, max_lines = 1000, max_len=50, max_vocab_size=50000):
165 |         """
166 |         Initialize a conversational dialog dataset with query-response pairs        
167 | 
168 |         :param filename: Path to the OpenSubstitles dataset
169 |         :param vocab:  (Default value = None) Vocabulary, will be created if None
170 |         :param max_lines:  (Default value = 1000) Limit the number of lines to read from the dataset file
171 |         :param max_len:  (Default value = 50) Maximum length of the sentences
172 |         :param max_vocab_size:  (Default value = 50000) Maximum size of the vocabulary
173 | 
174 |         """
175 | 
176 |         self.lines = []
177 |         with gzip.open(filename, 'rb') as f:
178 |             for i, line in enumerate(f):
179 |                 if i >= max_lines:
180 |                     break
181 | 
182 |                 tokens = word_tokenize(line.decode('utf-8'))
183 |                 self.lines.append(tokens)
184 | 
185 |         self.max_lines = min(len(self.lines), max_lines)
186 |                 
187 |         if vocab is None:
188 |             vocab = Vocab(special_tokens=[Vocab.PAD_TOKEN, Vocab.START_TOKEN, Vocab.END_TOKEN, Vocab.UNK_TOKEN])
189 |             vocab.add_documents(self.lines)
190 |             vocab.prune_vocab(max_vocab_size)
191 | 
192 |             print(f'Created vocab: {vocab}')
193 | 
194 |             
195 |         if max_len is None:
196 |             max_len = max(len(s) for s in itertools.chain.from_iterable(self.sentences))
197 |             print(f'Calculed max len: {max_len}')
198 |         
199 |         self.vocab = vocab
200 |         self.max_len = max_len
201 |         
202 |     def _pad_sentnece(self, sent):
203 |         """
204 |         Cut the sentence if needed and pad it to the maximum len
205 | 
206 |         :param sent: The input sentnece
207 | 
208 |         """
209 |         sent = sent[:self.max_len - 1] + [Vocab.END_TOKEN,]
210 |         
211 |         nb_pad = self.max_len - len(sent)
212 |         sent = sent + [Vocab.PAD_TOKEN,] * nb_pad
213 |         
214 |         return sent
215 |         
216 |     def _process_sent(self, sent):
217 |         """
218 |         Cut, pad, and convert the sentence from tokens to indices using the vocabulary
219 | 
220 |         :param sent: The input sentence
221 | 
222 |         """
223 |         sent = self._pad_sentnece(sent)
224 |         sent = [self.vocab[t] if t in self.vocab else self.vocab[Vocab.UNK_TOKEN] for t in sent]
225 |         
226 |         sent = np.array(sent, dtype=np.long)
227 |         return sent
228 |         
229 |     def __getitem__(self, index):
230 |         """
231 |         Create a pair of query-reponse using two consequtive lines in the dataset and return it
232 | 
233 |         :param index: Index of the query line. The reponse is the next line.
234 | 
235 |         """
236 |         query = self.lines[index]
237 |         response = self.lines[index+1]
238 |         
239 |         query = self._process_sent(query)
240 |         response = self._process_sent(response)        
241 |         
242 |         return query, response
243 |     
244 |     def __len__(self):
245 |         """ Return the total length of the dataset """
246 |         return self.max_lines - 1
247 | 
248 | def softmax_masked(inputs, mask, dim=1, epsilon=0.000001):
249 |     """
250 |     Perform the softmas operation on a batch of masked sequences of different lengths
251 | 
252 |     :param inputs: Input sequences, a 2d array of the shape (batch_size, max_seq_len)
253 |     :param mask: Mask, an array of 1 and 0
254 |     :param dim:  (Default value = 1) Dimension of the softmax operation
255 |     :param epsilon:  (Default value = 0.000001)
256 | 
257 |     """
258 |     inputs_exp = torch.exp(inputs)
259 |     inputs_exp = inputs_exp * mask.float()
260 |     inputs_exp_sum = inputs_exp.sum(dim=dim)
261 |     inputs_attention = inputs_exp / (inputs_exp_sum.unsqueeze(dim) + epsilon)
262 | 
263 |     return inputs_attention
264 | 
265 | class Seq2SeqAttentionModel(torch.nn.Module):
266 |     """ A more advanced GRU-based sequence-to-sequence model with attention """
267 |     def __init__(self, vocab_size, embedding_size, hidden_size, teacher_forcing,
268 |                  max_len,trainable_embeddings, start_index, end_index, pad_index, W_emb=None):
269 |         """
270 |         Initialize the model
271 | 
272 |         :param vocab_size: The size of the vocabulary
273 |         :param embedding_size: Dimension of the embeddings
274 |         :param hidden_size: The size of the hidden layers, including GRU
275 |         :param teacher_forcing: The probability of teacher forcing
276 |         :param max_len: Maximum length of the sequences
277 |         :param trainable_embeddings: Whether the embedding layer will be trainable or frozen
278 |         :param start_index: Index of the START token in the vocabulary
279 |         :param end_index: Index of the END token in the vocabulary
280 |         :param pad_index: Index of the PAD token in the vocabulary
281 |         :param W_emb:  (Default value = None) Initial values of the embedding layer, a numpy array
282 | 
283 |         """
284 | 
285 |         super().__init__()
286 | 
287 |         self.teacher_forcing = teacher_forcing
288 |         self.max_len = max_len
289 |         self.start_index = start_index
290 |         self.end_index = end_index
291 |         self.pad_index = pad_index
292 |         
293 |         self.embedding = torch.nn.Embedding(vocab_size, embedding_size, padding_idx=pad_index)
294 |         if W_emb is not None:
295 |             self.embedding.weight.data.copy_(torch.from_numpy(W_emb))
296 |         if not trainable_embeddings:
297 |             self.embedding.weight.requires_grad = False
298 | 
299 |         self.encoder = torch.nn.GRU(embedding_size, hidden_size, batch_first=True)
300 |         self.decoder = torch.nn.GRUCell(embedding_size, hidden_size)
301 | 
302 |         self.attention_decoder = torch.nn.Linear(hidden_size, hidden_size)
303 |         self.attention_encoder = torch.nn.Linear(hidden_size, hidden_size)        
304 |         self.attention_reduce = torch.nn.Linear(hidden_size, 1, bias=False)                
305 |         self.decoder_hidden_combine = torch.nn.Linear(hidden_size * 2, hidden_size)
306 |         
307 |         self.decoder_projection = torch.nn.Linear(hidden_size, vocab_size)
308 | 
309 |             
310 |     def encode(self, inputs):
311 |         """
312 |         Encode input sentence and return the all hidden states and the input mask
313 | 
314 |         :param inputs: The input sentence
315 | 
316 |         """
317 |         batch_size = inputs.size(0)
318 |         inputs_mask = (inputs != self.pad_index).long()
319 |         inputs_lengths = torch.sum(inputs_mask, dim=1)
320 |         
321 |         inputs_emb = self.embedding(inputs)
322 |         outputs, h = self.encoder(inputs_emb)
323 |         
324 |         return outputs, inputs_mask
325 |     
326 |     def decode(self, encoder_hiddens, inputs_mask, targets=None):
327 |         """
328 |         Decode the response given the all hidden states of the encoder
329 | 
330 |         :param encoder_hiddens: Hidden states of the decoder
331 |         :param inputs_mask: Input mask
332 |         :param targets:  (Default value = None) True decoding targets to be used for teacher forcing
333 | 
334 |         """
335 |         batch_size = encoder_hiddens.size(0)
336 | 
337 |         outputs_logits = []
338 |         decoder_hidden = torch.zeros_like(encoder_hiddens[:,0,:])
339 |         decoder_inputs = torch.full_like(decoder_hidden[:, 0], self.start_index).long()
340 |         for i in range(self.max_len):
341 |             decoder_inputs_emb = self.embedding(decoder_inputs)
342 |             
343 |             att_enc = self.attention_encoder(encoder_hiddens)
344 |             att_dec = self.attention_decoder(decoder_hidden)
345 |             att = torch.tanh(att_enc + att_dec.unsqueeze(1))
346 |             att_reduced = self.attention_reduce(att).squeeze(-1)
347 |             att_normazlied = softmax_masked(att_reduced, inputs_mask)
348 | 
349 |             decoder_hidden_att = torch.sum(encoder_hiddens * att_normazlied.unsqueeze(-1), dim=1)
350 |             decoder_hidden_combined = self.decoder_hidden_combine(torch.cat([decoder_hidden, decoder_hidden_att], dim=-1))
351 |             
352 |             decoder_hidden = self.decoder(decoder_inputs_emb, decoder_hidden_combined)
353 |             
354 |             decoder_output_logit = self.decoder_projection(decoder_hidden)
355 |             
356 |             if np.random.rand() < self.teacher_forcing and targets is not None:
357 |                 decoder_inputs = targets[:, i]
358 |             else:
359 |                 decoder_inputs = decoder_output_logit.argmax(dim=1).long()
360 |             
361 |             outputs_logits.append(decoder_output_logit)
362 |             
363 |         outputs_logits = torch.stack(outputs_logits, dim=1)
364 |             
365 |         return outputs_logits
366 |         
367 |     def forward(self, inputs, targets=None):
368 |         """
369 |         Encode the input query and decode the response
370 | 
371 |         :param inputs: The input sentence
372 |         :param targets:  (Default value = None) True decoding targets
373 | 
374 |         """
375 |         encoder_hiddens, inputs_mask = self.encode(inputs)
376 |         outputs_logits = self.decode(encoder_hiddens, inputs_mask, targets)
377 | 
378 |         return outputs_logits
379 | 
380 | def load_model(model_class, filename):
381 |     """
382 |     Create the model of the given class and load the checkpoint from the given file
383 | 
384 |     :param model_class: Model class
385 |     :param filename: Path to the checkpoint
386 | 
387 |     """
388 |     def _map_location(storage, loc):
389 |         """ A utility function to load a trained on a GPU model to the CPU """
390 |         return storage
391 | 
392 |     # load trained on GPU models to CPU
393 |     map_location = None
394 |     if not torch.cuda.is_available():
395 |         map_location = _map_location
396 | 
397 |     state = torch.load(str(filename), map_location=map_location)
398 | 
399 |     model = model_class(**state['model_params'])
400 |     model.load_state_dict(state['model_state'])
401 | 
402 |     return model
403 | 
404 | def generate_response(query):
405 |     """
406 |     Generate a response from the model for a given query. The model and the dataset will be taken from the app cache
407 | 
408 |     :param query: Query to generate the response to
409 | 
410 |     """
411 | 
412 |     if not isinstance(query, list):
413 |         query = word_tokenize(query)
414 | 
415 |     dataset = app_cache['dataset']
416 |     model = app_cache['model']
417 | 
418 |     query = dataset._process_sent(query)
419 |     query = torch.tensor(query)
420 | 
421 |     response_logits = model(query.view(1, -1)).squeeze(0)
422 |     response_indices = response_logits.argmax(dim=-1).cpu().numpy()
423 | 
424 |     response = [dataset.vocab.id2token[int(idx)] for idx in response_indices]
425 |     response = [t for t in response if t not in dataset.vocab.special_tokens]
426 |     response = ' '.join(response)
427 | 
428 |     return response
429 | 
430 | ### END CODE FROM THE NOTEBOOK ###
431 | 
432 | 
433 | app = Flask(__name__)
434 | app.config.from_object(__name__)
435 | 
436 | app.config.update(dict(
437 |     model_filename='tmp/seq2seq_dialog_att.pt',
438 |     vocab_filename='tmp/seq2seq_dialog.vocab.csv',
439 |     dataset_filename='OpenSubtitles.en.gz',
440 | ))
441 | app.config.from_envvar('SEQ2SEQ_DIALOG_SETTINGS', silent=True)
442 | 
443 | 
444 | def init_dataset():
445 |     """ Initialize the dataset from the parameters in the app config and return it """
446 |     dataset_filename = app.config['dataset_filename']
447 |     vocab_filename = app.config['vocab_filename']
448 | 
449 |     vocab = Vocab.load(vocab_filename)
450 |     dataset = SubtitlesDialogDataset(dataset_filename, max_lines=1, vocab=vocab, max_len=50)
451 | 
452 |     return dataset
453 | 
454 | 
455 | def init_model():
456 |     """ Initialize the model from the parameters in the app config and return it """
457 |     model_filename = app.config['model_filename']
458 |     model = load_model(Seq2SeqAttentionModel, model_filename)
459 | 
460 |     return model
461 | 
462 | 
463 | app_cache = dict(
464 |     dataset=init_dataset(),
465 |     model=init_model(),
466 | )
467 | 
468 | 
469 | @app.route('/dialog/', methods=['GET'])
470 | def dialog():
471 |     """ Take the query from the GET parameter `query`, generate the reponse, and return a json object """
472 |     query = request.args.get('query')
473 |     response = generate_response(query)
474 | 
475 |     result = dict(
476 |         query=query,
477 |         response=response,
478 |     )
479 | 
480 |     return jsonify(**result)
481 | 
482 | 
483 | if __name__ == '__main__':
484 |     app.run(host='0.0.0.0', port=8080)
485 | 


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial5_telegram.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Time to Learn Telegram Chatbots\n",
  8 |     "\n",
  9 |     "\n",
 10 |     "Install the Telegram package with:\n",
 11 |     "\n",
 12 |     "**pip install python-telegram-bot==12.0.0b1 --upgrade**\n",
 13 |     "\n",
 14 |     "This tutorial was modified from the Telegram bot Github page. [See there for more information.](https://github.com/python-telegram-bot/python-telegram-bot)\n",
 15 |     "\n",
 16 |     "Now we will go through the basics of creating your bot:"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "# the telegram package holds everything we need for this tutorial\n",
 26 |     "import telegram\n",
 27 |     "from telegram.ext import Updater\n",
 28 |     "from telegram.ext import CommandHandler\n",
 29 |     "from telegram.ext import MessageHandler, Filters"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "# Authorization token needed - @BotFather\n",
 39 |     "TOKEN = \"764368673:AAGqBzI4RbIJYne35MwPIKMbEXsHKK_Dh0U\""
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# Create our bot using the token\n",
 49 |     "\n",
 50 |     "bot = telegram.Bot(token=TOKEN)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Does the bot look correct? Let's check the details\n",
 60 |     "print(bot.get_me())"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "# If things go wrong, we won't know unless we enable logging\n",
 70 |     "import logging\n",
 71 |     "logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',\n",
 72 |     "                     level=logging.INFO)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "# We have a bot - now let's handle incoming messages\n",
 80 |     "\n",
 81 |     "We will now create function handlers for different input commands from the user.\n",
 82 |     "\n",
 83 |     "This will be achieved using a Telegram \"Updater\" object."
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "# We will create an updater to update the conversation between user and bot\n",
 93 |     "updater = Updater(token=TOKEN, use_context=True)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "# Let's create a handler function to handle when the user says \"/start\"\n",
103 |     "def start(update, context):\n",
104 |     "    # Here, we blindly respond to the user\n",
105 |     "    # The /start command could have come with arguments, we ignore those\n",
106 |     "    context.bot.send_message(chat_id=update.message.chat_id, text=\"I'm a bot, please talk to me!\")"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "# A \"dispatcher\" object allows us to add this command handler\n",
116 |     "dispatcher = updater.dispatcher\n",
117 |     "start_handler = CommandHandler('start', start)\n",
118 |     "dispatcher.add_handler(start_handler)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "# We have a bot, we have a command handler, let's start this thing up!\n",
128 |     "updater.start_polling()"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "Now let's add another handler."
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "# Okay, now let's respond to a user input command \"caps\"\n",
145 |     "def caps(update, context):\n",
146 |     "    text_caps = ' '.join(context.args).upper()\n",
147 |     "    context.bot.send_message(chat_id=update.message.chat_id, text=text_caps)\n",
148 |     "\n",
149 |     "caps_handler = CommandHandler('caps', caps)\n",
150 |     "dispatcher.add_handler(caps_handler)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "# But what to do in your project?\n",
158 |     "\n",
159 |     "Let's explore responding to arbitrary messages."
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "# This simple bot will simply tell the user what they said - easy peasy\n",
169 |     "def respond(update, context):\n",
170 |     "    # This is how you access the user input message\n",
171 |     "    message = update.message.text\n",
172 |     "    context.bot.send_message(chat_id=update.message.chat_id, text='You said: %s' % message)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "Now we have our handler. Let's add it."
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "# Filters.text allows the handler to only respond to text messages\n",
189 |     "msg_handler = MessageHandler(Filters.text, respond)\n",
190 |     "dispatcher.add_handler(msg_handler)"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "# We are tired of this bot. Let's shut it down!\n",
200 |     "updater.stop()"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "# Now let's wrap it all into a class structure!\n",
208 |     "\n",
209 |     "We will call this class Chatbot."
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "class Chatbot:\n",
219 |     "    def __init__(self, token):\n",
220 |     "        \"\"\"This chatbot takes a Telegram authorization toke and a handler function,\n",
221 |     "        and deploys a Telegram chatbot to respond to user messages with that token.\n",
222 |     "        \n",
223 |     "        token - a string authorization token provided by @BotFather on Telegram\n",
224 |     "        handle_func - a function taking update, context parameters which responds to user inputs\n",
225 |     "        \"\"\"\n",
226 |     "        self.token = token\n",
227 |     "        self.bot = telegram.Bot(token=TOKEN)\n",
228 |     "        self.updater = Updater(token=TOKEN, use_context=True)\n",
229 |     "        self.dispatcher = self.updater.dispatcher\n",
230 |     "        self.updater.start_polling()\n",
231 |     "        \n",
232 |     "    def stop(self):\n",
233 |     "        \"\"\"Stop the Telegram bot\"\"\"\n",
234 |     "        self.updater.stop()\n",
235 |     "        \n",
236 |     "    def add_handler(self, handler):\n",
237 |     "        \"\"\"Add a handler function to extend bot functionality\"\"\"\n",
238 |     "        self.dispatcher.add_handler(handler)\n",
239 |     "    \n",
240 |     "    "
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "def reverse(update, context):\n",
250 |     "    \"\"\"Whatever the user says, reverse their message and repeat it back to them.\"\"\"\n",
251 |     "    message = update.message.text\n",
252 |     "    rev_message = \"\".join(reversed(message)) \n",
253 |     "    context.bot.send_message(chat_id=update.message.chat_id, text=rev_message)\n",
254 |     "    \n",
255 |     "def greeting(update, context):\n",
256 |     "    \"\"\"Greet the user and ask for their name.\"\"\"\n",
257 |     "    context.bot.send_message(chat_id=update.message.chat_id, text=\"Hello there! What is your name?\")\n",
258 |     "\n",
259 |     "bot = Chatbot(TOKEN)\n",
260 |     "bot.add_handler(MessageHandler(Filters.text, reverse))\n",
261 |     "bot.add_handler(CommandHandler('start', greeting))"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": [
270 |     "bot.stop()"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "# This is all good, but what about a longer message history?\n",
278 |     "\n",
279 |     "A more intelligent bot doesn't respond to only the previous message. What if we want the whole history?"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "history = []\n",
289 |     "def repeat_history(update, context):\n",
290 |     "    \"\"\"Whatever the user says, reverse their message and repeat it back to them.\"\"\"\n",
291 |     "    message = update.message.text\n",
292 |     "    history.append(message)\n",
293 |     "    \n",
294 |     "    # here is where you insert your chatbot\n",
295 |     "    output = \" # \".join(history)\n",
296 |     "    \n",
297 |     "    context.bot.send_message(chat_id=update.message.chat_id, text=output)\n",
298 |     "    \n",
299 |     "bot = Chatbot(TOKEN)\n",
300 |     "bot.add_handler(MessageHandler(Filters.text, repeat_history))"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": null,
306 |    "metadata": {},
307 |    "outputs": [],
308 |    "source": [
309 |     "bot.stop()"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "metadata": {},
315 |    "source": [
316 |     "# And that's it! Now you can create your own chatbot.\n",
317 |     "\n",
318 |     "Now in your own projects: Create a model which produces a response to a user conversation history, and create a message handler to utilize Telegram messaging. Good to go!"
319 |    ]
320 |   }
321 |  ],
322 |  "metadata": {
323 |   "kernelspec": {
324 |    "display_name": "Python 3",
325 |    "language": "python",
326 |    "name": "python3"
327 |   },
328 |   "language_info": {
329 |    "codemirror_mode": {
330 |     "name": "ipython",
331 |     "version": 3
332 |    },
333 |    "file_extension": ".py",
334 |    "mimetype": "text/x-python",
335 |    "name": "python",
336 |    "nbconvert_exporter": "python",
337 |    "pygments_lexer": "ipython3",
338 |    "version": "3.6.5"
339 |   }
340 |  },
341 |  "nbformat": 4,
342 |  "nbformat_minor": 2
343 | }
344 | 


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial_6/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dataset_filename": "data_small",
 3 |   "output_dir": "run_1/",
 4 |   "old_model_dir": "pretrained_model",
 5 |   "num_epochs":5,
 6 |   "history_len": 50,
 7 |   "response_len": 15,
 8 |   "embedding_dim": 512,
 9 |   "model_dim": 512,
10 |   "inner_dim": 2048,
11 |   "num_layers": 6,
12 |   "num_heads": 8,
13 |   "dim_k": 64,
14 |   "dim_v": 64,
15 |   "dropout": 0.3,
16 |   "min_count": 1,
17 |   "train_batch_size": 200,
18 |   "val_batch_size": 25,
19 |   "warmup_steps": 4000,
20 |   "a_nice_note": "baseline test",
21 |   "label_smoothing": false,
22 |   "train_len": 1999,
23 |   "vocab_size": 11507,
24 |   "device":"cpu",
25 |   "beam_size":4,
26 |   "n_best":4,
27 |   "choose_best":false
28 | }


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial_6/dataset.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | import nltk
  3 | from nltk.tokenize import TweetTokenizer
  4 | import numpy as np
  5 | import torch.utils.data
  6 | import csv
  7 | import json
  8 | from transformer import Constants
  9 | 
 10 | # get history, respinse data from csv file
 11 | def _read_file(filename):
 12 |     history = list()
 13 |     response = list()
 14 |     ids = list()
 15 |     i = 1
 16 |     with open(filename, 'r') as fp:
 17 |         reader = csv.reader(fp)
 18 |         for row in reader:
 19 |             if i == 1:
 20 |                 i = 0
 21 |                 continue
 22 |             ids.append(row[0])
 23 |             history.append(row[1].split(" "))
 24 |             response.append(row[2].split(" "))
 25 | 
 26 |     return history, response, ids
 27 | 
 28 | 
 29 | 
 30 | class Vocab(object):
 31 | 
 32 |     def __init__(self, special_tokens=None):
 33 |         super(Vocab, self).__init__()
 34 | 
 35 |         self.nb_tokens = 0
 36 | 
 37 |         # vocab mapping
 38 |         self.token2id = {}
 39 |         self.id2token = {}
 40 | 
 41 |         self.token_counts = Counter()
 42 | 
 43 |         self.special_tokens = []
 44 |         if special_tokens is not None:
 45 |             self.special_tokens = special_tokens
 46 |             self.add_document(self.special_tokens)
 47 | 
 48 |     # updates the vocab with an example
 49 |     def add_document(self, document):
 50 |         for token in document:
 51 |             self.token_counts[token] += 1
 52 | 
 53 |             if token not in self.token2id:
 54 |                 self.token2id[token] = self.nb_tokens
 55 |                 self.id2token[self.nb_tokens] = token
 56 |                 self.nb_tokens += 1
 57 | 
 58 |     def add_documents(self, documents):
 59 |         for doc in documents:
 60 |             self.add_document(doc)
 61 | 
 62 |     # prune the vocab that occur less than the min count
 63 |     def prune_vocab(self, min_count=2):
 64 |         nb_tokens_before = len(self.token2id)
 65 | 
 66 |         tokens_to_delete = set([t for t, c in self.token_counts.items() if c < min_count])
 67 |         tokens_to_delete -= set(self.special_tokens)
 68 | 
 69 |         for token in tokens_to_delete:
 70 |             self.token_counts.pop(token)
 71 | 
 72 |         self.token2id = {t: i for i, t in enumerate(self.token_counts.keys())}
 73 |         self.id2token = {i: t for t, i in self.token2id.items()}
 74 |         self.nb_tokens = len(self.token2id)
 75 | 
 76 |         print('Vocab pruned: {} -> {}'.format(nb_tokens_before, self.nb_tokens))
 77 | 
 78 |     # load token2id from json file, useful when using pretrained model
 79 |     def load_from_dict(self, filename):
 80 |         with open(filename, 'r') as f:
 81 |             self.token2id = json.load(f)
 82 |         self.id2token = {i: t for t, i in self.token2id.items()}
 83 |         self.nb_tokens = len(self.token2id)
 84 | 
 85 |     # Save token2id to json file
 86 |     def save_to_dict(self, filename):
 87 |         with open(filename, 'w') as f:
 88 |             json.dump(self.token2id, f)
 89 | 
 90 |     def __getitem__(self, item):
 91 |         return self.token2id[item]
 92 | 
 93 |     def __contains__(self, item):
 94 |         return item in self.token2id
 95 | 
 96 |     def __len__(self):
 97 |         return self.nb_tokens
 98 | 
 99 |     def __str__(self):
100 |         return 'Vocab: {} tokens'.format(self.nb_tokens)
101 | 
102 | 
103 | class DialogueDataset(torch.utils.data.Dataset):
104 |     PAD_WORD = '<blank>'
105 |     UNK_WORD = '<unk>'
106 |     SEP_WORD = '<s>'
107 |     EOS_WORD = '</s>'
108 |     CLS_WORD = '<cls>'
109 | 
110 |     def __init__(self, filename, history_len = 50, response_len=15, vocab=None, update_vocab=True):
111 |         """
112 |         Initialize the dialogue dataset.
113 | 
114 |         Get examples, and create/update vocab
115 | 
116 |         Examples:
117 |             History: <cls> hello ! <s> hi , how are you ? </s>
118 |             Resoponse: <cls> i am good , thank you ! </s>
119 | 
120 |         Args:
121 |             filename: Filename of csv file with the data
122 |             history_len: Maximum token length for the history. Will be
123 |                 pruned/padded to this length
124 |             response_len: Maximum length for the response.
125 |             vocab: Optional vocab object to use for this dataset
126 |             update_vocab: Set to false to not update the vocab with the new
127 |                 examples
128 |         """
129 |         self.history, self.response, self.ids = _read_file(filename)
130 | 
131 |         self.history_len = history_len
132 |         self.response_len = response_len
133 | 
134 |         if vocab is None:
135 |             # Create new vocab object
136 |             self.vocab = Vocab(special_tokens=[DialogueDataset.PAD_WORD,
137 |                                                DialogueDataset.UNK_WORD,
138 |                                                DialogueDataset.SEP_WORD,
139 |                                                DialogueDataset.EOS_WORD,
140 |                                                DialogueDataset.CLS_WORD])
141 |         else:
142 |             self.vocab = vocab
143 | 
144 |         # do not want to update vocab for running old model
145 |         if update_vocab:
146 |             self.vocab.add_documents(self.history)
147 |             self.vocab.add_documents(self.response)
148 | 
149 |     def _process_history(self, history):
150 |         """
151 |         creates token encodings for the word embeddings, positional encodings,
152 |         and segment encodings for the dialogue history
153 | 
154 |         Examples:
155 |             History: <cls> hello ! <s> hi , how are you ? </s>
156 |             self.history_len = 15
157 | 
158 |             h_seq = np.array([4, 34, 65, 2, 23, 44, 455, 97, 56, 10, 3, 0, 0, 0, 0])
159 |             h_pos = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0)]
160 |             h_seg = np.array([1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0)]
161 | 
162 |         Args:
163 |             history: list of tokens in the history
164 |         Returns:
165 |             h_seq: token encodings for the history
166 |             h_pos: positional encoding for the history
167 |             h_seg: segment encoding for the history
168 |         """
169 |         history = history[-self.history_len+1:]
170 |         history.append(DialogueDataset.EOS_WORD)
171 | 
172 |         needed_pads = self.history_len - len(history)
173 |         if needed_pads > 0:
174 |             history = history + [DialogueDataset.PAD_WORD] * needed_pads
175 | 
176 |         history = [
177 |             self.vocab[token] if token in self.vocab else self.vocab[DialogueDataset.UNK_WORD]
178 |             for token in history
179 |         ]
180 | 
181 |         # create position embeddings, make zero if it is the pad token (0)
182 |         h_pos = np.array([pos_i+1 if w_i != 0 else 0
183 |             for pos_i, w_i in enumerate(history)])
184 | 
185 |         #create segment embeddings
186 |         seg = list()
187 |         i = 1
188 |         for j, token in enumerate(history):
189 |             if token == self.vocab[DialogueDataset.PAD_WORD]:
190 |                 break
191 |             seg.append(i)
192 |             if token == self.vocab[DialogueDataset.SEP_WORD]:
193 |                 i+=1
194 |         seg += [0] * needed_pads
195 |         h_seg = np.array(seg, dtype=np.long)
196 | 
197 |         h_seq = np.array(history, dtype=np.long)
198 | 
199 |         return h_seq, h_pos, h_seg
200 | 
201 |     def _process_response(self, response):
202 |         """
203 |         creates token encodings for the word embeddings, and positional
204 |             encodings for the response
205 | 
206 |         Examples:
207 |             Response:  <cls> i am good , thank you ! </s>
208 |             self.response_len = 10
209 | 
210 |             r_seq = np.array([4, 43, 52, 77, 9, 65, 93, 5,  3, 0])
211 |             r_pos = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 0,)]
212 | 
213 |         Args:
214 |             response: list of tokens in the response
215 |         Returns:
216 |             r_seq: token encodings for the response
217 |             r_pos: positional encoding for the response
218 |         """
219 |         response = response[:self.response_len - 1]
220 |         response.append(DialogueDataset.EOS_WORD)
221 |         #response.insert(0, DialogueDataset.CLS_WORD)
222 | 
223 |         needed_pads = self.response_len - len(response)
224 |         if needed_pads > 0:
225 |             response = response + [DialogueDataset.PAD_WORD] * needed_pads
226 | 
227 |         response = [
228 |             self.vocab[token] if token in self.vocab else self.vocab[DialogueDataset.UNK_WORD]
229 |             for token in response
230 |         ]
231 |         # create position embeddings
232 |         r_pos = np.array([pos_i + 1 if w_i != 0 else 0
233 |                         for pos_i, w_i in enumerate(response)])
234 |         r_seq = np.array(response, dtype=np.long)
235 |         return r_seq, r_pos
236 |     
237 |     def get_input_features(self, history):
238 |         """ get features for chatbot """
239 |         tokenizer = TweetTokenizer()
240 |         all_history = list()
241 |         all_history.append(DialogueDataset.CLS_WORD)
242 |         for line in history:
243 |             all_history+=list(tokenizer.tokenize(line))
244 |             all_history.append(DialogueDataset.SEP_WORD)
245 |         h_seq, h_pos, h_seg = self._process_history(all_history[:-1])
246 |         return torch.from_numpy(h_seq).unsqueeze(0), torch.from_numpy(h_pos).unsqueeze(0), torch.from_numpy(h_seg).unsqueeze(0)
247 | 
248 |     def __getitem__(self, index):
249 |         """
250 |             returns the features for an example in the dataset
251 | 
252 |         Args:
253 |             index: index of example in dataset
254 | 
255 |         Returns:
256 |             h_seq: token encodings for the history
257 |             h_pos: positional encoding for the history
258 |             h_seg: segment encoding for the history
259 |             r_seq: token encodings for the response
260 |             r_pos: positional encoding for the response
261 |         """
262 |         h_seq, h_pos, h_seg = self._process_history(self.history[index])
263 |         r_seq, r_pos = self._process_response(self.response[index])
264 |         id = self.ids[index]
265 |         return h_seq, h_pos, h_seg, r_seq, r_pos
266 | 
267 |     def __len__(self):
268 |         return len(self.history)
269 | 


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial_6/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | nltk
3 | tqdm
4 | sklearn
5 | ipywidgets
6 | 


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial_6/transformer/Beam.py:
--------------------------------------------------------------------------------
  1 | """ Manage beam search info structure.
  2 | 
  3 |     Heavily borrowed from OpenNMT-py.
  4 |     For code in OpenNMT-py, please check the following link:
  5 |     https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/Beam.py
  6 | """
  7 | 
  8 | import torch
  9 | import numpy as np
 10 | import transformer.Constants as Constants
 11 | 
 12 | class Beam():
 13 |     ''' Beam search '''
 14 | 
 15 |     def __init__(self, size, device=False):
 16 | 
 17 |         self.size = size
 18 |         self._done = False
 19 | 
 20 |         # The score for each translation on the beam.
 21 |         self.scores = torch.zeros((size,), dtype=torch.float, device=device)
 22 |         self.all_scores = []
 23 | 
 24 |         # The backpointers at each time-step.
 25 |         self.prev_ks = []
 26 | 
 27 |         # The outputs at each time-step.
 28 |         self.next_ys = [torch.full((size,), Constants.PAD, dtype=torch.long, device=device)]
 29 |         self.next_ys[0][0] = Constants.CLS
 30 | 
 31 |     def get_current_state(self):
 32 |         "Get the outputs for the current timestep."
 33 |         return self.get_tentative_hypothesis()
 34 | 
 35 |     def get_current_origin(self):
 36 |         "Get the backpointers for the current timestep."
 37 |         return self.prev_ks[-1]
 38 | 
 39 |     @property
 40 |     def done(self):
 41 |         return self._done
 42 | 
 43 |     def advance(self, word_prob):
 44 |         "Update beam status and check if finished or not."
 45 |         num_words = word_prob.size(1)
 46 | 
 47 |         # Sum the previous scores.
 48 |         if len(self.prev_ks) > 0:
 49 |             beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob)
 50 |         else:
 51 |             beam_lk = word_prob[0]
 52 | 
 53 |         flat_beam_lk = beam_lk.view(-1)
 54 | 
 55 |         best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True) # 1st sort
 56 |         best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True) # 2nd sort
 57 | 
 58 |         self.all_scores.append(self.scores)
 59 |         self.scores = best_scores
 60 | 
 61 |         # bestScoresId is flattened as a (beam x word) array,
 62 |         # so we need to calculate which word and beam each score came from
 63 |         prev_k = best_scores_id / num_words
 64 |         self.prev_ks.append(prev_k)
 65 |         self.next_ys.append(best_scores_id - prev_k * num_words)
 66 | 
 67 |         # End condition is when top-of-beam is EOS.
 68 |         if self.next_ys[-1][0].item() == Constants.EOS:
 69 |             self._done = True
 70 |             self.all_scores.append(self.scores)
 71 | 
 72 |         return self._done
 73 | 
 74 |     def sort_scores(self):
 75 |         "Sort the scores."
 76 |         return torch.sort(self.scores, 0, True)
 77 | 
 78 |     def get_the_best_score_and_idx(self):
 79 |         "Get the score of the best in the beam."
 80 |         scores, ids = self.sort_scores()
 81 |         return scores[1], ids[1]
 82 | 
 83 |     def get_tentative_hypothesis(self):
 84 |         "Get the decoded sequence for the current timestep."
 85 | 
 86 |         if len(self.next_ys) == 1:
 87 |             dec_seq = self.next_ys[0].unsqueeze(1)
 88 |         else:
 89 |             _, keys = self.sort_scores()
 90 |             hyps = [self.get_hypothesis(k) for k in keys]
 91 |             hyps = [[Constants.CLS] + h for h in hyps]
 92 |             dec_seq = torch.LongTensor(hyps)
 93 | 
 94 |         return dec_seq
 95 | 
 96 |     def get_hypothesis(self, k):
 97 |         """ Walk back to construct the full hypothesis. """
 98 |         hyp = []
 99 |         for j in range(len(self.prev_ks) - 1, -1, -1):
100 |             hyp.append(self.next_ys[j+1][k])
101 |             k = self.prev_ks[j][k]
102 | 
103 |         return list(map(lambda x: x.item(), hyp[::-1]))
104 | 


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial_6/transformer/Constants.py:
--------------------------------------------------------------------------------
 1 | 
 2 | PAD = 0
 3 | UNK = 1
 4 | SEP = 2
 5 | EOS = 3
 6 | CLS = 4
 7 | 
 8 | 
 9 | PAD_WORD = '<blank>'
10 | UNK_WORD = '<unk>'
11 | SEP_WORD = '<s>'
12 | EOS_WORD = '</s>'
13 | CLS_WORD = '<cls>'
14 | 


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial_6/transformer/Layers.py:
--------------------------------------------------------------------------------
 1 | ''' Define the Layers '''
 2 | import torch.nn as nn
 3 | from transformer.SubLayers import MultiHeadAttention, PositionwiseFeedForward
 4 | 
 5 | __author__ = "Yu-Hsiang Huang"
 6 | 
 7 | 
 8 | class EncoderLayer(nn.Module):
 9 |     ''' Compose with two layers '''
10 | 
11 |     def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
12 |         super(EncoderLayer, self).__init__()
13 |         self.slf_attn = MultiHeadAttention(
14 |             n_head, d_model, d_k, d_v, dropout=dropout)
15 |         self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
16 | 
17 |     def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
18 |         """
19 |         First performs self attention on the input. then the result is passed
20 |         through a feed forward network to get the output
21 | 
22 |         Args:
23 |             enc_input: vector input
24 | 
25 |         Returns:
26 |             enc_output: vector output from encoder layer
27 |         """
28 |         # Multi-Head Attention (w/ Add and Norm)
29 |         enc_output, enc_slf_attn = self.slf_attn(
30 |             enc_input, enc_input, enc_input, mask=slf_attn_mask)
31 |         enc_output *= non_pad_mask
32 | 
33 |         # Feed forward (w/ Add and Norm)
34 |         enc_output = self.pos_ffn(enc_output)
35 |         enc_output *= non_pad_mask
36 | 
37 |         return enc_output, enc_slf_attn
38 | 
39 | 
40 | class DecoderLayer(nn.Module):
41 |     ''' Compose with three layers '''
42 | 
43 |     def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
44 |         super(DecoderLayer, self).__init__()
45 |         self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
46 |         self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
47 |         self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
48 | 
49 |     def forward(self, dec_input, enc_output, non_pad_mask=None, slf_attn_mask=None, dec_enc_attn_mask=None):
50 |         """
51 |         First performs masked self attention on input.
52 | 
53 |         Then preforms attention
54 |         where the query is the output from the previous layer, and the keys
55 |         and values is the encoder output
56 | 
57 |         finally, the result is passed through a feed forward network to get
58 |         the output
59 | 
60 |         Args:
61 |             dec_input: input to the decoder
62 |             enc_output: output from encoder
63 | 
64 |         Returns:
65 |             dec_output: output from decoder
66 |         """
67 |         # Masked Multi-Head Attention (w/ Add and Norm)
68 |         dec_output, dec_slf_attn = self.slf_attn(
69 |             dec_input, dec_input, dec_input, mask=slf_attn_mask)
70 |         dec_output *= non_pad_mask
71 | 
72 |         # Multi-Head Attention (w/ Add and Norm)
73 |         dec_output, dec_enc_attn = self.enc_attn(
74 |             dec_output, enc_output, enc_output, mask=dec_enc_attn_mask)
75 |         dec_output *= non_pad_mask
76 | 
77 |         # Feed forward (w/ Add and Norm)
78 |         dec_output = self.pos_ffn(dec_output)
79 |         dec_output *= non_pad_mask
80 | 
81 |         return dec_output, dec_slf_attn, dec_enc_attn
82 | 


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial_6/transformer/Models.py:
--------------------------------------------------------------------------------
  1 | ''' Define the Transformer model '''
  2 | import torch
  3 | import torch.nn as nn
  4 | import numpy as np
  5 | import transformer.Constants as Constants
  6 | from transformer.Layers import EncoderLayer, DecoderLayer
  7 | 
  8 | __author__ = "Yu-Hsiang Huang"
  9 | 
 10 | def get_non_pad_mask(seq):
 11 |     assert seq.dim() == 2
 12 |     return seq.ne(Constants.PAD).type(torch.float).unsqueeze(-1)
 13 | 
 14 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
 15 |     ''' Sinusoid position encoding table '''
 16 | 
 17 |     def cal_angle(position, hid_idx):
 18 |         return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
 19 | 
 20 |     def get_posi_angle_vec(position):
 21 |         return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
 22 | 
 23 |     sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
 24 | 
 25 |     sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
 26 |     sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
 27 | 
 28 |     if padding_idx is not None:
 29 |         # zero vector for padding dimension
 30 |         sinusoid_table[padding_idx] = 0.
 31 | 
 32 |     return torch.FloatTensor(sinusoid_table)
 33 | 
 34 | def get_attn_key_pad_mask(seq_k, seq_q):
 35 |     ''' For masking out the padding part of key sequence. '''
 36 | 
 37 |     # Expand to fit the shape of key query attention matrix.
 38 |     len_q = seq_q.size(1)
 39 |     padding_mask = seq_k.eq(Constants.PAD)
 40 |     padding_mask = padding_mask.unsqueeze(1).expand(-1, len_q, -1)  # b x lq x lk
 41 | 
 42 |     return padding_mask
 43 | 
 44 | def get_subsequent_mask(seq):
 45 |     ''' For masking out the subsequent info. '''
 46 | 
 47 |     sz_b, len_s = seq.size()
 48 |     subsequent_mask = torch.triu(
 49 |         torch.ones((len_s, len_s), device=seq.device, dtype=torch.uint8), diagonal=1)
 50 |     subsequent_mask = subsequent_mask.unsqueeze(0).expand(sz_b, -1, -1)  # b x ls x ls
 51 | 
 52 |     return subsequent_mask
 53 | 
 54 | class Encoder(nn.Module):
 55 |     ''' A encoder model with self attention mechanism. '''
 56 | 
 57 |     def __init__(
 58 |             self,
 59 |             n_src_vocab, len_max_seq, d_word_vec,
 60 |             n_layers, n_head, d_k, d_v,
 61 |             d_model, d_inner, dropout=0.1, pretrained_embeddings=None):
 62 | 
 63 |         super().__init__()
 64 | 
 65 |         n_position = len_max_seq + 1
 66 | 
 67 |         if pretrained_embeddings is None:
 68 |             self.src_word_emb = nn.Embedding(
 69 |                 n_src_vocab, d_word_vec, padding_idx=Constants.PAD)
 70 |         else:
 71 |             self.src_word_emb = nn.Embedding.from_pretrained(
 72 |                 pretrained_embeddings, padding_idx=Constants.PAD)
 73 | 
 74 |         self.position_enc = nn.Embedding.from_pretrained(
 75 |             get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0),
 76 |             freeze=True)
 77 | 
 78 |         self.segment_enc = nn.Embedding(int(n_position/2), d_word_vec, padding_idx=0)
 79 | 
 80 |         self.layer_stack = nn.ModuleList([
 81 |             EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
 82 |             for _ in range(n_layers)])
 83 | 
 84 |     def forward(self, src_seq, src_pos, src_seg, return_attns=False):
 85 |         """
 86 |         First creates an imput embedding from the seq, pos, and seg encodings.
 87 |         then runs the encoder layer for n_layers and returns the final vector
 88 |         
 89 |         Args:
 90 |             h_seq: Encodings for the words in the history
 91 |             h_pos: Positional encodings for the words in the history
 92 |             h_seg: Segment encodings for turns in the history
 93 |         Returns:
 94 |             enc_output: vector output from encoder
 95 |         """
 96 |         enc_slf_attn_list = []
 97 | 
 98 |         # -- Prepare masks
 99 |         slf_attn_mask = get_attn_key_pad_mask(seq_k=src_seq, seq_q=src_seq)
100 |         non_pad_mask = get_non_pad_mask(src_seq)
101 | 
102 |         # -- Get input embeddings
103 |         enc_output = self.src_word_emb(src_seq) + self.position_enc(src_pos) \
104 |             + self.segment_enc(src_seg)
105 | 
106 |         # Nx encoder layer
107 |         for enc_layer in self.layer_stack:
108 |             enc_output, enc_slf_attn = enc_layer(
109 |                 enc_output,
110 |                 non_pad_mask=non_pad_mask,
111 |                 slf_attn_mask=slf_attn_mask)
112 |             if return_attns:
113 |                 enc_slf_attn_list += [enc_slf_attn]
114 | 
115 |         if return_attns:
116 |             return enc_output, enc_slf_attn_list
117 |         return enc_output,
118 | 
119 | class Decoder(nn.Module):
120 |     ''' A decoder model with self attention mechanism. '''
121 | 
122 |     def __init__(
123 |             self,
124 |             n_tgt_vocab, len_max_seq, d_word_vec,
125 |             n_layers, n_head, d_k, d_v,
126 |             d_model, d_inner, dropout=0.1, pretrained_embeddings=None):
127 | 
128 |         super().__init__()
129 |         n_position = len_max_seq + 1
130 | 
131 |         if pretrained_embeddings is None:
132 |             self.tgt_word_emb = nn.Embedding(
133 |                 n_tgt_vocab, d_word_vec, padding_idx=Constants.PAD)
134 |         else:
135 |             self.tgt_word_emb = nn.Embedding.from_pretrained(
136 |                 pretrained_embeddings, padding_idx=Constants.PAD)
137 | 
138 |         self.position_enc = nn.Embedding.from_pretrained(
139 |             get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0),
140 |             freeze=True)
141 | 
142 |         self.layer_stack = nn.ModuleList([
143 |             DecoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
144 |             for _ in range(n_layers)])
145 | 
146 |     def forward(self, tgt_seq, tgt_pos, src_seq, enc_output, return_attns=False):
147 |         """
148 |         Starts by getting the imput embedding from the target seq, and pos
149 |         encodings. Then runs the decoder.
150 |         
151 |         Args:
152 |             tgt_seq: Encodings for the words in the target response
153 |             tgt_pos: Positional encodings for the words in the target response
154 |             src_seq: Encodings for the words in the history
155 |             enc_output: Output from the Encoder 
156 |         Returns:
157 |             sec_output: vector outputs from decoder, one for each word in the response 
158 |             
159 |         """
160 |         dec_slf_attn_list, dec_enc_attn_list = [], []
161 | 
162 |         # -- Prepare masks
163 |         non_pad_mask = get_non_pad_mask(tgt_seq)
164 | 
165 |         slf_attn_mask_subseq = get_subsequent_mask(tgt_seq)
166 |         slf_attn_mask_keypad = get_attn_key_pad_mask(seq_k=tgt_seq, seq_q=tgt_seq)
167 |         slf_attn_mask = (slf_attn_mask_keypad + slf_attn_mask_subseq).gt(0)
168 | 
169 |         dec_enc_attn_mask = get_attn_key_pad_mask(seq_k=src_seq, seq_q=tgt_seq)
170 | 
171 |         # -- Forward
172 |         dec_output = self.tgt_word_emb(tgt_seq) + self.position_enc(tgt_pos)
173 | 
174 |         # Nx decoder layer
175 |         for dec_layer in self.layer_stack:
176 |             dec_output, dec_slf_attn, dec_enc_attn = dec_layer(
177 |                 dec_output, enc_output,
178 |                 non_pad_mask=non_pad_mask,
179 |                 slf_attn_mask=slf_attn_mask,
180 |                 dec_enc_attn_mask=dec_enc_attn_mask)
181 | 
182 |             if return_attns:
183 |                 dec_slf_attn_list += [dec_slf_attn]
184 |                 dec_enc_attn_list += [dec_enc_attn]
185 | 
186 |         if return_attns:
187 |             return dec_output, dec_slf_attn_list, dec_enc_attn_list
188 |         return dec_output,
189 | 
190 | class Transformer(nn.Module):
191 |     ''' A sequence to sequence model with attention mechanism. '''
192 | 
193 |     def __init__(
194 |             self,
195 |             n_src_vocab, n_tgt_vocab, len_max_seq_enc, len_max_seq_dec,
196 |             d_word_vec=512, d_model=512, d_inner=2048,
197 |             n_layers=6, n_head=8, d_k=64, d_v=64, dropout=0.1,
198 |             tgt_emb_prj_weight_sharing=True,
199 |             emb_src_tgt_weight_sharing=True,
200 |             pretrained_embeddings=None):
201 | 
202 |         super().__init__()
203 | 
204 |         self.encoder = Encoder(
205 |             n_src_vocab=n_src_vocab, len_max_seq=len_max_seq_enc,
206 |             d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
207 |             n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
208 |             dropout=dropout, pretrained_embeddings=pretrained_embeddings)
209 | 
210 |         self.decoder = Decoder(
211 |             n_tgt_vocab=n_tgt_vocab, len_max_seq=len_max_seq_dec,
212 |             d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
213 |             n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
214 |             dropout=dropout, pretrained_embeddings=pretrained_embeddings)
215 | 
216 |         self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False)
217 |         nn.init.xavier_normal_(self.tgt_word_prj.weight)
218 | 
219 |         assert d_model == d_word_vec, \
220 |         'To facilitate the residual connections, \
221 |          the dimensions of all module outputs shall be the same.'
222 | 
223 |         if tgt_emb_prj_weight_sharing:
224 |             # Share the weight matrix between target word embedding & the final logit dense layer
225 |             self.tgt_word_prj.weight = self.decoder.tgt_word_emb.weight
226 |             self.x_logit_scale = (d_model ** -0.5)
227 |         else:
228 |             self.x_logit_scale = 1.
229 | 
230 |         if emb_src_tgt_weight_sharing:
231 |             # Share the weight matrix between source & target word embeddings
232 |             assert n_src_vocab == n_tgt_vocab, \
233 |             "To share word embedding table, the vocabulary size of src/tgt shall be the same."
234 |             self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight
235 | 
236 |     def forward(self, src_seq, src_pos, src_seg, tgt_seq, tgt_pos):
237 |         """
238 |         Takes in the input features for the history and response, and returns a prediction.
239 |         
240 |         First encodes the history, and then decodes it before mapping the output to the vocabulary
241 |         
242 |         Args:
243 |             src_seq: Encodings for the words in the history 
244 |             src_pos: Positional encodings for the words in the history 
245 |             src_seg: Segment encodings for turns in the history 
246 |             tgt_seq: Encodings for the words in the target response 
247 |             tgt_pos: Positional encodings for the words in the target response 
248 |         Returns:
249 |             Outputs: Vector of probabilities for each word in the vocabulary, for each word in the response 
250 |         """
251 |         
252 |         tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1]
253 | 
254 |         enc_output, *_ = self.encoder(src_seq, src_pos, src_seg)
255 |         dec_output, *_ = self.decoder(tgt_seq, tgt_pos, src_seq, enc_output)
256 |         outputs = self.tgt_word_prj(dec_output) * self.x_logit_scale
257 | 
258 |         return outputs.view(-1, outputs.size(2))
259 | 


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial_6/transformer/Modules.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | __author__ = "Yu-Hsiang Huang"
 6 | 
 7 | class ScaledDotProductAttention(nn.Module):
 8 |     ''' Scaled Dot-Product Attention '''
 9 | 
10 |     def __init__(self, temperature, attn_dropout=0.1):
11 |         super().__init__()
12 |         self.temperature = temperature
13 |         self.dropout = nn.Dropout(attn_dropout)
14 |         self.softmax = nn.Softmax(dim=2)
15 | 
16 |     def forward(self, q, k, v, mask=None):
17 |         """
18 |         gets the querys keys, and values for each attention head.
19 | 
20 |         the queries and keys are multiplied, and this is scaled, masked,
21 |          softmaxed, and dropouted to get the weights
22 | 
23 |          these weights are applied to the values via matrix multiplication
24 | 
25 |         Args:
26 |             q: Query
27 |             k: Key
28 |             v: Value
29 |             mask:
30 | 
31 |         Returns:
32 | 
33 |         """
34 | 
35 |         # MatMul
36 |         attn = torch.bmm(q, k.transpose(1, 2))
37 |         # Scale
38 |         attn = attn / self.temperature
39 | 
40 |         # Mask
41 |         if mask is not None:
42 |             attn = attn.masked_fill(mask, -np.inf)
43 | 
44 |         # softmax/dropout
45 |         attn = self.softmax(attn)
46 |         attn = self.dropout(attn)
47 | 
48 |         # Matmul
49 |         output = torch.bmm(attn, v)
50 | 
51 |         return output, attn
52 | 


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial_6/transformer/Optim.py:
--------------------------------------------------------------------------------
 1 | '''A wrapper class for optimizer '''
 2 | import numpy as np
 3 | 
 4 | class ScheduledOptim():
 5 |     '''A simple wrapper class for learning rate scheduling'''
 6 | 
 7 |     def __init__(self, optimizer, d_model, n_warmup_steps):
 8 |         self.optimizer = optimizer
 9 |         self.n_warmup_steps = n_warmup_steps
10 |         self.n_current_steps = 0
11 |         self.init_lr = np.power(d_model, -0.5)
12 | 
13 |     def step_and_update_lr(self):
14 |         "Step with the inner optimizer"
15 |         self._update_learning_rate()
16 |         self.optimizer.step()
17 | 
18 |     def zero_grad(self):
19 |         "Zero out the gradients by the inner optimizer"
20 |         self.optimizer.zero_grad()
21 | 
22 |     def _get_lr_scale(self):
23 |         return np.min([
24 |             np.power(self.n_current_steps, -0.5),
25 |             np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])
26 | 
27 |     def _update_learning_rate(self):
28 |         ''' Learning rate scheduling per step '''
29 | 
30 |         self.n_current_steps += 1
31 |         lr = self.init_lr * self._get_lr_scale()
32 | 
33 |         for param_group in self.optimizer.param_groups:
34 |             param_group['lr'] = lr
35 | 
36 | 


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial_6/transformer/SubLayers.py:
--------------------------------------------------------------------------------
  1 | ''' Define the sublayers in encoder/decoder layer '''
  2 | import numpy as np
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from transformer.Modules import ScaledDotProductAttention
  6 | 
  7 | __author__ = "Yu-Hsiang Huang"
  8 | 
  9 | class MultiHeadAttention(nn.Module):
 10 |     ''' Multi-Head Attention module '''
 11 | 
 12 |     def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
 13 |         super().__init__()
 14 | 
 15 |         self.n_head = n_head
 16 |         self.d_k = d_k
 17 |         self.d_v = d_v
 18 | 
 19 |         self.w_qs = nn.Linear(d_model, n_head * d_k)
 20 |         self.w_ks = nn.Linear(d_model, n_head * d_k)
 21 |         self.w_vs = nn.Linear(d_model, n_head * d_v)
 22 |         nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
 23 |         nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
 24 |         nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))
 25 | 
 26 |         self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5))
 27 |         self.layer_norm = nn.LayerNorm(d_model)
 28 | 
 29 |         self.fc = nn.Linear(n_head * d_v, d_model)
 30 |         nn.init.xavier_normal_(self.fc.weight)
 31 | 
 32 |         self.dropout = nn.Dropout(dropout)
 33 | 
 34 | 
 35 |     def forward(self, q, k, v, mask=None):
 36 |         """
 37 |         first passes the querys, keys, and values through linear layers to get
 38 |         n_head inputs for scaled dot-product attention
 39 | 
 40 |         then preforms scaled Dot-product attention and applies layer normalization
 41 |         before returning the output
 42 | 
 43 |         Args:
 44 |             q: Query
 45 |             k: Key
 46 |             v: Value
 47 |             mask:
 48 | 
 49 |         Returns:
 50 |             Output: output from multihead attention
 51 |         """
 52 | 
 53 |         d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
 54 | 
 55 |         sz_b, len_q, _ = q.size()
 56 |         sz_b, len_k, _ = k.size()
 57 |         sz_b, len_v, _ = v.size()
 58 | 
 59 |         residual = q
 60 | 
 61 |         # Linear
 62 |         q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
 63 |         k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
 64 |         v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
 65 | 
 66 |         q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk
 67 |         k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk
 68 |         v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv
 69 | 
 70 |         mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x ..
 71 | 
 72 |         # Scaled Dot-Product Attention
 73 |         output, attn = self.attention(q, k, v, mask=mask)
 74 | 
 75 |         output = output.view(n_head, sz_b, len_q, d_v)
 76 |         output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv)
 77 | 
 78 |         output = self.dropout(self.fc(output))
 79 | 
 80 |         # Add and Norm
 81 |         output = self.layer_norm(output + residual)
 82 | 
 83 |         return output, attn
 84 | 
 85 | class PositionwiseFeedForward(nn.Module):
 86 |     ''' A two-feed-forward-layer module '''
 87 | 
 88 |     def __init__(self, d_in, d_hid, dropout=0.1):
 89 |         super().__init__()
 90 |         self.w_1 = nn.Conv1d(d_in, d_hid, 1) # position-wise
 91 |         self.w_2 = nn.Conv1d(d_hid, d_in, 1) # position-wise
 92 |         self.layer_norm = nn.LayerNorm(d_in)
 93 |         self.dropout = nn.Dropout(dropout)
 94 | 
 95 |     def forward(self, x):
 96 |         """
 97 |         just a feed forward linear layer that is used after attention in the
 98 |         encoder and decoder
 99 |         Args:
100 |             x: input
101 | 
102 |         Returns:
103 | 
104 |         """
105 |         # feed forward
106 |         residual = x
107 |         output = x.transpose(1, 2)
108 |         output = self.w_2(F.relu(self.w_1(output)))
109 |         output = output.transpose(1, 2)
110 |         output = self.dropout(output)
111 | 
112 |         # Add and norm
113 |         output = self.layer_norm(output + residual)
114 |         return output
115 | 


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial_6/transformer/Translator.py:
--------------------------------------------------------------------------------
  1 | ''' This module will handle the text generation with beam search. '''
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | from transformer.Models import Transformer
  8 | from transformer.Beam import Beam
  9 | 
 10 | class Chatbot(object):
 11 |     ''' Load with trained model and handle the beam search '''
 12 | 
 13 |     def __init__(self, config, model):
 14 |         self.config = config
 15 |         self.device = torch.device(config["device"])
 16 | 
 17 |         model.word_prob_prj = nn.LogSoftmax(dim=1)
 18 | 
 19 |         model = model.to(self.device)
 20 | 
 21 |         self.model = model
 22 |         self.model.eval()
 23 | 
 24 |     def translate_batch(self, src_seq, src_pos, src_seg):
 25 |         ''' Translation work in one batch '''
 26 | 
 27 |         def get_inst_idx_to_tensor_position_map(inst_idx_list):
 28 |             ''' Indicate the position of an instance in a tensor. '''
 29 |             return {inst_idx: tensor_position for tensor_position, inst_idx in enumerate(inst_idx_list)}
 30 | 
 31 |         def collect_active_part(beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm):
 32 |             ''' Collect tensor parts associated to active instances. '''
 33 | 
 34 |             _, *d_hs = beamed_tensor.size()
 35 |             n_curr_active_inst = len(curr_active_inst_idx)
 36 |             new_shape = (n_curr_active_inst * n_bm, *d_hs)
 37 | 
 38 |             beamed_tensor = beamed_tensor.view(n_prev_active_inst, -1)
 39 |             beamed_tensor = beamed_tensor.index_select(0, curr_active_inst_idx)
 40 |             beamed_tensor = beamed_tensor.view(*new_shape)
 41 | 
 42 |             return beamed_tensor
 43 | 
 44 |         def collate_active_info(
 45 |                 src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list):
 46 |             # Sentences which are still active are collected,
 47 |             # so the decoder will not run on completed sentences.
 48 |             n_prev_active_inst = len(inst_idx_to_position_map)
 49 |             active_inst_idx = [inst_idx_to_position_map[k] for k in active_inst_idx_list]
 50 |             active_inst_idx = torch.LongTensor(active_inst_idx).to(self.device)
 51 | 
 52 |             active_src_seq = collect_active_part(src_seq, active_inst_idx, n_prev_active_inst, n_bm)
 53 |             active_src_enc = collect_active_part(src_enc, active_inst_idx, n_prev_active_inst, n_bm)
 54 |             active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list)
 55 | 
 56 |             return active_src_seq, active_src_enc, active_inst_idx_to_position_map
 57 | 
 58 |         def beam_decode_step(
 59 |                 inst_dec_beams, len_dec_seq, src_seq, enc_output, inst_idx_to_position_map, n_bm):
 60 |             ''' Decode and update beam status, and then return active beam idx '''
 61 | 
 62 |             def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq):
 63 |                 dec_partial_seq = [b.get_current_state() for b in inst_dec_beams if not b.done]
 64 |                 dec_partial_seq = torch.stack(dec_partial_seq).to(self.device)
 65 |                 dec_partial_seq = dec_partial_seq.view(-1, len_dec_seq)
 66 |                 return dec_partial_seq
 67 | 
 68 |             def prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm):
 69 |                 dec_partial_pos = torch.arange(1, len_dec_seq + 1, dtype=torch.long, device=self.device)
 70 |                 dec_partial_pos = dec_partial_pos.unsqueeze(0).repeat(n_active_inst * n_bm, 1)
 71 |                 return dec_partial_pos
 72 | 
 73 |             def predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm):
 74 |                 dec_output, *_ = self.model.decoder(dec_seq, dec_pos, src_seq, enc_output)
 75 |                 dec_output = dec_output[:, -1, :]  # Pick the last step: (bh * bm) * d_h
 76 |                 word_prob = F.log_softmax(self.model.tgt_word_prj(dec_output), dim=1)
 77 |                 word_prob = word_prob.view(n_active_inst, n_bm, -1)
 78 | 
 79 |                 return word_prob
 80 | 
 81 |             def collect_active_inst_idx_list(inst_beams, word_prob, inst_idx_to_position_map):
 82 |                 active_inst_idx_list = []
 83 |                 for inst_idx, inst_position in inst_idx_to_position_map.items():
 84 |                     is_inst_complete = inst_beams[inst_idx].advance(word_prob[inst_position])
 85 |                     if not is_inst_complete:
 86 |                         active_inst_idx_list += [inst_idx]
 87 | 
 88 |                 return active_inst_idx_list
 89 | 
 90 |             n_active_inst = len(inst_idx_to_position_map)
 91 | 
 92 |             dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq)
 93 |             dec_pos = prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm)
 94 |             word_prob = predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm)
 95 | 
 96 |             # Update the beam with predicted word prob information and collect incomplete instances
 97 |             active_inst_idx_list = collect_active_inst_idx_list(
 98 |                 inst_dec_beams, word_prob, inst_idx_to_position_map)
 99 | 
100 |             return active_inst_idx_list
101 | 
102 |         def collect_hypothesis_and_scores(inst_dec_beams, n_best):
103 |             all_hyp, all_scores = [], []
104 |             for inst_idx in range(len(inst_dec_beams)):
105 |                 scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores()
106 |                 all_scores += [scores[:n_best]]
107 | 
108 |                 hyps = [inst_dec_beams[inst_idx].get_hypothesis(i) for i in tail_idxs[:n_best]]
109 |                 all_hyp += [hyps]
110 |             return all_hyp, all_scores
111 | 
112 |         with torch.no_grad():
113 |             #-- Encode
114 |             src_seq, src_pos, src_seg = src_seq.to(self.device), src_pos.to(self.device), src_seg.to(self.device)
115 |             src_enc, *_ = self.model.encoder(src_seq, src_pos, src_seg)
116 | 
117 |             #-- Repeat data for beam search
118 |             n_bm = self.config["beam_size"]
119 |             n_inst, len_s, d_h = src_enc.size()
120 |             src_seq = src_seq.repeat(1, n_bm).view(n_inst * n_bm, len_s)
121 |             src_enc = src_enc.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, d_h)
122 | 
123 |             #-- Prepare beams
124 |             inst_dec_beams = [Beam(n_bm, device=self.device) for _ in range(n_inst)]
125 | 
126 |             #-- Bookkeeping for active or not
127 |             active_inst_idx_list = list(range(n_inst))
128 |             inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list)
129 | 
130 |             #-- Decode
131 |             for len_dec_seq in range(1, self.config["response_len"] + 1):
132 | 
133 |                 active_inst_idx_list = beam_decode_step(
134 |                     inst_dec_beams, len_dec_seq, src_seq, src_enc, inst_idx_to_position_map, n_bm)
135 | 
136 |                 if not active_inst_idx_list:
137 |                     break  # all instances have finished their path to <EOS>
138 | 
139 |                 src_seq, src_enc, inst_idx_to_position_map = collate_active_info(
140 |                     src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list)
141 | 
142 |         batch_hyp, batch_scores = collect_hypothesis_and_scores(inst_dec_beams, self.config["n_best"])
143 | 
144 |         return batch_hyp, batch_scores
145 | 


--------------------------------------------------------------------------------
/tutorials/pytorch_track/tutorial_6/transformer_tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Tutorial: Transformer for Dialogue\n",
  8 |     "This tutorial will go over the process of implementing a transformer for dialogue. \n",
  9 |     "\n",
 10 |     "Before running, make sure you have \"data_small\" and \"pretrained_model\" in the same directory as this file. These folders can be downloaded from this dropbox: https://www.dropbox.com/sh/3clajk8a3gr3qde/AADInNzuRyDI7YCDVYSvo0cxa?dl=0\n",
 11 |     "\n",
 12 |     "Transformer description found in paper Attention Is All You Need\n",
 13 |     "(https://arxiv.org/abs/1706.03762 )\n",
 14 |     "\n",
 15 |     "Dataset: Open subtitles - http://opus.nlpl.eu/OpenSubtitles-v2018.php\n",
 16 |     "\n",
 17 |     "Transformer code - https://github.com/jadore801120/attention-is-all-you-need-pytorch"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "# Import libraries"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import numpy as np\n",
 34 |     "import json\n",
 35 |     "import torch\n",
 36 |     "import torch.nn.functional as F\n",
 37 |     "import os\n",
 38 |     "import random\n",
 39 |     "from tqdm import tqdm\n",
 40 |     "import ipywidgets as widgets\n",
 41 |     "\n",
 42 |     "import transformer\n",
 43 |     "from transformer.Models import Transformer\n",
 44 |     "from transformer.Translator import Chatbot\n",
 45 |     "from dataset import DialogueDataset, Vocab"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "# Load config \n",
 53 |     "\n",
 54 |     "Now, load the config file. This file contains all of the hyperparameters for the experiment. \n",
 55 |     "\n",
 56 |     "If you want to change the parameters, change them in the config.json file"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "# load config\n",
 66 |     "with open(\"config.json\", \"r\") as f:\n",
 67 |     "    config = json.load(f)\n",
 68 |     "\n",
 69 |     "for key, data in config.items():\n",
 70 |     "    print(\"{}: {}\".format(key, data))"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "# create output dir to save model, and results in\n",
 80 |     "if not os.path.exists(config[\"output_dir\"]):\n",
 81 |     "    os.mkdir(config[\"output_dir\"])"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "# Load Data\n",
 89 |     "\n",
 90 |     "Next we will create our training and validation dataset objects.\n",
 91 |     "\n",
 92 |     "The dataset takes the dataset filename, the max length for the history, and the max length for the response. you can initialize the vocab with an already existing vocab object by passing the vocab object. There is also a setting to not update the vocab with the new documents-this is useful for running pretrianed models where you need to have the same vocab as the old model.\n",
 93 |     "\n",
 94 |     "We want the 2 datasets to have the same vocab, so the validation dataset will be initialized with the trianing vocab, and the updated vocab from the val dataset is set to the train dataset."
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {
101 |     "scrolled": true
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "# create train dataset\n",
106 |     "train_dataset = DialogueDataset(\n",
107 |     "            os.path.join(config[\"dataset_filename\"], \"train.csv\"),\n",
108 |     "            config[\"history_len\"],\n",
109 |     "            config[\"response_len\"])\n",
110 |     "\n",
111 |     "# creat validation dataset\n",
112 |     "val_dataset = DialogueDataset(\n",
113 |     "            os.path.join(config[\"dataset_filename\"], \"val.csv\"),\n",
114 |     "            config[\"history_len\"],\n",
115 |     "            config[\"response_len\"],\n",
116 |     "            train_dataset.vocab)\n",
117 |     "\n",
118 |     "# set vocab:\n",
119 |     "vocab = val_dataset.vocab\n",
120 |     "train_dataset.vocab = vocab\n",
121 |     "config[\"vocab_size\"] = len(vocab)\n",
122 |     "vocab.save_to_dict(os.path.join(config[\"output_dir\"], \"vocab.json\"))\n",
123 |     "\n",
124 |     "# print info\n",
125 |     "print(\"train_len: {}\\nval_len: {}\\nvocab_size: {}\".format(len(train_dataset), len(val_dataset), len(vocab)))"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "Dataloaders for the model are initialized with the datasets\n",
133 |     "\n",
134 |     "We want to shuffle the train dataset, but it does not matter for validation"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "# initialize dataloaders\n",
144 |     "data_loader_train = torch.utils.data.DataLoader(\n",
145 |     "            train_dataset, config[\"train_batch_size\"], shuffle=True)\n",
146 |     "data_loader_val = torch.utils.data.DataLoader(\n",
147 |     "            val_dataset, config[\"val_batch_size\"], shuffle=False)\n"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "# Create Model\n",
155 |     "The transformer model is initialized with the parameters in the config file. You can change these parameters  to improve the model."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "# initialize device ('cuda', or 'cpu')\n",
165 |     "device = torch.device(config[\"device\"])"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "# create model\n",
175 |     "model = Transformer(\n",
176 |     "    config[\"vocab_size\"],\n",
177 |     "    config[\"vocab_size\"],\n",
178 |     "    config[\"history_len\"],\n",
179 |     "    config[\"response_len\"],\n",
180 |     "    d_word_vec=config[\"embedding_dim\"],\n",
181 |     "    d_model=config[\"model_dim\"],\n",
182 |     "    d_inner=config[\"inner_dim\"],\n",
183 |     "    n_layers=config[\"num_layers\"],\n",
184 |     "    n_head=config[\"num_heads\"],\n",
185 |     "    d_k=config[\"dim_k\"],\n",
186 |     "    d_v=config[\"dim_v\"],\n",
187 |     "    dropout=config[\"dropout\"]\n",
188 |     ").to(device)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "# Create Optimizer\n",
196 |     "\n",
197 |     "In the transformer paper they update the learning rate during training. To do this, we will make a scheduled optimizer wrapper class. \n",
198 |     "\n",
199 |     "We use an adam optimizer."
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "# optimizer class for updating the learning rate\n",
209 |     "class ScheduledOptim():\n",
210 |     "    '''A simple wrapper class for learning rate scheduling'''\n",
211 |     "\n",
212 |     "    def __init__(self, optimizer, d_model, n_warmup_steps):\n",
213 |     "        self.optimizer = optimizer\n",
214 |     "        self.n_warmup_steps = n_warmup_steps\n",
215 |     "        self.n_current_steps = 0\n",
216 |     "        self.init_lr = np.power(d_model, -0.5)\n",
217 |     "\n",
218 |     "    def step_and_update_lr(self):\n",
219 |     "        \"Step with the inner optimizer\"\n",
220 |     "        self._update_learning_rate()\n",
221 |     "        self.optimizer.step()\n",
222 |     "\n",
223 |     "    def zero_grad(self):\n",
224 |     "        \"Zero out the gradients by the inner optimizer\"\n",
225 |     "        self.optimizer.zero_grad()\n",
226 |     "\n",
227 |     "    def _get_lr_scale(self):\n",
228 |     "        return np.min([\n",
229 |     "            np.power(self.n_current_steps, -0.5),\n",
230 |     "            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])\n",
231 |     "\n",
232 |     "    def _update_learning_rate(self):\n",
233 |     "        ''' Learning rate scheduling per step '''\n",
234 |     "\n",
235 |     "        self.n_current_steps += 1\n",
236 |     "        lr = self.init_lr * self._get_lr_scale()\n",
237 |     "\n",
238 |     "        for param_group in self.optimizer.param_groups:\n",
239 |     "            param_group['lr'] = lr\n"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "# create optimizer\n",
249 |     "optimizer = torch.optim.Adam(\n",
250 |     "    filter(lambda x: x.requires_grad, model.parameters()),\n",
251 |     "    betas=(0.9, 0.98), eps=1e-09)\n",
252 |     "# create a sceduled optimizer object\n",
253 |     "optimizer = ScheduledOptim(\n",
254 |     "    optimizer, config[\"model_dim\"], config[\"warmup_steps\"])"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "markdown",
259 |    "metadata": {},
260 |    "source": [
261 |     "# Load Pretrained Model\n",
262 |     "If you want to run a pretrained model, change the \"old_model_dir\" from None to the filename with the pretrained model  \n",
263 |     "\n",
264 |     "You must have the same vocab for the old model, so that is loaded as well"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "def save_checkpoint(filename, model, optimizer):\n",
274 |     "    '''\n",
275 |     "    saves model into a state dict, along with its training statistics,\n",
276 |     "    and parameters\n",
277 |     "    :param model:\n",
278 |     "    :param optimizer:\n",
279 |     "    :return:\n",
280 |     "    '''\n",
281 |     "    state = {\n",
282 |     "        'model': model.state_dict(),\n",
283 |     "        'optimizer' : optimizer.state_dict(),\n",
284 |     "        }\n",
285 |     "    torch.save(state, filename)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "def load_checkpoint(filename, model, optimizer, device):\n",
295 |     "    '''\n",
296 |     "    loads previous model\n",
297 |     "    :param filename: file name of model\n",
298 |     "    :param model: model that contains same parameters of the one you are loading\n",
299 |     "    :param optimizer:\n",
300 |     "    :return: loaded model, checkpoint\n",
301 |     "    '''\n",
302 |     "    if os.path.isfile(filename):\n",
303 |     "        checkpoint = torch.load(filename, map_location=device)\n",
304 |     "        model.load_state_dict(checkpoint['model'])\n",
305 |     "        optimizer.load_state_dict(checkpoint['optimizer'])\n",
306 |     "    return model, optimizer\n"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "metadata": {},
313 |    "outputs": [],
314 |    "source": [
315 |     "if config[\"old_model_dir\"] is not None:\n",
316 |     "    model, optimizer.optimizer = load_checkpoint(os.path.join(config[\"old_model_dir\"], \"model.bin\"),\n",
317 |     "                                                model, optimizer.optimizer, device)\n",
318 |     "    vocab.load_from_dict(os.path.join(config[\"old_model_dir\"], \"vocab.json\"))"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "# Output an Example\n",
326 |     "Sometimes it is useful to see what the model is doing. So we will create a function that outputs an example from the validation set, along with the prediction from the model"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "def output_example(model, val_dataset, device, vocab):\n",
336 |     "    '''output an example and the models prediction for that example'''\n",
337 |     "    random_index = random.randint(0, len(val_dataset))\n",
338 |     "    example = val_dataset[random_index]\n",
339 |     "\n",
340 |     "    # prepare data\n",
341 |     "    h_seq, h_pos, h_seg, r_seq, r_pos = map(\n",
342 |     "        lambda x: torch.from_numpy(x).to(device).unsqueeze(0), example)\n",
343 |     "\n",
344 |     "    # take out first token from target for some reason\n",
345 |     "    gold = r_seq[:, 1:]\n",
346 |     "\n",
347 |     "    # forward\n",
348 |     "    pred = model(h_seq, h_pos, h_seg, r_seq, r_pos)\n",
349 |     "    output = torch.argmax(pred, dim=1)\n",
350 |     "\n",
351 |     "    # get history text\n",
352 |     "    string = \"history: \"\n",
353 |     "    seg = -1\n",
354 |     "    for i, idx in enumerate(h_seg.squeeze()):\n",
355 |     "        if seg != idx.item():\n",
356 |     "            string+=\"\\n\"\n",
357 |     "            seg=idx.item()\n",
358 |     "        token = vocab.id2token[h_seq.squeeze()[i].item()]\n",
359 |     "        if token != '<blank>':\n",
360 |     "            string += \"{} \".format(token)\n",
361 |     "\n",
362 |     "    # get target text\n",
363 |     "    string += \"\\nTarget:\\n\"\n",
364 |     "    for idx in gold.squeeze():\n",
365 |     "        token = vocab.id2token[idx.item()]\n",
366 |     "        string += \"{} \".format(token)\n",
367 |     "\n",
368 |     "    # get prediction\n",
369 |     "    string += \"\\n\\nPrediction:\\n\"\n",
370 |     "    for idx in output:\n",
371 |     "        token = vocab.id2token[idx.item()]\n",
372 |     "        string += \"{} \".format(token)\n",
373 |     "\n",
374 |     "    # print\n",
375 |     "    print(\"\\n------------------------\\n\")\n",
376 |     "    print(string)\n",
377 |     "    print(\"\\n------------------------\\n\")"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "markdown",
382 |    "metadata": {},
383 |    "source": [
384 |     "# Calculate Performance\n",
385 |     "\n",
386 |     "First calculate the loss, with or without smoothing\n",
387 |     "\n",
388 |     "In all you need is attention, they apply a label smothing to the loss function. They do this to make the model more \"unsure\" so the accuracy is higher. However, this causes perplexity to decrease. \n",
389 |     "\n",
390 |     "Calculate the number of correctly predicted tokens, to calculate accuracy later"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": null,
396 |    "metadata": {},
397 |    "outputs": [],
398 |    "source": [
399 |     "def cal_performance(pred, gold, smoothing=False):\n",
400 |     "    ''' Apply label smoothing if needed '''\n",
401 |     "\n",
402 |     "    loss = cal_loss(pred, gold, smoothing)\n",
403 |     "\n",
404 |     "    pred = pred.max(1)[1]\n",
405 |     "    gold = gold.contiguous().view(-1)\n",
406 |     "    non_pad_mask = gold.ne(transformer.Constants.PAD)\n",
407 |     "    # eq omputes element-wise equality\n",
408 |     "    n_correct = pred.eq(gold)\n",
409 |     "    n_correct = n_correct.masked_select(non_pad_mask).sum().item()\n",
410 |     "\n",
411 |     "    return loss, n_correct"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": null,
417 |    "metadata": {},
418 |    "outputs": [],
419 |    "source": [
420 |     "def cal_loss(pred, gold, smoothing):\n",
421 |     "    ''' Calculate cross entropy loss, apply label smoothing if needed. '''\n",
422 |     "\n",
423 |     "    gold = gold.contiguous().view(-1)\n",
424 |     "\n",
425 |     "    if smoothing:\n",
426 |     "        eps = 0.1\n",
427 |     "        n_class = pred.size(1)\n",
428 |     "\n",
429 |     "        one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1)\n",
430 |     "        one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)\n",
431 |     "        log_prb = F.log_softmax(pred, dim=1)\n",
432 |     "\n",
433 |     "        non_pad_mask = gold.ne(transformer.Constants.PAD)\n",
434 |     "        loss = -(one_hot * log_prb).sum(dim=1)\n",
435 |     "        #loss = loss.masked_select(non_pad_mask).sum()  # average later\n",
436 |     "        loss = loss.masked_select(non_pad_mask).mean()\n",
437 |     "    else:\n",
438 |     "        loss = F.cross_entropy(pred, gold, ignore_index=transformer.Constants.PAD, reduction='mean')\n",
439 |     "    return loss"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "markdown",
444 |    "metadata": {},
445 |    "source": [
446 |     "# Forward Pass\n",
447 |     "First prepares the inputs by sending the features to the respective device\n",
448 |     "-src_seq: input word encodings\n",
449 |     "-src_pos: input positional encodings\n",
450 |     "-src_seg: input sequence encodings, for the turns in dialogue history\n",
451 |     "-tgt_seq: target word encodings\n",
452 |     "-tgt_pos: target positional encodings\n",
453 |     "\n",
454 |     "gold is the target but without the CLS token at the begining\n",
455 |     "\n",
456 |     "If you are training, you want to clear the gradients before getting the output"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": null,
462 |    "metadata": {},
463 |    "outputs": [],
464 |    "source": [
465 |     "# forward\n",
466 |     "def forward(phase, batch, model, optimizer):\n",
467 |     "    h_seq, h_pos, h_seg, r_seq, r_pos = map(\n",
468 |     "                lambda x: x.to(device), batch)\n",
469 |     "\n",
470 |     "    gold = r_seq[:, 1:]\n",
471 |     "\n",
472 |     "    # forward\n",
473 |     "    if phase == \"train\":\n",
474 |     "        optimizer.zero_grad()\n",
475 |     "    pred = model(h_seq, h_pos, h_seg, r_seq, r_pos)\n",
476 |     "    \n",
477 |     "    return pred, gold\n",
478 |     "        "
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "markdown",
483 |    "metadata": {},
484 |    "source": [
485 |     "# Backward Pass\n",
486 |     "The backward pass computes the loss, and updates the models parameters if it is training\n",
487 |     "\n",
488 |     "returns the loss, and the number of correct outputs"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": null,
494 |    "metadata": {},
495 |    "outputs": [],
496 |    "source": [
497 |     "# backward\n",
498 |     "def backward(phase, pred, gold, config):\n",
499 |     "    # get loss\n",
500 |     "    loss, n_correct = cal_performance(pred, gold,\n",
501 |     "        smoothing=config[\"label_smoothing\"])\n",
502 |     "    \n",
503 |     "    if phase == \"train\":\n",
504 |     "        # backward\n",
505 |     "        loss.backward()\n",
506 |     "\n",
507 |     "        # update parameters, and learning rate\n",
508 |     "        optimizer.step_and_update_lr()\n",
509 |     "\n",
510 |     "    return float(loss), n_correct"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "markdown",
515 |    "metadata": {},
516 |    "source": [
517 |     "# Training Loop\n",
518 |     "For every epoch, the loop runs training and evaluation.\n",
519 |     "\n",
520 |     "Setting the model to eval mode vs training mode disables things like dropout layers, and other things you do not want during evaluation\n",
521 |     "\n",
522 |     "Metrics are initialized, and saved to the output file\n",
523 |     "\n",
524 |     "after running validation, we want to save the weights of the model only if the validation loss is lower than it has been before. This means we will only save the best model."
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "markdown",
529 |    "metadata": {},
530 |    "source": [
531 |     "Next step before running training is initialize a dictionary for the results of training. It is important to be organized with experiment results.\n",
532 |     "\n",
533 |     "We want to save the weights of the model only when the validation loss lower than it has been before. So the lowest loss is initialized to a arbitrary large number. If the validation loss is lower than the lowest loss, save the weights, and set the lowest loss to the validation loss"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": null,
539 |    "metadata": {},
540 |    "outputs": [],
541 |    "source": [
542 |     "# initialize results, add config to them\n",
543 |     "results = dict()\n",
544 |     "results[\"config\"] = config\n",
545 |     "\n",
546 |     "# initialize lowest validation loss, use to save weights\n",
547 |     "lowest_loss = 999"
548 |    ]
549 |   },
550 |   {
551 |    "cell_type": "code",
552 |    "execution_count": null,
553 |    "metadata": {},
554 |    "outputs": [],
555 |    "source": [
556 |     "# begin training\n",
557 |     "for i in range(config[\"num_epochs\"]):\n",
558 |     "    epoch_metrics = dict()\n",
559 |     "    # output an example\n",
560 |     "    output_example(model, val_dataset, device, vocab)\n",
561 |     "    # run each phase per epoch\n",
562 |     "    for phase in [\"train\", \"val\"]:\n",
563 |     "        if phase == \"train\":\n",
564 |     "            # set model to training mode\n",
565 |     "            model.train()\n",
566 |     "            dataloader = data_loader_train\n",
567 |     "            batch_size = config[\"train_batch_size\"]\n",
568 |     "        else:\n",
569 |     "            # set model to evaluation mode\n",
570 |     "            self.model.eval()\n",
571 |     "            dataloader = data_loader_val\n",
572 |     "            batch_size = config[\"val_batch_size\"]\n",
573 |     "        \n",
574 |     "        # initialize metrics\n",
575 |     "        phase_metrics = dict()\n",
576 |     "        epoch_loss = list()\n",
577 |     "        average_epoch_loss = None\n",
578 |     "        n_word_total = 0\n",
579 |     "        n_correct = 0\n",
580 |     "        n_word_correct = 0\n",
581 |     "        for i, batch in enumerate(tqdm(dataloader, mininterval=2, desc=phase, leave=False)):\n",
582 |     "            # forward\n",
583 |     "            pred, gold = forward(phase, batch, model, optimizer)\n",
584 |     "            # backward\n",
585 |     "            loss, n_correct = backward(phase, pred, gold, config)\n",
586 |     "            \n",
587 |     "            # record loss\n",
588 |     "            epoch_loss.append(loss)\n",
589 |     "            average_epoch_loss = np.mean(epoch_loss)\n",
590 |     "\n",
591 |     "            # get_accuracy\n",
592 |     "            non_pad_mask = gold.ne(transformer.Constants.PAD)\n",
593 |     "            n_word = non_pad_mask.sum().item()\n",
594 |     "            n_word_total += n_word\n",
595 |     "            n_word_correct += n_correct\n",
596 |     "            \n",
597 |     "        # record metrics\n",
598 |     "        phase_metrics[\"loss\"] = average_epoch_loss\n",
599 |     "        phase_metrics[\"token_accuracy\"] = n_word_correct / n_word_total\n",
600 |     "\n",
601 |     "        # get perplexity\n",
602 |     "        perplexity = np.exp(average_epoch_loss)\n",
603 |     "        phase_metrics[\"perplexity\"] = perplexity\n",
604 |     "        \n",
605 |     "        phase_metrics[\"time_taken\"] = time.clock() - start\n",
606 |     "        \n",
607 |     "        epoch_metrics[phase] = phase_metrics\n",
608 |     "        \n",
609 |     "        # save model if val loss is lower than any of the previous epochs\n",
610 |     "        if phase == \"val\":\n",
611 |     "            if average_epoch_loss <= lowest_loss:\n",
612 |     "                save_checkpoint(filename, model, optimizer.optimizer)\n",
613 |     "                lowest_loss = average_epoch_loss\n",
614 |     "                \n",
615 |     "    results[\"epoch_{}\".format(epoch)] = epoch_metrics"
616 |    ]
617 |   },
618 |   {
619 |    "cell_type": "code",
620 |    "execution_count": null,
621 |    "metadata": {},
622 |    "outputs": [],
623 |    "source": [
624 |     "# save results to file\n",
625 |     "with open(os.path.join(config[\"output_dir\"], \"results.json\"), 'w') as f:\n",
626 |     "    json.dump(results, f)"
627 |    ]
628 |   },
629 |   {
630 |    "cell_type": "markdown",
631 |    "metadata": {},
632 |    "source": [
633 |     "# Chat With Your Model\n",
634 |     "\n",
635 |     "Next, we can make a demo chatbot with the transformer. This is slightly different, and will use beam search. The inputs to the chatbot will be all the previous dialogue turns, the queries and responses. \n",
636 |     "\n",
637 |     "The chatbot does a beam search, and returns the n_best responses. If chose_best is true, it will output the response with the highest score. This may cause the model to be not interesting, so setting chose_best to false will cause the model to output something it may consider less probable, but possibly something different.\n",
638 |     "\n",
639 |     "The pretrained model will also output many <unk> tokens because it was trained on a large dataset with a small vocab, so many examples have these tokens, and it will predict them. (You can come up a word to replace the token in your head to make things more fun for yourself). You can also increase the number of possible results with beam_size, and n_best.\n",
640 |     "\n",
641 |     "With the vocab mapping, it creates the output sentence from the final result"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "code",
646 |    "execution_count": null,
647 |    "metadata": {},
648 |    "outputs": [],
649 |    "source": [
650 |     "# create chatbot object\n",
651 |     "chatbot = Chatbot(config, model)\n",
652 |     "history = list()\n",
653 |     "\n",
654 |     "def generate_response(query, chatbot, dataset):\n",
655 |     "    # get input features for the dialogue history\n",
656 |     "    h_seq, h_pos, h_seg = dataset.get_input_features(history)\n",
657 |     "    \n",
658 |     "    # get response from model\n",
659 |     "    response = chatbot.translate_batch(h_seq, h_pos, h_seg)\n",
660 |     "    return response\n",
661 |     "\n",
662 |     "# print the response from the input\n",
663 |     "def print_response(text_widget):\n",
664 |     "    # get query, add to the end of history \n",
665 |     "    query = text_widget.value\n",
666 |     "    history.append(query)\n",
667 |     "    # generate responses\n",
668 |     "    responses, scores = generate_response(history, chatbot, val_dataset)\n",
669 |     "    # chose response\n",
670 |     "    if config[\"choose_best\"]:\n",
671 |     "        response = responses[0][0]\n",
672 |     "    else:\n",
673 |     "        # pick a random result from the n_best\n",
674 |     "        idx=random.randint(0, max(config[\"n_best\"], config[\"beam_size\"])-1)\n",
675 |     "        response = responses[0][idx]\n",
676 |     "    \n",
677 |     "    # uncomment this line to see all the scores\n",
678 |     "    # print(\"scores in log prob: {}\\n\".format(scores[0]))\n",
679 |     "    \n",
680 |     "    # create output string\n",
681 |     "    output = \"\"\n",
682 |     "    for idx in response[:-1]:\n",
683 |     "        token = vocab.id2token[idx]\n",
684 |     "        output += \"{} \".format(token)\n",
685 |     "    print(f'{query} -> {output}')\n",
686 |     "    history.append(output)\n",
687 |     "\n",
688 |     "text_input = widgets.Text(placeholder='Type something',\n",
689 |     "                          description='String:',\n",
690 |     "                          disabled=False)\n",
691 |     "\n",
692 |     "text_input.on_submit(print_response)\n"
693 |    ]
694 |   },
695 |   {
696 |    "cell_type": "code",
697 |    "execution_count": null,
698 |    "metadata": {},
699 |    "outputs": [],
700 |    "source": [
701 |     "text_input"
702 |    ]
703 |   },
704 |   {
705 |    "cell_type": "markdown",
706 |    "metadata": {},
707 |    "source": [
708 |     "# Extra exercise 1: Sample results according to their probability scores\n",
709 |     "Set config[\"choose_best\"] to be False and uncomment the command to print scores. You should see different scores associated with the responses. They are all negative numbers because log probablity values are used.\n",
710 |     "\n",
711 |     "### Can you modify the function again, so the results are sampled according to their probability scores?\n",
712 |     "Hint: check here for a function to use https://pytorch.org/docs/stable/_modules/torch/distributions/categorical.html\n",
713 |     "\n",
714 |     "In fact it is easy to write your own sampling function too. e.g. Suppose we have a random variable X with P(X=a) = 0.6 and P(X=b) = 0.4. To sample from X, we can randomly draw a number r between [0, 1]. If r < 0.6, we pick a as our outcome, otherwise pick b.\n",
715 |     "\n",
716 |     "In our case, you need to convert the log prob scores in probability space using exp(), normalize them (so they sum to 1) and them construct the intervals."
717 |    ]
718 |   },
719 |   {
720 |    "cell_type": "markdown",
721 |    "metadata": {},
722 |    "source": [
723 |     "# Extra exercise 2: Rewrite the Position-wise Feed-Forward Network\n",
724 |     "The Transformer has a Position-wise Feed-Forward Network in each encoder and decoder layer. The source code can be found in transformer/SubLayers.py"
725 |    ]
726 |   },
727 |   {
728 |    "cell_type": "code",
729 |    "execution_count": null,
730 |    "metadata": {},
731 |    "outputs": [],
732 |    "source": [
733 |     "import torch.nn as nn\n",
734 |     "\n",
735 |     "class PositionwiseFeedForward(nn.Module):\n",
736 |     "    ''' A two-feed-forward-layer module '''\n",
737 |     "\n",
738 |     "    def __init__(self, d_in, d_hid, dropout=0.1):\n",
739 |     "        super().__init__()\n",
740 |     "        self.w_1 = nn.Conv1d(d_in, d_hid, 1) # position-wise\n",
741 |     "        self.w_2 = nn.Conv1d(d_hid, d_in, 1) # position-wise\n",
742 |     "        self.layer_norm = nn.LayerNorm(d_in)\n",
743 |     "        self.dropout = nn.Dropout(dropout)\n",
744 |     "\n",
745 |     "    def forward(self, x):\n",
746 |     "        \"\"\"\n",
747 |     "        just a feed forward linear layer that is used after attention in the\n",
748 |     "        encoder and decoder\n",
749 |     "        Args:\n",
750 |     "            x: input\n",
751 |     "        Returns:\n",
752 |     "        \"\"\"\n",
753 |     "        # feed forward\n",
754 |     "        residual = x\n",
755 |     "        output = x.transpose(1, 2)\n",
756 |     "        output = self.w_2(F.relu(self.w_1(output)))\n",
757 |     "        output = output.transpose(1, 2)\n",
758 |     "        output = self.dropout(output)\n",
759 |     "\n",
760 |     "        # Add and norm\n",
761 |     "        output = self.layer_norm(output + residual)\n",
762 |     "        return output"
763 |    ]
764 |   },
765 |   {
766 |    "cell_type": "markdown",
767 |    "metadata": {},
768 |    "source": [
769 |     "### Answer questions:\n",
770 |     "1. What is the purpose of x.transpose(1, 2)? \n",
771 |     "2. If we do not use nn.Conv1d(), can you achieve the same goal with nn.Linear()?"
772 |    ]
773 |   },
774 |   {
775 |    "cell_type": "code",
776 |    "execution_count": null,
777 |    "metadata": {},
778 |    "outputs": [],
779 |    "source": []
780 |   }
781 |  ],
782 |  "metadata": {
783 |   "kernelspec": {
784 |    "display_name": "Python 3",
785 |    "language": "python",
786 |    "name": "python3"
787 |   },
788 |   "language_info": {
789 |    "codemirror_mode": {
790 |     "name": "ipython",
791 |     "version": 3
792 |    },
793 |    "file_extension": ".py",
794 |    "mimetype": "text/x-python",
795 |    "name": "python",
796 |    "nbconvert_exporter": "python",
797 |    "pygments_lexer": "ipython3",
798 |    "version": "3.6.7"
799 |   }
800 |  },
801 |  "nbformat": 4,
802 |  "nbformat_minor": 2
803 | }
804 | 


--------------------------------------------------------------------------------