├── .gitignore ├── README.md ├── lectures ├── Alexander Rush. Conditional Text Generation and Pretraining.pdf ├── Joao Sedoc. Evaluating Conversational Agents.pdf └── README.md ├── project_ideas.md ├── remote_jupyter.md └── tutorials ├── README.md ├── deeppavlov_track ├── README.md ├── Tutorial_1_Introduction_to_Tensorflow.ipynb ├── Tutorial_2_Sentence_classification_with_word_embeddings.ipynb ├── Tutorial_Day_2_seq2seq.ipynb ├── Tutorial_Day_3_Fine_Tuning_BERT.ipynb ├── Tutorial_Day_4_Transformer_BERT_text_generation.ipynb ├── Tutorial_Day_5_Serving_with_DeepPavlov.ipynb └── img │ ├── beam_search_vs_human.png │ ├── bert_ner_diagram.png │ ├── decoding.png │ └── seq2seq_training.png └── pytorch_track ├── tutorial1_intro_pytorch.ipynb ├── tutorial2_sentnece_classification.ipynb ├── tutorial3_seq2seq_dialog.ipynb ├── tutorial4_finetuning_bert.ipynb ├── tutorial5_serving_models.py ├── tutorial5_telegram.ipynb └── tutorial_6 ├── config.json ├── dataset.py ├── requirements.txt ├── transformer ├── Beam.py ├── Constants.py ├── Layers.py ├── Models.py ├── Modules.py ├── Optim.py ├── SubLayers.py └── Translator.py └── transformer_tutorial.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Linux template 3 | *~ 4 | 5 | # temporary files which can be created if a process still has a handle open of a deleted file 6 | .fuse_hidden* 7 | 8 | # KDE directory preferences 9 | .directory 10 | 11 | # Linux trash folder which might appear on any partition or disk 12 | .Trash-* 13 | 14 | # .nfs files are created when an open file is removed but is still being accessed 15 | .nfs* 16 | ### macOS template 17 | # General 18 | .DS_Store 19 | .AppleDouble 20 | .LSOverride 21 | 22 | # Icon must end with two \r 23 | Icon 24 | 25 | # Thumbnails 26 | ._* 27 | 28 | # Files that might appear in the root of a volume 29 | .DocumentRevisions-V100 30 | .fseventsd 31 | .Spotlight-V100 32 | .TemporaryItems 33 | .Trashes 34 | .VolumeIcon.icns 35 | .com.apple.timemachine.donotpresent 36 | 37 | # Directories potentially created on remote AFP share 38 | .AppleDB 39 | .AppleDesktop 40 | Network Trash Folder 41 | Temporary Items 42 | .apdisk 43 | ### JetBrains template 44 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 45 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 46 | 47 | # User-specific stuff 48 | .idea/**/workspace.xml 49 | .idea/**/tasks.xml 50 | .idea/**/usage.statistics.xml 51 | .idea/**/dictionaries 52 | .idea/**/shelf 53 | 54 | # Sensitive or high-churn files 55 | .idea/**/dataSources/ 56 | .idea/**/dataSources.ids 57 | .idea/**/dataSources.local.xml 58 | .idea/**/sqlDataSources.xml 59 | .idea/**/dynamic.xml 60 | .idea/**/uiDesigner.xml 61 | .idea/**/dbnavigator.xml 62 | 63 | # Gradle 64 | .idea/**/gradle.xml 65 | .idea/**/libraries 66 | 67 | # Gradle and Maven with auto-import 68 | # When using Gradle or Maven with auto-import, you should exclude module files, 69 | # since they will be recreated, and may cause churn. Uncomment if using 70 | # auto-import. 71 | # .idea/modules.xml 72 | # .idea/*.iml 73 | # .idea/modules 74 | 75 | # CMake 76 | cmake-build-*/ 77 | 78 | # Mongo Explorer plugin 79 | .idea/**/mongoSettings.xml 80 | 81 | # File-based project format 82 | *.iws 83 | 84 | # IntelliJ 85 | out/ 86 | 87 | # mpeltonen/sbt-idea plugin 88 | .idea_modules/ 89 | 90 | # JIRA plugin 91 | atlassian-ide-plugin.xml 92 | 93 | # Cursive Clojure plugin 94 | .idea/replstate.xml 95 | 96 | # Crashlytics plugin (for Android Studio and IntelliJ) 97 | com_crashlytics_export_strings.xml 98 | crashlytics.properties 99 | crashlytics-build.properties 100 | fabric.properties 101 | 102 | # Editor-based Rest Client 103 | .idea/httpRequests 104 | ### Python template 105 | # Byte-compiled / optimized / DLL files 106 | __pycache__/ 107 | *.py[cod] 108 | *$py.class 109 | 110 | # C extensions 111 | *.so 112 | 113 | # Distribution / packaging 114 | .Python 115 | build/ 116 | develop-eggs/ 117 | dist/ 118 | downloads/ 119 | eggs/ 120 | .eggs/ 121 | lib/ 122 | lib64/ 123 | parts/ 124 | sdist/ 125 | var/ 126 | wheels/ 127 | *.egg-info/ 128 | .installed.cfg 129 | *.egg 130 | MANIFEST 131 | 132 | # PyInstaller 133 | # Usually these files are written by a python script from a template 134 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 135 | *.manifest 136 | *.spec 137 | 138 | # Installer logs 139 | pip-log.txt 140 | pip-delete-this-directory.txt 141 | 142 | # Unit test / coverage reports 143 | htmlcov/ 144 | .tox/ 145 | .coverage 146 | .coverage.* 147 | .cache 148 | nosetests.xml 149 | coverage.xml 150 | *.cover 151 | .hypothesis/ 152 | .pytest_cache/ 153 | 154 | # Translations 155 | *.mo 156 | *.pot 157 | 158 | # Django stuff: 159 | *.log 160 | local_settings.py 161 | db.sqlite3 162 | 163 | # Flask stuff: 164 | instance/ 165 | .webassets-cache 166 | 167 | # Scrapy stuff: 168 | .scrapy 169 | 170 | # Sphinx documentation 171 | docs/_build/ 172 | 173 | # PyBuilder 174 | target/ 175 | 176 | # Jupyter Notebook 177 | .ipynb_checkpoints 178 | 179 | # pyenv 180 | .python-version 181 | 182 | # celery beat schedule file 183 | celerybeat-schedule 184 | 185 | # SageMath parsed files 186 | *.sage.py 187 | 188 | # Environments 189 | .env 190 | .venv 191 | env/ 192 | venv/ 193 | ENV/ 194 | env.bak/ 195 | venv.bak/ 196 | 197 | # Spyder project settings 198 | .spyderproject 199 | .spyproject 200 | 201 | # Rope project settings 202 | .ropeproject 203 | 204 | # mkdocs documentation 205 | /site 206 | 207 | # mypy 208 | .mypy_cache/ 209 | 210 | 211 | data/ 212 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | CISS 2019 Materials 2 | =================== 3 | 4 | Lectures: 5 | * [lectures/](lectures/) 6 | 7 | Tutorials: 8 | * [tutorials/](tutorials/) 9 | 10 | Materials: 11 | * [Project Ideas](project_ideas.md) 12 | 13 | 14 | -------------------------------------------------------------------------------- /lectures/Alexander Rush. Conditional Text Generation and Pretraining.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/text-machine-lab/ciss2_materials/0f2423eac4c4cbdb1c3e19452203b7d6f207b1dc/lectures/Alexander Rush. Conditional Text Generation and Pretraining.pdf -------------------------------------------------------------------------------- /lectures/Joao Sedoc. Evaluating Conversational Agents.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/text-machine-lab/ciss2_materials/0f2423eac4c4cbdb1c3e19452203b7d6f207b1dc/lectures/Joao Sedoc. Evaluating Conversational Agents.pdf -------------------------------------------------------------------------------- /lectures/README.md: -------------------------------------------------------------------------------- 1 | ## Day 1: 2 | * Lecture 1: Neural Networks ([slides](https://docs.google.com/presentation/d/1NlNsgNPN2MiVWW8HOIk2CWi_IkXPnHHTV8MGmQEmMss/edit?usp=sharing), [video](https://echo360.org/media/98fb71cf-b104-444c-a114-43870e6a90c2/public)) 3 | 4 | * Lecture 2: Representing words ([slides](https://docs.google.com/presentation/d/12MFFqeaMw8uaME_eqVjx9Ua29V8HoG1GhOSpJ_Y0nP4/edit?usp=sharing), [video](https://echo360.org/media/3cd73a3c-6c74-4a36-897d-e37e80f472b2/public)) 5 | 6 | * Invited talk: Alexander Rush, Conditional Text Generation and Pretraining. ([slides](Alexander%20Rush.%20Conditional%20Text%20Generation%20and%20Pretraining.pdf), [video](https://uml.mediasite.com/Mediasite/Play/9b77f879b01a4679a5122f109957506d1d) (participants will receive the password via email)) 7 | 8 | ## Day 2 9 | * Lecture 3: Convolutional Neural Networks ([slides](https://docs.google.com/presentation/d/1G60Wv4eEpcouO2848A-8KxDAz4C7-mS1BpsEg2msKsk/edit?usp=sharing), [video](https://echo360.org/media/237e4c2e-e402-4ba3-9e4f-c13b511da39e/public)) 10 | 11 | * Lecture 4: Recurrent Neural Networks ([slides](https://docs.google.com/presentation/d/1FRWbtzmaSj-adKm_QLRYZntI6BJYrgcnTYxhP-O2hJY/edit?usp=sharing), [video](https://echo360.org/media/951e7be3-a5d2-464c-ae25-f2456ce55442/public)) 12 | 13 | * Invited talk: João Sedoc, Evaluating Conversational Agents ([slides](Joao%20Sedoc.%20Evaluating%20Conversational%20Agents.pdf), [video](https://echo360.org/media/2cdc409e-3075-4d6e-8fa0-bd287363b587/public)) 14 | 15 | ## Day 3 16 | 17 | * Lecture 4.75: Attention: A Quick Recap ([slides](https://docs.google.com/presentation/d/1_PLMA-c_hSs_0tS10yVU1kX4N6gLMXmy6vZCE8chWEk/edit?usp=sharing), [video](https://echo360.org/media/6212dbbc-21c8-418e-b0ae-85ea95dc41e2/public)) 18 | 19 | * Lecture 5: Transformers ([slides](https://docs.google.com/presentation/d/1cg18KSHtgtkewC5srMuRTGuFL8k3rmaweuwlINSLXbs/edit?usp=sharing), [video](https://echo360.org/media/9aa918f6-99b0-4f31-b264-19c6111c8759/public)) 20 | 21 | * Lecture 6: Contextualized embeddings: ELMo, GPT, BERT ([slides](https://docs.google.com/presentation/d/14dsuG-btGgvQ6IUF2ZNRjRZ9qma1oQCUrVuFcn5vVAw/edit?usp=sharing), [video](https://echo360.org/media/c986c7eb-f8da-4720-8e63-d4913c2fdb12/public)) 22 | 23 | * Invited talk: Kate Saenko, Grounding Language in Pixels ([slides](https://drive.google.com/file/d/1G6DXv5JHtrvpuJtxbDLUJuKBrm6vJT8a/view?usp=drive_web), [video](https://echo360.org/media/436cb96f-5585-4c20-82ff-89cd8d093490/public)) 24 | 25 | ## Day 4 26 | 27 | * Lecture 7: Memory-based models. External knowledge integration ([slides](https://docs.google.com/presentation/d/10ENMJINp50US2VLTRbv_gIv3N3vogNbwFmKQBq_C9B4/edit?usp=sharing), [video](https://echo360.org/media/3a8e9c57-11bd-4441-9301-c577cd0676d9/public)) 28 | * Lecture 8: Deep Question Answering ([slides](https://docs.google.com/presentation/d/1Gy-SWO18fJo3mFEta6lq9ovEjEhcH05ZN876qwF1W48/edit?usp=sharing), [video](https://echo360.org/media/c33d748f-66b4-465b-abe4-0aeda24d5049/public)) 29 | 30 | ## Day 5 31 | 32 | * Lecture 10: Multi-skill conversational agents ([slides](https://docs.google.com/presentation/d/1vp_-V_Qe9HmA0j_yHw11X0Fg0vC0BVvSCiUcEqETk8Y/edit?usp=sharing), [video](https://echo360.org/media/17cf25aa-a82d-4159-b4a4-76eadf1f7715/public)) 33 | 34 | * Lecture 11: Hierarchy in neural dialogue models ([slides](https://docs.google.com/presentation/d/1K9IAnExUJD5FdL3cHVMIFrtaFO8tt1NJIxkqVPh9kn4/edit?usp=sharing), [video](https://echo360.org/media/f15053bb-1ce6-4762-b72e-833c3e120ab3/public)) 35 | 36 | * Lecture 12: Dialogue diversity([slides](https://docs.google.com/presentation/d/1qQT3ihVJtHyczyJnKZSVxHl8CK-lq2TKF6wuqPR-NDM/edit?usp=sharing), [video](https://echo360.org/media/e08a929d-e75c-4212-a635-f9b109fa825b/public)) 37 | 38 | * Invited talk: Jason Weston, Putting Together the Threads of Conversational AI? ([video](https://echo360.org/media/55668c92-97cc-44a5-acac-354dc383c840/public)) 39 | -------------------------------------------------------------------------------- /project_ideas.md: -------------------------------------------------------------------------------- 1 | # Project Ideas 2 | 3 | This page collects several ideas of possible projects to help you figure out what might be your project to work on. 4 | 5 | 6 | ## Ideas 7 | 8 | - Encoder/decoder transformer-based chatbot 9 | - This [huggingface/pytorch-pretrained-BERT](https://github.com/huggingface/pytorch-pretrained-BERT) repository contains several Transformer-based models that you can use. 10 | - HRED-based sequence-to-sequence architecture 11 | - There is a PyTorch implmentation of [HRED](https://github.com/hsgodhia/hred) 12 | - A chat-bot with personification 13 | - [How to build a State-of-the-Art Conversational AI with Transfer Learning](https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313) 14 | - Emotions and emojis 🙈 15 | - [Understanding emotions — from Keras to pyTorch](https://medium.com/huggingface/understanding-emotions-from-keras-to-pytorch-3ccb61d5a983) 16 | - Dialog State Tracking Challenge 17 | - [Web-page](https://www.microsoft.com/en-us/research/event/dialog-state-tracking-challenge/) 18 | - Visual Dialog Challenge 19 | - [Web-page](https://visualdialog.org/challenge/2018) 20 | - Visual Question Answering 21 | - [Web-page](https://visualqa.org/) 22 | - Generate discussions on current affairs (like two or more agents posting on a forum) 23 | - You can start with this [Web-page](https://www.kaggle.com/aashita/training-a-bot-to-comment-on-current-affairs) 24 | 25 | 26 | ## Be creative! 27 | 28 | Choose a project you would really like to work on but did not have time! Remember, anything* can be framed as a question answering task, and, furthermore, as a dialog! 29 | 30 | \*According to Socher 31 | 32 | ## Datasets 33 | 34 | Below is a list of datasets that you can use in your projects: 35 | 36 | https://docs.google.com/document/d/1QVVX0YV5_ebH5M9XUtT7VveD0v2wVQy69DeKkWqhcz4/edit?usp=drivesdk 37 | 38 | 39 | 40 | ## Tips 41 | The tutorial sessions will show you how to do basic things, such as loading data, constructing a model, training and testing. 42 | Unless you are an advanced learner already, it may be a good idea to expand the code from the tutorials for your project. 43 | 44 | The dataset can be huge, and our time is limited. In early development, it may be wise to use a small subset of your training data. Do not make each development cycle hours long. 45 | 46 | An innovative project may have an innovative task, or an innovative model, or both. Something is better than nothing. After you make things work, you will have more ideas and you can always add things on top of it. 47 | -------------------------------------------------------------------------------- /remote_jupyter.md: -------------------------------------------------------------------------------- 1 | You can setup Jupyter in way so you can connect to it remotely 2 | from any computer (e.g. from your home). It can be very convenient 3 | as you don't need to go to the lab to work on a project remotely. 4 | 5 | Since the lab machines are not accessible from the outside network, 6 | you need to first connect to the cs server (cs.uml.edu) and then connect 7 | to a lab machine (i.e. dan417-01.uml.edu). Moreover, you need to 8 | forward the Jupyter port (8888) to your local machine. 9 | 10 | Below is an instruction how to achieve this on a Mac or Linux system. 11 | Windows users can do the same using [putty](https://putty.org/). 12 | 13 | Essentially, we need edit the ssh client config file, 14 | located in your home dir: `~/.ssh/config`. 15 | This file contains configurations options that are going to be used 16 | while connecting to a specific server. Note that this is a file on your local machine (your laptop). 17 | 18 | Open this file in your favourite editor (or create it if it does not exists) and type the following: 19 | ``` 20 | Host cs 21 | HostName cs.uml.edu 22 | User your_username 23 | ``` 24 | where `your_username` is your actual user name from the cs server. Save the changes and close the file. 25 | Now, if you type `ssh cs` in the terminal, the ssh client understand that the hostname should be `cs.uml.edu`, 26 | and the username should be `your_username`. Similarly, you can specify other connection options 27 | under the `Host cs` directive. 28 | 29 | Next, we will specify the port forwarding option, as well as the proxy connection using the cs server. 30 | Open the `~/.ssh/config` file again and insert the code below at the end of the file: 31 | ``` 32 | Host dan417-01 33 | Hostname dan417-01.uml.edu 34 | User your_username 35 | LocalForward 8888 127.0.0.1:8888 36 | ProxyJump cs 37 | ``` 38 | 39 | The `LowelForward` option specifies that the connections to the port `8888` on your local machine should be 40 | forwarded to the address `127.0.0.1:8888` on the remote machine. Since the Jupyter listens on this address by default, 41 | if you open the browser on your local machine and go to http://127.0.0.1:8888, you will connect to the Jupyter 42 | running on the remote server. 43 | -------------------------------------------------------------------------------- /tutorials/README.md: -------------------------------------------------------------------------------- 1 | # Videos: 2 | 3 | ## PyTorch track 4 | 5 | * [Intro to PyTorch](https://uml.mediasite.com/Mediasite/Play/66b4d7b9a4f44b2187809abcdc63c4bf1d) 6 | * [Sentence classification with word embeddings](https://uml.mediasite.com/Mediasite/Play/43c9bf608cb7480883626e8d97541a8a1d) 7 | * [Seq2seq](https://uml.mediasite.com/Mediasite/Play/d39705e0241f43cd868e2d1b152cd31e1d) 8 | * [BERT](https://uml.mediasite.com/Mediasite/Play/209e6b07904d429dbce1d0fdb92d59781d) 9 | * [Transformer-based chit-chat](https://uml.mediasite.com/Mediasite/Play/049fe8d3bc7b4e75b7dc145eead7568d1d) 10 | 11 | ## TensorFlow track 12 | * [Intro to TensorFlow](https://uml.mediasite.com/Mediasite/Play/42e45dc1b01245d2a2c23b4de984618d1d) 13 | * [Sentence classification with word embeddings](https://uml.mediasite.com/Mediasite/Play/4eb5149c7021491bbc4119dd456f9d2a1d) 14 | * [Seq2seq](https://uml.mediasite.com/Mediasite/Play/cd131dff73dd404ab8dfaef21aeaa0301d) 15 | * [BERT](https://uml.mediasite.com/Mediasite/Play/c0dfd827d65449388360468e10eaf8f61d) 16 | * [Transformer-based chit-chat](https://uml.mediasite.com/Mediasite/Play/bc26c5b8c10e4e429cbb38c7ad03dd501d) 17 | -------------------------------------------------------------------------------- /tutorials/deeppavlov_track/README.md: -------------------------------------------------------------------------------- 1 | # Tutorials links 2 | 3 | 1. Tutorial 1. Introduction to TensorFlow: https://colab.research.google.com/drive/10i1tovcAXjIRoPI8IP5flrhoGuZLUSRe 4 | 5 | 2. Tutorial 2. Sentence classification with word embeddings: https://colab.research.google.com/drive/1Dnr3wC3FBf4KS0GOVNlEbp5fg74f0FM1 6 | 7 | 3. Tutorial 3. Sequence to sequence: https://colab.research.google.com/drive/135BsS9VWUgIHwfTviKgFWRuBvMDHJAjD 8 | 9 | 4. Tutorial 4. [Fine Tuning BERT](https://colab.research.google.com/github/text-machine-lab/ciss2_materials/blob/master/tutorials/deeppavlov_track/Tutorial_Day_3_Fine_Tuning_BERT.ipynb) 10 | 11 | 5. Tutorial 5. [Transformer BERT for text generation](https://colab.research.google.com/github/text-machine-lab/ciss2_materials/blob/master/tutorials/deeppavlov_track/Tutorial_Day_4_Transformer_BERT_text_generation.ipynb) 12 | 13 | 6. Tutorial 6. [Serving with DeepPavlov](https://colab.research.google.com/github/text-machine-lab/ciss2_materials/blob/master/tutorials/deeppavlov_track/Tutorial_Day_5_Serving_with_DeepPavlov.ipynb) 14 | -------------------------------------------------------------------------------- /tutorials/deeppavlov_track/Tutorial_Day_5_Serving_with_DeepPavlov.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "5dGp8dHi_1BU" 8 | }, 9 | "source": [ 10 | "# Models serving with DeepPavlov\n", 11 | "\n", 12 | "DeepPavlov supports out of the box serving for pre-trained models and custom ones.\n", 13 | "Serving can be done with:\n", 14 | "* [REST API](http://docs.deeppavlov.ai/en/master/intro/features.html#examples-of-some-components)\n", 15 | "* [Telegram](http://docs.deeppavlov.ai/en/master/intro/features.html#examples-of-some-components)\n", 16 | "* [Amazon Alexa](http://docs.deeppavlov.ai/en/master/devguides/amazon_alexa.html)\n", 17 | "* [Microsoft Bot Framework](http://docs.deeppavlov.ai/en/master/devguides/ms_bot_integration.html)\n", 18 | " * Bing, Cortana, Email, Facebook Messenger, Slack, GroupMe, Microsoft Teams, Skype, Telegram, Twilio, Web Chat\n", 19 | "* [Yandex Alice](http://docs.deeppavlov.ai/en/master/devguides/yandex_alice.html)\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": { 25 | "colab_type": "text", 26 | "id": "xDMqRTTORX3k" 27 | }, 28 | "source": [ 29 | "## Serving DeepPavlov pre-trained models\n", 30 | "\n", 31 | "\n", 32 | "DeepPavlov has one-line commands to serve models:\n", 33 | "\n", 34 | "Run model in CLI:\n", 35 | "```\n", 36 | "python -m deeppavlov interact model_config\n", 37 | "```\n", 38 | "\n", 39 | "Serve model with REST API:\n", 40 | "```\n", 41 | "python -m deeppavlov riseapi model_config\n", 42 | "```\n", 43 | "\n", 44 | "Serve model with Telegram:\n", 45 | "```\n", 46 | "python -m deeppavlov interactbot model_config -t \n", 47 | "```\n", 48 | "\n", 49 | "\n", 50 | "Let's try some of them for Goal Oriented bot trained on DSTC 2 dataset. This bot is trained to suggest restaurants in Cambridge area.\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "colab_type": "text", 57 | "id": "7ggNsOzzVqHu" 58 | }, 59 | "source": [ 60 | "Install DeepPavlov library" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "colab": {}, 68 | "colab_type": "code", 69 | "id": "WzLfe9wBUjYU" 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "! pip install deeppavlov" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "colab": {}, 81 | "colab_type": "code", 82 | "id": "d9iIw6wnVlxH" 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "import deeppavlov" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": { 92 | "colab_type": "text", 93 | "id": "bbEEbgbJVwLU" 94 | }, 95 | "source": [ 96 | "Install requirements for Goal Oriented bot:" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "colab": {}, 104 | "colab_type": "code", 105 | "id": "wxBXKBMWVjBt" 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "! python -m deeppavlov install gobot_dstc2" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": { 115 | "colab_type": "text", 116 | "id": "OGTYPJzeWV0T" 117 | }, 118 | "source": [ 119 | "Download pre-trained model:" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "colab": {}, 127 | "colab_type": "code", 128 | "id": "z9glfej-WgBw" 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "! python -m deeppavlov download gobot_dstc2" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": { 138 | "colab_type": "text", 139 | "id": "futyYStQWmNi" 140 | }, 141 | "source": [ 142 | "Run with CLI:" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "colab": {}, 150 | "colab_type": "code", 151 | "id": "HUoI6tKjW_vI" 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "! python -m deeppavlov interact gobot_dstc2" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": { 161 | "colab_type": "text", 162 | "id": "XfywF_PNY4Jm" 163 | }, 164 | "source": [ 165 | "Serving with Telegram:\n", 166 | "```\n", 167 | "python -m deeppavlov interactbot gobot_dstc2 -t \n", 168 | "```\n", 169 | "\n", 170 | "Telegram token can be created with @BotFather bot. Details by this [link](https://core.telegram.org/bots#3-how-do-i-create-a-bot).\n", 171 | "\n", 172 | "Once you got Telegram token you can run the Goal Oriented bot." 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "colab": {}, 180 | "colab_type": "code", 181 | "id": "KHNZB6CUZvqu" 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "! python -m deeppavlov interactbot gobot_dstc2 -t " 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "colab_type": "text", 192 | "id": "VfkgYpbfYsNh" 193 | }, 194 | "source": [ 195 | "## Serving custom models\n", 196 | "\n", 197 | "We have already discussed how to serve pre-trained DeepPavlov models. But how to use deeppavlov to serve custom ones?" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": { 203 | "colab_type": "text", 204 | "id": "BrIbVM-ye5ig" 205 | }, 206 | "source": [ 207 | "### Say Hi Example\n", 208 | "\n", 209 | "Let's consider simple example:" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "colab": {}, 217 | "colab_type": "code", 218 | "id": "qkDKZndxaNaw" 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "class SayHiModel:\n", 223 | " def __init__(self, *args, **kwargs):\n", 224 | " pass\n", 225 | " \n", 226 | " def __call__(self, input_texts):\n", 227 | " '''\n", 228 | " __call__ method should return responses for each utterance in input_texts\n", 229 | " '''\n", 230 | " output_text = []\n", 231 | " for text in input_texts:\n", 232 | " output_text.append('Hi!')\n", 233 | " return output_text" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": { 239 | "colab_type": "text", 240 | "id": "V9RV6HgIcpPZ" 241 | }, 242 | "source": [ 243 | "Here we define utilitary function to generate configuration file, we need such kind of configurations for DeepPavlov lib." 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "colab": {}, 251 | "colab_type": "code", 252 | "id": "7efSG1yhbVXi" 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "def generate_config(class_name):\n", 257 | " \"\"\"generate minimal required DeepPavlov model configuration\"\"\"\n", 258 | " \n", 259 | " config = {\n", 260 | " 'chainer': {\n", 261 | " 'in': ['x'],\n", 262 | " 'out': ['y'],\n", 263 | " 'pipe': [\n", 264 | " {\n", 265 | " 'class_name': f'__main__:{class_name}',\n", 266 | " 'in': ['x'],\n", 267 | " 'out': ['y']\n", 268 | " }\n", 269 | " ]\n", 270 | " }\n", 271 | " }\n", 272 | " return config" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": { 278 | "colab_type": "text", 279 | "id": "P1ITXwmQeOey" 280 | }, 281 | "source": [ 282 | "Serving with Python API:" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "colab": {}, 290 | "colab_type": "code", 291 | "id": "eGvf2K8WcB0A" 292 | }, 293 | "outputs": [], 294 | "source": [ 295 | "# to interact with CLI\n", 296 | "from deeppavlov.core.commands.infer import interact_model\n", 297 | "# to interact with Telegram\n", 298 | "from deeppavlov.utils.telegram.telegram_ui import interact_model_by_telegram\n" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": { 305 | "colab": {}, 306 | "colab_type": "code", 307 | "id": "_Op0z9yGdqfy" 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "interact_model(generate_config('SayHiModel'))" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": { 318 | "colab": {}, 319 | "colab_type": "code", 320 | "id": "APiZBIdheU7o" 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "interact_model_by_telegram(generate_config('SayHiModel'), token='YOUR_TOKEN')" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": { 330 | "colab_type": "text", 331 | "id": "0zFMQtPRfyrD" 332 | }, 333 | "source": [ 334 | "### Serving BERT Generator from Day 4 Tutor\n", 335 | "\n", 336 | "Install requirements and download model:" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": { 343 | "colab": {}, 344 | "colab_type": "code", 345 | "id": "02JpuqCLithf" 346 | }, 347 | "outputs": [], 348 | "source": [ 349 | "! pip install git+https://github.com/deepmipt/bert.git@feat/multi_gpu\n", 350 | "! wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip\n", 351 | "! unzip uncased_L-12_H-768_A-12.zip" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": { 357 | "colab_type": "text", 358 | "id": "ZJPCe0xFi8LI" 359 | }, 360 | "source": [ 361 | "Define all required code from day 4 tutor in single cell:" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": { 368 | "colab": {}, 369 | "colab_type": "code", 370 | "id": "n0DTAAp6i58M" 371 | }, 372 | "outputs": [], 373 | "source": [ 374 | "import deeppavlov\n", 375 | "from deeppavlov.models.preprocessors.bert_preprocessor import BertPreprocessor\n", 376 | "\n", 377 | "from bert_dp import modeling\n", 378 | "\n", 379 | "\n", 380 | "BERT_MODEL_PATH = './uncased_L-12_H-768_A-12/'\n", 381 | "\n", 382 | "bert_config = modeling.BertConfig.from_json_file(BERT_MODEL_PATH + 'bert_config.json')\n", 383 | "\n", 384 | "import tensorflow as tf\n", 385 | "\n", 386 | "# we should define placeholders for BERT model\n", 387 | "input_ids_ph = tf.placeholder(shape=(None, None), dtype=tf.int32)\n", 388 | "input_masks_ph = tf.placeholder(shape=(None, None), dtype=tf.int32)\n", 389 | "token_types_ph = tf.placeholder(shape=(None, None), dtype=tf.int32)\n", 390 | "is_train_ph = tf.placeholder_with_default(False, shape=[])\n", 391 | "\n", 392 | "# this will build Tensorflow graph for BERT model\n", 393 | "bert_model = modeling.BertModel(config=bert_config,\n", 394 | " is_training=is_train_ph,\n", 395 | " input_ids=input_ids_ph,\n", 396 | " input_mask=input_masks_ph,\n", 397 | " token_type_ids=token_types_ph,\n", 398 | " use_one_hot_embeddings=False)\n", 399 | "\n", 400 | "def gather_indexes(sequence_tensor, positions):\n", 401 | " \"\"\"Gathers the vectors at the specific positions over a minibatch.\"\"\"\n", 402 | " sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)\n", 403 | " batch_size = sequence_shape[0]\n", 404 | " seq_length = sequence_shape[1]\n", 405 | " width = sequence_shape[2]\n", 406 | "\n", 407 | " flat_offsets = tf.reshape(\n", 408 | " tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])\n", 409 | " flat_positions = tf.reshape(positions + flat_offsets, [-1])\n", 410 | " flat_sequence_tensor = tf.reshape(sequence_tensor,\n", 411 | " [batch_size * seq_length, width])\n", 412 | " output_tensor = tf.gather(flat_sequence_tensor, flat_positions)\n", 413 | " return output_tensor\n", 414 | "\n", 415 | "def get_masked_lm_output(bert_config, input_tensor, output_weights, positions):\n", 416 | " \"\"\"Get probabilies for the masked LM.\n", 417 | " \n", 418 | " bert_config - instance of BertConfig\n", 419 | " input_tensor - output of bert_model.get_sequence_output()\n", 420 | " output_weights - projection matrix, here we use embeddings matrix and then transpose it\n", 421 | " positions - posistions of MASKED tokens, i.e. at witch positions we want to make predictions\n", 422 | " \"\"\"\n", 423 | " input_tensor = gather_indexes(input_tensor, positions)\n", 424 | "\n", 425 | " with tf.variable_scope(\"cls/predictions\"):\n", 426 | " # We apply one more non-linear transformation before the output layer.\n", 427 | " with tf.variable_scope(\"transform\"):\n", 428 | " input_tensor = tf.layers.dense(\n", 429 | " input_tensor,\n", 430 | " units=bert_config.hidden_size,\n", 431 | " activation=modeling.get_activation(bert_config.hidden_act),\n", 432 | " kernel_initializer=modeling.create_initializer(\n", 433 | " bert_config.initializer_range))\n", 434 | " input_tensor = modeling.layer_norm(input_tensor)\n", 435 | "\n", 436 | " # The output weights are the same as the input embeddings, but there is\n", 437 | " # an output-only bias for each token.\n", 438 | " output_bias = tf.get_variable(\n", 439 | " \"output_bias\",\n", 440 | " shape=[bert_config.vocab_size],\n", 441 | " initializer=tf.zeros_initializer())\n", 442 | " logits = tf.matmul(input_tensor, output_weights, transpose_b=True)\n", 443 | " logits = tf.nn.bias_add(logits, output_bias)\n", 444 | " probs = tf.nn.softmax(logits, axis=-1)\n", 445 | "\n", 446 | " return probs\n", 447 | " \n", 448 | "# define placeholder for MASKED tokens positions\n", 449 | "masked_lm_positions_ph = tf.placeholder(shape=(None, None), dtype=tf.int32)\n", 450 | "\n", 451 | "# define predictions for MASKED tokens \n", 452 | "masked_lm_probs = get_masked_lm_output(bert_config, \n", 453 | " bert_model.get_sequence_output(),\n", 454 | " bert_model.get_embedding_table(),\n", 455 | " masked_lm_positions_ph)\n", 456 | "\n", 457 | "# define TensorFlow session\n", 458 | "sess_config = tf.ConfigProto(allow_soft_placement=True)\n", 459 | "sess_config.gpu_options.allow_growth = True\n", 460 | "sess = tf.Session(config=sess_config)\n", 461 | "\n", 462 | "init_checkpoint = BERT_MODEL_PATH + 'bert_model.ckpt'\n", 463 | "\n", 464 | "# load from checkpoint\n", 465 | "tvars = tf.trainable_variables()\n", 466 | "assignment_map, initialized_variable_names = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)\n", 467 | "tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n", 468 | "\n", 469 | "sess.run(tf.global_variables_initializer())\n", 470 | "\n", 471 | "from bert_dp import tokenization\n", 472 | "\n", 473 | "tokenizer = tokenization.FullTokenizer(\n", 474 | " vocab_file=BERT_MODEL_PATH + 'vocab.txt',\n", 475 | " do_lower_case=True,\n", 476 | ")\n", 477 | "\n", 478 | "MASK_TOKEN = '[MASK]'\n", 479 | "MASK_ID = tokenizer.convert_tokens_to_ids([MASK_TOKEN])[0]\n", 480 | "\n", 481 | "from copy import deepcopy\n", 482 | "import numpy as np\n", 483 | "\n", 484 | "def append_tokens(input_example, token=MASK_TOKEN, token_id=MASK_ID, n=3):\n", 485 | " \"\"\"\n", 486 | " This function appends `token` to `input_example` `n` times.\n", 487 | " Also, it maintains correct values for `input_mask`, `input_ids`, `input_type_ids`.\n", 488 | " Don't forget that [SEP] token is always the last token.\n", 489 | " \n", 490 | " input_example - result of BertPreprocessor with tokens, input_ids, ...\n", 491 | " token - token to append\n", 492 | " token_id - token id to append\n", 493 | " n - how many times to append token to input_example\n", 494 | " \"\"\"\n", 495 | " input_example = deepcopy(input_example)\n", 496 | " max_seq_len = len(input_example.input_mask)\n", 497 | " input_len = sum(input_example.input_mask)\n", 498 | " \n", 499 | " # new_tokens = YOUR CODE HERE\n", 500 | " new_tokens = (input_example.tokens[:input_len - 1] + [token] * n + input_example.tokens[input_len-1:])[:max_seq_len]\n", 501 | " input_example.tokens = new_tokens\n", 502 | " assert len(new_tokens) <= max_seq_len\n", 503 | " \n", 504 | " # new_input_mask = YOUR CODE HERE\n", 505 | " new_input_mask = (input_example.input_mask[:input_len - 1] + [1] * n + input_example.input_mask[input_len-1:])[:max_seq_len]\n", 506 | " input_example.input_mask = new_input_mask\n", 507 | " assert len(new_input_mask) <= max_seq_len\n", 508 | " \n", 509 | " # new_input_ids = YOUR CODE HERE\n", 510 | " new_input_ids = (input_example.input_ids[:input_len - 1] + [token_id] * n + input_example.input_ids[input_len-1:])[:max_seq_len]\n", 511 | " input_example.input_ids = new_input_ids\n", 512 | " assert len(new_input_ids) <= max_seq_len\n", 513 | " \n", 514 | " # new_input_type_ids = YOUR CODE HERE\n", 515 | " new_input_type_ids = (input_example.input_type_ids[:input_len - 1] + [1] * n + input_example.input_type_ids[input_len-1:])[:max_seq_len]\n", 516 | " input_example.input_type_ids = new_input_type_ids\n", 517 | " assert len(new_input_type_ids) <= max_seq_len\n", 518 | " \n", 519 | " return input_example, [i for i in range(len(input_example.tokens)) if input_example.tokens[i] == MASK_TOKEN]\n", 520 | " \n", 521 | "\n", 522 | "def generate_text(input_example, sampling_method='greedy', mask_tokens_n=3, max_generated_tokens=15):\n", 523 | " \"\"\"\n", 524 | " This function generates text using input_example as initial text.\n", 525 | " \n", 526 | " Text generation stops when one of ['.', '?', '!'] symbols is predicted or \n", 527 | " achieved number of `max_generated_tokens`\n", 528 | " \"\"\"\n", 529 | " generated_example = deepcopy(input_example)\n", 530 | " for i in range(max_generated_tokens):\n", 531 | " # Firstly, we append [MASK] tokens to the end of a text.\n", 532 | " # If mask_tokens_n is too small (e.g., 1) then model will predict \".\" and generation will stop.\n", 533 | " # It happens because BERT learned that the last token in sentences is usually \".\".\n", 534 | " masked_input_example, masked_lm_positions = append_tokens(generated_example, n=mask_tokens_n)\n", 535 | " \n", 536 | " # get distribution over vocabulary for the first masked token\n", 537 | " probs = sess.run(masked_lm_probs, feed_dict={\n", 538 | " input_ids_ph: [masked_input_example.input_ids],\n", 539 | " input_masks_ph: [masked_input_example.input_mask],\n", 540 | " token_types_ph: [masked_input_example.input_type_ids],\n", 541 | " masked_lm_positions_ph: [masked_lm_positions],\n", 542 | " })[0]\n", 543 | " \n", 544 | " # sample token from vocabulary using probs\n", 545 | " if sampling_method == 'greedy':\n", 546 | " next_token_id = np.argmax(probs)\n", 547 | " else:\n", 548 | " next_token_id = sampling_method(probs)\n", 549 | " \n", 550 | " # append generated token to text\n", 551 | " next_token = tokenizer.convert_ids_to_tokens([next_token_id])[0] \n", 552 | " generated_example, _ = append_tokens(generated_example, token=next_token, token_id=next_token_id, n=1)\n", 553 | " \n", 554 | " if generated_example.tokens[-2] in ['.', '?', '!']:\n", 555 | " break\n", 556 | "\n", 557 | " return generated_example\n", 558 | " \n", 559 | "\n", 560 | "def top_k_sampling(probs, k=10):\n", 561 | " \"\"\"\n", 562 | " Sample from k tokens with the highest probabilities.\n", 563 | " Don't forget to re-normalize top k probs.\n", 564 | " \"\"\"\n", 565 | " #### YOUR CODE HERE START ####\n", 566 | " # get top k indicies from probs\n", 567 | " top_k_tokens_ids = np.argsort(probs)[::-1][:k]\n", 568 | " # get top k probabilites using top_k_tokens_ids\n", 569 | " top_k_probs = probs[top_k_tokens_ids]\n", 570 | " # make sure that sum of top_k_probs == 1\n", 571 | " top_k_probs = top_k_probs / sum(top_k_probs)\n", 572 | " #### YOUR CODE HERE END ####\n", 573 | " return top_k_tokens_ids[np.argmax(np.random.multinomial(n=1, pvals=top_k_probs))]\n" 574 | ] 575 | }, 576 | { 577 | "cell_type": "markdown", 578 | "metadata": { 579 | "colab_type": "text", 580 | "id": "o8_tHTBWoiaW" 581 | }, 582 | "source": [ 583 | "Definge BERT generator model:" 584 | ] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": null, 589 | "metadata": { 590 | "colab": {}, 591 | "colab_type": "code", 592 | "id": "IDhsH35-rd9o" 593 | }, 594 | "outputs": [], 595 | "source": [ 596 | "class BertGenerator:\n", 597 | " def __init__(self, *args, **kwargs):\n", 598 | " self.bp = BertPreprocessor(vocab_file=BERT_MODEL_PATH + 'vocab.txt', do_lower_case=True, max_seq_length=32)\n", 599 | " \n", 600 | " def __call__(self, input_texts):\n", 601 | " '''\n", 602 | " __call__ method should return responses for each utterance in input_texts\n", 603 | " '''\n", 604 | " output_text = []\n", 605 | " for text in input_texts:\n", 606 | " input_example = self.bp(texts_a = [f'- {text}'], texts_b = ['- '])[0]\n", 607 | "\n", 608 | " top_k_10_sampling = lambda x: top_k_sampling(x, 10)\n", 609 | " generated_example = generate_text(input_example, sampling_method=top_k_10_sampling)\n", 610 | " sep_index = generated_example.tokens.index('[SEP]')\n", 611 | " response = ' '.join(generated_example.tokens[sep_index + 2:-1]).replace(' ##', '').replace('##', '')\n", 612 | " output_text.append(response)\n", 613 | " return output_text" 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": { 619 | "colab_type": "text", 620 | "id": "i2yIy9bkoq4J" 621 | }, 622 | "source": [ 623 | "Interact with CLI:" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": null, 629 | "metadata": { 630 | "colab": {}, 631 | "colab_type": "code", 632 | "id": "p5_il86NkmTR" 633 | }, 634 | "outputs": [], 635 | "source": [ 636 | "interact_model(generate_config('BertGenerator'))" 637 | ] 638 | }, 639 | { 640 | "cell_type": "markdown", 641 | "metadata": { 642 | "colab_type": "text", 643 | "id": "NZiaw9d1ottZ" 644 | }, 645 | "source": [ 646 | "Interact with Telegram:" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": null, 652 | "metadata": { 653 | "colab": {}, 654 | "colab_type": "code", 655 | "id": "FMuIALP-tERC" 656 | }, 657 | "outputs": [], 658 | "source": [ 659 | "interact_model_by_telegram(generate_config('BertGenerator'), token='YOUR_TOKEN')" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": null, 665 | "metadata": {}, 666 | "outputs": [], 667 | "source": [] 668 | } 669 | ], 670 | "metadata": { 671 | "accelerator": "GPU", 672 | "colab": { 673 | "name": "Copy of Tutorial_Day_4_Transformer_BERT_text_generation.ipynb", 674 | "provenance": [], 675 | "toc_visible": true, 676 | "version": "0.3.2" 677 | }, 678 | "kernelspec": { 679 | "display_name": "Python 3", 680 | "language": "python", 681 | "name": "python3" 682 | }, 683 | "language_info": { 684 | "codemirror_mode": { 685 | "name": "ipython", 686 | "version": 3 687 | }, 688 | "file_extension": ".py", 689 | "mimetype": "text/x-python", 690 | "name": "python", 691 | "nbconvert_exporter": "python", 692 | "pygments_lexer": "ipython3", 693 | "version": "3.6.7" 694 | } 695 | }, 696 | "nbformat": 4, 697 | "nbformat_minor": 2 698 | } 699 | -------------------------------------------------------------------------------- /tutorials/deeppavlov_track/img/beam_search_vs_human.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/text-machine-lab/ciss2_materials/0f2423eac4c4cbdb1c3e19452203b7d6f207b1dc/tutorials/deeppavlov_track/img/beam_search_vs_human.png -------------------------------------------------------------------------------- /tutorials/deeppavlov_track/img/bert_ner_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/text-machine-lab/ciss2_materials/0f2423eac4c4cbdb1c3e19452203b7d6f207b1dc/tutorials/deeppavlov_track/img/bert_ner_diagram.png -------------------------------------------------------------------------------- /tutorials/deeppavlov_track/img/decoding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/text-machine-lab/ciss2_materials/0f2423eac4c4cbdb1c3e19452203b7d6f207b1dc/tutorials/deeppavlov_track/img/decoding.png -------------------------------------------------------------------------------- /tutorials/deeppavlov_track/img/seq2seq_training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/text-machine-lab/ciss2_materials/0f2423eac4c4cbdb1c3e19452203b7d6f207b1dc/tutorials/deeppavlov_track/img/seq2seq_training.png -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial1_intro_pytorch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import torch\n", 11 | "import torch.utils.data\n", 12 | "import sklearn.datasets\n", 13 | "from sklearn.metrics import accuracy_score" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "## Introduction" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "As NumPy, PyTorch provides basic functions for creating tensors and common operations on them." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "a = torch.ones(5)\n", 37 | "b = torch.full_like(a, 5)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "a" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "b" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "a + b" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "However, in contrast to NumPy, PyTorch can perform computations on GPU.\n", 72 | " - See the [CUDA semantics](https://pytorch.org/docs/stable/notes/cuda.html) documentation for details, including how to write device-agnositc code" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "a.device" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "a = a.to('cuda')" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "a.device" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "# a + b" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "b = torch.full_like(a, 5)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "a + b" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "## Neural Networks" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "Since PyTorch allows automatic differentiation, building neural networks with PyTorch is very easy.\n", 141 | "\n", 142 | "All the models implemented in PyTorch should subclass the [`torch.nn.Module` class](https://pytorch.org/docs/stable/nn.html?highlight=module#torch.nn.Module). The main method of this class (which is used by a lot of other PyTorch classes) is `forward()`. This is the core method that defines how your model is going to run and what outputs it should produce given the inputs. \n", 143 | "In the constructor of the your model (the `__init__` method) you should initialize all the layers you are going to use. PyTorch provides a large amount of commonly used layers that are very easy to use. Please refer to the [documentation of PyTorch](https://pytorch.org/docs/stable/nn.html) for a complete list of layers.\n", 144 | "\n", 145 | "Below we are going to declare a simple neural network with two layers and a ReLU activation function between them." 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "class Net(torch.nn.Module):\n", 155 | " \"\"\"A basic neural network model with one layer\"\"\"\n", 156 | " def __init__(self, nb_features, hidden_size, nb_classes):\n", 157 | " \"\"\"\n", 158 | " Initialize the model class\n", 159 | "\n", 160 | " :param nb_features: Number of input feature\n", 161 | " :param hidden_size: The size of the hidden layer\n", 162 | " :param nb_classes: Number of classes for classification\n", 163 | "\n", 164 | " \"\"\"\n", 165 | "\n", 166 | " super().__init__()\n", 167 | "\n", 168 | " self.fc1 = torch.nn.Linear(nb_features, hidden_size)\n", 169 | " self.fc1_activ = torch.nn.ReLU()\n", 170 | "\n", 171 | " self.fc_logits = torch.nn.Linear(hidden_size, nb_classes)\n", 172 | "\n", 173 | " def forward(self, inputs):\n", 174 | " \"\"\"\n", 175 | " Perform the forward pass on the input data\n", 176 | "\n", 177 | " :param inputs: input data\n", 178 | "\n", 179 | " \"\"\"\n", 180 | " z1 = self.fc1(inputs)\n", 181 | " z1_active = self.fc1_activ(z1)\n", 182 | "\n", 183 | " logits = self.fc_logits(z1_active)\n", 184 | "\n", 185 | " return logits" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "model = Net(nb_features=4, hidden_size=8, nb_classes=3)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "model" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "Let's test the model on a random input. Notice how the size of the input data correspond to the size of the first layer and the size of the output correspond to the size of the last layer." 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "inputs = torch.rand(1, 4)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "inputs" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "outputs = model(inputs)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "outputs" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "### Loss calcualtion" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "PyTorch has a quite a few pre-defined loss functions that we can use. Most common loss functions are enumerated below:\n", 261 | " - [Mean Squared Error loss](https://pytorch.org/docs/stable/nn.html#torch.nn.MSELoss)\n", 262 | " - [Cross Entropy loss](https://pytorch.org/docs/stable/nn.html#torch.nn.CrossEntropyLoss)\n", 263 | " - [Binary Cross Entropy loss](https://pytorch.org/docs/stable/nn.html#torch.nn.BCELoss)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "targets = torch.rand_like(outputs)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "targets" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "criterion = torch.nn.MSELoss()\n", 291 | "loss = criterion(outputs, targets)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "loss" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "### Gradients" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "After calling `loss.backward()` PyTorch performs the backward pass of the network and stores the gradients of the weights." 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "model.zero_grad()" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "print('fc1.bias before backward')\n", 333 | "print(model.fc1.bias.grad)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "loss.backward()" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "print('fc1.bias after backward')\n", 352 | "print(model.fc1.bias.grad)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "### Parameters update" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "Alongside with the loss functions, PyTorch provides several differnet optmizers, ranging from the classical [Stochastic Gradient Descent](https://pytorch.org/docs/stable/optim.html#torch.optim.SGD) to [RMSprop](https://pytorch.org/docs/stable/optim.html#torch.optim.RMSprop) and [Adam](https://pytorch.org/docs/stable/optim.html#torch.optim.Adam)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "optimizer = torch.optim.Adam(model.parameters(), lr=0.001)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "In general, training loop consists of the following parts:\n", 383 | "1. Clearing the gradients\n", 384 | "2. Obtaining inputs and targets, and, possibly, moving them to the GPU\n", 385 | "3. Performing the forward pass of the model\n", 386 | "4. Calculating the loss\n", 387 | "5. Performing the backward pass\n", 388 | "6. Updating the weights of the network" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "optimizer.zero_grad()\n", 398 | "\n", 399 | "inputs = torch.rand(1, 4)\n", 400 | "targets = torch.rand(1, 3)\n", 401 | "\n", 402 | "outputs = model(inputs)\n", 403 | "\n", 404 | "loss = criterion(outputs, targets)\n", 405 | "\n", 406 | "loss.backward()\n", 407 | "optimizer.step()" 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "metadata": {}, 413 | "source": [ 414 | "## Data loading" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": {}, 421 | "outputs": [], 422 | "source": [ 423 | "iris_data = sklearn.datasets.load_iris()" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "iris_data.feature_names" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [ 441 | "iris_data.data[:10,:]" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "iris_data.target_names" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": { 457 | "scrolled": true 458 | }, 459 | "outputs": [], 460 | "source": [ 461 | "iris_data.target" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": {}, 467 | "source": [ 468 | "The [Dataset](https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.Dataset) class provided by PyTorch is an abstract class representing any dataset used as input to a model. It is conveniently designed in a way that all the classes subclassing it would only have to override `__len__` and `__getitem__` methods. The goal of the `__getitem__` method is, given an index, to return the corresponding input data\n", 469 | "\n", 470 | "You might find it useful to have a look at the official [Data Loading and Processing Tutorial](https://pytorch.org/tutorials/beginner/data_loading_tutorial.html) on the PyTorch website." 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "class IrisDataset(torch.utils.data.Dataset):\n", 480 | " \"\"\"A PyTorch dataset for the Scikit-learn Iris data\"\"\"\n", 481 | " def __init__(self, data):\n", 482 | " \"\"\"\n", 483 | " Initialize the dataset class\n", 484 | "\n", 485 | " :param data: Scikit-learn Iris data\n", 486 | "\n", 487 | " \"\"\"\n", 488 | " self.features_names = data.feature_names\n", 489 | " self.target_names = data.target_names\n", 490 | " self.X = data.data.astype(np.float32)\n", 491 | " self.y = data.target\n", 492 | "\n", 493 | " def __getitem__(self, index):\n", 494 | " \"\"\"\n", 495 | " Return the item by its index\n", 496 | "\n", 497 | " :param index: index of the item\n", 498 | "\n", 499 | " \"\"\"\n", 500 | " X = self.X[index]\n", 501 | " y = self.y[index]\n", 502 | "\n", 503 | " return X, y\n", 504 | "\n", 505 | " def __len__(self):\n", 506 | " \"\"\" Return the length of the dataset \"\"\"\n", 507 | " return len(self.y)" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [ 516 | "dataset = IrisDataset(iris_data)" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": null, 522 | "metadata": {}, 523 | "outputs": [], 524 | "source": [ 525 | "len(dataset)" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": null, 531 | "metadata": {}, 532 | "outputs": [], 533 | "source": [ 534 | "dataset[0]" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": {}, 540 | "source": [ 541 | "[DataLoader](https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader) is another useful class of PyTorch that combines a dataset and a sampler, and provides single- or multi-process iterators over the dataset. The goal of data loader is to create batches of training examples for the network by sampling the dataset and combining the sampled items into batches." 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "metadata": {}, 548 | "outputs": [], 549 | "source": [ 550 | "dataloader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True)" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "metadata": {}, 557 | "outputs": [], 558 | "source": [ 559 | "len(dataloader)" 560 | ] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": {}, 565 | "source": [ 566 | "## Training loop" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": null, 572 | "metadata": {}, 573 | "outputs": [], 574 | "source": [ 575 | "nb_features = dataset.X.shape[1]\n", 576 | "hidden_size = 32\n", 577 | "nb_classes = len(set(dataset.y))\n", 578 | "\n", 579 | "model = Net(nb_features, hidden_size, nb_classes)\n", 580 | "model = model.to('cuda')" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": {}, 587 | "outputs": [], 588 | "source": [ 589 | "model" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": null, 595 | "metadata": {}, 596 | "outputs": [], 597 | "source": [ 598 | "criterion = torch.nn.CrossEntropyLoss()\n", 599 | "optimizer = torch.optim.Adam(model.parameters(), lr=0.001)" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": null, 605 | "metadata": {}, 606 | "outputs": [], 607 | "source": [ 608 | "nb_epochs = 9\n", 609 | "\n", 610 | "for i in range(nb_epochs):\n", 611 | " epoch_losses = []\n", 612 | " for X_batch, y_batch in dataloader:\n", 613 | " model.train()\n", 614 | " optimizer.zero_grad()\n", 615 | " \n", 616 | " X_batch = X_batch.to('cuda')\n", 617 | " y_batch = y_batch.to('cuda')\n", 618 | " \n", 619 | " logits = model(X_batch)\n", 620 | " loss = criterion(logits, y_batch)\n", 621 | " \n", 622 | " loss.backward()\n", 623 | " optimizer.step()\n", 624 | " \n", 625 | " epoch_losses.append(loss.item())\n", 626 | " \n", 627 | " epoch_loss = np.mean(epoch_losses)\n", 628 | " print(f'Epoch: {i+1}, loss: {epoch_loss:.3f}')" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": null, 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": null, 641 | "metadata": {}, 642 | "outputs": [], 643 | "source": [] 644 | } 645 | ], 646 | "metadata": { 647 | "kernelspec": { 648 | "display_name": "Python 3", 649 | "language": "python", 650 | "name": "python3" 651 | }, 652 | "language_info": { 653 | "codemirror_mode": { 654 | "name": "ipython", 655 | "version": 3 656 | }, 657 | "file_extension": ".py", 658 | "mimetype": "text/x-python", 659 | "name": "python", 660 | "nbconvert_exporter": "python", 661 | "pygments_lexer": "ipython3", 662 | "version": "3.7.3" 663 | } 664 | }, 665 | "nbformat": 4, 666 | "nbformat_minor": 2 667 | } 668 | -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial2_sentnece_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import csv\n", 11 | "import shutil\n", 12 | "import zipfile\n", 13 | "import pickle\n", 14 | "import itertools\n", 15 | "import urllib.parse\n", 16 | "import urllib.request\n", 17 | "from collections import Counter\n", 18 | "\n", 19 | "import numpy as np\n", 20 | "import torch\n", 21 | "import torch.utils.data\n", 22 | "import sklearn.datasets\n", 23 | "from sklearn.metrics import accuracy_score" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Introduction" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "In this turorial, we will build a simple neural network for sentence classification using word embeddings. The model simply sums up the embeddings of the tokens in the sentence and pass it through several fully connected layers." 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## Dataset" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "We will use the [Stanford Sentiment Treebank](https://nlp.stanford.edu/sentiment/index.html) dataset, converted into a two-way classification problem, where the goal is given an input sentence to determine is it positive or negative." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "def maybe_download_and_unzip_file(file_url, file_name=None):\n", 61 | " \"\"\"\n", 62 | " Download and unzip a remote archive if it does not exists yet\n", 63 | "\n", 64 | " :param file_url: Url of the archive\n", 65 | " :param file_name: (Default value = None) The filename to save the content\n", 66 | "\n", 67 | " \"\"\"\n", 68 | " if file_name is None:\n", 69 | " file_name = os.path.basename(file_url)\n", 70 | " \n", 71 | " if not os.path.exists(file_name):\n", 72 | " print(f'Downloading: {file_name}')\n", 73 | " \n", 74 | " with urllib.request.urlopen(file_url) as response, open(file_name, 'wb') as target_file:\n", 75 | " shutil.copyfileobj(response, target_file)\n", 76 | "\n", 77 | " print(f'Downloaded: {file_name}')\n", 78 | " \n", 79 | " if os.path.splitext(file_name)[1] == '.zip':\n", 80 | " print(f'Extracting: {file_name}')\n", 81 | " with zipfile.ZipFile(file_name, 'r') as zip_file:\n", 82 | " zip_file.extractall('.')\n", 83 | " \n", 84 | " else:\n", 85 | " print(f'Exists: {file_name}')" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "dataset_url = 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8'\n", 95 | "dataset_filename = 'SST-2.zip'" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "train_filename = 'SST-2/train.tsv'\n", 105 | "val_filename = 'SST-2/dev.tsv'" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "maybe_download_and_unzip_file(dataset_url, dataset_filename)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "### Vocabulary\n", 122 | "\n", 123 | "Before the data gets loaded into the model, it has to be converted from raw text to a numeric representation. One way to achieve this is to introduce a token-to-id mapping. More specifically, we will use a vocabulary class that maintains the mapping between tokens and their IDs, and that is able to flexibly add tokens and prune the vocabulary based on the token counts. When the input dataset is very large, vocabulary pruning is widely used in practice for more efficient memory usage." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "class Vocab(object):\n", 133 | " \"\"\" Vocabulary class to provide token to id correpondance \"\"\"\n", 134 | " END_TOKEN = ''\n", 135 | " START_TOKEN = ''\n", 136 | " PAD_TOKEN = ''\n", 137 | " UNK_TOKEN = ''\n", 138 | "\n", 139 | " def __init__(self, special_tokens=None):\n", 140 | " \"\"\"\n", 141 | " Initialize the vocabulary class\n", 142 | "\n", 143 | " :param special_tokens: (Default value = None) A list of special tokens. The PAD token should be the first in the list, if used.\n", 144 | "\n", 145 | " \"\"\"\n", 146 | " super().__init__()\n", 147 | "\n", 148 | " self.special_tokens = special_tokens\n", 149 | "\n", 150 | " self.token2id = {}\n", 151 | " self.id2token = {}\n", 152 | "\n", 153 | " self.token_counts = Counter()\n", 154 | "\n", 155 | " if self.special_tokens is not None:\n", 156 | " self.add_document(self.special_tokens)\n", 157 | "\n", 158 | " def add_document(self, document, rebuild=True):\n", 159 | " \"\"\"\n", 160 | " Process the document and add tokens from the it to the vocabulary\n", 161 | "\n", 162 | " :param document: A list of tokens in the document\n", 163 | " :param rebuild: (Default value = True) Whether to rebuild the token2id correspondance or not\n", 164 | "\n", 165 | " \"\"\"\n", 166 | " for token in document:\n", 167 | " self.token_counts[token] += 1\n", 168 | "\n", 169 | " if token not in self.token2id:\n", 170 | " self.token2id[token] = len(self.token2id)\n", 171 | "\n", 172 | " if rebuild:\n", 173 | " self._rebuild_id2token()\n", 174 | "\n", 175 | " def add_documents(self, documents):\n", 176 | " \"\"\"\n", 177 | " Process a list of documents and tokens from the them to the vocabulary\n", 178 | "\n", 179 | " :param documents: A list of documents, where each document is a list of tokens\n", 180 | "\n", 181 | " \"\"\"\n", 182 | " for doc in documents:\n", 183 | " self.add_document(doc, rebuild=False)\n", 184 | "\n", 185 | " self._rebuild_id2token()\n", 186 | "\n", 187 | " def _rebuild_id2token(self):\n", 188 | " \"\"\" Revuild the token to id correspondance \"\"\"\n", 189 | " self.id2token = {i: t for t, i in self.token2id.items()}\n", 190 | "\n", 191 | " def get(self, item, default=None):\n", 192 | " \"\"\"\n", 193 | " Given a token, return the corresponding id\n", 194 | "\n", 195 | " :param item: A token\n", 196 | " :param default: (Default value = None) Default value to return if token is not present in the vocabulary\n", 197 | "\n", 198 | " \"\"\"\n", 199 | " return self.token2id.get(item, default)\n", 200 | "\n", 201 | " def __getitem__(self, item):\n", 202 | " \"\"\"\n", 203 | " Given a token, return the corresponding id\n", 204 | "\n", 205 | " :param item: A token\n", 206 | "\n", 207 | " \"\"\"\n", 208 | " return self.token2id[item]\n", 209 | "\n", 210 | " def __contains__(self, item):\n", 211 | " \"\"\"\n", 212 | " Check if a token is present in the vocabulary\n", 213 | "\n", 214 | " :param item: A token\n", 215 | "\n", 216 | " \"\"\"\n", 217 | " return item in self.token2id\n", 218 | "\n", 219 | " def __len__(self):\n", 220 | " \"\"\" Return the length of the vocabulary \"\"\"\n", 221 | " return len(self.token2id)\n", 222 | "\n", 223 | " def __str__(self):\n", 224 | " \"\"\" Get a string representation of the vocabulary \"\"\"\n", 225 | " return f'{len(self)} tokens'" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "Now, let's create a dataset class. Notice how the vocabulary can be shared between the train and the test datasets." 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "class SSTDataset(torch.utils.data.Dataset):\n", 242 | " \"\"\" \"\"\"\n", 243 | " def __init__(self, filename, vocab=None, max_len=None):\n", 244 | " \"\"\"\n", 245 | " Initialize the Stanford Sentiment Treebank Dataset\n", 246 | "\n", 247 | " :param filename: Path to the dataset from the GLUE benchmark\n", 248 | " :param vocab: (Default value = None) Vocabulary to use, will be created if None\n", 249 | " :param max_len: (Default value = None) Maximum length of the sentneces. The longer sentences will be cut\n", 250 | "\n", 251 | " \"\"\"\n", 252 | " super().__init__()\n", 253 | " \n", 254 | " data = self._load_file(filename)\n", 255 | " \n", 256 | " self.sentences = [sent.split(' ') for sent, label in data]\n", 257 | " self.labels = [int(label) for sent, label in data]\n", 258 | " \n", 259 | " print(f'Sentences: {len(self.sentences)}')\n", 260 | " print(f'Labels: {len(self.labels)}')\n", 261 | " \n", 262 | " if vocab is None: \n", 263 | " vocab = Vocab(special_tokens=[Vocab.PAD_TOKEN, Vocab.UNK_TOKEN])\n", 264 | " vocab.add_documents(self.sentences)\n", 265 | " print(f'Creating vocab: {vocab}')\n", 266 | " \n", 267 | " if max_len is None:\n", 268 | " max_len = max(len(s) for s in itertools.chain.from_iterable(self.sentences))\n", 269 | " print(f'Calculating max len: {max_len}')\n", 270 | " \n", 271 | " self.max_len = max_len\n", 272 | " self.vocab = vocab\n", 273 | " \n", 274 | " def _load_file(self, filename):\n", 275 | " \"\"\"\n", 276 | " Read the dataset from the file\n", 277 | "\n", 278 | " :param filename: Path to the dataset\n", 279 | "\n", 280 | " \"\"\"\n", 281 | " with open(filename, 'r') as csv_file:\n", 282 | " reader = csv.DictReader(csv_file, delimiter='\\t')\n", 283 | " data = [(r['sentence'].strip(), r['label']) for r in reader]\n", 284 | " \n", 285 | " return data\n", 286 | " \n", 287 | " def _pad_sentnece(self, sent):\n", 288 | " \"\"\"\n", 289 | " Cut the sentence if needed and pad it to the maximum len\n", 290 | "\n", 291 | " :param sent: The input sentnece\n", 292 | "\n", 293 | " \"\"\"\n", 294 | " sent = sent[:self.max_len]\n", 295 | " \n", 296 | " nb_pad = self.max_len - len(sent)\n", 297 | " sent = sent + [Vocab.PAD_TOKEN,] * nb_pad\n", 298 | " \n", 299 | " return sent\n", 300 | " \n", 301 | " def __getitem__(self, index):\n", 302 | " \"\"\"\n", 303 | " Return a processed and ready to be batched item from the dataset by its index\n", 304 | "\n", 305 | " :param index: The index of the sentence in the dataset\n", 306 | "\n", 307 | " \"\"\"\n", 308 | " sent = self.sentences[index]\n", 309 | " label = self.labels[index]\n", 310 | " \n", 311 | " sent = self._pad_sentnece(sent)\n", 312 | " sent = [self.vocab[t] if t in self.vocab else self.vocab[Vocab.UNK_TOKEN] for t in sent]\n", 313 | " sent = np.array(sent, dtype=np.long)\n", 314 | " \n", 315 | " return sent, label\n", 316 | " \n", 317 | " def __len__(self):\n", 318 | " \"\"\" Return the length of the dataset \"\"\"\n", 319 | " return len(self.labels)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "dataset_train = SSTDataset(train_filename)" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "dataset_val = SSTDataset(val_filename, vocab=dataset_train.vocab, max_len = dataset_train.max_len)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "dataset_train[0]" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "## Word embeddings" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "We'll use the [fastText](https://fasttext.cc/) embeddings, trained on Common Crawl. We've conveted them into a dictionary and pickled them using the standard `pickle` module." 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "embeddings_url = 'https://mednli.blob.core.windows.net/shared/word_embeddings/crawl-300d-2M.pickled'\n", 370 | "embeddings_filename = 'crawl-300d-2M.pickled'" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "scrolled": true 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "maybe_download_and_unzip_file(embeddings_url, embeddings_filename)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "with open(embeddings_filename, 'rb') as pkl_file:\n", 391 | " word_embeddings = pickle.load(pkl_file)" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "print(f'Word embeddings: {len(word_embeddings)} tokens, shape {word_embeddings[list(word_embeddings.keys())[0]].shape}')" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "list(word_embeddings.keys())[:10]" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "word_embeddings['cat'].shape" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "word_embeddings['cat'][:20]" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "### Embedding matrix" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "Since we do not need all the embeddings, let's create a matrix, where each row will correspond to a token in the vocabulary and will contain the corresponding embedding." 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "def create_embeddings_matrix(word_embeddings, vocab):\n", 451 | " \"\"\"\n", 452 | " Given word embeddings dictionary and the vocabulary, construct the embeddings martix, where each row corresponds to a token and contains the embedding of this token\n", 453 | "\n", 454 | " :param word_embeddings: Word embeddings dictionary, token -> numpy array\n", 455 | " :param vocab: Vocabulary\n", 456 | "\n", 457 | " \"\"\"\n", 458 | " embedding_size = word_embeddings[list(word_embeddings.keys())[0]].shape[0]\n", 459 | "\n", 460 | " W_emb = np.zeros((len(vocab), embedding_size), dtype=np.float32)\n", 461 | " \n", 462 | " special_tokens = {\n", 463 | " t: np.random.uniform(-0.3, 0.3, (embedding_size,))\n", 464 | " for t in (Vocab.UNK_TOKEN, )\n", 465 | " }\n", 466 | " special_tokens[Vocab.PAD_TOKEN] = np.zeros((embedding_size,))\n", 467 | "\n", 468 | " nb_unk = 0\n", 469 | " for i, t in vocab.id2token.items():\n", 470 | " if t in special_tokens:\n", 471 | " W_emb[i] = special_tokens[t]\n", 472 | " else:\n", 473 | " if t in word_embeddings:\n", 474 | " W_emb[i] = word_embeddings[t]\n", 475 | " else:\n", 476 | " W_emb[i] = np.random.uniform(-0.3, 0.3, embedding_size)\n", 477 | " nb_unk += 1\n", 478 | "\n", 479 | " print(f'Nb unk: {nb_unk}')\n", 480 | "\n", 481 | " return W_emb" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "metadata": {}, 488 | "outputs": [], 489 | "source": [ 490 | "len(dataset_train.vocab)" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "W_emb = create_embeddings_matrix(word_embeddings, dataset_train.vocab)" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": {}, 505 | "source": [ 506 | "## Model" 507 | ] 508 | }, 509 | { 510 | "cell_type": "markdown", 511 | "metadata": {}, 512 | "source": [ 513 | "Finally, let's declare a simple model. Notice how we put fully connected layers inside a `torch.nn.Sequential` container." 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "metadata": {}, 520 | "outputs": [], 521 | "source": [ 522 | "class BOWModel(torch.nn.Module):\n", 523 | " \"\"\" \"\"\"\n", 524 | " def __init__(self, vocab_size, embedding_size, hidden_size, dropout, trainable_embeddings, nb_classes, pad_index, W_emb=None):\n", 525 | " \"\"\"\n", 526 | " Initialize a simple feedforward Bag-of-words model with several hidden layers\n", 527 | "\n", 528 | " :param vocab_size: Vocabulary size\n", 529 | " :param embedding_size: Dmension of the embeddings\n", 530 | " :param hidden_size: The size of the hidden layers\n", 531 | " :param dropout: Probability of the dropout \n", 532 | " :param trainable_embeddings: Whether the embedding layer will be trainable or frozen\n", 533 | " :param nb_classes: Number of the classes to classify the input to\n", 534 | " :param pad_index: Index of the PAD token\n", 535 | " :param W_emb: (Default value = None) Initial values of the embedding layer, a numpy array\n", 536 | "\n", 537 | " \"\"\"\n", 538 | " super().__init__()\n", 539 | "\n", 540 | " self.pad_index = pad_index\n", 541 | " \n", 542 | " self.embedding = torch.nn.Embedding(vocab_size, embedding_size, padding_idx=pad_index)\n", 543 | " if W_emb is not None:\n", 544 | " self.embedding.weight.data.copy_(torch.from_numpy(W_emb))\n", 545 | " if not trainable_embeddings:\n", 546 | " self.embedding.weight.requires_grad = False\n", 547 | "\n", 548 | " self.classifier = torch.nn.Sequential(\n", 549 | " torch.nn.Linear(embedding_size, hidden_size),\n", 550 | " torch.nn.ReLU(),\n", 551 | " torch.nn.Dropout(dropout),\n", 552 | " torch.nn.Linear(hidden_size, hidden_size),\n", 553 | " torch.nn.ReLU(),\n", 554 | " torch.nn.Dropout(dropout),\n", 555 | " torch.nn.Linear(hidden_size, nb_classes),\n", 556 | " )\n", 557 | "\n", 558 | " \n", 559 | " def forward(self, inputs):\n", 560 | " \"\"\"\n", 561 | " Perform the forward pass of the model\n", 562 | "\n", 563 | " :param inputs: Input sentnences\n", 564 | "\n", 565 | " \"\"\"\n", 566 | " embedded = self.embedding(inputs)\n", 567 | " inputs_lengths = torch.sum(inputs != self.pad_index, dim=1).long()\n", 568 | " \n", 569 | " z = torch.sum(embedded, dim=1) / inputs_lenghts.unsqueeze(-1).float()\n", 570 | " \n", 571 | " logits = self.classifier(z)\n", 572 | " \n", 573 | " return logits" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "hidden_size = 128\n", 583 | "dropout = 0.3\n", 584 | "trainable_embeddings = False" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "model = BOWModel(\n", 594 | " vocab_size=len(dataset_train.vocab), \n", 595 | " embedding_size = W_emb.shape[1], \n", 596 | " hidden_size=hidden_size, \n", 597 | " dropout=dropout, \n", 598 | " trainable_embeddings=trainable_embeddings, \n", 599 | " nb_classes=len(set(dataset_train.labels)), \n", 600 | " pad_index=dataset_train.vocab[Vocab.PAD_TOKEN], \n", 601 | " W_emb=W_emb\n", 602 | ")" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "metadata": {}, 609 | "outputs": [], 610 | "source": [ 611 | "model = model.to('cuda')" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": null, 617 | "metadata": {}, 618 | "outputs": [], 619 | "source": [ 620 | "model" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "metadata": {}, 626 | "source": [ 627 | "## Training" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": null, 633 | "metadata": {}, 634 | "outputs": [], 635 | "source": [ 636 | "batch_size=256\n", 637 | "nb_epochs = 5\n", 638 | "learning_rate=0.001\n", 639 | "weight_decay = 0.00001" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": null, 645 | "metadata": {}, 646 | "outputs": [], 647 | "source": [ 648 | "dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True)\n", 649 | "dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=batch_size, shuffle=False)" 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": null, 655 | "metadata": {}, 656 | "outputs": [], 657 | "source": [ 658 | "criterion = torch.nn.CrossEntropyLoss()\n", 659 | "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)" 660 | ] 661 | }, 662 | { 663 | "cell_type": "markdown", 664 | "metadata": {}, 665 | "source": [ 666 | "Run the training!" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": null, 672 | "metadata": {}, 673 | "outputs": [], 674 | "source": [ 675 | "for i in range(nb_epochs):\n", 676 | " epoch_losses_train = []\n", 677 | " epoch_losses_val = []\n", 678 | " epoch_predictions = []\n", 679 | " epoch_targets = []\n", 680 | " \n", 681 | " for inputs, targets in dataloader_train:\n", 682 | " model.train()\n", 683 | " optimizer.zero_grad()\n", 684 | " \n", 685 | " inputs = inputs.to('cuda')\n", 686 | " targets = targets.to('cuda')\n", 687 | " \n", 688 | " logits = model(inputs)\n", 689 | " loss = criterion(logits, targets)\n", 690 | " \n", 691 | " loss.backward()\n", 692 | " optimizer.step()\n", 693 | " \n", 694 | " epoch_losses_train.append(loss.item())\n", 695 | "\n", 696 | " # calc accuracy on the dev set\n", 697 | " for inputs, targets in dataloader_val:\n", 698 | " model.eval()\n", 699 | " \n", 700 | " with torch.no_grad():\n", 701 | " inputs = inputs.to('cuda')\n", 702 | " targets = targets.to('cuda')\n", 703 | "\n", 704 | " logits = model(inputs)\n", 705 | " loss = criterion(logits, targets)\n", 706 | " pred = torch.argmax(logits, dim=1)\n", 707 | "\n", 708 | " epoch_losses_val.append(loss.item())\n", 709 | " epoch_predictions.append(pred.cpu().numpy())\n", 710 | " epoch_targets.append(targets.cpu().numpy())\n", 711 | " \n", 712 | " epoch_predictions = np.concatenate(epoch_predictions, axis=0)\n", 713 | " epoch_targets = np.concatenate(epoch_targets, axis=0)\n", 714 | " epoch_accuracy = accuracy_score(epoch_targets, epoch_predictions)\n", 715 | " epoch_loss_train = np.mean(epoch_losses_train)\n", 716 | " epoch_loss_val = np.mean(epoch_losses_val) \n", 717 | " \n", 718 | " print(f'Epoch: {i+1}, train loss: {epoch_loss_train:.3f}, val loss: {epoch_loss_val:.3f}, accuracy: {epoch_accuracy:.3f}')" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": null, 724 | "metadata": {}, 725 | "outputs": [], 726 | "source": [] 727 | } 728 | ], 729 | "metadata": { 730 | "kernelspec": { 731 | "display_name": "Python 3", 732 | "language": "python", 733 | "name": "python3" 734 | }, 735 | "language_info": { 736 | "codemirror_mode": { 737 | "name": "ipython", 738 | "version": 3 739 | }, 740 | "file_extension": ".py", 741 | "mimetype": "text/x-python", 742 | "name": "python", 743 | "nbconvert_exporter": "python", 744 | "pygments_lexer": "ipython3", 745 | "version": "3.7.3" 746 | } 747 | }, 748 | "nbformat": 4, 749 | "nbformat_minor": 2 750 | } 751 | -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial4_finetuning_bert.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import csv\n", 10 | "import logging\n", 11 | "import os\n", 12 | "import random\n", 13 | "import sys\n", 14 | "\n", 15 | "import numpy as np\n", 16 | "import torch\n", 17 | "from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset\n", 18 | "from torch.utils.data.distributed import DistributedSampler\n", 19 | "from tqdm import tqdm, trange, tqdm_notebook\n", 20 | "\n", 21 | "from torch.nn import CrossEntropyLoss\n", 22 | "from sklearn.metrics import f1_score\n", 23 | "\n", 24 | "from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME\n", 25 | "from pytorch_pretrained_bert.modeling import BertModel, BertForMaskedLM, BertForSequenceClassification, BertConfig\n", 26 | "from pytorch_pretrained_bert.tokenization import BertTokenizer\n", 27 | "from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Introduction" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "In this turorial, we will fine-tune the famous BERT model to solve specific tasks, such as classification, question answering, and NER\n", 42 | "\n", 43 | "\n", 44 | "This turorial uses a BERT implementation by [Hugging Face](https://huggingface.co/) and is based on an example from [the GitHub repository](https://github.com/huggingface/pytorch-pretrained-BERT) which can be installed by running `pip install https://github.com/huggingface/pytorch-pretrained-BERT/releases/download/v0.6.2/pytorch_pretrained_bert-0.6.2-py3-none-any.whl`" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## BERT model" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "Let's see the BERT in action on a task of masked language modelling." 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "First, we need to choose which model are we going to use\n", 66 | "\n", 67 | "- `bert-base-uncased`: 12-layer, 768-hidden, 12-heads, 110M parameters\n", 68 | "- `bert-large-uncased`: 24-layer, 1024-hidden, 16-heads, 340M parameters\n", 69 | "- `bert-base-cased`: 12-layer, 768-hidden, 12-heads , 110M parameters\n", 70 | "- `bert-large-cased`: 24-layer, 1024-hidden, 16-heads, 340M parameters\n", 71 | "- `bert-base-multilingual-uncased`: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters\n", 72 | "- `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "bert_model = 'bert-base-uncased'" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# Load pre-trained model tokenizer (vocabulary)\n", 91 | "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "text = '[CLS] This summer school is great ! [SEP] I love it ! [SEP]'" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "tokenized_text = tokenizer.tokenize(text)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "Notice how the word `puppeteer` was split into multiple tokens." 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "tokenized_text" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "# Mask a token that we will try to predict back with `BertForMaskedLM`\n", 135 | "masked_index = 5\n", 136 | "tokenized_text[masked_index] = '[MASK]'" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "tokenized_text" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "# Convert token to vocabulary indices\n", 155 | "indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n", 156 | "# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)\n", 157 | "segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "indexed_tokens" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "# Convert inputs to PyTorch tensors\n", 176 | "tokens_tensor = torch.tensor([indexed_tokens])\n", 177 | "segments_tensors = torch.tensor([segments_ids])" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "tokens_tensor" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "segments_tensors" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "# Load pre-trained model (weights)\n", 205 | "model = BertForMaskedLM.from_pretrained(bert_model)\n", 206 | "model.eval();" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "# If you have a GPU, put everything on cuda\n", 216 | "tokens_tensor = tokens_tensor.to('cuda')\n", 217 | "segments_tensors = segments_tensors.to('cuda')\n", 218 | "model = model.to('cuda')" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "# Predict all tokens\n", 228 | "with torch.no_grad():\n", 229 | " predictions = model(tokens_tensor, segments_tensors)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "predictions.shape" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "Get the prediction - it should be 'henson'" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "# confirm we were able to predict 'henson'\n", 255 | "predicted_index = torch.argmax(predictions[0, masked_index]).item()\n", 256 | "predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "predicted_token" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "## Fine-tuning BERT" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "Now, let's fine-tune BERT on a classification task. We are going to use the [Microsoft Research Paraphrase Corpus](https://www.microsoft.com/en-us/download/details.aspx?id=52398), reformatted for the GLUE benchmark, where the goal is given two sentneces, predict whether they are a paraphrase or not. [This script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) helps with the formatting of the MRPC dataset." 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "class InputExample(object):\n", 289 | " \"\"\"A single training/test example for simple sequence classification.\"\"\"\n", 290 | " def __init__(self, guid, text_a, text_b=None, label=None):\n", 291 | " \"\"\"Constructs a InputExample.\n", 292 | " Args:\n", 293 | " guid: Unique id for the example.\n", 294 | " text_a: string. The untokenized text of the first sequence. For single sequence tasks, only this sequence must be specified.\n", 295 | " text_b: (Optional) string. The untokenized text of the second sequence. Only must be specified for sequence pair tasks.\n", 296 | " label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples.\n", 297 | " \"\"\" \n", 298 | " self.guid = guid\n", 299 | " self.text_a = text_a\n", 300 | " self.text_b = text_b\n", 301 | " self.label = label" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "class InputFeatures(object):\n", 311 | " \"\"\"A single set of features of data, ready to be used by the model\"\"\"\n", 312 | " def __init__(self, input_ids, input_mask, segment_ids, label_id):\n", 313 | " self.input_ids = input_ids\n", 314 | " self.input_mask = input_mask\n", 315 | " self.segment_ids = segment_ids\n", 316 | " self.label_id = label_id" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "class DataProcessor(object):\n", 326 | " \"\"\"Base class for data converters for sequence classification data sets.\"\"\"\n", 327 | "\n", 328 | " def get_train_examples(self, data_dir):\n", 329 | " \"\"\"Gets a collection of `InputExample`s for the train set.\"\"\"\n", 330 | " raise NotImplementedError()\n", 331 | "\n", 332 | " def get_dev_examples(self, data_dir):\n", 333 | " \"\"\"Gets a collection of `InputExample`s for the dev set.\"\"\"\n", 334 | " raise NotImplementedError()\n", 335 | "\n", 336 | " def get_labels(self):\n", 337 | " \"\"\"Gets the list of labels for this data set.\"\"\"\n", 338 | " raise NotImplementedError()\n", 339 | "\n", 340 | " @classmethod\n", 341 | " def _read_tsv(cls, input_file, quotechar=None):\n", 342 | " \"\"\"Reads a tab separated value file.\"\"\"\n", 343 | " with open(input_file, \"r\", encoding=\"utf-8\") as f:\n", 344 | " reader = csv.reader(f, delimiter=\"\\t\", quotechar=quotechar)\n", 345 | " lines = []\n", 346 | " for line in reader:\n", 347 | " if sys.version_info[0] == 2:\n", 348 | " line = list(unicode(cell, 'utf-8') for cell in line)\n", 349 | " lines.append(line)\n", 350 | " \n", 351 | " return lines" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "class MrpcProcessor(DataProcessor):\n", 361 | " \"\"\"Processor for the MRPC data set (GLUE version).\"\"\"\n", 362 | "\n", 363 | " def get_train_examples(self, data_dir):\n", 364 | " \"\"\"See base class.\"\"\"\n", 365 | " print(\"LOOKING AT {}\".format(os.path.join(data_dir, \"train.tsv\")))\n", 366 | " return self._create_examples(self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")\n", 367 | "\n", 368 | " def get_dev_examples(self, data_dir):\n", 369 | " \"\"\"See base class.\"\"\"\n", 370 | " return self._create_examples(self._read_tsv(os.path.join(data_dir, \"dev.tsv\")), \"dev\")\n", 371 | "\n", 372 | " def get_labels(self):\n", 373 | " \"\"\"See base class.\"\"\"\n", 374 | " return [\"0\", \"1\"]\n", 375 | "\n", 376 | " def _create_examples(self, lines, set_type):\n", 377 | " \"\"\"Creates examples for the training and dev sets.\"\"\"\n", 378 | " examples = []\n", 379 | " for (i, line) in enumerate(lines):\n", 380 | " if i == 0:\n", 381 | " continue\n", 382 | " guid = \"%s-%s\" % (set_type, i)\n", 383 | " text_a = line[3]\n", 384 | " text_b = line[4]\n", 385 | " label = line[0]\n", 386 | " examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n", 387 | " \n", 388 | " return examples" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):\n", 398 | " \"\"\"Loads a data file into a list of `InputBatch`s.\"\"\"\n", 399 | "\n", 400 | " label_map = {label : i for i, label in enumerate(label_list)}\n", 401 | "\n", 402 | " features = []\n", 403 | " for (ex_index, example) in enumerate(examples):\n", 404 | " if ex_index % 10000 == 0:\n", 405 | " print(\"Creating example %d of %d\" % (ex_index, len(examples)))\n", 406 | "\n", 407 | " tokens_a = tokenizer.tokenize(example.text_a)\n", 408 | "\n", 409 | " tokens_b = None\n", 410 | " if example.text_b:\n", 411 | " tokens_b = tokenizer.tokenize(example.text_b)\n", 412 | " # Modifies `tokens_a` and `tokens_b` in place so that the total\n", 413 | " # length is less than the specified length.\n", 414 | " # Account for [CLS], [SEP], [SEP] with \"- 3\"\n", 415 | " _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)\n", 416 | " else:\n", 417 | " # Account for [CLS] and [SEP] with \"- 2\"\n", 418 | " if len(tokens_a) > max_seq_length - 2:\n", 419 | " tokens_a = tokens_a[:(max_seq_length - 2)]\n", 420 | "\n", 421 | " # The convention in BERT is:\n", 422 | " # (a) For sequence pairs:\n", 423 | " # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n", 424 | " # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1\n", 425 | " # (b) For single sequences:\n", 426 | " # tokens: [CLS] the dog is hairy . [SEP]\n", 427 | " # type_ids: 0 0 0 0 0 0 0\n", 428 | " #\n", 429 | " # Where \"type_ids\" are used to indicate whether this is the first\n", 430 | " # sequence or the second sequence. The embedding vectors for `type=0` and\n", 431 | " # `type=1` were learned during pre-training and are added to the wordpiece\n", 432 | " # embedding vector (and position vector). This is not *strictly* necessary\n", 433 | " # since the [SEP] token unambiguously separates the sequences, but it makes\n", 434 | " # it easier for the model to learn the concept of sequences.\n", 435 | " #\n", 436 | " # For classification tasks, the first vector (corresponding to [CLS]) is\n", 437 | " # used as as the \"sentence vector\". Note that this only makes sense because\n", 438 | " # the entire model is fine-tuned.\n", 439 | " tokens = [\"[CLS]\"] + tokens_a + [\"[SEP]\"]\n", 440 | " segment_ids = [0] * len(tokens)\n", 441 | "\n", 442 | " if tokens_b:\n", 443 | " tokens += tokens_b + [\"[SEP]\"]\n", 444 | " segment_ids += [1] * (len(tokens_b) + 1)\n", 445 | "\n", 446 | " input_ids = tokenizer.convert_tokens_to_ids(tokens)\n", 447 | "\n", 448 | " # The mask has 1 for real tokens and 0 for padding tokens. Only real\n", 449 | " # tokens are attended to.\n", 450 | " input_mask = [1] * len(input_ids)\n", 451 | "\n", 452 | " # Zero-pad up to the sequence length.\n", 453 | " padding = [0] * (max_seq_length - len(input_ids))\n", 454 | " input_ids += padding\n", 455 | " input_mask += padding\n", 456 | " segment_ids += padding\n", 457 | "\n", 458 | " assert len(input_ids) == max_seq_length\n", 459 | " assert len(input_mask) == max_seq_length\n", 460 | " assert len(segment_ids) == max_seq_length\n", 461 | "\n", 462 | " label_id = label_map[example.label]\n", 463 | "\n", 464 | " if ex_index < 1:\n", 465 | " print(\"*** Example ***\")\n", 466 | " print(\"guid: %s\" % (example.guid))\n", 467 | " print(\"tokens: %s\" % \" \".join([str(x) for x in tokens]))\n", 468 | " print(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n", 469 | " print(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n", 470 | " print(\"segment_ids: %s\" % \" \".join([str(x) for x in segment_ids]))\n", 471 | " print(\"label: %s (id = %d)\" % (example.label, label_id))\n", 472 | "\n", 473 | " features.append(InputFeatures(\n", 474 | " input_ids=input_ids,\n", 475 | " input_mask=input_mask,\n", 476 | " segment_ids=segment_ids,\n", 477 | " label_id=label_id\n", 478 | " ))\n", 479 | " \n", 480 | " return features" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "metadata": {}, 487 | "outputs": [], 488 | "source": [ 489 | "def _truncate_seq_pair(tokens_a, tokens_b, max_length):\n", 490 | " \"\"\"Truncates a sequence pair in place to the maximum length.\"\"\"\n", 491 | "\n", 492 | " # This is a simple heuristic which will always truncate the longer sequence\n", 493 | " # one token at a time. This makes more sense than truncating an equal percent\n", 494 | " # of tokens from each, since if one sequence is very short then each token\n", 495 | " # that's truncated likely contains more information than a longer sequence.\n", 496 | " while True:\n", 497 | " total_length = len(tokens_a) + len(tokens_b)\n", 498 | " if total_length <= max_length:\n", 499 | " break\n", 500 | " if len(tokens_a) > len(tokens_b):\n", 501 | " tokens_a.pop()\n", 502 | " else:\n", 503 | " tokens_b.pop()" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "def simple_accuracy(preds, labels):\n", 513 | " return (preds == labels).mean()\n", 514 | "\n", 515 | "\n", 516 | "def acc_and_f1(preds, labels):\n", 517 | " acc = simple_accuracy(preds, labels)\n", 518 | " f1 = f1_score(y_true=labels, y_pred=preds)\n", 519 | " return {\n", 520 | " \"acc\": acc,\n", 521 | " \"f1\": f1,\n", 522 | " \"acc_and_f1\": (acc + f1) / 2,\n", 523 | " }" 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": {}, 529 | "source": [ 530 | "### Parameters" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "data_dir = '/home/aromanov/projects/bert_explained/data/datasets/glue/mrpc/'\n", 540 | "bert_model = 'bert-base-uncased'\n", 541 | "task_name = 'mrpc'\n", 542 | "output_dir = 'tmp/mrpc/'\n", 543 | "max_seq_length = 128\n", 544 | "do_lower_case = True\n", 545 | "train_batch_size = 16\n", 546 | "eval_batch_size = 64\n", 547 | "learning_rate = 5e-5\n", 548 | "num_train_epochs = 1\n", 549 | "warmup_proportion = 0.1" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": {}, 556 | "outputs": [], 557 | "source": [ 558 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": null, 564 | "metadata": {}, 565 | "outputs": [], 566 | "source": [ 567 | "print(f\"device: {device}\")" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": null, 573 | "metadata": {}, 574 | "outputs": [], 575 | "source": [ 576 | "if not os.path.exists(output_dir):\n", 577 | " os.makedirs(output_dir)" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": null, 583 | "metadata": {}, 584 | "outputs": [], 585 | "source": [ 586 | "processor = MrpcProcessor()" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [ 595 | "label_list = processor.get_labels()\n", 596 | "num_labels = len(label_list)" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": null, 602 | "metadata": {}, 603 | "outputs": [], 604 | "source": [ 605 | "tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": null, 611 | "metadata": {}, 612 | "outputs": [], 613 | "source": [ 614 | "train_examples = None\n", 615 | "num_train_optimization_steps = None\n", 616 | "\n", 617 | "train_examples = processor.get_train_examples(data_dir)\n", 618 | "num_train_optimization_steps = int(len(train_examples) / train_batch_size) * num_train_epochs" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": null, 624 | "metadata": {}, 625 | "outputs": [], 626 | "source": [ 627 | "# Prepare model\n", 628 | "model = BertForSequenceClassification.from_pretrained(bert_model, num_labels=num_labels)\n", 629 | "model = model.to(device)" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "# Prepare optimizer\n", 639 | "param_optimizer = list(model.named_parameters())\n", 640 | "no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']\n", 641 | "optimizer_grouped_parameters = [\n", 642 | " {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},\n", 643 | " {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}\n", 644 | "]\n", 645 | "\n", 646 | "optimizer = BertAdam(optimizer_grouped_parameters,\n", 647 | " lr=learning_rate,\n", 648 | " warmup=warmup_proportion,\n", 649 | " t_total=num_train_optimization_steps)" 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": null, 655 | "metadata": {}, 656 | "outputs": [], 657 | "source": [ 658 | "global_step = 0\n", 659 | "nb_tr_steps = 0\n", 660 | "tr_loss = 0" 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": null, 666 | "metadata": {}, 667 | "outputs": [], 668 | "source": [ 669 | "train_features = convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer)" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": null, 675 | "metadata": {}, 676 | "outputs": [], 677 | "source": [ 678 | "print(f\" Num examples = {len(train_examples)}\")\n", 679 | "print(f\" Batch size = {train_batch_size}\")\n", 680 | "print(f\" Num steps = {num_train_optimization_steps}\")" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": null, 686 | "metadata": {}, 687 | "outputs": [], 688 | "source": [ 689 | "all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)\n", 690 | "all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)\n", 691 | "all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)\n", 692 | "all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)\n", 693 | "\n", 694 | "train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)\n", 695 | "train_sampler = RandomSampler(train_data)\n", 696 | "train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": null, 702 | "metadata": {}, 703 | "outputs": [], 704 | "source": [ 705 | "model.train()\n", 706 | "\n", 707 | "for _ in tqdm_notebook(range(num_train_epochs), desc=\"Epoch\"):\n", 708 | " tr_loss = 0\n", 709 | " nb_tr_examples, nb_tr_steps = 0, 0\n", 710 | " for step, batch in enumerate(tqdm_notebook(train_dataloader, desc=\"Iteration\")):\n", 711 | " batch = tuple(t.to(device) for t in batch)\n", 712 | " input_ids, input_mask, segment_ids, label_ids = batch\n", 713 | "\n", 714 | " logits = model(input_ids, segment_ids, input_mask, labels=None)\n", 715 | "\n", 716 | " loss_fct = CrossEntropyLoss()\n", 717 | " loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))\n", 718 | " loss.backward()\n", 719 | "\n", 720 | " tr_loss += loss.item()\n", 721 | " nb_tr_examples += input_ids.size(0)\n", 722 | " nb_tr_steps += 1\n", 723 | "\n", 724 | " optimizer.step()\n", 725 | " optimizer.zero_grad()\n", 726 | " global_step += 1" 727 | ] 728 | }, 729 | { 730 | "cell_type": "code", 731 | "execution_count": null, 732 | "metadata": {}, 733 | "outputs": [], 734 | "source": [ 735 | "# Save a trained model, configuration and tokenizer\n", 736 | "model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self\n", 737 | "\n", 738 | "# If we save using the predefined names, we can load using `from_pretrained`\n", 739 | "output_model_file = os.path.join(output_dir, WEIGHTS_NAME)\n", 740 | "output_config_file = os.path.join(output_dir, CONFIG_NAME)\n", 741 | "\n", 742 | "torch.save(model_to_save.state_dict(), output_model_file)\n", 743 | "model_to_save.config.to_json_file(output_config_file)\n", 744 | "tokenizer.save_vocabulary(output_dir)" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": null, 750 | "metadata": {}, 751 | "outputs": [], 752 | "source": [ 753 | "# Load a trained model and vocabulary that you have fine-tuned\n", 754 | "model = BertForSequenceClassification.from_pretrained(output_dir, num_labels=num_labels)\n", 755 | "tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=do_lower_case)\n", 756 | "model = model.to(device)" 757 | ] 758 | }, 759 | { 760 | "cell_type": "code", 761 | "execution_count": null, 762 | "metadata": {}, 763 | "outputs": [], 764 | "source": [ 765 | "eval_examples = processor.get_dev_examples(data_dir)\n", 766 | "eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer)" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": null, 772 | "metadata": {}, 773 | "outputs": [], 774 | "source": [ 775 | "print(\" Num examples = {len(eval_examples)}\")\n", 776 | "print(\" Batch size = {eval_batch_size}\")" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": null, 782 | "metadata": {}, 783 | "outputs": [], 784 | "source": [ 785 | "all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)\n", 786 | "all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)\n", 787 | "all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)\n", 788 | "all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)\n", 789 | "\n", 790 | "eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)\n", 791 | "eval_sampler = SequentialSampler(eval_data)\n", 792 | "eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)" 793 | ] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": null, 798 | "metadata": {}, 799 | "outputs": [], 800 | "source": [ 801 | "model.eval()\n", 802 | "\n", 803 | "eval_loss = 0\n", 804 | "nb_eval_steps = 0\n", 805 | "preds = []\n", 806 | "\n", 807 | "for input_ids, input_mask, segment_ids, label_ids in tqdm_notebook(eval_dataloader, desc=\"Evaluating\"):\n", 808 | " input_ids = input_ids.to(device)\n", 809 | " input_mask = input_mask.to(device)\n", 810 | " segment_ids = segment_ids.to(device)\n", 811 | " label_ids = label_ids.to(device)\n", 812 | "\n", 813 | " with torch.no_grad():\n", 814 | " logits = model(input_ids, segment_ids, input_mask, labels=None)\n", 815 | "\n", 816 | " # create eval loss and other metric required by the task\n", 817 | " loss_fct = CrossEntropyLoss()\n", 818 | " tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))\n", 819 | " eval_loss += tmp_eval_loss.mean().item()\n", 820 | " \n", 821 | " nb_eval_steps += 1\n", 822 | " if len(preds) == 0:\n", 823 | " preds.append(logits.detach().cpu().numpy())\n", 824 | " else:\n", 825 | " preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0)\n", 826 | "\n", 827 | "eval_loss = eval_loss / nb_eval_steps\n", 828 | "preds = preds[0]\n", 829 | "preds = np.argmax(preds, axis=1)\n", 830 | "\n", 831 | "result = acc_and_f1(preds, all_label_ids.numpy())\n", 832 | "\n", 833 | "loss = tr_loss/nb_tr_steps\n", 834 | "\n", 835 | "result['eval_loss'] = eval_loss\n", 836 | "result['global_step'] = global_step\n", 837 | "result['loss'] = loss" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": null, 843 | "metadata": {}, 844 | "outputs": [], 845 | "source": [ 846 | "output_eval_file = os.path.join(output_dir, \"eval_results.txt\")\n", 847 | "with open(output_eval_file, \"w\") as writer:\n", 848 | " print(\"***** Eval results *****\")\n", 849 | " for key in sorted(result.keys()):\n", 850 | " print(f\" {key} = {result[key]}\")\n", 851 | " writer.write(f\"{key} = {result[key]}\\n\")" 852 | ] 853 | }, 854 | { 855 | "cell_type": "code", 856 | "execution_count": null, 857 | "metadata": {}, 858 | "outputs": [], 859 | "source": [] 860 | } 861 | ], 862 | "metadata": { 863 | "kernelspec": { 864 | "display_name": "Python 3", 865 | "language": "python", 866 | "name": "python3" 867 | }, 868 | "language_info": { 869 | "codemirror_mode": { 870 | "name": "ipython", 871 | "version": 3 872 | }, 873 | "file_extension": ".py", 874 | "mimetype": "text/x-python", 875 | "name": "python", 876 | "nbconvert_exporter": "python", 877 | "pygments_lexer": "ipython3", 878 | "version": "3.7.3" 879 | } 880 | }, 881 | "nbformat": 4, 882 | "nbformat_minor": 2 883 | } 884 | -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial5_serving_models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import shutil 4 | import zipfile 5 | import gzip 6 | import pickle 7 | import itertools 8 | import urllib.parse 9 | import urllib.request 10 | from collections import Counter 11 | import functools 12 | 13 | import numpy as np 14 | import torch 15 | import torch.utils.data 16 | from nltk import word_tokenize 17 | 18 | 19 | from flask import Flask, render_template, request, jsonify, Response 20 | 21 | 22 | ### CODE BELOW IS COPIED FROM THE NOTEBOOK ### 23 | 24 | class Vocab(object): 25 | """Vocabulary class to provide token to id correpondance""" 26 | END_TOKEN = '' 27 | START_TOKEN = '' 28 | PAD_TOKEN = '' 29 | UNK_TOKEN = '' 30 | 31 | def __init__(self, special_tokens=None): 32 | """ 33 | Initialize the vocabulary class 34 | 35 | :param special_tokens: Default value = None) A list of special tokens. The PAD token should be the first in the list, if used. 36 | 37 | """ 38 | super().__init__() 39 | 40 | self.special_tokens = special_tokens 41 | 42 | self.token2id = {} 43 | self.id2token = {} 44 | 45 | self.token_counts = Counter() 46 | 47 | if self.special_tokens is not None: 48 | self.add_document(self.special_tokens) 49 | 50 | def add_document(self, document, rebuild=True): 51 | """ 52 | Process the document and add tokens from the it to the vocabulary 53 | 54 | :param document: A list of tokens in the document 55 | :param rebuild: Default value = True) Whether to rebuild the token2id correspondance or not 56 | 57 | """ 58 | for token in document: 59 | self.token_counts[token] += 1 60 | 61 | if token not in self.token2id: 62 | self.token2id[token] = len(self.token2id) 63 | 64 | if rebuild: 65 | self._rebuild_id2token() 66 | 67 | def add_documents(self, documents): 68 | """ 69 | Process a list of documents and tokens from the them to the vocabulary 70 | 71 | :param documents: A list of documents, where each document is a list of tokens 72 | 73 | """ 74 | for doc in documents: 75 | self.add_document(doc, rebuild=False) 76 | 77 | self._rebuild_id2token() 78 | 79 | def _rebuild_id2token(self): 80 | """Revuild the token to id correspondance""" 81 | self.id2token = {i: t for t, i in self.token2id.items()} 82 | 83 | def get(self, item, default=None): 84 | """ 85 | Given a token, return the corresponding id 86 | 87 | :param item: A token 88 | :param default: Default value = None) Default value to return if token is not present in the vocabulary 89 | 90 | """ 91 | return self.token2id.get(item, default) 92 | 93 | def __getitem__(self, item): 94 | """ 95 | Given a token, return the corresponding id 96 | 97 | :param item: A token 98 | 99 | """ 100 | return self.token2id[item] 101 | 102 | def __contains__(self, item): 103 | """ 104 | Check if a token is present in the vocabulary 105 | 106 | :param item: A token 107 | 108 | """ 109 | return item in self.token2id 110 | 111 | def __len__(self): 112 | """ """ 113 | return len(self.token2id) 114 | 115 | def __str__(self): 116 | """Get a string representation of the vocabulary""" 117 | return f'{len(self)} tokens' 118 | 119 | def save(self, filename): 120 | """ 121 | Save the vocabulary to a csv file. See the `load` method. 122 | 123 | :param filename: Path the file 124 | 125 | """ 126 | with open(filename, 'w') as csv_file: 127 | writer = csv.DictWriter(csv_file, fieldnames=['token', 'counts', 'is_special']) 128 | writer.writeheader() 129 | for idx in range(len(self.token2id)): 130 | token = self.id2token[idx] 131 | is_special = 1 if token in self.special_tokens else 0 132 | writer.writerow({'token': token, 'counts': self.token_counts[token], 'is_special': is_special}) 133 | 134 | @staticmethod 135 | def load(filename): 136 | """ 137 | Load the vocabulary from a csv file. See the `save` method. 138 | 139 | :param filename: 140 | 141 | """ 142 | with open(filename, 'r') as csv_file: 143 | token2id = {} 144 | tokens_counts = {} 145 | special_tokens = [] 146 | reader = csv.DictReader(csv_file) 147 | for i, row in enumerate(reader): 148 | token2id[row['token']] = i 149 | tokens_counts[row['token']] = int(row['counts']) 150 | if bool(int(row['is_special'])): 151 | special_tokens.append(row['token']) 152 | 153 | vocab = Vocab() 154 | vocab.token2id = token2id 155 | vocab.token_counts = Counter(tokens_counts) 156 | vocab.special_tokens = special_tokens 157 | vocab._rebuild_id2token() 158 | 159 | return vocab 160 | 161 | 162 | class SubtitlesDialogDataset(torch.utils.data.Dataset): 163 | """ A conversational dialog dataset with query-response pairs """ 164 | def __init__(self, filename, vocab=None, max_lines = 1000, max_len=50, max_vocab_size=50000): 165 | """ 166 | Initialize a conversational dialog dataset with query-response pairs 167 | 168 | :param filename: Path to the OpenSubstitles dataset 169 | :param vocab: (Default value = None) Vocabulary, will be created if None 170 | :param max_lines: (Default value = 1000) Limit the number of lines to read from the dataset file 171 | :param max_len: (Default value = 50) Maximum length of the sentences 172 | :param max_vocab_size: (Default value = 50000) Maximum size of the vocabulary 173 | 174 | """ 175 | 176 | self.lines = [] 177 | with gzip.open(filename, 'rb') as f: 178 | for i, line in enumerate(f): 179 | if i >= max_lines: 180 | break 181 | 182 | tokens = word_tokenize(line.decode('utf-8')) 183 | self.lines.append(tokens) 184 | 185 | self.max_lines = min(len(self.lines), max_lines) 186 | 187 | if vocab is None: 188 | vocab = Vocab(special_tokens=[Vocab.PAD_TOKEN, Vocab.START_TOKEN, Vocab.END_TOKEN, Vocab.UNK_TOKEN]) 189 | vocab.add_documents(self.lines) 190 | vocab.prune_vocab(max_vocab_size) 191 | 192 | print(f'Created vocab: {vocab}') 193 | 194 | 195 | if max_len is None: 196 | max_len = max(len(s) for s in itertools.chain.from_iterable(self.sentences)) 197 | print(f'Calculed max len: {max_len}') 198 | 199 | self.vocab = vocab 200 | self.max_len = max_len 201 | 202 | def _pad_sentnece(self, sent): 203 | """ 204 | Cut the sentence if needed and pad it to the maximum len 205 | 206 | :param sent: The input sentnece 207 | 208 | """ 209 | sent = sent[:self.max_len - 1] + [Vocab.END_TOKEN,] 210 | 211 | nb_pad = self.max_len - len(sent) 212 | sent = sent + [Vocab.PAD_TOKEN,] * nb_pad 213 | 214 | return sent 215 | 216 | def _process_sent(self, sent): 217 | """ 218 | Cut, pad, and convert the sentence from tokens to indices using the vocabulary 219 | 220 | :param sent: The input sentence 221 | 222 | """ 223 | sent = self._pad_sentnece(sent) 224 | sent = [self.vocab[t] if t in self.vocab else self.vocab[Vocab.UNK_TOKEN] for t in sent] 225 | 226 | sent = np.array(sent, dtype=np.long) 227 | return sent 228 | 229 | def __getitem__(self, index): 230 | """ 231 | Create a pair of query-reponse using two consequtive lines in the dataset and return it 232 | 233 | :param index: Index of the query line. The reponse is the next line. 234 | 235 | """ 236 | query = self.lines[index] 237 | response = self.lines[index+1] 238 | 239 | query = self._process_sent(query) 240 | response = self._process_sent(response) 241 | 242 | return query, response 243 | 244 | def __len__(self): 245 | """ Return the total length of the dataset """ 246 | return self.max_lines - 1 247 | 248 | def softmax_masked(inputs, mask, dim=1, epsilon=0.000001): 249 | """ 250 | Perform the softmas operation on a batch of masked sequences of different lengths 251 | 252 | :param inputs: Input sequences, a 2d array of the shape (batch_size, max_seq_len) 253 | :param mask: Mask, an array of 1 and 0 254 | :param dim: (Default value = 1) Dimension of the softmax operation 255 | :param epsilon: (Default value = 0.000001) 256 | 257 | """ 258 | inputs_exp = torch.exp(inputs) 259 | inputs_exp = inputs_exp * mask.float() 260 | inputs_exp_sum = inputs_exp.sum(dim=dim) 261 | inputs_attention = inputs_exp / (inputs_exp_sum.unsqueeze(dim) + epsilon) 262 | 263 | return inputs_attention 264 | 265 | class Seq2SeqAttentionModel(torch.nn.Module): 266 | """ A more advanced GRU-based sequence-to-sequence model with attention """ 267 | def __init__(self, vocab_size, embedding_size, hidden_size, teacher_forcing, 268 | max_len,trainable_embeddings, start_index, end_index, pad_index, W_emb=None): 269 | """ 270 | Initialize the model 271 | 272 | :param vocab_size: The size of the vocabulary 273 | :param embedding_size: Dimension of the embeddings 274 | :param hidden_size: The size of the hidden layers, including GRU 275 | :param teacher_forcing: The probability of teacher forcing 276 | :param max_len: Maximum length of the sequences 277 | :param trainable_embeddings: Whether the embedding layer will be trainable or frozen 278 | :param start_index: Index of the START token in the vocabulary 279 | :param end_index: Index of the END token in the vocabulary 280 | :param pad_index: Index of the PAD token in the vocabulary 281 | :param W_emb: (Default value = None) Initial values of the embedding layer, a numpy array 282 | 283 | """ 284 | 285 | super().__init__() 286 | 287 | self.teacher_forcing = teacher_forcing 288 | self.max_len = max_len 289 | self.start_index = start_index 290 | self.end_index = end_index 291 | self.pad_index = pad_index 292 | 293 | self.embedding = torch.nn.Embedding(vocab_size, embedding_size, padding_idx=pad_index) 294 | if W_emb is not None: 295 | self.embedding.weight.data.copy_(torch.from_numpy(W_emb)) 296 | if not trainable_embeddings: 297 | self.embedding.weight.requires_grad = False 298 | 299 | self.encoder = torch.nn.GRU(embedding_size, hidden_size, batch_first=True) 300 | self.decoder = torch.nn.GRUCell(embedding_size, hidden_size) 301 | 302 | self.attention_decoder = torch.nn.Linear(hidden_size, hidden_size) 303 | self.attention_encoder = torch.nn.Linear(hidden_size, hidden_size) 304 | self.attention_reduce = torch.nn.Linear(hidden_size, 1, bias=False) 305 | self.decoder_hidden_combine = torch.nn.Linear(hidden_size * 2, hidden_size) 306 | 307 | self.decoder_projection = torch.nn.Linear(hidden_size, vocab_size) 308 | 309 | 310 | def encode(self, inputs): 311 | """ 312 | Encode input sentence and return the all hidden states and the input mask 313 | 314 | :param inputs: The input sentence 315 | 316 | """ 317 | batch_size = inputs.size(0) 318 | inputs_mask = (inputs != self.pad_index).long() 319 | inputs_lengths = torch.sum(inputs_mask, dim=1) 320 | 321 | inputs_emb = self.embedding(inputs) 322 | outputs, h = self.encoder(inputs_emb) 323 | 324 | return outputs, inputs_mask 325 | 326 | def decode(self, encoder_hiddens, inputs_mask, targets=None): 327 | """ 328 | Decode the response given the all hidden states of the encoder 329 | 330 | :param encoder_hiddens: Hidden states of the decoder 331 | :param inputs_mask: Input mask 332 | :param targets: (Default value = None) True decoding targets to be used for teacher forcing 333 | 334 | """ 335 | batch_size = encoder_hiddens.size(0) 336 | 337 | outputs_logits = [] 338 | decoder_hidden = torch.zeros_like(encoder_hiddens[:,0,:]) 339 | decoder_inputs = torch.full_like(decoder_hidden[:, 0], self.start_index).long() 340 | for i in range(self.max_len): 341 | decoder_inputs_emb = self.embedding(decoder_inputs) 342 | 343 | att_enc = self.attention_encoder(encoder_hiddens) 344 | att_dec = self.attention_decoder(decoder_hidden) 345 | att = torch.tanh(att_enc + att_dec.unsqueeze(1)) 346 | att_reduced = self.attention_reduce(att).squeeze(-1) 347 | att_normazlied = softmax_masked(att_reduced, inputs_mask) 348 | 349 | decoder_hidden_att = torch.sum(encoder_hiddens * att_normazlied.unsqueeze(-1), dim=1) 350 | decoder_hidden_combined = self.decoder_hidden_combine(torch.cat([decoder_hidden, decoder_hidden_att], dim=-1)) 351 | 352 | decoder_hidden = self.decoder(decoder_inputs_emb, decoder_hidden_combined) 353 | 354 | decoder_output_logit = self.decoder_projection(decoder_hidden) 355 | 356 | if np.random.rand() < self.teacher_forcing and targets is not None: 357 | decoder_inputs = targets[:, i] 358 | else: 359 | decoder_inputs = decoder_output_logit.argmax(dim=1).long() 360 | 361 | outputs_logits.append(decoder_output_logit) 362 | 363 | outputs_logits = torch.stack(outputs_logits, dim=1) 364 | 365 | return outputs_logits 366 | 367 | def forward(self, inputs, targets=None): 368 | """ 369 | Encode the input query and decode the response 370 | 371 | :param inputs: The input sentence 372 | :param targets: (Default value = None) True decoding targets 373 | 374 | """ 375 | encoder_hiddens, inputs_mask = self.encode(inputs) 376 | outputs_logits = self.decode(encoder_hiddens, inputs_mask, targets) 377 | 378 | return outputs_logits 379 | 380 | def load_model(model_class, filename): 381 | """ 382 | Create the model of the given class and load the checkpoint from the given file 383 | 384 | :param model_class: Model class 385 | :param filename: Path to the checkpoint 386 | 387 | """ 388 | def _map_location(storage, loc): 389 | """ A utility function to load a trained on a GPU model to the CPU """ 390 | return storage 391 | 392 | # load trained on GPU models to CPU 393 | map_location = None 394 | if not torch.cuda.is_available(): 395 | map_location = _map_location 396 | 397 | state = torch.load(str(filename), map_location=map_location) 398 | 399 | model = model_class(**state['model_params']) 400 | model.load_state_dict(state['model_state']) 401 | 402 | return model 403 | 404 | def generate_response(query): 405 | """ 406 | Generate a response from the model for a given query. The model and the dataset will be taken from the app cache 407 | 408 | :param query: Query to generate the response to 409 | 410 | """ 411 | 412 | if not isinstance(query, list): 413 | query = word_tokenize(query) 414 | 415 | dataset = app_cache['dataset'] 416 | model = app_cache['model'] 417 | 418 | query = dataset._process_sent(query) 419 | query = torch.tensor(query) 420 | 421 | response_logits = model(query.view(1, -1)).squeeze(0) 422 | response_indices = response_logits.argmax(dim=-1).cpu().numpy() 423 | 424 | response = [dataset.vocab.id2token[int(idx)] for idx in response_indices] 425 | response = [t for t in response if t not in dataset.vocab.special_tokens] 426 | response = ' '.join(response) 427 | 428 | return response 429 | 430 | ### END CODE FROM THE NOTEBOOK ### 431 | 432 | 433 | app = Flask(__name__) 434 | app.config.from_object(__name__) 435 | 436 | app.config.update(dict( 437 | model_filename='tmp/seq2seq_dialog_att.pt', 438 | vocab_filename='tmp/seq2seq_dialog.vocab.csv', 439 | dataset_filename='OpenSubtitles.en.gz', 440 | )) 441 | app.config.from_envvar('SEQ2SEQ_DIALOG_SETTINGS', silent=True) 442 | 443 | 444 | def init_dataset(): 445 | """ Initialize the dataset from the parameters in the app config and return it """ 446 | dataset_filename = app.config['dataset_filename'] 447 | vocab_filename = app.config['vocab_filename'] 448 | 449 | vocab = Vocab.load(vocab_filename) 450 | dataset = SubtitlesDialogDataset(dataset_filename, max_lines=1, vocab=vocab, max_len=50) 451 | 452 | return dataset 453 | 454 | 455 | def init_model(): 456 | """ Initialize the model from the parameters in the app config and return it """ 457 | model_filename = app.config['model_filename'] 458 | model = load_model(Seq2SeqAttentionModel, model_filename) 459 | 460 | return model 461 | 462 | 463 | app_cache = dict( 464 | dataset=init_dataset(), 465 | model=init_model(), 466 | ) 467 | 468 | 469 | @app.route('/dialog/', methods=['GET']) 470 | def dialog(): 471 | """ Take the query from the GET parameter `query`, generate the reponse, and return a json object """ 472 | query = request.args.get('query') 473 | response = generate_response(query) 474 | 475 | result = dict( 476 | query=query, 477 | response=response, 478 | ) 479 | 480 | return jsonify(**result) 481 | 482 | 483 | if __name__ == '__main__': 484 | app.run(host='0.0.0.0', port=8080) 485 | -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial5_telegram.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Time to Learn Telegram Chatbots\n", 8 | "\n", 9 | "\n", 10 | "Install the Telegram package with:\n", 11 | "\n", 12 | "**pip install python-telegram-bot==12.0.0b1 --upgrade**\n", 13 | "\n", 14 | "This tutorial was modified from the Telegram bot Github page. [See there for more information.](https://github.com/python-telegram-bot/python-telegram-bot)\n", 15 | "\n", 16 | "Now we will go through the basics of creating your bot:" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "# the telegram package holds everything we need for this tutorial\n", 26 | "import telegram\n", 27 | "from telegram.ext import Updater\n", 28 | "from telegram.ext import CommandHandler\n", 29 | "from telegram.ext import MessageHandler, Filters" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "# Authorization token needed - @BotFather\n", 39 | "TOKEN = \"764368673:AAGqBzI4RbIJYne35MwPIKMbEXsHKK_Dh0U\"" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "# Create our bot using the token\n", 49 | "\n", 50 | "bot = telegram.Bot(token=TOKEN)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Does the bot look correct? Let's check the details\n", 60 | "print(bot.get_me())" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "# If things go wrong, we won't know unless we enable logging\n", 70 | "import logging\n", 71 | "logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',\n", 72 | " level=logging.INFO)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "# We have a bot - now let's handle incoming messages\n", 80 | "\n", 81 | "We will now create function handlers for different input commands from the user.\n", 82 | "\n", 83 | "This will be achieved using a Telegram \"Updater\" object." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# We will create an updater to update the conversation between user and bot\n", 93 | "updater = Updater(token=TOKEN, use_context=True)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# Let's create a handler function to handle when the user says \"/start\"\n", 103 | "def start(update, context):\n", 104 | " # Here, we blindly respond to the user\n", 105 | " # The /start command could have come with arguments, we ignore those\n", 106 | " context.bot.send_message(chat_id=update.message.chat_id, text=\"I'm a bot, please talk to me!\")" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "# A \"dispatcher\" object allows us to add this command handler\n", 116 | "dispatcher = updater.dispatcher\n", 117 | "start_handler = CommandHandler('start', start)\n", 118 | "dispatcher.add_handler(start_handler)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "# We have a bot, we have a command handler, let's start this thing up!\n", 128 | "updater.start_polling()" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "Now let's add another handler." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# Okay, now let's respond to a user input command \"caps\"\n", 145 | "def caps(update, context):\n", 146 | " text_caps = ' '.join(context.args).upper()\n", 147 | " context.bot.send_message(chat_id=update.message.chat_id, text=text_caps)\n", 148 | "\n", 149 | "caps_handler = CommandHandler('caps', caps)\n", 150 | "dispatcher.add_handler(caps_handler)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "# But what to do in your project?\n", 158 | "\n", 159 | "Let's explore responding to arbitrary messages." 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "# This simple bot will simply tell the user what they said - easy peasy\n", 169 | "def respond(update, context):\n", 170 | " # This is how you access the user input message\n", 171 | " message = update.message.text\n", 172 | " context.bot.send_message(chat_id=update.message.chat_id, text='You said: %s' % message)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "Now we have our handler. Let's add it." 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "# Filters.text allows the handler to only respond to text messages\n", 189 | "msg_handler = MessageHandler(Filters.text, respond)\n", 190 | "dispatcher.add_handler(msg_handler)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "# We are tired of this bot. Let's shut it down!\n", 200 | "updater.stop()" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "# Now let's wrap it all into a class structure!\n", 208 | "\n", 209 | "We will call this class Chatbot." 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "class Chatbot:\n", 219 | " def __init__(self, token):\n", 220 | " \"\"\"This chatbot takes a Telegram authorization toke and a handler function,\n", 221 | " and deploys a Telegram chatbot to respond to user messages with that token.\n", 222 | " \n", 223 | " token - a string authorization token provided by @BotFather on Telegram\n", 224 | " handle_func - a function taking update, context parameters which responds to user inputs\n", 225 | " \"\"\"\n", 226 | " self.token = token\n", 227 | " self.bot = telegram.Bot(token=TOKEN)\n", 228 | " self.updater = Updater(token=TOKEN, use_context=True)\n", 229 | " self.dispatcher = self.updater.dispatcher\n", 230 | " self.updater.start_polling()\n", 231 | " \n", 232 | " def stop(self):\n", 233 | " \"\"\"Stop the Telegram bot\"\"\"\n", 234 | " self.updater.stop()\n", 235 | " \n", 236 | " def add_handler(self, handler):\n", 237 | " \"\"\"Add a handler function to extend bot functionality\"\"\"\n", 238 | " self.dispatcher.add_handler(handler)\n", 239 | " \n", 240 | " " 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "def reverse(update, context):\n", 250 | " \"\"\"Whatever the user says, reverse their message and repeat it back to them.\"\"\"\n", 251 | " message = update.message.text\n", 252 | " rev_message = \"\".join(reversed(message)) \n", 253 | " context.bot.send_message(chat_id=update.message.chat_id, text=rev_message)\n", 254 | " \n", 255 | "def greeting(update, context):\n", 256 | " \"\"\"Greet the user and ask for their name.\"\"\"\n", 257 | " context.bot.send_message(chat_id=update.message.chat_id, text=\"Hello there! What is your name?\")\n", 258 | "\n", 259 | "bot = Chatbot(TOKEN)\n", 260 | "bot.add_handler(MessageHandler(Filters.text, reverse))\n", 261 | "bot.add_handler(CommandHandler('start', greeting))" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "bot.stop()" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "# This is all good, but what about a longer message history?\n", 278 | "\n", 279 | "A more intelligent bot doesn't respond to only the previous message. What if we want the whole history?" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "history = []\n", 289 | "def repeat_history(update, context):\n", 290 | " \"\"\"Whatever the user says, reverse their message and repeat it back to them.\"\"\"\n", 291 | " message = update.message.text\n", 292 | " history.append(message)\n", 293 | " \n", 294 | " # here is where you insert your chatbot\n", 295 | " output = \" # \".join(history)\n", 296 | " \n", 297 | " context.bot.send_message(chat_id=update.message.chat_id, text=output)\n", 298 | " \n", 299 | "bot = Chatbot(TOKEN)\n", 300 | "bot.add_handler(MessageHandler(Filters.text, repeat_history))" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "bot.stop()" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "# And that's it! Now you can create your own chatbot.\n", 317 | "\n", 318 | "Now in your own projects: Create a model which produces a response to a user conversation history, and create a message handler to utilize Telegram messaging. Good to go!" 319 | ] 320 | } 321 | ], 322 | "metadata": { 323 | "kernelspec": { 324 | "display_name": "Python 3", 325 | "language": "python", 326 | "name": "python3" 327 | }, 328 | "language_info": { 329 | "codemirror_mode": { 330 | "name": "ipython", 331 | "version": 3 332 | }, 333 | "file_extension": ".py", 334 | "mimetype": "text/x-python", 335 | "name": "python", 336 | "nbconvert_exporter": "python", 337 | "pygments_lexer": "ipython3", 338 | "version": "3.6.5" 339 | } 340 | }, 341 | "nbformat": 4, 342 | "nbformat_minor": 2 343 | } 344 | -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial_6/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_filename": "data_small", 3 | "output_dir": "run_1/", 4 | "old_model_dir": "pretrained_model", 5 | "num_epochs":5, 6 | "history_len": 50, 7 | "response_len": 15, 8 | "embedding_dim": 512, 9 | "model_dim": 512, 10 | "inner_dim": 2048, 11 | "num_layers": 6, 12 | "num_heads": 8, 13 | "dim_k": 64, 14 | "dim_v": 64, 15 | "dropout": 0.3, 16 | "min_count": 1, 17 | "train_batch_size": 200, 18 | "val_batch_size": 25, 19 | "warmup_steps": 4000, 20 | "a_nice_note": "baseline test", 21 | "label_smoothing": false, 22 | "train_len": 1999, 23 | "vocab_size": 11507, 24 | "device":"cpu", 25 | "beam_size":4, 26 | "n_best":4, 27 | "choose_best":false 28 | } -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial_6/dataset.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import nltk 3 | from nltk.tokenize import TweetTokenizer 4 | import numpy as np 5 | import torch.utils.data 6 | import csv 7 | import json 8 | from transformer import Constants 9 | 10 | # get history, respinse data from csv file 11 | def _read_file(filename): 12 | history = list() 13 | response = list() 14 | ids = list() 15 | i = 1 16 | with open(filename, 'r') as fp: 17 | reader = csv.reader(fp) 18 | for row in reader: 19 | if i == 1: 20 | i = 0 21 | continue 22 | ids.append(row[0]) 23 | history.append(row[1].split(" ")) 24 | response.append(row[2].split(" ")) 25 | 26 | return history, response, ids 27 | 28 | 29 | 30 | class Vocab(object): 31 | 32 | def __init__(self, special_tokens=None): 33 | super(Vocab, self).__init__() 34 | 35 | self.nb_tokens = 0 36 | 37 | # vocab mapping 38 | self.token2id = {} 39 | self.id2token = {} 40 | 41 | self.token_counts = Counter() 42 | 43 | self.special_tokens = [] 44 | if special_tokens is not None: 45 | self.special_tokens = special_tokens 46 | self.add_document(self.special_tokens) 47 | 48 | # updates the vocab with an example 49 | def add_document(self, document): 50 | for token in document: 51 | self.token_counts[token] += 1 52 | 53 | if token not in self.token2id: 54 | self.token2id[token] = self.nb_tokens 55 | self.id2token[self.nb_tokens] = token 56 | self.nb_tokens += 1 57 | 58 | def add_documents(self, documents): 59 | for doc in documents: 60 | self.add_document(doc) 61 | 62 | # prune the vocab that occur less than the min count 63 | def prune_vocab(self, min_count=2): 64 | nb_tokens_before = len(self.token2id) 65 | 66 | tokens_to_delete = set([t for t, c in self.token_counts.items() if c < min_count]) 67 | tokens_to_delete -= set(self.special_tokens) 68 | 69 | for token in tokens_to_delete: 70 | self.token_counts.pop(token) 71 | 72 | self.token2id = {t: i for i, t in enumerate(self.token_counts.keys())} 73 | self.id2token = {i: t for t, i in self.token2id.items()} 74 | self.nb_tokens = len(self.token2id) 75 | 76 | print('Vocab pruned: {} -> {}'.format(nb_tokens_before, self.nb_tokens)) 77 | 78 | # load token2id from json file, useful when using pretrained model 79 | def load_from_dict(self, filename): 80 | with open(filename, 'r') as f: 81 | self.token2id = json.load(f) 82 | self.id2token = {i: t for t, i in self.token2id.items()} 83 | self.nb_tokens = len(self.token2id) 84 | 85 | # Save token2id to json file 86 | def save_to_dict(self, filename): 87 | with open(filename, 'w') as f: 88 | json.dump(self.token2id, f) 89 | 90 | def __getitem__(self, item): 91 | return self.token2id[item] 92 | 93 | def __contains__(self, item): 94 | return item in self.token2id 95 | 96 | def __len__(self): 97 | return self.nb_tokens 98 | 99 | def __str__(self): 100 | return 'Vocab: {} tokens'.format(self.nb_tokens) 101 | 102 | 103 | class DialogueDataset(torch.utils.data.Dataset): 104 | PAD_WORD = '' 105 | UNK_WORD = '' 106 | SEP_WORD = '' 107 | EOS_WORD = '' 108 | CLS_WORD = '' 109 | 110 | def __init__(self, filename, history_len = 50, response_len=15, vocab=None, update_vocab=True): 111 | """ 112 | Initialize the dialogue dataset. 113 | 114 | Get examples, and create/update vocab 115 | 116 | Examples: 117 | History: hello ! hi , how are you ? 118 | Resoponse: i am good , thank you ! 119 | 120 | Args: 121 | filename: Filename of csv file with the data 122 | history_len: Maximum token length for the history. Will be 123 | pruned/padded to this length 124 | response_len: Maximum length for the response. 125 | vocab: Optional vocab object to use for this dataset 126 | update_vocab: Set to false to not update the vocab with the new 127 | examples 128 | """ 129 | self.history, self.response, self.ids = _read_file(filename) 130 | 131 | self.history_len = history_len 132 | self.response_len = response_len 133 | 134 | if vocab is None: 135 | # Create new vocab object 136 | self.vocab = Vocab(special_tokens=[DialogueDataset.PAD_WORD, 137 | DialogueDataset.UNK_WORD, 138 | DialogueDataset.SEP_WORD, 139 | DialogueDataset.EOS_WORD, 140 | DialogueDataset.CLS_WORD]) 141 | else: 142 | self.vocab = vocab 143 | 144 | # do not want to update vocab for running old model 145 | if update_vocab: 146 | self.vocab.add_documents(self.history) 147 | self.vocab.add_documents(self.response) 148 | 149 | def _process_history(self, history): 150 | """ 151 | creates token encodings for the word embeddings, positional encodings, 152 | and segment encodings for the dialogue history 153 | 154 | Examples: 155 | History: hello ! hi , how are you ? 156 | self.history_len = 15 157 | 158 | h_seq = np.array([4, 34, 65, 2, 23, 44, 455, 97, 56, 10, 3, 0, 0, 0, 0]) 159 | h_pos = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0)] 160 | h_seg = np.array([1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0)] 161 | 162 | Args: 163 | history: list of tokens in the history 164 | Returns: 165 | h_seq: token encodings for the history 166 | h_pos: positional encoding for the history 167 | h_seg: segment encoding for the history 168 | """ 169 | history = history[-self.history_len+1:] 170 | history.append(DialogueDataset.EOS_WORD) 171 | 172 | needed_pads = self.history_len - len(history) 173 | if needed_pads > 0: 174 | history = history + [DialogueDataset.PAD_WORD] * needed_pads 175 | 176 | history = [ 177 | self.vocab[token] if token in self.vocab else self.vocab[DialogueDataset.UNK_WORD] 178 | for token in history 179 | ] 180 | 181 | # create position embeddings, make zero if it is the pad token (0) 182 | h_pos = np.array([pos_i+1 if w_i != 0 else 0 183 | for pos_i, w_i in enumerate(history)]) 184 | 185 | #create segment embeddings 186 | seg = list() 187 | i = 1 188 | for j, token in enumerate(history): 189 | if token == self.vocab[DialogueDataset.PAD_WORD]: 190 | break 191 | seg.append(i) 192 | if token == self.vocab[DialogueDataset.SEP_WORD]: 193 | i+=1 194 | seg += [0] * needed_pads 195 | h_seg = np.array(seg, dtype=np.long) 196 | 197 | h_seq = np.array(history, dtype=np.long) 198 | 199 | return h_seq, h_pos, h_seg 200 | 201 | def _process_response(self, response): 202 | """ 203 | creates token encodings for the word embeddings, and positional 204 | encodings for the response 205 | 206 | Examples: 207 | Response: i am good , thank you ! 208 | self.response_len = 10 209 | 210 | r_seq = np.array([4, 43, 52, 77, 9, 65, 93, 5, 3, 0]) 211 | r_pos = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 0,)] 212 | 213 | Args: 214 | response: list of tokens in the response 215 | Returns: 216 | r_seq: token encodings for the response 217 | r_pos: positional encoding for the response 218 | """ 219 | response = response[:self.response_len - 1] 220 | response.append(DialogueDataset.EOS_WORD) 221 | #response.insert(0, DialogueDataset.CLS_WORD) 222 | 223 | needed_pads = self.response_len - len(response) 224 | if needed_pads > 0: 225 | response = response + [DialogueDataset.PAD_WORD] * needed_pads 226 | 227 | response = [ 228 | self.vocab[token] if token in self.vocab else self.vocab[DialogueDataset.UNK_WORD] 229 | for token in response 230 | ] 231 | # create position embeddings 232 | r_pos = np.array([pos_i + 1 if w_i != 0 else 0 233 | for pos_i, w_i in enumerate(response)]) 234 | r_seq = np.array(response, dtype=np.long) 235 | return r_seq, r_pos 236 | 237 | def get_input_features(self, history): 238 | """ get features for chatbot """ 239 | tokenizer = TweetTokenizer() 240 | all_history = list() 241 | all_history.append(DialogueDataset.CLS_WORD) 242 | for line in history: 243 | all_history+=list(tokenizer.tokenize(line)) 244 | all_history.append(DialogueDataset.SEP_WORD) 245 | h_seq, h_pos, h_seg = self._process_history(all_history[:-1]) 246 | return torch.from_numpy(h_seq).unsqueeze(0), torch.from_numpy(h_pos).unsqueeze(0), torch.from_numpy(h_seg).unsqueeze(0) 247 | 248 | def __getitem__(self, index): 249 | """ 250 | returns the features for an example in the dataset 251 | 252 | Args: 253 | index: index of example in dataset 254 | 255 | Returns: 256 | h_seq: token encodings for the history 257 | h_pos: positional encoding for the history 258 | h_seg: segment encoding for the history 259 | r_seq: token encodings for the response 260 | r_pos: positional encoding for the response 261 | """ 262 | h_seq, h_pos, h_seg = self._process_history(self.history[index]) 263 | r_seq, r_pos = self._process_response(self.response[index]) 264 | id = self.ids[index] 265 | return h_seq, h_pos, h_seg, r_seq, r_pos 266 | 267 | def __len__(self): 268 | return len(self.history) 269 | -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial_6/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | nltk 3 | tqdm 4 | sklearn 5 | ipywidgets 6 | -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial_6/transformer/Beam.py: -------------------------------------------------------------------------------- 1 | """ Manage beam search info structure. 2 | 3 | Heavily borrowed from OpenNMT-py. 4 | For code in OpenNMT-py, please check the following link: 5 | https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/Beam.py 6 | """ 7 | 8 | import torch 9 | import numpy as np 10 | import transformer.Constants as Constants 11 | 12 | class Beam(): 13 | ''' Beam search ''' 14 | 15 | def __init__(self, size, device=False): 16 | 17 | self.size = size 18 | self._done = False 19 | 20 | # The score for each translation on the beam. 21 | self.scores = torch.zeros((size,), dtype=torch.float, device=device) 22 | self.all_scores = [] 23 | 24 | # The backpointers at each time-step. 25 | self.prev_ks = [] 26 | 27 | # The outputs at each time-step. 28 | self.next_ys = [torch.full((size,), Constants.PAD, dtype=torch.long, device=device)] 29 | self.next_ys[0][0] = Constants.CLS 30 | 31 | def get_current_state(self): 32 | "Get the outputs for the current timestep." 33 | return self.get_tentative_hypothesis() 34 | 35 | def get_current_origin(self): 36 | "Get the backpointers for the current timestep." 37 | return self.prev_ks[-1] 38 | 39 | @property 40 | def done(self): 41 | return self._done 42 | 43 | def advance(self, word_prob): 44 | "Update beam status and check if finished or not." 45 | num_words = word_prob.size(1) 46 | 47 | # Sum the previous scores. 48 | if len(self.prev_ks) > 0: 49 | beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob) 50 | else: 51 | beam_lk = word_prob[0] 52 | 53 | flat_beam_lk = beam_lk.view(-1) 54 | 55 | best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True) # 1st sort 56 | best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True) # 2nd sort 57 | 58 | self.all_scores.append(self.scores) 59 | self.scores = best_scores 60 | 61 | # bestScoresId is flattened as a (beam x word) array, 62 | # so we need to calculate which word and beam each score came from 63 | prev_k = best_scores_id / num_words 64 | self.prev_ks.append(prev_k) 65 | self.next_ys.append(best_scores_id - prev_k * num_words) 66 | 67 | # End condition is when top-of-beam is EOS. 68 | if self.next_ys[-1][0].item() == Constants.EOS: 69 | self._done = True 70 | self.all_scores.append(self.scores) 71 | 72 | return self._done 73 | 74 | def sort_scores(self): 75 | "Sort the scores." 76 | return torch.sort(self.scores, 0, True) 77 | 78 | def get_the_best_score_and_idx(self): 79 | "Get the score of the best in the beam." 80 | scores, ids = self.sort_scores() 81 | return scores[1], ids[1] 82 | 83 | def get_tentative_hypothesis(self): 84 | "Get the decoded sequence for the current timestep." 85 | 86 | if len(self.next_ys) == 1: 87 | dec_seq = self.next_ys[0].unsqueeze(1) 88 | else: 89 | _, keys = self.sort_scores() 90 | hyps = [self.get_hypothesis(k) for k in keys] 91 | hyps = [[Constants.CLS] + h for h in hyps] 92 | dec_seq = torch.LongTensor(hyps) 93 | 94 | return dec_seq 95 | 96 | def get_hypothesis(self, k): 97 | """ Walk back to construct the full hypothesis. """ 98 | hyp = [] 99 | for j in range(len(self.prev_ks) - 1, -1, -1): 100 | hyp.append(self.next_ys[j+1][k]) 101 | k = self.prev_ks[j][k] 102 | 103 | return list(map(lambda x: x.item(), hyp[::-1])) 104 | -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial_6/transformer/Constants.py: -------------------------------------------------------------------------------- 1 | 2 | PAD = 0 3 | UNK = 1 4 | SEP = 2 5 | EOS = 3 6 | CLS = 4 7 | 8 | 9 | PAD_WORD = '' 10 | UNK_WORD = '' 11 | SEP_WORD = '' 12 | EOS_WORD = '' 13 | CLS_WORD = '' 14 | -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial_6/transformer/Layers.py: -------------------------------------------------------------------------------- 1 | ''' Define the Layers ''' 2 | import torch.nn as nn 3 | from transformer.SubLayers import MultiHeadAttention, PositionwiseFeedForward 4 | 5 | __author__ = "Yu-Hsiang Huang" 6 | 7 | 8 | class EncoderLayer(nn.Module): 9 | ''' Compose with two layers ''' 10 | 11 | def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): 12 | super(EncoderLayer, self).__init__() 13 | self.slf_attn = MultiHeadAttention( 14 | n_head, d_model, d_k, d_v, dropout=dropout) 15 | self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout) 16 | 17 | def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): 18 | """ 19 | First performs self attention on the input. then the result is passed 20 | through a feed forward network to get the output 21 | 22 | Args: 23 | enc_input: vector input 24 | 25 | Returns: 26 | enc_output: vector output from encoder layer 27 | """ 28 | # Multi-Head Attention (w/ Add and Norm) 29 | enc_output, enc_slf_attn = self.slf_attn( 30 | enc_input, enc_input, enc_input, mask=slf_attn_mask) 31 | enc_output *= non_pad_mask 32 | 33 | # Feed forward (w/ Add and Norm) 34 | enc_output = self.pos_ffn(enc_output) 35 | enc_output *= non_pad_mask 36 | 37 | return enc_output, enc_slf_attn 38 | 39 | 40 | class DecoderLayer(nn.Module): 41 | ''' Compose with three layers ''' 42 | 43 | def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): 44 | super(DecoderLayer, self).__init__() 45 | self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) 46 | self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) 47 | self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout) 48 | 49 | def forward(self, dec_input, enc_output, non_pad_mask=None, slf_attn_mask=None, dec_enc_attn_mask=None): 50 | """ 51 | First performs masked self attention on input. 52 | 53 | Then preforms attention 54 | where the query is the output from the previous layer, and the keys 55 | and values is the encoder output 56 | 57 | finally, the result is passed through a feed forward network to get 58 | the output 59 | 60 | Args: 61 | dec_input: input to the decoder 62 | enc_output: output from encoder 63 | 64 | Returns: 65 | dec_output: output from decoder 66 | """ 67 | # Masked Multi-Head Attention (w/ Add and Norm) 68 | dec_output, dec_slf_attn = self.slf_attn( 69 | dec_input, dec_input, dec_input, mask=slf_attn_mask) 70 | dec_output *= non_pad_mask 71 | 72 | # Multi-Head Attention (w/ Add and Norm) 73 | dec_output, dec_enc_attn = self.enc_attn( 74 | dec_output, enc_output, enc_output, mask=dec_enc_attn_mask) 75 | dec_output *= non_pad_mask 76 | 77 | # Feed forward (w/ Add and Norm) 78 | dec_output = self.pos_ffn(dec_output) 79 | dec_output *= non_pad_mask 80 | 81 | return dec_output, dec_slf_attn, dec_enc_attn 82 | -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial_6/transformer/Models.py: -------------------------------------------------------------------------------- 1 | ''' Define the Transformer model ''' 2 | import torch 3 | import torch.nn as nn 4 | import numpy as np 5 | import transformer.Constants as Constants 6 | from transformer.Layers import EncoderLayer, DecoderLayer 7 | 8 | __author__ = "Yu-Hsiang Huang" 9 | 10 | def get_non_pad_mask(seq): 11 | assert seq.dim() == 2 12 | return seq.ne(Constants.PAD).type(torch.float).unsqueeze(-1) 13 | 14 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): 15 | ''' Sinusoid position encoding table ''' 16 | 17 | def cal_angle(position, hid_idx): 18 | return position / np.power(10000, 2 * (hid_idx // 2) / d_hid) 19 | 20 | def get_posi_angle_vec(position): 21 | return [cal_angle(position, hid_j) for hid_j in range(d_hid)] 22 | 23 | sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)]) 24 | 25 | sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 26 | sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 27 | 28 | if padding_idx is not None: 29 | # zero vector for padding dimension 30 | sinusoid_table[padding_idx] = 0. 31 | 32 | return torch.FloatTensor(sinusoid_table) 33 | 34 | def get_attn_key_pad_mask(seq_k, seq_q): 35 | ''' For masking out the padding part of key sequence. ''' 36 | 37 | # Expand to fit the shape of key query attention matrix. 38 | len_q = seq_q.size(1) 39 | padding_mask = seq_k.eq(Constants.PAD) 40 | padding_mask = padding_mask.unsqueeze(1).expand(-1, len_q, -1) # b x lq x lk 41 | 42 | return padding_mask 43 | 44 | def get_subsequent_mask(seq): 45 | ''' For masking out the subsequent info. ''' 46 | 47 | sz_b, len_s = seq.size() 48 | subsequent_mask = torch.triu( 49 | torch.ones((len_s, len_s), device=seq.device, dtype=torch.uint8), diagonal=1) 50 | subsequent_mask = subsequent_mask.unsqueeze(0).expand(sz_b, -1, -1) # b x ls x ls 51 | 52 | return subsequent_mask 53 | 54 | class Encoder(nn.Module): 55 | ''' A encoder model with self attention mechanism. ''' 56 | 57 | def __init__( 58 | self, 59 | n_src_vocab, len_max_seq, d_word_vec, 60 | n_layers, n_head, d_k, d_v, 61 | d_model, d_inner, dropout=0.1, pretrained_embeddings=None): 62 | 63 | super().__init__() 64 | 65 | n_position = len_max_seq + 1 66 | 67 | if pretrained_embeddings is None: 68 | self.src_word_emb = nn.Embedding( 69 | n_src_vocab, d_word_vec, padding_idx=Constants.PAD) 70 | else: 71 | self.src_word_emb = nn.Embedding.from_pretrained( 72 | pretrained_embeddings, padding_idx=Constants.PAD) 73 | 74 | self.position_enc = nn.Embedding.from_pretrained( 75 | get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0), 76 | freeze=True) 77 | 78 | self.segment_enc = nn.Embedding(int(n_position/2), d_word_vec, padding_idx=0) 79 | 80 | self.layer_stack = nn.ModuleList([ 81 | EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) 82 | for _ in range(n_layers)]) 83 | 84 | def forward(self, src_seq, src_pos, src_seg, return_attns=False): 85 | """ 86 | First creates an imput embedding from the seq, pos, and seg encodings. 87 | then runs the encoder layer for n_layers and returns the final vector 88 | 89 | Args: 90 | h_seq: Encodings for the words in the history 91 | h_pos: Positional encodings for the words in the history 92 | h_seg: Segment encodings for turns in the history 93 | Returns: 94 | enc_output: vector output from encoder 95 | """ 96 | enc_slf_attn_list = [] 97 | 98 | # -- Prepare masks 99 | slf_attn_mask = get_attn_key_pad_mask(seq_k=src_seq, seq_q=src_seq) 100 | non_pad_mask = get_non_pad_mask(src_seq) 101 | 102 | # -- Get input embeddings 103 | enc_output = self.src_word_emb(src_seq) + self.position_enc(src_pos) \ 104 | + self.segment_enc(src_seg) 105 | 106 | # Nx encoder layer 107 | for enc_layer in self.layer_stack: 108 | enc_output, enc_slf_attn = enc_layer( 109 | enc_output, 110 | non_pad_mask=non_pad_mask, 111 | slf_attn_mask=slf_attn_mask) 112 | if return_attns: 113 | enc_slf_attn_list += [enc_slf_attn] 114 | 115 | if return_attns: 116 | return enc_output, enc_slf_attn_list 117 | return enc_output, 118 | 119 | class Decoder(nn.Module): 120 | ''' A decoder model with self attention mechanism. ''' 121 | 122 | def __init__( 123 | self, 124 | n_tgt_vocab, len_max_seq, d_word_vec, 125 | n_layers, n_head, d_k, d_v, 126 | d_model, d_inner, dropout=0.1, pretrained_embeddings=None): 127 | 128 | super().__init__() 129 | n_position = len_max_seq + 1 130 | 131 | if pretrained_embeddings is None: 132 | self.tgt_word_emb = nn.Embedding( 133 | n_tgt_vocab, d_word_vec, padding_idx=Constants.PAD) 134 | else: 135 | self.tgt_word_emb = nn.Embedding.from_pretrained( 136 | pretrained_embeddings, padding_idx=Constants.PAD) 137 | 138 | self.position_enc = nn.Embedding.from_pretrained( 139 | get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0), 140 | freeze=True) 141 | 142 | self.layer_stack = nn.ModuleList([ 143 | DecoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) 144 | for _ in range(n_layers)]) 145 | 146 | def forward(self, tgt_seq, tgt_pos, src_seq, enc_output, return_attns=False): 147 | """ 148 | Starts by getting the imput embedding from the target seq, and pos 149 | encodings. Then runs the decoder. 150 | 151 | Args: 152 | tgt_seq: Encodings for the words in the target response 153 | tgt_pos: Positional encodings for the words in the target response 154 | src_seq: Encodings for the words in the history 155 | enc_output: Output from the Encoder 156 | Returns: 157 | sec_output: vector outputs from decoder, one for each word in the response 158 | 159 | """ 160 | dec_slf_attn_list, dec_enc_attn_list = [], [] 161 | 162 | # -- Prepare masks 163 | non_pad_mask = get_non_pad_mask(tgt_seq) 164 | 165 | slf_attn_mask_subseq = get_subsequent_mask(tgt_seq) 166 | slf_attn_mask_keypad = get_attn_key_pad_mask(seq_k=tgt_seq, seq_q=tgt_seq) 167 | slf_attn_mask = (slf_attn_mask_keypad + slf_attn_mask_subseq).gt(0) 168 | 169 | dec_enc_attn_mask = get_attn_key_pad_mask(seq_k=src_seq, seq_q=tgt_seq) 170 | 171 | # -- Forward 172 | dec_output = self.tgt_word_emb(tgt_seq) + self.position_enc(tgt_pos) 173 | 174 | # Nx decoder layer 175 | for dec_layer in self.layer_stack: 176 | dec_output, dec_slf_attn, dec_enc_attn = dec_layer( 177 | dec_output, enc_output, 178 | non_pad_mask=non_pad_mask, 179 | slf_attn_mask=slf_attn_mask, 180 | dec_enc_attn_mask=dec_enc_attn_mask) 181 | 182 | if return_attns: 183 | dec_slf_attn_list += [dec_slf_attn] 184 | dec_enc_attn_list += [dec_enc_attn] 185 | 186 | if return_attns: 187 | return dec_output, dec_slf_attn_list, dec_enc_attn_list 188 | return dec_output, 189 | 190 | class Transformer(nn.Module): 191 | ''' A sequence to sequence model with attention mechanism. ''' 192 | 193 | def __init__( 194 | self, 195 | n_src_vocab, n_tgt_vocab, len_max_seq_enc, len_max_seq_dec, 196 | d_word_vec=512, d_model=512, d_inner=2048, 197 | n_layers=6, n_head=8, d_k=64, d_v=64, dropout=0.1, 198 | tgt_emb_prj_weight_sharing=True, 199 | emb_src_tgt_weight_sharing=True, 200 | pretrained_embeddings=None): 201 | 202 | super().__init__() 203 | 204 | self.encoder = Encoder( 205 | n_src_vocab=n_src_vocab, len_max_seq=len_max_seq_enc, 206 | d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, 207 | n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, 208 | dropout=dropout, pretrained_embeddings=pretrained_embeddings) 209 | 210 | self.decoder = Decoder( 211 | n_tgt_vocab=n_tgt_vocab, len_max_seq=len_max_seq_dec, 212 | d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, 213 | n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, 214 | dropout=dropout, pretrained_embeddings=pretrained_embeddings) 215 | 216 | self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False) 217 | nn.init.xavier_normal_(self.tgt_word_prj.weight) 218 | 219 | assert d_model == d_word_vec, \ 220 | 'To facilitate the residual connections, \ 221 | the dimensions of all module outputs shall be the same.' 222 | 223 | if tgt_emb_prj_weight_sharing: 224 | # Share the weight matrix between target word embedding & the final logit dense layer 225 | self.tgt_word_prj.weight = self.decoder.tgt_word_emb.weight 226 | self.x_logit_scale = (d_model ** -0.5) 227 | else: 228 | self.x_logit_scale = 1. 229 | 230 | if emb_src_tgt_weight_sharing: 231 | # Share the weight matrix between source & target word embeddings 232 | assert n_src_vocab == n_tgt_vocab, \ 233 | "To share word embedding table, the vocabulary size of src/tgt shall be the same." 234 | self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight 235 | 236 | def forward(self, src_seq, src_pos, src_seg, tgt_seq, tgt_pos): 237 | """ 238 | Takes in the input features for the history and response, and returns a prediction. 239 | 240 | First encodes the history, and then decodes it before mapping the output to the vocabulary 241 | 242 | Args: 243 | src_seq: Encodings for the words in the history 244 | src_pos: Positional encodings for the words in the history 245 | src_seg: Segment encodings for turns in the history 246 | tgt_seq: Encodings for the words in the target response 247 | tgt_pos: Positional encodings for the words in the target response 248 | Returns: 249 | Outputs: Vector of probabilities for each word in the vocabulary, for each word in the response 250 | """ 251 | 252 | tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1] 253 | 254 | enc_output, *_ = self.encoder(src_seq, src_pos, src_seg) 255 | dec_output, *_ = self.decoder(tgt_seq, tgt_pos, src_seq, enc_output) 256 | outputs = self.tgt_word_prj(dec_output) * self.x_logit_scale 257 | 258 | return outputs.view(-1, outputs.size(2)) 259 | -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial_6/transformer/Modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | __author__ = "Yu-Hsiang Huang" 6 | 7 | class ScaledDotProductAttention(nn.Module): 8 | ''' Scaled Dot-Product Attention ''' 9 | 10 | def __init__(self, temperature, attn_dropout=0.1): 11 | super().__init__() 12 | self.temperature = temperature 13 | self.dropout = nn.Dropout(attn_dropout) 14 | self.softmax = nn.Softmax(dim=2) 15 | 16 | def forward(self, q, k, v, mask=None): 17 | """ 18 | gets the querys keys, and values for each attention head. 19 | 20 | the queries and keys are multiplied, and this is scaled, masked, 21 | softmaxed, and dropouted to get the weights 22 | 23 | these weights are applied to the values via matrix multiplication 24 | 25 | Args: 26 | q: Query 27 | k: Key 28 | v: Value 29 | mask: 30 | 31 | Returns: 32 | 33 | """ 34 | 35 | # MatMul 36 | attn = torch.bmm(q, k.transpose(1, 2)) 37 | # Scale 38 | attn = attn / self.temperature 39 | 40 | # Mask 41 | if mask is not None: 42 | attn = attn.masked_fill(mask, -np.inf) 43 | 44 | # softmax/dropout 45 | attn = self.softmax(attn) 46 | attn = self.dropout(attn) 47 | 48 | # Matmul 49 | output = torch.bmm(attn, v) 50 | 51 | return output, attn 52 | -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial_6/transformer/Optim.py: -------------------------------------------------------------------------------- 1 | '''A wrapper class for optimizer ''' 2 | import numpy as np 3 | 4 | class ScheduledOptim(): 5 | '''A simple wrapper class for learning rate scheduling''' 6 | 7 | def __init__(self, optimizer, d_model, n_warmup_steps): 8 | self.optimizer = optimizer 9 | self.n_warmup_steps = n_warmup_steps 10 | self.n_current_steps = 0 11 | self.init_lr = np.power(d_model, -0.5) 12 | 13 | def step_and_update_lr(self): 14 | "Step with the inner optimizer" 15 | self._update_learning_rate() 16 | self.optimizer.step() 17 | 18 | def zero_grad(self): 19 | "Zero out the gradients by the inner optimizer" 20 | self.optimizer.zero_grad() 21 | 22 | def _get_lr_scale(self): 23 | return np.min([ 24 | np.power(self.n_current_steps, -0.5), 25 | np.power(self.n_warmup_steps, -1.5) * self.n_current_steps]) 26 | 27 | def _update_learning_rate(self): 28 | ''' Learning rate scheduling per step ''' 29 | 30 | self.n_current_steps += 1 31 | lr = self.init_lr * self._get_lr_scale() 32 | 33 | for param_group in self.optimizer.param_groups: 34 | param_group['lr'] = lr 35 | 36 | -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial_6/transformer/SubLayers.py: -------------------------------------------------------------------------------- 1 | ''' Define the sublayers in encoder/decoder layer ''' 2 | import numpy as np 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from transformer.Modules import ScaledDotProductAttention 6 | 7 | __author__ = "Yu-Hsiang Huang" 8 | 9 | class MultiHeadAttention(nn.Module): 10 | ''' Multi-Head Attention module ''' 11 | 12 | def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): 13 | super().__init__() 14 | 15 | self.n_head = n_head 16 | self.d_k = d_k 17 | self.d_v = d_v 18 | 19 | self.w_qs = nn.Linear(d_model, n_head * d_k) 20 | self.w_ks = nn.Linear(d_model, n_head * d_k) 21 | self.w_vs = nn.Linear(d_model, n_head * d_v) 22 | nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) 23 | nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) 24 | nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) 25 | 26 | self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5)) 27 | self.layer_norm = nn.LayerNorm(d_model) 28 | 29 | self.fc = nn.Linear(n_head * d_v, d_model) 30 | nn.init.xavier_normal_(self.fc.weight) 31 | 32 | self.dropout = nn.Dropout(dropout) 33 | 34 | 35 | def forward(self, q, k, v, mask=None): 36 | """ 37 | first passes the querys, keys, and values through linear layers to get 38 | n_head inputs for scaled dot-product attention 39 | 40 | then preforms scaled Dot-product attention and applies layer normalization 41 | before returning the output 42 | 43 | Args: 44 | q: Query 45 | k: Key 46 | v: Value 47 | mask: 48 | 49 | Returns: 50 | Output: output from multihead attention 51 | """ 52 | 53 | d_k, d_v, n_head = self.d_k, self.d_v, self.n_head 54 | 55 | sz_b, len_q, _ = q.size() 56 | sz_b, len_k, _ = k.size() 57 | sz_b, len_v, _ = v.size() 58 | 59 | residual = q 60 | 61 | # Linear 62 | q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) 63 | k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) 64 | v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) 65 | 66 | q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk 67 | k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk 68 | v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv 69 | 70 | mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x .. 71 | 72 | # Scaled Dot-Product Attention 73 | output, attn = self.attention(q, k, v, mask=mask) 74 | 75 | output = output.view(n_head, sz_b, len_q, d_v) 76 | output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv) 77 | 78 | output = self.dropout(self.fc(output)) 79 | 80 | # Add and Norm 81 | output = self.layer_norm(output + residual) 82 | 83 | return output, attn 84 | 85 | class PositionwiseFeedForward(nn.Module): 86 | ''' A two-feed-forward-layer module ''' 87 | 88 | def __init__(self, d_in, d_hid, dropout=0.1): 89 | super().__init__() 90 | self.w_1 = nn.Conv1d(d_in, d_hid, 1) # position-wise 91 | self.w_2 = nn.Conv1d(d_hid, d_in, 1) # position-wise 92 | self.layer_norm = nn.LayerNorm(d_in) 93 | self.dropout = nn.Dropout(dropout) 94 | 95 | def forward(self, x): 96 | """ 97 | just a feed forward linear layer that is used after attention in the 98 | encoder and decoder 99 | Args: 100 | x: input 101 | 102 | Returns: 103 | 104 | """ 105 | # feed forward 106 | residual = x 107 | output = x.transpose(1, 2) 108 | output = self.w_2(F.relu(self.w_1(output))) 109 | output = output.transpose(1, 2) 110 | output = self.dropout(output) 111 | 112 | # Add and norm 113 | output = self.layer_norm(output + residual) 114 | return output 115 | -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial_6/transformer/Translator.py: -------------------------------------------------------------------------------- 1 | ''' This module will handle the text generation with beam search. ''' 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from transformer.Models import Transformer 8 | from transformer.Beam import Beam 9 | 10 | class Chatbot(object): 11 | ''' Load with trained model and handle the beam search ''' 12 | 13 | def __init__(self, config, model): 14 | self.config = config 15 | self.device = torch.device(config["device"]) 16 | 17 | model.word_prob_prj = nn.LogSoftmax(dim=1) 18 | 19 | model = model.to(self.device) 20 | 21 | self.model = model 22 | self.model.eval() 23 | 24 | def translate_batch(self, src_seq, src_pos, src_seg): 25 | ''' Translation work in one batch ''' 26 | 27 | def get_inst_idx_to_tensor_position_map(inst_idx_list): 28 | ''' Indicate the position of an instance in a tensor. ''' 29 | return {inst_idx: tensor_position for tensor_position, inst_idx in enumerate(inst_idx_list)} 30 | 31 | def collect_active_part(beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm): 32 | ''' Collect tensor parts associated to active instances. ''' 33 | 34 | _, *d_hs = beamed_tensor.size() 35 | n_curr_active_inst = len(curr_active_inst_idx) 36 | new_shape = (n_curr_active_inst * n_bm, *d_hs) 37 | 38 | beamed_tensor = beamed_tensor.view(n_prev_active_inst, -1) 39 | beamed_tensor = beamed_tensor.index_select(0, curr_active_inst_idx) 40 | beamed_tensor = beamed_tensor.view(*new_shape) 41 | 42 | return beamed_tensor 43 | 44 | def collate_active_info( 45 | src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list): 46 | # Sentences which are still active are collected, 47 | # so the decoder will not run on completed sentences. 48 | n_prev_active_inst = len(inst_idx_to_position_map) 49 | active_inst_idx = [inst_idx_to_position_map[k] for k in active_inst_idx_list] 50 | active_inst_idx = torch.LongTensor(active_inst_idx).to(self.device) 51 | 52 | active_src_seq = collect_active_part(src_seq, active_inst_idx, n_prev_active_inst, n_bm) 53 | active_src_enc = collect_active_part(src_enc, active_inst_idx, n_prev_active_inst, n_bm) 54 | active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list) 55 | 56 | return active_src_seq, active_src_enc, active_inst_idx_to_position_map 57 | 58 | def beam_decode_step( 59 | inst_dec_beams, len_dec_seq, src_seq, enc_output, inst_idx_to_position_map, n_bm): 60 | ''' Decode and update beam status, and then return active beam idx ''' 61 | 62 | def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq): 63 | dec_partial_seq = [b.get_current_state() for b in inst_dec_beams if not b.done] 64 | dec_partial_seq = torch.stack(dec_partial_seq).to(self.device) 65 | dec_partial_seq = dec_partial_seq.view(-1, len_dec_seq) 66 | return dec_partial_seq 67 | 68 | def prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm): 69 | dec_partial_pos = torch.arange(1, len_dec_seq + 1, dtype=torch.long, device=self.device) 70 | dec_partial_pos = dec_partial_pos.unsqueeze(0).repeat(n_active_inst * n_bm, 1) 71 | return dec_partial_pos 72 | 73 | def predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm): 74 | dec_output, *_ = self.model.decoder(dec_seq, dec_pos, src_seq, enc_output) 75 | dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h 76 | word_prob = F.log_softmax(self.model.tgt_word_prj(dec_output), dim=1) 77 | word_prob = word_prob.view(n_active_inst, n_bm, -1) 78 | 79 | return word_prob 80 | 81 | def collect_active_inst_idx_list(inst_beams, word_prob, inst_idx_to_position_map): 82 | active_inst_idx_list = [] 83 | for inst_idx, inst_position in inst_idx_to_position_map.items(): 84 | is_inst_complete = inst_beams[inst_idx].advance(word_prob[inst_position]) 85 | if not is_inst_complete: 86 | active_inst_idx_list += [inst_idx] 87 | 88 | return active_inst_idx_list 89 | 90 | n_active_inst = len(inst_idx_to_position_map) 91 | 92 | dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq) 93 | dec_pos = prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm) 94 | word_prob = predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm) 95 | 96 | # Update the beam with predicted word prob information and collect incomplete instances 97 | active_inst_idx_list = collect_active_inst_idx_list( 98 | inst_dec_beams, word_prob, inst_idx_to_position_map) 99 | 100 | return active_inst_idx_list 101 | 102 | def collect_hypothesis_and_scores(inst_dec_beams, n_best): 103 | all_hyp, all_scores = [], [] 104 | for inst_idx in range(len(inst_dec_beams)): 105 | scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores() 106 | all_scores += [scores[:n_best]] 107 | 108 | hyps = [inst_dec_beams[inst_idx].get_hypothesis(i) for i in tail_idxs[:n_best]] 109 | all_hyp += [hyps] 110 | return all_hyp, all_scores 111 | 112 | with torch.no_grad(): 113 | #-- Encode 114 | src_seq, src_pos, src_seg = src_seq.to(self.device), src_pos.to(self.device), src_seg.to(self.device) 115 | src_enc, *_ = self.model.encoder(src_seq, src_pos, src_seg) 116 | 117 | #-- Repeat data for beam search 118 | n_bm = self.config["beam_size"] 119 | n_inst, len_s, d_h = src_enc.size() 120 | src_seq = src_seq.repeat(1, n_bm).view(n_inst * n_bm, len_s) 121 | src_enc = src_enc.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, d_h) 122 | 123 | #-- Prepare beams 124 | inst_dec_beams = [Beam(n_bm, device=self.device) for _ in range(n_inst)] 125 | 126 | #-- Bookkeeping for active or not 127 | active_inst_idx_list = list(range(n_inst)) 128 | inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list) 129 | 130 | #-- Decode 131 | for len_dec_seq in range(1, self.config["response_len"] + 1): 132 | 133 | active_inst_idx_list = beam_decode_step( 134 | inst_dec_beams, len_dec_seq, src_seq, src_enc, inst_idx_to_position_map, n_bm) 135 | 136 | if not active_inst_idx_list: 137 | break # all instances have finished their path to 138 | 139 | src_seq, src_enc, inst_idx_to_position_map = collate_active_info( 140 | src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list) 141 | 142 | batch_hyp, batch_scores = collect_hypothesis_and_scores(inst_dec_beams, self.config["n_best"]) 143 | 144 | return batch_hyp, batch_scores 145 | -------------------------------------------------------------------------------- /tutorials/pytorch_track/tutorial_6/transformer_tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tutorial: Transformer for Dialogue\n", 8 | "This tutorial will go over the process of implementing a transformer for dialogue. \n", 9 | "\n", 10 | "Before running, make sure you have \"data_small\" and \"pretrained_model\" in the same directory as this file. These folders can be downloaded from this dropbox: https://www.dropbox.com/sh/3clajk8a3gr3qde/AADInNzuRyDI7YCDVYSvo0cxa?dl=0\n", 11 | "\n", 12 | "Transformer description found in paper Attention Is All You Need\n", 13 | "(https://arxiv.org/abs/1706.03762 )\n", 14 | "\n", 15 | "Dataset: Open subtitles - http://opus.nlpl.eu/OpenSubtitles-v2018.php\n", 16 | "\n", 17 | "Transformer code - https://github.com/jadore801120/attention-is-all-you-need-pytorch" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "# Import libraries" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import numpy as np\n", 34 | "import json\n", 35 | "import torch\n", 36 | "import torch.nn.functional as F\n", 37 | "import os\n", 38 | "import random\n", 39 | "from tqdm import tqdm\n", 40 | "import ipywidgets as widgets\n", 41 | "\n", 42 | "import transformer\n", 43 | "from transformer.Models import Transformer\n", 44 | "from transformer.Translator import Chatbot\n", 45 | "from dataset import DialogueDataset, Vocab" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "# Load config \n", 53 | "\n", 54 | "Now, load the config file. This file contains all of the hyperparameters for the experiment. \n", 55 | "\n", 56 | "If you want to change the parameters, change them in the config.json file" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# load config\n", 66 | "with open(\"config.json\", \"r\") as f:\n", 67 | " config = json.load(f)\n", 68 | "\n", 69 | "for key, data in config.items():\n", 70 | " print(\"{}: {}\".format(key, data))" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "# create output dir to save model, and results in\n", 80 | "if not os.path.exists(config[\"output_dir\"]):\n", 81 | " os.mkdir(config[\"output_dir\"])" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "# Load Data\n", 89 | "\n", 90 | "Next we will create our training and validation dataset objects.\n", 91 | "\n", 92 | "The dataset takes the dataset filename, the max length for the history, and the max length for the response. you can initialize the vocab with an already existing vocab object by passing the vocab object. There is also a setting to not update the vocab with the new documents-this is useful for running pretrianed models where you need to have the same vocab as the old model.\n", 93 | "\n", 94 | "We want the 2 datasets to have the same vocab, so the validation dataset will be initialized with the trianing vocab, and the updated vocab from the val dataset is set to the train dataset." 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "scrolled": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "# create train dataset\n", 106 | "train_dataset = DialogueDataset(\n", 107 | " os.path.join(config[\"dataset_filename\"], \"train.csv\"),\n", 108 | " config[\"history_len\"],\n", 109 | " config[\"response_len\"])\n", 110 | "\n", 111 | "# creat validation dataset\n", 112 | "val_dataset = DialogueDataset(\n", 113 | " os.path.join(config[\"dataset_filename\"], \"val.csv\"),\n", 114 | " config[\"history_len\"],\n", 115 | " config[\"response_len\"],\n", 116 | " train_dataset.vocab)\n", 117 | "\n", 118 | "# set vocab:\n", 119 | "vocab = val_dataset.vocab\n", 120 | "train_dataset.vocab = vocab\n", 121 | "config[\"vocab_size\"] = len(vocab)\n", 122 | "vocab.save_to_dict(os.path.join(config[\"output_dir\"], \"vocab.json\"))\n", 123 | "\n", 124 | "# print info\n", 125 | "print(\"train_len: {}\\nval_len: {}\\nvocab_size: {}\".format(len(train_dataset), len(val_dataset), len(vocab)))" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "Dataloaders for the model are initialized with the datasets\n", 133 | "\n", 134 | "We want to shuffle the train dataset, but it does not matter for validation" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# initialize dataloaders\n", 144 | "data_loader_train = torch.utils.data.DataLoader(\n", 145 | " train_dataset, config[\"train_batch_size\"], shuffle=True)\n", 146 | "data_loader_val = torch.utils.data.DataLoader(\n", 147 | " val_dataset, config[\"val_batch_size\"], shuffle=False)\n" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "# Create Model\n", 155 | "The transformer model is initialized with the parameters in the config file. You can change these parameters to improve the model." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "# initialize device ('cuda', or 'cpu')\n", 165 | "device = torch.device(config[\"device\"])" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "# create model\n", 175 | "model = Transformer(\n", 176 | " config[\"vocab_size\"],\n", 177 | " config[\"vocab_size\"],\n", 178 | " config[\"history_len\"],\n", 179 | " config[\"response_len\"],\n", 180 | " d_word_vec=config[\"embedding_dim\"],\n", 181 | " d_model=config[\"model_dim\"],\n", 182 | " d_inner=config[\"inner_dim\"],\n", 183 | " n_layers=config[\"num_layers\"],\n", 184 | " n_head=config[\"num_heads\"],\n", 185 | " d_k=config[\"dim_k\"],\n", 186 | " d_v=config[\"dim_v\"],\n", 187 | " dropout=config[\"dropout\"]\n", 188 | ").to(device)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "# Create Optimizer\n", 196 | "\n", 197 | "In the transformer paper they update the learning rate during training. To do this, we will make a scheduled optimizer wrapper class. \n", 198 | "\n", 199 | "We use an adam optimizer." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "# optimizer class for updating the learning rate\n", 209 | "class ScheduledOptim():\n", 210 | " '''A simple wrapper class for learning rate scheduling'''\n", 211 | "\n", 212 | " def __init__(self, optimizer, d_model, n_warmup_steps):\n", 213 | " self.optimizer = optimizer\n", 214 | " self.n_warmup_steps = n_warmup_steps\n", 215 | " self.n_current_steps = 0\n", 216 | " self.init_lr = np.power(d_model, -0.5)\n", 217 | "\n", 218 | " def step_and_update_lr(self):\n", 219 | " \"Step with the inner optimizer\"\n", 220 | " self._update_learning_rate()\n", 221 | " self.optimizer.step()\n", 222 | "\n", 223 | " def zero_grad(self):\n", 224 | " \"Zero out the gradients by the inner optimizer\"\n", 225 | " self.optimizer.zero_grad()\n", 226 | "\n", 227 | " def _get_lr_scale(self):\n", 228 | " return np.min([\n", 229 | " np.power(self.n_current_steps, -0.5),\n", 230 | " np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])\n", 231 | "\n", 232 | " def _update_learning_rate(self):\n", 233 | " ''' Learning rate scheduling per step '''\n", 234 | "\n", 235 | " self.n_current_steps += 1\n", 236 | " lr = self.init_lr * self._get_lr_scale()\n", 237 | "\n", 238 | " for param_group in self.optimizer.param_groups:\n", 239 | " param_group['lr'] = lr\n" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "# create optimizer\n", 249 | "optimizer = torch.optim.Adam(\n", 250 | " filter(lambda x: x.requires_grad, model.parameters()),\n", 251 | " betas=(0.9, 0.98), eps=1e-09)\n", 252 | "# create a sceduled optimizer object\n", 253 | "optimizer = ScheduledOptim(\n", 254 | " optimizer, config[\"model_dim\"], config[\"warmup_steps\"])" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "# Load Pretrained Model\n", 262 | "If you want to run a pretrained model, change the \"old_model_dir\" from None to the filename with the pretrained model \n", 263 | "\n", 264 | "You must have the same vocab for the old model, so that is loaded as well" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "def save_checkpoint(filename, model, optimizer):\n", 274 | " '''\n", 275 | " saves model into a state dict, along with its training statistics,\n", 276 | " and parameters\n", 277 | " :param model:\n", 278 | " :param optimizer:\n", 279 | " :return:\n", 280 | " '''\n", 281 | " state = {\n", 282 | " 'model': model.state_dict(),\n", 283 | " 'optimizer' : optimizer.state_dict(),\n", 284 | " }\n", 285 | " torch.save(state, filename)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "def load_checkpoint(filename, model, optimizer, device):\n", 295 | " '''\n", 296 | " loads previous model\n", 297 | " :param filename: file name of model\n", 298 | " :param model: model that contains same parameters of the one you are loading\n", 299 | " :param optimizer:\n", 300 | " :return: loaded model, checkpoint\n", 301 | " '''\n", 302 | " if os.path.isfile(filename):\n", 303 | " checkpoint = torch.load(filename, map_location=device)\n", 304 | " model.load_state_dict(checkpoint['model'])\n", 305 | " optimizer.load_state_dict(checkpoint['optimizer'])\n", 306 | " return model, optimizer\n" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "if config[\"old_model_dir\"] is not None:\n", 316 | " model, optimizer.optimizer = load_checkpoint(os.path.join(config[\"old_model_dir\"], \"model.bin\"),\n", 317 | " model, optimizer.optimizer, device)\n", 318 | " vocab.load_from_dict(os.path.join(config[\"old_model_dir\"], \"vocab.json\"))" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "# Output an Example\n", 326 | "Sometimes it is useful to see what the model is doing. So we will create a function that outputs an example from the validation set, along with the prediction from the model" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "def output_example(model, val_dataset, device, vocab):\n", 336 | " '''output an example and the models prediction for that example'''\n", 337 | " random_index = random.randint(0, len(val_dataset))\n", 338 | " example = val_dataset[random_index]\n", 339 | "\n", 340 | " # prepare data\n", 341 | " h_seq, h_pos, h_seg, r_seq, r_pos = map(\n", 342 | " lambda x: torch.from_numpy(x).to(device).unsqueeze(0), example)\n", 343 | "\n", 344 | " # take out first token from target for some reason\n", 345 | " gold = r_seq[:, 1:]\n", 346 | "\n", 347 | " # forward\n", 348 | " pred = model(h_seq, h_pos, h_seg, r_seq, r_pos)\n", 349 | " output = torch.argmax(pred, dim=1)\n", 350 | "\n", 351 | " # get history text\n", 352 | " string = \"history: \"\n", 353 | " seg = -1\n", 354 | " for i, idx in enumerate(h_seg.squeeze()):\n", 355 | " if seg != idx.item():\n", 356 | " string+=\"\\n\"\n", 357 | " seg=idx.item()\n", 358 | " token = vocab.id2token[h_seq.squeeze()[i].item()]\n", 359 | " if token != '':\n", 360 | " string += \"{} \".format(token)\n", 361 | "\n", 362 | " # get target text\n", 363 | " string += \"\\nTarget:\\n\"\n", 364 | " for idx in gold.squeeze():\n", 365 | " token = vocab.id2token[idx.item()]\n", 366 | " string += \"{} \".format(token)\n", 367 | "\n", 368 | " # get prediction\n", 369 | " string += \"\\n\\nPrediction:\\n\"\n", 370 | " for idx in output:\n", 371 | " token = vocab.id2token[idx.item()]\n", 372 | " string += \"{} \".format(token)\n", 373 | "\n", 374 | " # print\n", 375 | " print(\"\\n------------------------\\n\")\n", 376 | " print(string)\n", 377 | " print(\"\\n------------------------\\n\")" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "# Calculate Performance\n", 385 | "\n", 386 | "First calculate the loss, with or without smoothing\n", 387 | "\n", 388 | "In all you need is attention, they apply a label smothing to the loss function. They do this to make the model more \"unsure\" so the accuracy is higher. However, this causes perplexity to decrease. \n", 389 | "\n", 390 | "Calculate the number of correctly predicted tokens, to calculate accuracy later" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "def cal_performance(pred, gold, smoothing=False):\n", 400 | " ''' Apply label smoothing if needed '''\n", 401 | "\n", 402 | " loss = cal_loss(pred, gold, smoothing)\n", 403 | "\n", 404 | " pred = pred.max(1)[1]\n", 405 | " gold = gold.contiguous().view(-1)\n", 406 | " non_pad_mask = gold.ne(transformer.Constants.PAD)\n", 407 | " # eq omputes element-wise equality\n", 408 | " n_correct = pred.eq(gold)\n", 409 | " n_correct = n_correct.masked_select(non_pad_mask).sum().item()\n", 410 | "\n", 411 | " return loss, n_correct" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "def cal_loss(pred, gold, smoothing):\n", 421 | " ''' Calculate cross entropy loss, apply label smoothing if needed. '''\n", 422 | "\n", 423 | " gold = gold.contiguous().view(-1)\n", 424 | "\n", 425 | " if smoothing:\n", 426 | " eps = 0.1\n", 427 | " n_class = pred.size(1)\n", 428 | "\n", 429 | " one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1)\n", 430 | " one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)\n", 431 | " log_prb = F.log_softmax(pred, dim=1)\n", 432 | "\n", 433 | " non_pad_mask = gold.ne(transformer.Constants.PAD)\n", 434 | " loss = -(one_hot * log_prb).sum(dim=1)\n", 435 | " #loss = loss.masked_select(non_pad_mask).sum() # average later\n", 436 | " loss = loss.masked_select(non_pad_mask).mean()\n", 437 | " else:\n", 438 | " loss = F.cross_entropy(pred, gold, ignore_index=transformer.Constants.PAD, reduction='mean')\n", 439 | " return loss" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "# Forward Pass\n", 447 | "First prepares the inputs by sending the features to the respective device\n", 448 | "-src_seq: input word encodings\n", 449 | "-src_pos: input positional encodings\n", 450 | "-src_seg: input sequence encodings, for the turns in dialogue history\n", 451 | "-tgt_seq: target word encodings\n", 452 | "-tgt_pos: target positional encodings\n", 453 | "\n", 454 | "gold is the target but without the CLS token at the begining\n", 455 | "\n", 456 | "If you are training, you want to clear the gradients before getting the output" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [ 465 | "# forward\n", 466 | "def forward(phase, batch, model, optimizer):\n", 467 | " h_seq, h_pos, h_seg, r_seq, r_pos = map(\n", 468 | " lambda x: x.to(device), batch)\n", 469 | "\n", 470 | " gold = r_seq[:, 1:]\n", 471 | "\n", 472 | " # forward\n", 473 | " if phase == \"train\":\n", 474 | " optimizer.zero_grad()\n", 475 | " pred = model(h_seq, h_pos, h_seg, r_seq, r_pos)\n", 476 | " \n", 477 | " return pred, gold\n", 478 | " " 479 | ] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "metadata": {}, 484 | "source": [ 485 | "# Backward Pass\n", 486 | "The backward pass computes the loss, and updates the models parameters if it is training\n", 487 | "\n", 488 | "returns the loss, and the number of correct outputs" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "# backward\n", 498 | "def backward(phase, pred, gold, config):\n", 499 | " # get loss\n", 500 | " loss, n_correct = cal_performance(pred, gold,\n", 501 | " smoothing=config[\"label_smoothing\"])\n", 502 | " \n", 503 | " if phase == \"train\":\n", 504 | " # backward\n", 505 | " loss.backward()\n", 506 | "\n", 507 | " # update parameters, and learning rate\n", 508 | " optimizer.step_and_update_lr()\n", 509 | "\n", 510 | " return float(loss), n_correct" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": {}, 516 | "source": [ 517 | "# Training Loop\n", 518 | "For every epoch, the loop runs training and evaluation.\n", 519 | "\n", 520 | "Setting the model to eval mode vs training mode disables things like dropout layers, and other things you do not want during evaluation\n", 521 | "\n", 522 | "Metrics are initialized, and saved to the output file\n", 523 | "\n", 524 | "after running validation, we want to save the weights of the model only if the validation loss is lower than it has been before. This means we will only save the best model." 525 | ] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": {}, 530 | "source": [ 531 | "Next step before running training is initialize a dictionary for the results of training. It is important to be organized with experiment results.\n", 532 | "\n", 533 | "We want to save the weights of the model only when the validation loss lower than it has been before. So the lowest loss is initialized to a arbitrary large number. If the validation loss is lower than the lowest loss, save the weights, and set the lowest loss to the validation loss" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "# initialize results, add config to them\n", 543 | "results = dict()\n", 544 | "results[\"config\"] = config\n", 545 | "\n", 546 | "# initialize lowest validation loss, use to save weights\n", 547 | "lowest_loss = 999" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [ 556 | "# begin training\n", 557 | "for i in range(config[\"num_epochs\"]):\n", 558 | " epoch_metrics = dict()\n", 559 | " # output an example\n", 560 | " output_example(model, val_dataset, device, vocab)\n", 561 | " # run each phase per epoch\n", 562 | " for phase in [\"train\", \"val\"]:\n", 563 | " if phase == \"train\":\n", 564 | " # set model to training mode\n", 565 | " model.train()\n", 566 | " dataloader = data_loader_train\n", 567 | " batch_size = config[\"train_batch_size\"]\n", 568 | " else:\n", 569 | " # set model to evaluation mode\n", 570 | " self.model.eval()\n", 571 | " dataloader = data_loader_val\n", 572 | " batch_size = config[\"val_batch_size\"]\n", 573 | " \n", 574 | " # initialize metrics\n", 575 | " phase_metrics = dict()\n", 576 | " epoch_loss = list()\n", 577 | " average_epoch_loss = None\n", 578 | " n_word_total = 0\n", 579 | " n_correct = 0\n", 580 | " n_word_correct = 0\n", 581 | " for i, batch in enumerate(tqdm(dataloader, mininterval=2, desc=phase, leave=False)):\n", 582 | " # forward\n", 583 | " pred, gold = forward(phase, batch, model, optimizer)\n", 584 | " # backward\n", 585 | " loss, n_correct = backward(phase, pred, gold, config)\n", 586 | " \n", 587 | " # record loss\n", 588 | " epoch_loss.append(loss)\n", 589 | " average_epoch_loss = np.mean(epoch_loss)\n", 590 | "\n", 591 | " # get_accuracy\n", 592 | " non_pad_mask = gold.ne(transformer.Constants.PAD)\n", 593 | " n_word = non_pad_mask.sum().item()\n", 594 | " n_word_total += n_word\n", 595 | " n_word_correct += n_correct\n", 596 | " \n", 597 | " # record metrics\n", 598 | " phase_metrics[\"loss\"] = average_epoch_loss\n", 599 | " phase_metrics[\"token_accuracy\"] = n_word_correct / n_word_total\n", 600 | "\n", 601 | " # get perplexity\n", 602 | " perplexity = np.exp(average_epoch_loss)\n", 603 | " phase_metrics[\"perplexity\"] = perplexity\n", 604 | " \n", 605 | " phase_metrics[\"time_taken\"] = time.clock() - start\n", 606 | " \n", 607 | " epoch_metrics[phase] = phase_metrics\n", 608 | " \n", 609 | " # save model if val loss is lower than any of the previous epochs\n", 610 | " if phase == \"val\":\n", 611 | " if average_epoch_loss <= lowest_loss:\n", 612 | " save_checkpoint(filename, model, optimizer.optimizer)\n", 613 | " lowest_loss = average_epoch_loss\n", 614 | " \n", 615 | " results[\"epoch_{}\".format(epoch)] = epoch_metrics" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": null, 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "# save results to file\n", 625 | "with open(os.path.join(config[\"output_dir\"], \"results.json\"), 'w') as f:\n", 626 | " json.dump(results, f)" 627 | ] 628 | }, 629 | { 630 | "cell_type": "markdown", 631 | "metadata": {}, 632 | "source": [ 633 | "# Chat With Your Model\n", 634 | "\n", 635 | "Next, we can make a demo chatbot with the transformer. This is slightly different, and will use beam search. The inputs to the chatbot will be all the previous dialogue turns, the queries and responses. \n", 636 | "\n", 637 | "The chatbot does a beam search, and returns the n_best responses. If chose_best is true, it will output the response with the highest score. This may cause the model to be not interesting, so setting chose_best to false will cause the model to output something it may consider less probable, but possibly something different.\n", 638 | "\n", 639 | "The pretrained model will also output many tokens because it was trained on a large dataset with a small vocab, so many examples have these tokens, and it will predict them. (You can come up a word to replace the token in your head to make things more fun for yourself). You can also increase the number of possible results with beam_size, and n_best.\n", 640 | "\n", 641 | "With the vocab mapping, it creates the output sentence from the final result" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": null, 647 | "metadata": {}, 648 | "outputs": [], 649 | "source": [ 650 | "# create chatbot object\n", 651 | "chatbot = Chatbot(config, model)\n", 652 | "history = list()\n", 653 | "\n", 654 | "def generate_response(query, chatbot, dataset):\n", 655 | " # get input features for the dialogue history\n", 656 | " h_seq, h_pos, h_seg = dataset.get_input_features(history)\n", 657 | " \n", 658 | " # get response from model\n", 659 | " response = chatbot.translate_batch(h_seq, h_pos, h_seg)\n", 660 | " return response\n", 661 | "\n", 662 | "# print the response from the input\n", 663 | "def print_response(text_widget):\n", 664 | " # get query, add to the end of history \n", 665 | " query = text_widget.value\n", 666 | " history.append(query)\n", 667 | " # generate responses\n", 668 | " responses, scores = generate_response(history, chatbot, val_dataset)\n", 669 | " # chose response\n", 670 | " if config[\"choose_best\"]:\n", 671 | " response = responses[0][0]\n", 672 | " else:\n", 673 | " # pick a random result from the n_best\n", 674 | " idx=random.randint(0, max(config[\"n_best\"], config[\"beam_size\"])-1)\n", 675 | " response = responses[0][idx]\n", 676 | " \n", 677 | " # uncomment this line to see all the scores\n", 678 | " # print(\"scores in log prob: {}\\n\".format(scores[0]))\n", 679 | " \n", 680 | " # create output string\n", 681 | " output = \"\"\n", 682 | " for idx in response[:-1]:\n", 683 | " token = vocab.id2token[idx]\n", 684 | " output += \"{} \".format(token)\n", 685 | " print(f'{query} -> {output}')\n", 686 | " history.append(output)\n", 687 | "\n", 688 | "text_input = widgets.Text(placeholder='Type something',\n", 689 | " description='String:',\n", 690 | " disabled=False)\n", 691 | "\n", 692 | "text_input.on_submit(print_response)\n" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [ 701 | "text_input" 702 | ] 703 | }, 704 | { 705 | "cell_type": "markdown", 706 | "metadata": {}, 707 | "source": [ 708 | "# Extra exercise 1: Sample results according to their probability scores\n", 709 | "Set config[\"choose_best\"] to be False and uncomment the command to print scores. You should see different scores associated with the responses. They are all negative numbers because log probablity values are used.\n", 710 | "\n", 711 | "### Can you modify the function again, so the results are sampled according to their probability scores?\n", 712 | "Hint: check here for a function to use https://pytorch.org/docs/stable/_modules/torch/distributions/categorical.html\n", 713 | "\n", 714 | "In fact it is easy to write your own sampling function too. e.g. Suppose we have a random variable X with P(X=a) = 0.6 and P(X=b) = 0.4. To sample from X, we can randomly draw a number r between [0, 1]. If r < 0.6, we pick a as our outcome, otherwise pick b.\n", 715 | "\n", 716 | "In our case, you need to convert the log prob scores in probability space using exp(), normalize them (so they sum to 1) and them construct the intervals." 717 | ] 718 | }, 719 | { 720 | "cell_type": "markdown", 721 | "metadata": {}, 722 | "source": [ 723 | "# Extra exercise 2: Rewrite the Position-wise Feed-Forward Network\n", 724 | "The Transformer has a Position-wise Feed-Forward Network in each encoder and decoder layer. The source code can be found in transformer/SubLayers.py" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": null, 730 | "metadata": {}, 731 | "outputs": [], 732 | "source": [ 733 | "import torch.nn as nn\n", 734 | "\n", 735 | "class PositionwiseFeedForward(nn.Module):\n", 736 | " ''' A two-feed-forward-layer module '''\n", 737 | "\n", 738 | " def __init__(self, d_in, d_hid, dropout=0.1):\n", 739 | " super().__init__()\n", 740 | " self.w_1 = nn.Conv1d(d_in, d_hid, 1) # position-wise\n", 741 | " self.w_2 = nn.Conv1d(d_hid, d_in, 1) # position-wise\n", 742 | " self.layer_norm = nn.LayerNorm(d_in)\n", 743 | " self.dropout = nn.Dropout(dropout)\n", 744 | "\n", 745 | " def forward(self, x):\n", 746 | " \"\"\"\n", 747 | " just a feed forward linear layer that is used after attention in the\n", 748 | " encoder and decoder\n", 749 | " Args:\n", 750 | " x: input\n", 751 | " Returns:\n", 752 | " \"\"\"\n", 753 | " # feed forward\n", 754 | " residual = x\n", 755 | " output = x.transpose(1, 2)\n", 756 | " output = self.w_2(F.relu(self.w_1(output)))\n", 757 | " output = output.transpose(1, 2)\n", 758 | " output = self.dropout(output)\n", 759 | "\n", 760 | " # Add and norm\n", 761 | " output = self.layer_norm(output + residual)\n", 762 | " return output" 763 | ] 764 | }, 765 | { 766 | "cell_type": "markdown", 767 | "metadata": {}, 768 | "source": [ 769 | "### Answer questions:\n", 770 | "1. What is the purpose of x.transpose(1, 2)? \n", 771 | "2. If we do not use nn.Conv1d(), can you achieve the same goal with nn.Linear()?" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": null, 777 | "metadata": {}, 778 | "outputs": [], 779 | "source": [] 780 | } 781 | ], 782 | "metadata": { 783 | "kernelspec": { 784 | "display_name": "Python 3", 785 | "language": "python", 786 | "name": "python3" 787 | }, 788 | "language_info": { 789 | "codemirror_mode": { 790 | "name": "ipython", 791 | "version": 3 792 | }, 793 | "file_extension": ".py", 794 | "mimetype": "text/x-python", 795 | "name": "python", 796 | "nbconvert_exporter": "python", 797 | "pygments_lexer": "ipython3", 798 | "version": "3.6.7" 799 | } 800 | }, 801 | "nbformat": 4, 802 | "nbformat_minor": 2 803 | } 804 | --------------------------------------------------------------------------------