├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── data └── data-readme.md ├── meta ├── banner.png ├── wandb-sentiment.jpg ├── wandb.png └── wandb.svg ├── models └── models-readme.md ├── transformers_multi_label_classification.ipynb ├── transformers_multiclass_classification.ipynb ├── transformers_ner.ipynb ├── transformers_sentiment_wandb.ipynb └── transformers_summarization_wandb.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | . 2 | .. 3 | .vscode 4 | .ipynb_checkpoints 5 | *.csv 6 | *.bin 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | pip-wheel-metadata/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # celery beat schedule file 102 | celerybeat-schedule 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # Other files 135 | *.csv 136 | *.pkl 137 | .vscode 138 | .idea 139 | 140 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | We welcome and appreciate all contributions to the project! Below are some guidelines to follow when contributing. 3 | 4 | ## Issues 5 | - Before opening a new issue, please check if the issue has already been reported. 6 | - When opening a new issue, provide a clear and concise description of the problem. Include as much relevant information as possible (e.g. version, operating system, etc.). 7 | ## Pull Requests 8 | - Fork the repository and create a new branch for your changes. 9 | - Keep your changes in a single commit and limit the commit message to 72 characters in the subject line and a brief description in the body. 10 | - Make sure that your code follows the existing coding style and best practices. 11 | - Ensure that your code is well-documented and tested. 12 | - Open a pull request with a clear title and description of your changes. Reference the related issue (if any) and explain the changes in detail. 13 | - Wait for a maintainer to review and merge your pull request. 14 | ## Code of Conduct 15 | - By participating in this project, you agree to abide by our Code of Conduct. 16 | 17 | Thank you for your contribution! 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Abhishek Kumar Mishra 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #

PyTorch Transformers Tutorials

2 | 3 |

4 | Transformer Tutorials 5 |

6 | 7 |

8 | GitHub issues 9 | GitHub forks 10 | Github Stars 11 | GitHub license 12 | 13 | 14 | ### Introduction 15 | 16 | The field of **NLP** was revolutionized in the year 2018 by introduction of **BERT** and his **Transformer** friends(RoBerta, XLM etc.). 17 | 18 | These novel transformer based neural network architectures and new ways to training a neural network on natural language data introduced transfer learning to NLP problems. Transfer learning had been giving out state of the art results in the Computer Vision domain for a few years now and introduction of transformer models for NLP brought about the same paradigm change in NLP. 19 | 20 | Companies like [Google](https://github.com/google-research/bert) and [Facebook](https://github.com/pytorch/fairseq/tree/master/examples/roberta) trained their neural networks on large swathes of Natural Language Data to grasp the intricacies of language thereby generating a Language model. Finally these models were fine tuned to specific domain dataset to achieve state of the art results for a specific problem statement. They also published these trained models to open source community. The community members were now able to fine tune these models to their specific use cases. 21 | 22 | [Hugging Face](https://github.com/huggingface) made it easier for community to access and fine tune these models using their Python Package: [Transformers](https://github.com/huggingface/transformers). 23 | 24 | ### Motivation 25 | Despite these amazing technological advancements applying these solutions to business problems is still a challenge given the niche knowledge required to understand and apply these method on specific problem statements. Hence, In the following tutorials i will be demonstrating how a user can leverage technologies along with some other python tools to fine tune these Language models to specific type of tasks. 26 | 27 | Before i proceed i will like to mention the following groups for the fantastic work they are doing and sharing which have made these notebooks and tutorials possible: 28 | 29 | Please review these amazing sources of information and subscribe to their channels/sources. 30 | - [Hugging Face Team](https://huggingface.co/) 31 | - Abhishek Thakur for his amazing [Youtube videos](https://www.youtube.com/user/abhisheksvnit) 32 | 33 | The problem statements that i will be working with are: 34 | 35 | | Notebook |Github Link |Colab Link|Kaggle Kernel| 36 | |--|--|--|--| 37 | |Text Classification: Multi-Class| [Github](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)|[Kaggle](https://www.kaggle.com/eggwhites2705/transformers-multiclass-classification-ipynb)| 38 | |Text Classification: Multi-Label| [Github](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|[Kaggle](https://www.kaggle.com/eggwhites2705/transformers-multi-label-classification)| 39 | |Sentiment Classification **with Experiment Tracking in [WandB](https://app.wandb.ai/abhimishra-91/transformers_tutorials_sentiment/runs/1zwn4gbg?workspace=user-abhimishra-91)!**|[Github](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_sentiment_wandb.ipynb)|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_sentiment_wandb.ipynb)|| 40 | |Named Entity Recognition: **with TPU processing!**|[Github](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_ner.ipynb)|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_ner.ipynb)|[Kaggle](https://www.kaggle.com/eggwhites2705/transformers-ner)| 41 | |Question Answering|||| 42 | |Summary Writing: **with Experiment Tracking in [WandB](https://app.wandb.ai/abhimishra-91/transformers_tutorials_summarization?workspace=user-abhimishra-91)!**|[Github](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|[Kaggle](https://www.kaggle.com/eggwhites2705/transformers-summarization-t5/output)| 43 | 44 | 45 | 46 | ### Directory Structure 47 | 48 | 1. `data`: This folder contains all the toy data used for fine tuning. 49 | 2. `utils`: This folder will contain any miscellaneous script used to prepare for the fine tuning. 50 | 3. `models`: Folder to save all the artifacts post fine tuning. 51 | 52 | ### Further Watching/Reading 53 | 54 | I will try to cover the practical and implementation aspects of fine tuning of these language models on various NLP tasks. You can improve your knowledge on this topic by reading/watching the following resources. 55 | 56 | 57 | - Watching 58 | - [Introduction in Simple terms](https://www.youtube.com/watch?v=gcHkxP9adiM) 59 | - [Transfer Learning in NLP](https://www.youtube.com/watch?v=0T_Qr4qBrqc) 60 | - [BERT Research Series from ChrisMcCormickAI](https://www.youtube.com/playlist?list=PLam9sigHPGwOBuH4_4fr-XvDbe5uneaf6) 61 | 62 | - Reading 63 | - [Transformers Documentation](https://huggingface.co/transformers/) 64 | - [Pytorch Documentation](https://pytorch.org/docs/stable/index.html) 65 | - [Google AI Blog](https://ai.googleblog.com/) 66 | -------------------------------------------------------------------------------- /data/data-readme.md: -------------------------------------------------------------------------------- 1 | # ReadMe for the Data folder 2 | 3 | Data Used for Text Classification: Multi Class 4 | -------------------------------------------------------------------------------- /meta/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhimishra91/transformers-tutorials/3d5a9b1d735eb68648588526d77d6dda7735d631/meta/banner.png -------------------------------------------------------------------------------- /meta/wandb-sentiment.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhimishra91/transformers-tutorials/3d5a9b1d735eb68648588526d77d6dda7735d631/meta/wandb-sentiment.jpg -------------------------------------------------------------------------------- /meta/wandb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhimishra91/transformers-tutorials/3d5a9b1d735eb68648588526d77d6dda7735d631/meta/wandb.png -------------------------------------------------------------------------------- /meta/wandb.svg: -------------------------------------------------------------------------------- 1 | Training Loss neat-totem-19 gallant-music-9 clear-hill-80100200300Step0123456
text { font-size: 10px; fill: black }
.react-vis-magic-css-import-rule{display:inherit}.rv-treemap{font-size:12px;position:relative}.rv-treemap__leaf{overflow:hidden;position:absolute}.rv-treemap__leaf--circle{align-items:center;border-radius:100%;display:flex;justify-content:center}.rv-treemap__leaf__content{overflow:hidden;padding:10px;text-overflow:ellipsis}.rv-xy-plot{color:#c3c3c3;position:relative}.rv-xy-plot canvas{pointer-events:none}.rv-xy-plot .rv-xy-canvas{pointer-events:none;position:absolute}.rv-xy-plot__inner{display:block}.rv-xy-plot__axis__line{fill:none;stroke-width:2px;stroke:#e6e6e9}.rv-xy-plot__axis__tick__line{stroke:#e6e6e9}.rv-xy-plot__axis__tick__text{fill:#6b6b76;font-size:11px}.rv-xy-plot__axis__title text{fill:#6b6b76;font-size:11px}.rv-xy-plot__grid-lines__line{stroke:#e6e6e9}.rv-xy-plot__circular-grid-lines__line{fill-opacity:0;stroke:#e6e6e9}.rv-xy-plot__series,.rv-xy-plot__series path{pointer-events:all}.rv-xy-plot__series--line{fill:none;stroke:#000;stroke-width:2px}.rv-crosshair{position:absolute;font-size:11px;pointer-events:none}.rv-crosshair__line{background:#47d3d9;width:1px}.rv-crosshair__inner{position:absolute;text-align:left;top:0}.rv-crosshair__inner__content{border-radius:4px;background:#3a3a48;color:#fff;font-size:12px;padding:7px 10px;box-shadow:0 2px 4px rgba(0,0,0,0.5)}.rv-crosshair__inner--left{right:4px}.rv-crosshair__inner--right{left:4px}.rv-crosshair__title{font-weight:bold;white-space:nowrap}.rv-crosshair__item{white-space:nowrap}.rv-hint{position:absolute;pointer-events:none}.rv-hint__content{border-radius:4px;padding:7px 10px;font-size:12px;background:#3a3a48;box-shadow:0 2px 4px rgba(0,0,0,0.5);color:#fff;text-align:left;white-space:nowrap}.rv-discrete-color-legend{box-sizing:border-box;overflow-y:auto;font-size:12px}.rv-discrete-color-legend.horizontal{white-space:nowrap}.rv-discrete-color-legend-item{color:#3a3a48;border-radius:1px;padding:9px 10px}.rv-discrete-color-legend-item.horizontal{display:inline-block}.rv-discrete-color-legend-item.horizontal .rv-discrete-color-legend-item__title{margin-left:0;display:block}.rv-discrete-color-legend-item__color{display:inline-block;vertical-align:middle;overflow:visible}.rv-discrete-color-legend-item__color__path{stroke:#dcdcdc;stroke-width:2px}.rv-discrete-color-legend-item__title{margin-left:10px}.rv-discrete-color-legend-item.disabled{color:#b8b8b8}.rv-discrete-color-legend-item.clickable{cursor:pointer}.rv-discrete-color-legend-item.clickable:hover{background:#f9f9f9}.rv-search-wrapper{display:flex;flex-direction:column}.rv-search-wrapper__form{flex:0}.rv-search-wrapper__form__input{width:100%;color:#a6a6a5;border:1px solid #e5e5e4;padding:7px 10px;font-size:12px;box-sizing:border-box;border-radius:2px;margin:0 0 9px;outline:0}.rv-search-wrapper__contents{flex:1;overflow:auto}.rv-continuous-color-legend{font-size:12px}.rv-continuous-color-legend .rv-gradient{height:4px;border-radius:2px;margin-bottom:5px}.rv-continuous-size-legend{font-size:12px}.rv-continuous-size-legend .rv-bubbles{text-align:justify;overflow:hidden;margin-bottom:5px;width:100%}.rv-continuous-size-legend .rv-bubble{background:#d8d9dc;display:inline-block;vertical-align:bottom}.rv-continuous-size-legend .rv-spacer{display:inline-block;font-size:0;line-height:0;width:100%}.rv-legend-titles{height:16px;position:relative}.rv-legend-titles__left,.rv-legend-titles__right,.rv-legend-titles__center{position:absolute;white-space:nowrap;overflow:hidden}.rv-legend-titles__center{display:block;text-align:center;width:100%}.rv-legend-titles__right{right:0}.rv-radial-chart .rv-xy-plot__series--label{pointer-events:none}

/*]]>*/ -------------------------------------------------------------------------------- /models/models-readme.md: -------------------------------------------------------------------------------- 1 | # ReadMe for the Model folder 2 | 3 | All the files for Models saved here -------------------------------------------------------------------------------- /transformers_multiclass_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Fine Tuning Transformer for MultiClass Text Classification" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Introduction\n", 15 | "\n", 16 | "In this tutorial we will be fine tuning a transformer model for the **Multiclass text classification** problem. \n", 17 | "This is one of the most common business problems where a given piece of text/sentence/document needs to be classified into one of the categories out of the given list.\n", 18 | "\n", 19 | "#### Flow of the notebook\n", 20 | "\n", 21 | "The notebook will be divided into seperate sections to provide a organized walk through for the process used. This process can be modified for individual use cases. The sections are:\n", 22 | "\n", 23 | "1. [Importing Python Libraries and preparing the environment](#section01)\n", 24 | "2. [Importing and Pre-Processing the domain data](#section02)\n", 25 | "3. [Preparing the Dataset and Dataloader](#section03)\n", 26 | "4. [Creating the Neural Network for Fine Tuning](#section04)\n", 27 | "5. [Fine Tuning the Model](#section05)\n", 28 | "6. [Validating the Model Performance](#section06)\n", 29 | "7. [Saving the model and artifacts for Inference in Future](#section07)\n", 30 | "\n", 31 | "#### Technical Details\n", 32 | "\n", 33 | "This script leverages on multiple tools designed by other teams. Details of the tools used below. Please ensure that these elements are present in your setup to successfully implement this script.\n", 34 | "\n", 35 | " - Data: \n", 36 | "\t - We are using the News aggregator dataset available at by [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/News+Aggregator)\n", 37 | "\t - We are referring only to the first csv file from the data dump: `newsCorpora.csv`\n", 38 | "\t - There are `422937` rows of data. Where each row has the following data-point: \n", 39 | "\t\t - ID Numeric ID \n", 40 | "\t\t - TITLE News title \n", 41 | "\t\t - URL Url \n", 42 | "\t\t - PUBLISHER Publisher name \n", 43 | "\t\t - CATEGORY News category (b = business, t = science and technology, e = entertainment, m = health) \n", 44 | "\t\t - STORY Alphanumeric ID of the cluster that includes news about the same story \n", 45 | "\t\t - HOSTNAME Url hostname \n", 46 | "\t\t - TIMESTAMP Approximate time the news was published, as the number of milliseconds since the epoch 00:00:00 GMT, January 1, 1970\n", 47 | "\n", 48 | "\n", 49 | " - Language Model Used:\n", 50 | "\t - DistilBERT this is a smaller transformer model as compared to BERT or Roberta. It is created by process of distillation applied to Bert. \n", 51 | "\t - [Blog-Post](https://medium.com/huggingface/distilbert-8cf3380435b5)\n", 52 | "\t - [Research Paper](https://arxiv.org/abs/1910.01108)\n", 53 | " - [Documentation for python](https://huggingface.co/transformers/model_doc/distilbert.html)\n", 54 | "\n", 55 | "\n", 56 | " - Hardware Requirements:\n", 57 | "\t - Python 3.6 and above\n", 58 | "\t - Pytorch, Transformers and All the stock Python ML Libraries\n", 59 | "\t - GPU enabled setup \n", 60 | "\n", 61 | "\n", 62 | " - Script Objective:\n", 63 | "\t - The objective of this script is to fine tune DistilBERT to be able to classify a news headline into the following categories:\n", 64 | "\t\t - Business\n", 65 | "\t\t - Technology\n", 66 | "\t\t - Health\n", 67 | "\t\t - Entertainment \n" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "\n", 75 | "### Importing Python Libraries and preparing the environment\n", 76 | "\n", 77 | "At this step we will be importing the libraries and modules needed to run our script. Libraries are:\n", 78 | "* Pandas\n", 79 | "* Pytorch\n", 80 | "* Pytorch Utils for Dataset and Dataloader\n", 81 | "* Transformers\n", 82 | "* DistilBERT Model and Tokenizer\n", 83 | "\n", 84 | "Followed by that we will preapre the device for CUDA execeution. This configuration is needed if you want to leverage on onboard GPU. " 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 1, 90 | "metadata": { 91 | "colab": { 92 | "base_uri": "https://localhost:8080/", 93 | "height": 114, 94 | "referenced_widgets": [ 95 | "7532a60d077248ca963f514988d41acc", 96 | "e39121739a4c4bffb16155e9a61a58cc", 97 | "ca0092edb8f5442a988c87c89930a10e", 98 | "5a975bc922a14c1b82245195b7c7f659", 99 | "f9dd503454b2450e957a96bff807c8d9", 100 | "222af74a298140ff9ec6e6f1f286a52b", 101 | "3516912b49bf4e3e98c9ce6c7b1b1469", 102 | "8335299626724853865234b86745e2a5", 103 | "fd6f40d5854c49f79056cebd16f86626", 104 | "cde06f09879848a3bfc5892390a51d36", 105 | "b13c341a69c14c059edc40e6b438ee80", 106 | "dd7699e7813d4b9f9e80990498a39539", 107 | "dd589fc95cd94cd190bc2640f9ef618c", 108 | "4641ff326eea446c88903e70b85c90d7", 109 | "c1db6876e9b04293b4077f13006b4a19", 110 | "b8728a6fcdc54b72a10c8bfdcf794fad" 111 | ] 112 | }, 113 | "colab_type": "code", 114 | "id": "wuMlXT80GAMK", 115 | "outputId": "074dad6a-a18e-45bd-8c9c-29e318962dcd" 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "# Importing the libraries needed\n", 120 | "import pandas as pd\n", 121 | "import torch\n", 122 | "import transformers\n", 123 | "from torch.utils.data import Dataset, DataLoader\n", 124 | "from transformers import DistilBertModel, DistilBertTokenizer" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 2, 130 | "metadata": { 131 | "colab": {}, 132 | "colab_type": "code", 133 | "id": "xQMKTZ4ARk12" 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "# Setting up the device for GPU usage\n", 138 | "\n", 139 | "from torch import cuda\n", 140 | "device = 'cuda' if cuda.is_available() else 'cpu'" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "\n", 148 | "### Importing and Pre-Processing the domain data\n", 149 | "\n", 150 | "We will be working with the data and preparing for fine tuning purposes. \n", 151 | "*Assuming that the `newCorpora.csv` is already downloaded in your `data` folder*\n", 152 | "\n", 153 | "Import the file in a dataframe and give it the headers as per the documentation.\n", 154 | "Cleaning the file to remove the unwanted columns and create an additional column for training.\n", 155 | "The final Dataframe will be something like this:\n", 156 | "\n", 157 | "|TITLE|CATEGORY|ENCODED_CAT|\n", 158 | "|--|--|--|\n", 159 | "| title_1|Entertainment | 1 |\n", 160 | "| title_2|Entertainment | 1 |\n", 161 | "| title_3|Business| 2 |\n", 162 | "| title_4|Science| 3 |\n", 163 | "| title_5|Science| 3 |\n", 164 | "| title_6|Health| 4 |" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 5, 170 | "metadata": { 171 | "colab": {}, 172 | "colab_type": "code", 173 | "id": "iNCaZ2epNcSO" 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "# Import the csv into pandas dataframe and add the headers\n", 178 | "df = pd.read_csv('./data/newsCorpora.csv', sep='\\t', names=['ID','TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])\n", 179 | "# df.head()\n", 180 | "# # Removing unwanted columns and only leaving title of news and the category which will be the target\n", 181 | "df = df[['TITLE','CATEGORY']]\n", 182 | "# df.head()\n", 183 | "\n", 184 | "# # Converting the codes to appropriate categories using a dictionary\n", 185 | "my_dict = {\n", 186 | " 'e':'Entertainment',\n", 187 | " 'b':'Business',\n", 188 | " 't':'Science',\n", 189 | " 'm':'Health'\n", 190 | "}\n", 191 | "\n", 192 | "def update_cat(x):\n", 193 | " return my_dict[x]\n", 194 | "\n", 195 | "df['CATEGORY'] = df['CATEGORY'].apply(lambda x: update_cat(x))\n", 196 | "\n", 197 | "encode_dict = {}\n", 198 | "\n", 199 | "def encode_cat(x):\n", 200 | " if x not in encode_dict.keys():\n", 201 | " encode_dict[x]=len(encode_dict)\n", 202 | " return encode_dict[x]\n", 203 | "\n", 204 | "df['ENCODE_CAT'] = df['CATEGORY'].apply(lambda x: encode_cat(x))" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "\n", 212 | "### Preparing the Dataset and Dataloader\n", 213 | "\n", 214 | "We will start with defining few key variables that will be used later during the training/fine tuning stage.\n", 215 | "Followed by creation of Dataset class - This defines how the text is pre-processed before sending it to the neural network. We will also define the Dataloader that will feed the data in batches to the neural network for suitable training and processing. \n", 216 | "Dataset and Dataloader are constructs of the PyTorch library for defining and controlling the data pre-processing and its passage to neural network. For further reading into Dataset and Dataloader read the [docs at PyTorch](https://pytorch.org/docs/stable/data.html)\n", 217 | "\n", 218 | "#### *Triage* Dataset Class\n", 219 | "- This class is defined to accept the Dataframe as input and generate tokenized output that is used by the DistilBERT model for training. \n", 220 | "- We are using the DistilBERT tokenizer to tokenize the data in the `TITLE` column of the dataframe. \n", 221 | "- The tokenizer uses the `encode_plus` method to perform tokenization and generate the necessary outputs, namely: `ids`, `attention_mask`\n", 222 | "- To read further into the tokenizer, [refer to this document](https://huggingface.co/transformers/model_doc/distilbert.html#distilberttokenizer)\n", 223 | "- `target` is the encoded category on the news headline. \n", 224 | "- The *Triage* class is used to create 2 datasets, for training and for validation.\n", 225 | "- *Training Dataset* is used to fine tune the model: **80% of the original data**\n", 226 | "- *Validation Dataset* is used to evaluate the performance of the model. The model has not seen this data during training. \n", 227 | "\n", 228 | "#### Dataloader\n", 229 | "- Dataloader is used to for creating training and validation dataloader that load data to the neural network in a defined manner. This is needed because all the data from the dataset cannot be loaded to the memory at once, hence the amount of dataloaded to the memory and then passed to the neural network needs to be controlled.\n", 230 | "- This control is achieved using the parameters such as `batch_size` and `max_len`.\n", 231 | "- Training and Validation dataloaders are used in the training and validation part of the flow respectively" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 4, 237 | "metadata": { 238 | "colab": {}, 239 | "colab_type": "code", 240 | "id": "JrBr2YesGdO_" 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "# Defining some key variables that will be used later on in the training\n", 245 | "MAX_LEN = 512\n", 246 | "TRAIN_BATCH_SIZE = 4\n", 247 | "VALID_BATCH_SIZE = 2\n", 248 | "EPOCHS = 1\n", 249 | "LEARNING_RATE = 1e-05\n", 250 | "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 5, 256 | "metadata": { 257 | "colab": {}, 258 | "colab_type": "code", 259 | "id": "2vX7kzaAHu39" 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "class Triage(Dataset):\n", 264 | " def __init__(self, dataframe, tokenizer, max_len):\n", 265 | " self.len = len(dataframe)\n", 266 | " self.data = dataframe\n", 267 | " self.tokenizer = tokenizer\n", 268 | " self.max_len = max_len\n", 269 | " \n", 270 | " def __getitem__(self, index):\n", 271 | " title = str(self.data.TITLE[index])\n", 272 | " title = \" \".join(title.split())\n", 273 | " inputs = self.tokenizer.encode_plus(\n", 274 | " title,\n", 275 | " None,\n", 276 | " add_special_tokens=True,\n", 277 | " max_length=self.max_len,\n", 278 | " pad_to_max_length=True,\n", 279 | " return_token_type_ids=True,\n", 280 | " truncation=True\n", 281 | " )\n", 282 | " ids = inputs['input_ids']\n", 283 | " mask = inputs['attention_mask']\n", 284 | "\n", 285 | " return {\n", 286 | " 'ids': torch.tensor(ids, dtype=torch.long),\n", 287 | " 'mask': torch.tensor(mask, dtype=torch.long),\n", 288 | " 'targets': torch.tensor(self.data.ENCODE_CAT[index], dtype=torch.long)\n", 289 | " } \n", 290 | " \n", 291 | " def __len__(self):\n", 292 | " return self.len" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 6, 298 | "metadata": { 299 | "colab": {}, 300 | "colab_type": "code", 301 | "id": "Zcwq13c0NE9c" 302 | }, 303 | "outputs": [ 304 | { 305 | "name": "stdout", 306 | "output_type": "stream", 307 | "text": [ 308 | "FULL Dataset: (422419, 3)\n", 309 | "TRAIN Dataset: (337935, 3)\n", 310 | "TEST Dataset: (84484, 3)\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "# Creating the dataset and dataloader for the neural network\n", 316 | "\n", 317 | "train_size = 0.8\n", 318 | "train_dataset=df.sample(frac=train_size,random_state=200)\n", 319 | "test_dataset=df.drop(train_dataset.index).reset_index(drop=True)\n", 320 | "train_dataset = train_dataset.reset_index(drop=True)\n", 321 | "\n", 322 | "\n", 323 | "print(\"FULL Dataset: {}\".format(df.shape))\n", 324 | "print(\"TRAIN Dataset: {}\".format(train_dataset.shape))\n", 325 | "print(\"TEST Dataset: {}\".format(test_dataset.shape))\n", 326 | "\n", 327 | "training_set = Triage(train_dataset, tokenizer, MAX_LEN)\n", 328 | "testing_set = Triage(test_dataset, tokenizer, MAX_LEN)" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 7, 334 | "metadata": { 335 | "colab": {}, 336 | "colab_type": "code", 337 | "id": "l1BgA1CkQSYa" 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "train_params = {'batch_size': TRAIN_BATCH_SIZE,\n", 342 | " 'shuffle': True,\n", 343 | " 'num_workers': 0\n", 344 | " }\n", 345 | "\n", 346 | "test_params = {'batch_size': VALID_BATCH_SIZE,\n", 347 | " 'shuffle': True,\n", 348 | " 'num_workers': 0\n", 349 | " }\n", 350 | "\n", 351 | "training_loader = DataLoader(training_set, **train_params)\n", 352 | "testing_loader = DataLoader(testing_set, **test_params)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "\n", 360 | "### Creating the Neural Network for Fine Tuning\n", 361 | "\n", 362 | "#### Neural Network\n", 363 | " - We will be creating a neural network with the `DistillBERTClass`. \n", 364 | " - This network will have the DistilBERT Language model followed by a `dropout` and finally a `Linear` layer to obtain the final outputs. \n", 365 | " - The data will be fed to the DistilBERT Language model as defined in the dataset. \n", 366 | " - Final layer outputs is what will be compared to the `encoded category` to determine the accuracy of models prediction. \n", 367 | " - We will initiate an instance of the network called `model`. This instance will be used for training and then to save the final trained model for future inference. \n", 368 | " \n", 369 | "#### Loss Function and Optimizer\n", 370 | " - `Loss Function` and `Optimizer` and defined in the next cell.\n", 371 | " - The `Loss Function` is used the calculate the difference in the output created by the model and the actual output. \n", 372 | " - `Optimizer` is used to update the weights of the neural network to improve its performance.\n", 373 | " \n", 374 | "#### Further Reading\n", 375 | "- You can refer to my [Pytorch Tutorials](https://github.com/abhimishra91/pytorch-tutorials) to get an intuition of Loss Function and Optimizer.\n", 376 | "- [Pytorch Documentation for Loss Function](https://pytorch.org/docs/stable/nn.html#loss-functions)\n", 377 | "- [Pytorch Documentation for Optimizer](https://pytorch.org/docs/stable/optim.html)\n", 378 | "- Refer to the links provided on the top of the notebook to read more about DistiBERT. " 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 8, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. \n", 388 | "\n", 389 | "class DistillBERTClass(torch.nn.Module):\n", 390 | " def __init__(self):\n", 391 | " super(DistillBERTClass, self).__init__()\n", 392 | " self.l1 = DistilBertModel.from_pretrained(\"distilbert-base-uncased\")\n", 393 | " self.pre_classifier = torch.nn.Linear(768, 768)\n", 394 | " self.dropout = torch.nn.Dropout(0.3)\n", 395 | " self.classifier = torch.nn.Linear(768, 4)\n", 396 | "\n", 397 | " def forward(self, input_ids, attention_mask):\n", 398 | " output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)\n", 399 | " hidden_state = output_1[0]\n", 400 | " pooler = hidden_state[:, 0]\n", 401 | " pooler = self.pre_classifier(pooler)\n", 402 | " pooler = torch.nn.ReLU()(pooler)\n", 403 | " pooler = self.dropout(pooler)\n", 404 | " output = self.classifier(pooler)\n", 405 | " return output" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 9, 411 | "metadata": { 412 | "collapsed": true 413 | }, 414 | "outputs": [ 415 | { 416 | "data": { 417 | "text/plain": [ 418 | "DistillBERTClass(\n", 419 | " (l1): DistilBertModel(\n", 420 | " (embeddings): Embeddings(\n", 421 | " (word_embeddings): Embedding(30522, 768, padding_idx=0)\n", 422 | " (position_embeddings): Embedding(512, 768)\n", 423 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 424 | " (dropout): Dropout(p=0.1, inplace=False)\n", 425 | " )\n", 426 | " (transformer): Transformer(\n", 427 | " (layer): ModuleList(\n", 428 | " (0): TransformerBlock(\n", 429 | " (attention): MultiHeadSelfAttention(\n", 430 | " (dropout): Dropout(p=0.1, inplace=False)\n", 431 | " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", 432 | " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", 433 | " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", 434 | " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", 435 | " )\n", 436 | " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 437 | " (ffn): FFN(\n", 438 | " (dropout): Dropout(p=0.1, inplace=False)\n", 439 | " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", 440 | " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", 441 | " )\n", 442 | " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 443 | " )\n", 444 | " (1): TransformerBlock(\n", 445 | " (attention): MultiHeadSelfAttention(\n", 446 | " (dropout): Dropout(p=0.1, inplace=False)\n", 447 | " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", 448 | " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", 449 | " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", 450 | " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", 451 | " )\n", 452 | " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 453 | " (ffn): FFN(\n", 454 | " (dropout): Dropout(p=0.1, inplace=False)\n", 455 | " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", 456 | " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", 457 | " )\n", 458 | " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 459 | " )\n", 460 | " (2): TransformerBlock(\n", 461 | " (attention): MultiHeadSelfAttention(\n", 462 | " (dropout): Dropout(p=0.1, inplace=False)\n", 463 | " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", 464 | " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", 465 | " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", 466 | " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", 467 | " )\n", 468 | " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 469 | " (ffn): FFN(\n", 470 | " (dropout): Dropout(p=0.1, inplace=False)\n", 471 | " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", 472 | " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", 473 | " )\n", 474 | " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 475 | " )\n", 476 | " (3): TransformerBlock(\n", 477 | " (attention): MultiHeadSelfAttention(\n", 478 | " (dropout): Dropout(p=0.1, inplace=False)\n", 479 | " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", 480 | " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", 481 | " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", 482 | " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", 483 | " )\n", 484 | " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 485 | " (ffn): FFN(\n", 486 | " (dropout): Dropout(p=0.1, inplace=False)\n", 487 | " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", 488 | " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", 489 | " )\n", 490 | " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 491 | " )\n", 492 | " (4): TransformerBlock(\n", 493 | " (attention): MultiHeadSelfAttention(\n", 494 | " (dropout): Dropout(p=0.1, inplace=False)\n", 495 | " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", 496 | " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", 497 | " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", 498 | " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", 499 | " )\n", 500 | " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 501 | " (ffn): FFN(\n", 502 | " (dropout): Dropout(p=0.1, inplace=False)\n", 503 | " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", 504 | " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", 505 | " )\n", 506 | " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 507 | " )\n", 508 | " (5): TransformerBlock(\n", 509 | " (attention): MultiHeadSelfAttention(\n", 510 | " (dropout): Dropout(p=0.1, inplace=False)\n", 511 | " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", 512 | " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", 513 | " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", 514 | " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", 515 | " )\n", 516 | " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 517 | " (ffn): FFN(\n", 518 | " (dropout): Dropout(p=0.1, inplace=False)\n", 519 | " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", 520 | " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", 521 | " )\n", 522 | " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 523 | " )\n", 524 | " )\n", 525 | " )\n", 526 | " )\n", 527 | " (l2): Dropout(p=0.3, inplace=False)\n", 528 | " (l3): Linear(in_features=768, out_features=1, bias=True)\n", 529 | ")" 530 | ] 531 | }, 532 | "execution_count": 9, 533 | "metadata": {}, 534 | "output_type": "execute_result" 535 | } 536 | ], 537 | "source": [ 538 | "model = DistillBERTClass()\n", 539 | "model.to(device)" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": 10, 545 | "metadata": {}, 546 | "outputs": [], 547 | "source": [ 548 | "# Creating the loss function and optimizer\n", 549 | "loss_function = torch.nn.CrossEntropyLoss()\n", 550 | "optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "\n", 558 | "### Fine Tuning the Model\n", 559 | "\n", 560 | "After all the effort of loading and preparing the data and datasets, creating the model and defining its loss and optimizer. This is probably the easier steps in the process. \n", 561 | "\n", 562 | "Here we define a training function that trains the model on the training dataset created above, specified number of times (EPOCH), An epoch defines how many times the complete data will be passed through the network. \n", 563 | "\n", 564 | "Following events happen in this function to fine tune the neural network:\n", 565 | "- The dataloader passes data to the model based on the batch size. \n", 566 | "- Subsequent output from the model and the actual category are compared to calculate the loss. \n", 567 | "- Loss value is used to optimize the weights of the neurons in the network.\n", 568 | "- After every 5000 steps the loss value is printed in the console.\n", 569 | "\n", 570 | "As you can see just in 1 epoch by the final step the model was working with a miniscule loss of 0.0002485 i.e. the output is extremely close to the actual output." 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [ 579 | "# Function to calcuate the accuracy of the model\n", 580 | "\n", 581 | "def calcuate_accu(big_idx, targets):\n", 582 | " n_correct = (big_idx==targets).sum().item()\n", 583 | " return n_correct" 584 | ] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": 11, 589 | "metadata": {}, 590 | "outputs": [], 591 | "source": [ 592 | "# Defining the training function on the 80% of the dataset for tuning the distilbert model\n", 593 | "\n", 594 | "def train(epoch):\n", 595 | " tr_loss = 0\n", 596 | " n_correct = 0\n", 597 | " nb_tr_steps = 0\n", 598 | " nb_tr_examples = 0\n", 599 | " model.train()\n", 600 | " for _,data in enumerate(training_loader, 0):\n", 601 | " ids = data['ids'].to(device, dtype = torch.long)\n", 602 | " mask = data['mask'].to(device, dtype = torch.long)\n", 603 | " targets = data['targets'].to(device, dtype = torch.long)\n", 604 | "\n", 605 | " outputs = model(ids, mask)\n", 606 | " loss = loss_function(outputs, targets)\n", 607 | " tr_loss += loss.item()\n", 608 | " big_val, big_idx = torch.max(outputs.data, dim=1)\n", 609 | " n_correct += calcuate_accu(big_idx, targets)\n", 610 | "\n", 611 | " nb_tr_steps += 1\n", 612 | " nb_tr_examples+=targets.size(0)\n", 613 | " \n", 614 | " if _%5000==0:\n", 615 | " loss_step = tr_loss/nb_tr_steps\n", 616 | " accu_step = (n_correct*100)/nb_tr_examples \n", 617 | " print(f\"Training Loss per 5000 steps: {loss_step}\")\n", 618 | " print(f\"Training Accuracy per 5000 steps: {accu_step}\")\n", 619 | "\n", 620 | " optimizer.zero_grad()\n", 621 | " loss.backward()\n", 622 | " # # When using GPU\n", 623 | " optimizer.step()\n", 624 | "\n", 625 | " print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')\n", 626 | " epoch_loss = tr_loss/nb_tr_steps\n", 627 | " epoch_accu = (n_correct*100)/nb_tr_examples\n", 628 | " print(f\"Training Loss Epoch: {epoch_loss}\")\n", 629 | " print(f\"Training Accuracy Epoch: {epoch_accu}\")\n", 630 | "\n", 631 | " return " 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 12, 637 | "metadata": {}, 638 | "outputs": [ 639 | { 640 | "name": "stdout", 641 | "output_type": "stream", 642 | "text": [ 643 | "Epoch: 0, Loss: 6.332988739013672\n", 644 | "Epoch: 0, Loss: 0.0013066530227661133\n", 645 | "Epoch: 0, Loss: 0.0029534101486206055\n", 646 | "Epoch: 0, Loss: 0.005258679389953613\n", 647 | "Epoch: 0, Loss: 0.0020235776901245117\n", 648 | "Epoch: 0, Loss: 0.0023298263549804688\n", 649 | "Epoch: 0, Loss: 0.0034378767013549805\n", 650 | "Epoch: 0, Loss: 0.004993081092834473\n", 651 | "Epoch: 0, Loss: 0.008559942245483398\n", 652 | "Epoch: 0, Loss: 0.0014510154724121094\n", 653 | "Epoch: 0, Loss: 0.0028634071350097656\n", 654 | "Epoch: 0, Loss: 0.0006411075592041016\n", 655 | "Epoch: 0, Loss: 0.0012137889862060547\n", 656 | "Epoch: 0, Loss: 0.002307891845703125\n", 657 | "Epoch: 0, Loss: 0.00028586387634277344\n", 658 | "Epoch: 0, Loss: 0.0029143095016479492\n", 659 | "Epoch: 0, Loss: 0.0002485513687133789\n" 660 | ] 661 | } 662 | ], 663 | "source": [ 664 | "for epoch in range(EPOCHS):\n", 665 | " train(epoch)" 666 | ] 667 | }, 668 | { 669 | "cell_type": "markdown", 670 | "metadata": {}, 671 | "source": [ 672 | "\n", 673 | "### Validating the Model\n", 674 | "\n", 675 | "During the validation stage we pass the unseen data(Testing Dataset) to the model. This step determines how good the model performs on the unseen data. \n", 676 | "\n", 677 | "This unseen data is the 20% of `newscorpora.csv` which was seperated during the Dataset creation stage. \n", 678 | "During the validation stage the weights of the model are not updated. Only the final output is compared to the actual value. This comparison is then used to calcuate the accuracy of the model. \n", 679 | "\n", 680 | "As you can see the model is predicting the correct category of a given headline to a 99.9% accuracy." 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": 15, 686 | "metadata": {}, 687 | "outputs": [], 688 | "source": [ 689 | "def valid(model, testing_loader):\n", 690 | " model.eval()\n", 691 | " n_correct = 0; n_wrong = 0; total = 0\n", 692 | " with torch.no_grad():\n", 693 | " for _, data in enumerate(testing_loader, 0):\n", 694 | " ids = data['ids'].to(device, dtype = torch.long)\n", 695 | " mask = data['mask'].to(device, dtype = torch.long)\n", 696 | " targets = data['targets'].to(device, dtype = torch.long)\n", 697 | " outputs = model(ids, mask).squeeze()\n", 698 | " loss = loss_function(outputs, targets)\n", 699 | " tr_loss += loss.item()\n", 700 | " big_val, big_idx = torch.max(outputs.data, dim=1)\n", 701 | " n_correct += calcuate_accu(big_idx, targets)\n", 702 | "\n", 703 | " nb_tr_steps += 1\n", 704 | " nb_tr_examples+=targets.size(0)\n", 705 | " \n", 706 | " if _%5000==0:\n", 707 | " loss_step = tr_loss/nb_tr_steps\n", 708 | " accu_step = (n_correct*100)/nb_tr_examples\n", 709 | " print(f\"Validation Loss per 100 steps: {loss_step}\")\n", 710 | " print(f\"Validation Accuracy per 100 steps: {accu_step}\")\n", 711 | " epoch_loss = tr_loss/nb_tr_steps\n", 712 | " epoch_accu = (n_correct*100)/nb_tr_examples\n", 713 | " print(f\"Validation Loss Epoch: {epoch_loss}\")\n", 714 | " print(f\"Validation Accuracy Epoch: {epoch_accu}\")\n", 715 | " \n", 716 | " return epoch_accu\n" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": 16, 722 | "metadata": {}, 723 | "outputs": [ 724 | { 725 | "name": "stdout", 726 | "output_type": "stream", 727 | "text": [ 728 | "This is the validation section to print the accuracy and see how it performs\n", 729 | "Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch\n", 730 | "Accuracy on test data = 99.99%\n" 731 | ] 732 | } 733 | ], 734 | "source": [ 735 | "print('This is the validation section to print the accuracy and see how it performs')\n", 736 | "print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')\n", 737 | "\n", 738 | "acc = valid(model, testing_loader)\n", 739 | "print(\"Accuracy on test data = %0.2f%%\" % acc)" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "metadata": {}, 745 | "source": [ 746 | "\n", 747 | "### Saving the Trained Model Artifacts for inference\n", 748 | "\n", 749 | "This is the final step in the process of fine tuning the model. \n", 750 | "\n", 751 | "The model and its vocabulary are saved locally. These files are then used in the future to make inference on new inputs of news headlines.\n", 752 | "\n", 753 | "Please remember that a trained neural network is only useful when used in actual inference after its training. \n", 754 | "\n", 755 | "In the lifecycle of an ML projects this is only half the job done. We will leave the inference of these models for some other day. " 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": 20, 761 | "metadata": {}, 762 | "outputs": [ 763 | { 764 | "name": "stdout", 765 | "output_type": "stream", 766 | "text": [ 767 | "All files saved\n", 768 | "This tutorial is completed\n" 769 | ] 770 | } 771 | ], 772 | "source": [ 773 | "# Saving the files for re-use\n", 774 | "\n", 775 | "output_model_file = './models/pytorch_distilbert_news.bin'\n", 776 | "output_vocab_file = './models/vocab_distilbert_news.bin'\n", 777 | "\n", 778 | "model_to_save = model\n", 779 | "torch.save(model_to_save, output_model_file)\n", 780 | "tokenizer.save_vocabulary(output_vocab_file)\n", 781 | "\n", 782 | "print('All files saved')\n", 783 | "print('This tutorial is completed')" 784 | ] 785 | } 786 | ], 787 | "metadata": { 788 | "colab": { 789 | "collapsed_sections": [], 790 | "name": "01_transformers_multiclass_classification.ipynb", 791 | "provenance": [] 792 | }, 793 | "kernelspec": { 794 | "display_name": "Python 3", 795 | "language": "python", 796 | "name": "python3" 797 | }, 798 | "varInspector": { 799 | "cols": { 800 | "lenName": 16, 801 | "lenType": 16, 802 | "lenVar": 40 803 | }, 804 | "kernels_config": { 805 | "python": { 806 | "delete_cmd_postfix": "", 807 | "delete_cmd_prefix": "del ", 808 | "library": "var_list.py", 809 | "varRefreshCmd": "print(var_dic_list())" 810 | }, 811 | "r": { 812 | "delete_cmd_postfix": ") ", 813 | "delete_cmd_prefix": "rm(", 814 | "library": "var_list.r", 815 | "varRefreshCmd": "cat(var_dic_list()) " 816 | } 817 | }, 818 | "types_to_exclude": [ 819 | "module", 820 | "function", 821 | "builtin_function_or_method", 822 | "instance", 823 | "_Feature" 824 | ], 825 | "window_display": false 826 | }, 827 | "widgets": { 828 | "application/vnd.jupyter.widget-state+json": { 829 | "222af74a298140ff9ec6e6f1f286a52b": { 830 | "model_module": "@jupyter-widgets/base", 831 | "model_name": "LayoutModel", 832 | "state": { 833 | "_model_module": "@jupyter-widgets/base", 834 | "_model_module_version": "1.2.0", 835 | "_model_name": "LayoutModel", 836 | "_view_count": null, 837 | "_view_module": "@jupyter-widgets/base", 838 | "_view_module_version": "1.2.0", 839 | "_view_name": "LayoutView", 840 | "align_content": null, 841 | "align_items": null, 842 | "align_self": null, 843 | "border": null, 844 | "bottom": null, 845 | "display": null, 846 | "flex": null, 847 | "flex_flow": null, 848 | "grid_area": null, 849 | "grid_auto_columns": null, 850 | "grid_auto_flow": null, 851 | "grid_auto_rows": null, 852 | "grid_column": null, 853 | "grid_gap": null, 854 | "grid_row": null, 855 | "grid_template_areas": null, 856 | "grid_template_columns": null, 857 | "grid_template_rows": null, 858 | "height": null, 859 | "justify_content": null, 860 | "justify_items": null, 861 | "left": null, 862 | "margin": null, 863 | "max_height": null, 864 | "max_width": null, 865 | "min_height": null, 866 | "min_width": null, 867 | "object_fit": null, 868 | "object_position": null, 869 | "order": null, 870 | "overflow": null, 871 | "overflow_x": null, 872 | "overflow_y": null, 873 | "padding": null, 874 | "right": null, 875 | "top": null, 876 | "visibility": null, 877 | "width": null 878 | } 879 | }, 880 | "3516912b49bf4e3e98c9ce6c7b1b1469": { 881 | "model_module": "@jupyter-widgets/controls", 882 | "model_name": "DescriptionStyleModel", 883 | "state": { 884 | "_model_module": "@jupyter-widgets/controls", 885 | "_model_module_version": "1.5.0", 886 | "_model_name": "DescriptionStyleModel", 887 | "_view_count": null, 888 | "_view_module": "@jupyter-widgets/base", 889 | "_view_module_version": "1.2.0", 890 | "_view_name": "StyleView", 891 | "description_width": "" 892 | } 893 | }, 894 | "4641ff326eea446c88903e70b85c90d7": { 895 | "model_module": "@jupyter-widgets/base", 896 | "model_name": "LayoutModel", 897 | "state": { 898 | "_model_module": "@jupyter-widgets/base", 899 | "_model_module_version": "1.2.0", 900 | "_model_name": "LayoutModel", 901 | "_view_count": null, 902 | "_view_module": "@jupyter-widgets/base", 903 | "_view_module_version": "1.2.0", 904 | "_view_name": "LayoutView", 905 | "align_content": null, 906 | "align_items": null, 907 | "align_self": null, 908 | "border": null, 909 | "bottom": null, 910 | "display": null, 911 | "flex": null, 912 | "flex_flow": null, 913 | "grid_area": null, 914 | "grid_auto_columns": null, 915 | "grid_auto_flow": null, 916 | "grid_auto_rows": null, 917 | "grid_column": null, 918 | "grid_gap": null, 919 | "grid_row": null, 920 | "grid_template_areas": null, 921 | "grid_template_columns": null, 922 | "grid_template_rows": null, 923 | "height": null, 924 | "justify_content": null, 925 | "justify_items": null, 926 | "left": null, 927 | "margin": null, 928 | "max_height": null, 929 | "max_width": null, 930 | "min_height": null, 931 | "min_width": null, 932 | "object_fit": null, 933 | "object_position": null, 934 | "order": null, 935 | "overflow": null, 936 | "overflow_x": null, 937 | "overflow_y": null, 938 | "padding": null, 939 | "right": null, 940 | "top": null, 941 | "visibility": null, 942 | "width": null 943 | } 944 | }, 945 | "5a975bc922a14c1b82245195b7c7f659": { 946 | "model_module": "@jupyter-widgets/controls", 947 | "model_name": "HTMLModel", 948 | "state": { 949 | "_dom_classes": [], 950 | "_model_module": "@jupyter-widgets/controls", 951 | "_model_module_version": "1.5.0", 952 | "_model_name": "HTMLModel", 953 | "_view_count": null, 954 | "_view_module": "@jupyter-widgets/controls", 955 | "_view_module_version": "1.5.0", 956 | "_view_name": "HTMLView", 957 | "description": "", 958 | "description_tooltip": null, 959 | "layout": "IPY_MODEL_8335299626724853865234b86745e2a5", 960 | "placeholder": "​", 961 | "style": "IPY_MODEL_3516912b49bf4e3e98c9ce6c7b1b1469", 962 | "value": " 442/442 [00:01<00:00, 276B/s]" 963 | } 964 | }, 965 | "7532a60d077248ca963f514988d41acc": { 966 | "model_module": "@jupyter-widgets/controls", 967 | "model_name": "HBoxModel", 968 | "state": { 969 | "_dom_classes": [], 970 | "_model_module": "@jupyter-widgets/controls", 971 | "_model_module_version": "1.5.0", 972 | "_model_name": "HBoxModel", 973 | "_view_count": null, 974 | "_view_module": "@jupyter-widgets/controls", 975 | "_view_module_version": "1.5.0", 976 | "_view_name": "HBoxView", 977 | "box_style": "", 978 | "children": [ 979 | "IPY_MODEL_ca0092edb8f5442a988c87c89930a10e", 980 | "IPY_MODEL_5a975bc922a14c1b82245195b7c7f659" 981 | ], 982 | "layout": "IPY_MODEL_e39121739a4c4bffb16155e9a61a58cc" 983 | } 984 | }, 985 | "8335299626724853865234b86745e2a5": { 986 | "model_module": "@jupyter-widgets/base", 987 | "model_name": "LayoutModel", 988 | "state": { 989 | "_model_module": "@jupyter-widgets/base", 990 | "_model_module_version": "1.2.0", 991 | "_model_name": "LayoutModel", 992 | "_view_count": null, 993 | "_view_module": "@jupyter-widgets/base", 994 | "_view_module_version": "1.2.0", 995 | "_view_name": "LayoutView", 996 | "align_content": null, 997 | "align_items": null, 998 | "align_self": null, 999 | "border": null, 1000 | "bottom": null, 1001 | "display": null, 1002 | "flex": null, 1003 | "flex_flow": null, 1004 | "grid_area": null, 1005 | "grid_auto_columns": null, 1006 | "grid_auto_flow": null, 1007 | "grid_auto_rows": null, 1008 | "grid_column": null, 1009 | "grid_gap": null, 1010 | "grid_row": null, 1011 | "grid_template_areas": null, 1012 | "grid_template_columns": null, 1013 | "grid_template_rows": null, 1014 | "height": null, 1015 | "justify_content": null, 1016 | "justify_items": null, 1017 | "left": null, 1018 | "margin": null, 1019 | "max_height": null, 1020 | "max_width": null, 1021 | "min_height": null, 1022 | "min_width": null, 1023 | "object_fit": null, 1024 | "object_position": null, 1025 | "order": null, 1026 | "overflow": null, 1027 | "overflow_x": null, 1028 | "overflow_y": null, 1029 | "padding": null, 1030 | "right": null, 1031 | "top": null, 1032 | "visibility": null, 1033 | "width": null 1034 | } 1035 | }, 1036 | "b13c341a69c14c059edc40e6b438ee80": { 1037 | "model_module": "@jupyter-widgets/controls", 1038 | "model_name": "IntProgressModel", 1039 | "state": { 1040 | "_dom_classes": [], 1041 | "_model_module": "@jupyter-widgets/controls", 1042 | "_model_module_version": "1.5.0", 1043 | "_model_name": "IntProgressModel", 1044 | "_view_count": null, 1045 | "_view_module": "@jupyter-widgets/controls", 1046 | "_view_module_version": "1.5.0", 1047 | "_view_name": "ProgressView", 1048 | "bar_style": "success", 1049 | "description": "Downloading: 100%", 1050 | "description_tooltip": null, 1051 | "layout": "IPY_MODEL_4641ff326eea446c88903e70b85c90d7", 1052 | "max": 231508, 1053 | "min": 0, 1054 | "orientation": "horizontal", 1055 | "style": "IPY_MODEL_dd589fc95cd94cd190bc2640f9ef618c", 1056 | "value": 231508 1057 | } 1058 | }, 1059 | "b8728a6fcdc54b72a10c8bfdcf794fad": { 1060 | "model_module": "@jupyter-widgets/base", 1061 | "model_name": "LayoutModel", 1062 | "state": { 1063 | "_model_module": "@jupyter-widgets/base", 1064 | "_model_module_version": "1.2.0", 1065 | "_model_name": "LayoutModel", 1066 | "_view_count": null, 1067 | "_view_module": "@jupyter-widgets/base", 1068 | "_view_module_version": "1.2.0", 1069 | "_view_name": "LayoutView", 1070 | "align_content": null, 1071 | "align_items": null, 1072 | "align_self": null, 1073 | "border": null, 1074 | "bottom": null, 1075 | "display": null, 1076 | "flex": null, 1077 | "flex_flow": null, 1078 | "grid_area": null, 1079 | "grid_auto_columns": null, 1080 | "grid_auto_flow": null, 1081 | "grid_auto_rows": null, 1082 | "grid_column": null, 1083 | "grid_gap": null, 1084 | "grid_row": null, 1085 | "grid_template_areas": null, 1086 | "grid_template_columns": null, 1087 | "grid_template_rows": null, 1088 | "height": null, 1089 | "justify_content": null, 1090 | "justify_items": null, 1091 | "left": null, 1092 | "margin": null, 1093 | "max_height": null, 1094 | "max_width": null, 1095 | "min_height": null, 1096 | "min_width": null, 1097 | "object_fit": null, 1098 | "object_position": null, 1099 | "order": null, 1100 | "overflow": null, 1101 | "overflow_x": null, 1102 | "overflow_y": null, 1103 | "padding": null, 1104 | "right": null, 1105 | "top": null, 1106 | "visibility": null, 1107 | "width": null 1108 | } 1109 | }, 1110 | "c1db6876e9b04293b4077f13006b4a19": { 1111 | "model_module": "@jupyter-widgets/controls", 1112 | "model_name": "DescriptionStyleModel", 1113 | "state": { 1114 | "_model_module": "@jupyter-widgets/controls", 1115 | "_model_module_version": "1.5.0", 1116 | "_model_name": "DescriptionStyleModel", 1117 | "_view_count": null, 1118 | "_view_module": "@jupyter-widgets/base", 1119 | "_view_module_version": "1.2.0", 1120 | "_view_name": "StyleView", 1121 | "description_width": "" 1122 | } 1123 | }, 1124 | "ca0092edb8f5442a988c87c89930a10e": { 1125 | "model_module": "@jupyter-widgets/controls", 1126 | "model_name": "IntProgressModel", 1127 | "state": { 1128 | "_dom_classes": [], 1129 | "_model_module": "@jupyter-widgets/controls", 1130 | "_model_module_version": "1.5.0", 1131 | "_model_name": "IntProgressModel", 1132 | "_view_count": null, 1133 | "_view_module": "@jupyter-widgets/controls", 1134 | "_view_module_version": "1.5.0", 1135 | "_view_name": "ProgressView", 1136 | "bar_style": "success", 1137 | "description": "Downloading: 100%", 1138 | "description_tooltip": null, 1139 | "layout": "IPY_MODEL_222af74a298140ff9ec6e6f1f286a52b", 1140 | "max": 442, 1141 | "min": 0, 1142 | "orientation": "horizontal", 1143 | "style": "IPY_MODEL_f9dd503454b2450e957a96bff807c8d9", 1144 | "value": 442 1145 | } 1146 | }, 1147 | "cde06f09879848a3bfc5892390a51d36": { 1148 | "model_module": "@jupyter-widgets/base", 1149 | "model_name": "LayoutModel", 1150 | "state": { 1151 | "_model_module": "@jupyter-widgets/base", 1152 | "_model_module_version": "1.2.0", 1153 | "_model_name": "LayoutModel", 1154 | "_view_count": null, 1155 | "_view_module": "@jupyter-widgets/base", 1156 | "_view_module_version": "1.2.0", 1157 | "_view_name": "LayoutView", 1158 | "align_content": null, 1159 | "align_items": null, 1160 | "align_self": null, 1161 | "border": null, 1162 | "bottom": null, 1163 | "display": null, 1164 | "flex": null, 1165 | "flex_flow": null, 1166 | "grid_area": null, 1167 | "grid_auto_columns": null, 1168 | "grid_auto_flow": null, 1169 | "grid_auto_rows": null, 1170 | "grid_column": null, 1171 | "grid_gap": null, 1172 | "grid_row": null, 1173 | "grid_template_areas": null, 1174 | "grid_template_columns": null, 1175 | "grid_template_rows": null, 1176 | "height": null, 1177 | "justify_content": null, 1178 | "justify_items": null, 1179 | "left": null, 1180 | "margin": null, 1181 | "max_height": null, 1182 | "max_width": null, 1183 | "min_height": null, 1184 | "min_width": null, 1185 | "object_fit": null, 1186 | "object_position": null, 1187 | "order": null, 1188 | "overflow": null, 1189 | "overflow_x": null, 1190 | "overflow_y": null, 1191 | "padding": null, 1192 | "right": null, 1193 | "top": null, 1194 | "visibility": null, 1195 | "width": null 1196 | } 1197 | }, 1198 | "dd589fc95cd94cd190bc2640f9ef618c": { 1199 | "model_module": "@jupyter-widgets/controls", 1200 | "model_name": "ProgressStyleModel", 1201 | "state": { 1202 | "_model_module": "@jupyter-widgets/controls", 1203 | "_model_module_version": "1.5.0", 1204 | "_model_name": "ProgressStyleModel", 1205 | "_view_count": null, 1206 | "_view_module": "@jupyter-widgets/base", 1207 | "_view_module_version": "1.2.0", 1208 | "_view_name": "StyleView", 1209 | "bar_color": null, 1210 | "description_width": "initial" 1211 | } 1212 | }, 1213 | "dd7699e7813d4b9f9e80990498a39539": { 1214 | "model_module": "@jupyter-widgets/controls", 1215 | "model_name": "HTMLModel", 1216 | "state": { 1217 | "_dom_classes": [], 1218 | "_model_module": "@jupyter-widgets/controls", 1219 | "_model_module_version": "1.5.0", 1220 | "_model_name": "HTMLModel", 1221 | "_view_count": null, 1222 | "_view_module": "@jupyter-widgets/controls", 1223 | "_view_module_version": "1.5.0", 1224 | "_view_name": "HTMLView", 1225 | "description": "", 1226 | "description_tooltip": null, 1227 | "layout": "IPY_MODEL_b8728a6fcdc54b72a10c8bfdcf794fad", 1228 | "placeholder": "​", 1229 | "style": "IPY_MODEL_c1db6876e9b04293b4077f13006b4a19", 1230 | "value": " 232k/232k [00:00<00:00, 584kB/s]" 1231 | } 1232 | }, 1233 | "e39121739a4c4bffb16155e9a61a58cc": { 1234 | "model_module": "@jupyter-widgets/base", 1235 | "model_name": "LayoutModel", 1236 | "state": { 1237 | "_model_module": "@jupyter-widgets/base", 1238 | "_model_module_version": "1.2.0", 1239 | "_model_name": "LayoutModel", 1240 | "_view_count": null, 1241 | "_view_module": "@jupyter-widgets/base", 1242 | "_view_module_version": "1.2.0", 1243 | "_view_name": "LayoutView", 1244 | "align_content": null, 1245 | "align_items": null, 1246 | "align_self": null, 1247 | "border": null, 1248 | "bottom": null, 1249 | "display": null, 1250 | "flex": null, 1251 | "flex_flow": null, 1252 | "grid_area": null, 1253 | "grid_auto_columns": null, 1254 | "grid_auto_flow": null, 1255 | "grid_auto_rows": null, 1256 | "grid_column": null, 1257 | "grid_gap": null, 1258 | "grid_row": null, 1259 | "grid_template_areas": null, 1260 | "grid_template_columns": null, 1261 | "grid_template_rows": null, 1262 | "height": null, 1263 | "justify_content": null, 1264 | "justify_items": null, 1265 | "left": null, 1266 | "margin": null, 1267 | "max_height": null, 1268 | "max_width": null, 1269 | "min_height": null, 1270 | "min_width": null, 1271 | "object_fit": null, 1272 | "object_position": null, 1273 | "order": null, 1274 | "overflow": null, 1275 | "overflow_x": null, 1276 | "overflow_y": null, 1277 | "padding": null, 1278 | "right": null, 1279 | "top": null, 1280 | "visibility": null, 1281 | "width": null 1282 | } 1283 | }, 1284 | "f9dd503454b2450e957a96bff807c8d9": { 1285 | "model_module": "@jupyter-widgets/controls", 1286 | "model_name": "ProgressStyleModel", 1287 | "state": { 1288 | "_model_module": "@jupyter-widgets/controls", 1289 | "_model_module_version": "1.5.0", 1290 | "_model_name": "ProgressStyleModel", 1291 | "_view_count": null, 1292 | "_view_module": "@jupyter-widgets/base", 1293 | "_view_module_version": "1.2.0", 1294 | "_view_name": "StyleView", 1295 | "bar_color": null, 1296 | "description_width": "initial" 1297 | } 1298 | }, 1299 | "fd6f40d5854c49f79056cebd16f86626": { 1300 | "model_module": "@jupyter-widgets/controls", 1301 | "model_name": "HBoxModel", 1302 | "state": { 1303 | "_dom_classes": [], 1304 | "_model_module": "@jupyter-widgets/controls", 1305 | "_model_module_version": "1.5.0", 1306 | "_model_name": "HBoxModel", 1307 | "_view_count": null, 1308 | "_view_module": "@jupyter-widgets/controls", 1309 | "_view_module_version": "1.5.0", 1310 | "_view_name": "HBoxView", 1311 | "box_style": "", 1312 | "children": [ 1313 | "IPY_MODEL_b13c341a69c14c059edc40e6b438ee80", 1314 | "IPY_MODEL_dd7699e7813d4b9f9e80990498a39539" 1315 | ], 1316 | "layout": "IPY_MODEL_cde06f09879848a3bfc5892390a51d36" 1317 | } 1318 | } 1319 | } 1320 | } 1321 | }, 1322 | "nbformat": 4, 1323 | "nbformat_minor": 1 1324 | } 1325 | -------------------------------------------------------------------------------- /transformers_ner.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "rFvcIe4qz_2t" 8 | }, 9 | "source": [ 10 | "# Fine Tuning Transformer for Named Entity Recognition" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "colab_type": "text", 17 | "id": "Zt90X_Dw0B_T" 18 | }, 19 | "source": [ 20 | "### Introduction\n", 21 | "\n", 22 | "In this tutorial we will be fine tuning a transformer model for the **Named Entity Recognition** problem. \n", 23 | "This is one of the most common business problems where a given piece of text/sentence/document different entites need to be identified such as: Name, Location, Number, Entity etc.\n", 24 | "\n", 25 | "#### Flow of the notebook\n", 26 | "\n", 27 | "The notebook will be divided into seperate sections to provide a organized walk through for the process used. This process can be modified for individual use cases. The sections are:\n", 28 | "\n", 29 | "1. [Installing packages for preparing the system](#section00)\n", 30 | "2. [Importing Python Libraries and preparing the environment](#section01)\n", 31 | "3. [Importing and Pre-Processing the domain data](#section02)\n", 32 | "4. [Preparing the Dataset and Dataloader](#section03)\n", 33 | "5. [Creating the Neural Network for Fine Tuning](#section04)\n", 34 | "6. [Fine Tuning the Model](#section05)\n", 35 | "7. [Validating the Model Performance](#section06)\n", 36 | "\n", 37 | "#### Technical Details\n", 38 | "\n", 39 | "This script leverages on multiple tools designed by other teams. Details of the tools used below. Please ensure that these elements are present in your setup to successfully implement this script.\n", 40 | "\n", 41 | " - Data:\n", 42 | "\t- We are working from a dataset available on [Kaggle](https://www.kaggle.com/)\n", 43 | " - This NER annotated dataset is available at the following [link](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus)\n", 44 | " - We will be working with the file `ner.csv` from the dataset. \n", 45 | " - In the given file we will be looking at the following columns for the purpose of this fine tuning:\n", 46 | " - `sentence_idx` : This is the identifier that the word in the row is part of the same sentence\n", 47 | " - `word` : Word in the sentence\n", 48 | " - `tag` : This is the identifier that is used to identify the entity in the dataset. \n", 49 | " - The various entites tagged in this dataset are as per below:\n", 50 | " - geo = Geographical Entity\n", 51 | " - org = Organization\n", 52 | " - per = Person\n", 53 | " - gpe = Geopolitical Entity\n", 54 | " - tim = Time indicator\n", 55 | " - art = Artifact\n", 56 | " - eve = Event\n", 57 | " - nat = Natural Phenomenon\n", 58 | "\n", 59 | "\n", 60 | " - Language Model Used:\n", 61 | "\t - We are using BERT for this project. Hugging face team has created a customized model for token classification, called **BertForTokenClassification**. We will be using it in our custommodel class for training. \n", 62 | "\t - [Blog-Post](https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html)\n", 63 | " - [Documentation for python](https://huggingface.co/transformers/model_doc/bert.html#bertfortokenclassification)\n", 64 | "\n", 65 | "\n", 66 | " - Hardware Requirements:\n", 67 | "\t - Python 3.6 and above\n", 68 | "\t - Pytorch, Transformers and All the stock Python ML Libraries\n", 69 | "\t - TPU enabled setup. This can also be executed over GPU but the code base will need some changes. \n", 70 | "\n", 71 | "\n", 72 | " - Script Objective:\n", 73 | "\t - The objective of this script is to fine tune **BertForTokenClassification**` to be able to identify the entites as per the given test dataset. The entities labled in the given dataset are as follows:" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "\n", 81 | "### Installing packages for preparing the system\n", 82 | "\n", 83 | "We are installing 2 packages for the purposes of TPU execution and f1 metric score calculation respectively\n", 84 | "*You can skip this step if you already have these libraries installed in your environment*" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 2, 90 | "metadata": { 91 | "colab": { 92 | "base_uri": "https://localhost:8080/", 93 | "height": 773 94 | }, 95 | "colab_type": "code", 96 | "id": "pWbkd8Ld8MwL", 97 | "outputId": "b44f7ea3-2c0a-4e7c-f7ed-19f43d62de28" 98 | }, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | " % Total % Received % Xferd Average Speed Time Time Time Current\n", 105 | " Dload Upload Total Spent Left Speed\n", 106 | "\r", 107 | " 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r", 108 | "100 3727 100 3727 0 0 41876 0 --:--:-- --:--:-- --:--:-- 41876\n", 109 | "Updating TPU and VM. This may take around 2 minutes.\n", 110 | "Updating TPU runtime to pytorch-dev20200325 ...\n", 111 | "Done updating TPU runtime: \n", 112 | "Uninstalling torch-1.5.0a0+d6149a7:\n", 113 | " Successfully uninstalled torch-1.5.0a0+d6149a7\n", 114 | "Uninstalling torchvision-0.6.0a0+3c254fb:\n", 115 | " Successfully uninstalled torchvision-0.6.0a0+3c254fb\n", 116 | "Copying gs://tpu-pytorch/wheels/torch-nightly+20200325-cp36-cp36m-linux_x86_64.whl...\n", 117 | "- [1 files][ 83.4 MiB/ 83.4 MiB] \n", 118 | "Operation completed over 1 objects/83.4 MiB. \n", 119 | "Copying gs://tpu-pytorch/wheels/torch_xla-nightly+20200325-cp36-cp36m-linux_x86_64.whl...\n", 120 | "\\ [1 files][114.5 MiB/114.5 MiB] \n", 121 | "Operation completed over 1 objects/114.5 MiB. \n", 122 | "Copying gs://tpu-pytorch/wheels/torchvision-nightly+20200325-cp36-cp36m-linux_x86_64.whl...\n", 123 | "/ [1 files][ 2.5 MiB/ 2.5 MiB] \n", 124 | "Operation completed over 1 objects/2.5 MiB. \n", 125 | "Processing ./torch-nightly+20200325-cp36-cp36m-linux_x86_64.whl\n", 126 | "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torch==nightly+20200325) (1.18.3)\n", 127 | "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch==nightly+20200325) (0.16.0)\n", 128 | "\u001b[31mERROR: fastai 1.0.61 requires torchvision, which is not installed.\u001b[0m\n", 129 | "Installing collected packages: torch\n", 130 | "Successfully installed torch-1.5.0a0+d6149a7\n", 131 | "Processing ./torch_xla-nightly+20200325-cp36-cp36m-linux_x86_64.whl\n", 132 | "Installing collected packages: torch-xla\n", 133 | " Found existing installation: torch-xla 1.6+e788e5b\n", 134 | " Uninstalling torch-xla-1.6+e788e5b:\n", 135 | " Successfully uninstalled torch-xla-1.6+e788e5b\n", 136 | "Successfully installed torch-xla-1.6+e788e5b\n", 137 | "Processing ./torchvision-nightly+20200325-cp36-cp36m-linux_x86_64.whl\n", 138 | "Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from torchvision==nightly+20200325) (1.5.0a0+d6149a7)\n", 139 | "Requirement already satisfied: pillow>=4.1.1 in /usr/local/lib/python3.6/dist-packages (from torchvision==nightly+20200325) (7.0.0)\n", 140 | "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from torchvision==nightly+20200325) (1.12.0)\n", 141 | "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torchvision==nightly+20200325) (1.18.3)\n", 142 | "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch->torchvision==nightly+20200325) (0.16.0)\n", 143 | "Installing collected packages: torchvision\n", 144 | "Successfully installed torchvision-0.6.0a0+3c254fb\n", 145 | "Reading package lists... Done\n", 146 | "Building dependency tree \n", 147 | "Reading state information... Done\n", 148 | "libomp5 is already the newest version (5.0.1-1).\n", 149 | "libopenblas-dev is already the newest version (0.2.20+ds-4).\n", 150 | "0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "!curl -q https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py\n", 156 | "!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev\n", 157 | "!pip -q install seqeval" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": { 163 | "colab_type": "text", 164 | "id": "6ts5noAi0a6K" 165 | }, 166 | "source": [ 167 | "\n", 168 | "### Importing Python Libraries and preparing the environment\n", 169 | "\n", 170 | "At this step we will be importing the libraries and modules needed to run our script. Libraries are:\n", 171 | "* Pandas\n", 172 | "* Pytorch\n", 173 | "* Pytorch Utils for Dataset and Dataloader\n", 174 | "* Transformers\n", 175 | "* BERT Model and Tokenizer\n", 176 | "\n", 177 | "Followed by that we will preapre the device for TPU execeution. This configuration is needed if you want to leverage on onboard TPU. " 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 0, 183 | "metadata": { 184 | "colab": {}, 185 | "colab_type": "code", 186 | "id": "NYUqKiOZdR1H" 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "# Importing pytorch and the library for TPU execution\n", 191 | "\n", 192 | "import torch\n", 193 | "import torch_xla\n", 194 | "import torch_xla.core.xla_model as xm" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 0, 200 | "metadata": { 201 | "colab": {}, 202 | "colab_type": "code", 203 | "id": "y3jTWir2cBlN" 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "# Importing stock ml libraries\n", 208 | "\n", 209 | "import numpy as np\n", 210 | "import pandas as pd\n", 211 | "import transformers\n", 212 | "from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler\n", 213 | "from transformers import BertForTokenClassification, BertTokenizer, BertConfig, BertModel\n", 214 | "\n", 215 | "# Preparing for TPU usage\n", 216 | "dev = xm.xla_device()" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "\n", 224 | "### Importing and Pre-Processing the domain data\n", 225 | "\n", 226 | "We will be working with the data and preparing for fine tuning purposes. \n", 227 | "*Assuming that the `ner.csv` is already downloaded in your `data` folder*\n", 228 | "\n", 229 | "* Import the file in a dataframe and give it the headers as per the documentation.\n", 230 | "* Cleaning the file to remove the unwanted columns.\n", 231 | "* We will create a class `SentenceGetter` that will pull the words from the columns and create them into sentences\n", 232 | "* Followed by that we will create some additional lists and dict to keep the data that will be used for future processing" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 5, 238 | "metadata": { 239 | "colab": { 240 | "base_uri": "https://localhost:8080/", 241 | "height": 212 242 | }, 243 | "colab_type": "code", 244 | "id": "81kDZbz2cDn7", 245 | "outputId": "1312c879-5c24-4964-c6ab-ba1b9b189964" 246 | }, 247 | "outputs": [ 248 | { 249 | "name": "stderr", 250 | "output_type": "stream", 251 | "text": [ 252 | "b'Skipping line 281837: expected 25 fields, saw 34\\n'\n" 253 | ] 254 | }, 255 | { 256 | "data": { 257 | "text/html": [ 258 | "

\n", 259 | "\n", 272 | "\n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | "
possentence_idxwordtag
0NNS1.0ThousandsO
1IN1.0ofO
2NNS1.0demonstratorsO
3VBP1.0haveO
4VBN1.0marchedO
\n", 320 | "
" 321 | ], 322 | "text/plain": [ 323 | " pos sentence_idx word tag\n", 324 | "0 NNS 1.0 Thousands O\n", 325 | "1 IN 1.0 of O\n", 326 | "2 NNS 1.0 demonstrators O\n", 327 | "3 VBP 1.0 have O\n", 328 | "4 VBN 1.0 marched O" 329 | ] 330 | }, 331 | "execution_count": 5, 332 | "metadata": { 333 | "tags": [] 334 | }, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "df = pd.read_csv(\"./data/ner.csv\", encoding = \"ISO-8859-1\", error_bad_lines=False)\n", 340 | "dataset=df.drop(['Unnamed: 0', 'lemma', 'next-lemma', 'next-next-lemma', 'next-next-pos',\n", 341 | " 'next-next-shape', 'next-next-word', 'next-pos', 'next-shape',\n", 342 | " 'next-word', 'prev-iob', 'prev-lemma', 'prev-pos',\n", 343 | " 'prev-prev-iob', 'prev-prev-lemma', 'prev-prev-pos', 'prev-prev-shape',\n", 344 | " 'prev-prev-word', 'prev-shape', 'prev-word','shape'],axis=1)\n", 345 | "dataset.head()" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 0, 351 | "metadata": { 352 | "colab": {}, 353 | "colab_type": "code", 354 | "id": "sdqhHeAqcLnO" 355 | }, 356 | "outputs": [], 357 | "source": [ 358 | "# Creating a class to pull the words from the columns and create them into sentences\n", 359 | "\n", 360 | "class SentenceGetter(object):\n", 361 | " \n", 362 | " def __init__(self, dataset):\n", 363 | " self.n_sent = 1\n", 364 | " self.dataset = dataset\n", 365 | " self.empty = False\n", 366 | " agg_func = lambda s: [(w,p, t) for w,p, t in zip(s[\"word\"].values.tolist(),\n", 367 | " s['pos'].values.tolist(),\n", 368 | " s[\"tag\"].values.tolist())]\n", 369 | " self.grouped = self.dataset.groupby(\"sentence_idx\").apply(agg_func)\n", 370 | " self.sentences = [s for s in self.grouped]\n", 371 | " \n", 372 | " def get_next(self):\n", 373 | " try:\n", 374 | " s = self.grouped[\"Sentence: {}\".format(self.n_sent)]\n", 375 | " self.n_sent += 1\n", 376 | " return s\n", 377 | " except:\n", 378 | " return None\n", 379 | "\n", 380 | "getter = SentenceGetter(dataset)" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 0, 386 | "metadata": { 387 | "colab": {}, 388 | "colab_type": "code", 389 | "id": "Cqln3QJecNsJ" 390 | }, 391 | "outputs": [], 392 | "source": [ 393 | "# Creating new lists and dicts that will be used at a later stage for reference and processing\n", 394 | "\n", 395 | "tags_vals = list(set(dataset[\"tag\"].values))\n", 396 | "tag2idx = {t: i for i, t in enumerate(tags_vals)}\n", 397 | "sentences = [' '.join([s[0] for s in sent]) for sent in getter.sentences]\n", 398 | "labels = [[s[2] for s in sent] for sent in getter.sentences]\n", 399 | "labels = [[tag2idx.get(l) for l in lab] for lab in labels]" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "\n", 407 | "### Preparing the Dataset and Dataloader\n", 408 | "\n", 409 | "We will start with defining few key variables that will be used later during the training/fine tuning stage.\n", 410 | "Followed by creation of Dataset class - This defines how the text is pre-processed before sending it to the neural network. We will also define the Dataloader that will feed the data in batches to the neural network for suitable training and processing. \n", 411 | "Dataset and Dataloader are constructs of the PyTorch library for defining and controlling the data pre-processing and its passage to neural network. For further reading into Dataset and Dataloader read the [docs at PyTorch](https://pytorch.org/docs/stable/data.html)\n", 412 | "\n", 413 | "#### *CustomDataset* Dataset Class\n", 414 | "- This class is defined to accept the `tokenizer`, `sentences` and `labels` as input and generate tokenized output and tags that is used by the BERT model for training. \n", 415 | "- We are using the BERT tokenizer to tokenize the data in the `sentences` list for encoding. \n", 416 | "- The tokenizer uses the `encode_plus` method to perform tokenization and generate the necessary outputs, namely: `ids`, `attention_mask`\n", 417 | "- To read further into the tokenizer, [refer to this document](https://huggingface.co/transformers/model_doc/bert.html#berttokenizer)\n", 418 | "- `tags` is the encoded entity from the annonated dataset. \n", 419 | "- The *CustomDataset* class is used to create 2 datasets, for training and for validation.\n", 420 | "- *Training Dataset* is used to fine tune the model: **80% of the original data**\n", 421 | "- *Validation Dataset* is used to evaluate the performance of the model. The model has not seen this data during training. \n", 422 | "\n", 423 | "#### Dataloader\n", 424 | "- Dataloader is used to for creating training and validation dataloader that load data to the neural network in a defined manner. This is needed because all the data from the dataset cannot be loaded to the memory at once, hence the amount of dataloaded to the memory and then passed to the neural network needs to be controlled.\n", 425 | "- This control is achieved using the parameters such as `batch_size` and `max_len`.\n", 426 | "- Training and Validation dataloaders are used in the training and validation part of the flow respectively" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 0, 432 | "metadata": { 433 | "colab": {}, 434 | "colab_type": "code", 435 | "id": "kL0b1VIQcTAC" 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "# Defining some key variables that will be used later on in the training\n", 440 | "\n", 441 | "MAX_LEN = 200\n", 442 | "TRAIN_BATCH_SIZE = 32\n", 443 | "VALID_BATCH_SIZE = 16\n", 444 | "EPOCHS = 5\n", 445 | "LEARNING_RATE = 2e-05\n", 446 | "tokenizer = BertTokenizer.from_pretrained('bert-base-cased')" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 0, 452 | "metadata": { 453 | "colab": {}, 454 | "colab_type": "code", 455 | "id": "IV72GFgq_ZYb" 456 | }, 457 | "outputs": [], 458 | "source": [ 459 | "class CustomDataset(Dataset):\n", 460 | " def __init__(self, tokenizer, sentences, labels, max_len):\n", 461 | " self.len = len(sentences)\n", 462 | " self.sentences = sentences\n", 463 | " self.labels = labels\n", 464 | " self.tokenizer = tokenizer\n", 465 | " self.max_len = max_len\n", 466 | " \n", 467 | " def __getitem__(self, index):\n", 468 | " sentence = str(self.sentences[index])\n", 469 | " inputs = self.tokenizer.encode_plus(\n", 470 | " sentence,\n", 471 | " None,\n", 472 | " add_special_tokens=True,\n", 473 | " max_length=self.max_len,\n", 474 | " pad_to_max_length=True,\n", 475 | " return_token_type_ids=True\n", 476 | " )\n", 477 | " ids = inputs['input_ids']\n", 478 | " mask = inputs['attention_mask']\n", 479 | " label = self.labels[index]\n", 480 | " label.extend([4]*200)\n", 481 | " label=label[:200]\n", 482 | "\n", 483 | " return {\n", 484 | " 'ids': torch.tensor(ids, dtype=torch.long),\n", 485 | " 'mask': torch.tensor(mask, dtype=torch.long),\n", 486 | " 'tags': torch.tensor(label, dtype=torch.long)\n", 487 | " } \n", 488 | " \n", 489 | " def __len__(self):\n", 490 | " return self.len" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 12, 496 | "metadata": { 497 | "colab": { 498 | "base_uri": "https://localhost:8080/", 499 | "height": 67 500 | }, 501 | "colab_type": "code", 502 | "id": "7BvniiyvX-rB", 503 | "outputId": "61fdc431-4544-4a19-a176-46f19ca58f0f" 504 | }, 505 | "outputs": [ 506 | { 507 | "name": "stdout", 508 | "output_type": "stream", 509 | "text": [ 510 | "FULL Dataset: 35177\n", 511 | "TRAIN Dataset: 28141\n", 512 | "TEST Dataset: 7036\n" 513 | ] 514 | } 515 | ], 516 | "source": [ 517 | "# Creating the dataset and dataloader for the neural network\n", 518 | "\n", 519 | "train_percent = 0.8\n", 520 | "train_size = int(train_percent*len(sentences))\n", 521 | "# train_dataset=df.sample(frac=train_size,random_state=200).reset_index(drop=True)\n", 522 | "# test_dataset=df.drop(train_dataset.index).reset_index(drop=True)\n", 523 | "train_sentences = sentences[0:train_size]\n", 524 | "train_labels = labels[0:train_size]\n", 525 | "\n", 526 | "test_sentences = sentences[train_size:]\n", 527 | "test_labels = labels[train_size:]\n", 528 | "\n", 529 | "print(\"FULL Dataset: {}\".format(len(sentences)))\n", 530 | "print(\"TRAIN Dataset: {}\".format(len(train_sentences)))\n", 531 | "print(\"TEST Dataset: {}\".format(len(test_sentences)))\n", 532 | "\n", 533 | "training_set = CustomDataset(tokenizer, train_sentences, train_labels, MAX_LEN)\n", 534 | "testing_set = CustomDataset(tokenizer, test_sentences, test_labels, MAX_LEN)" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": 0, 540 | "metadata": { 541 | "colab": {}, 542 | "colab_type": "code", 543 | "id": "F05bou5zZYtV" 544 | }, 545 | "outputs": [], 546 | "source": [ 547 | "train_params = {'batch_size': TRAIN_BATCH_SIZE,\n", 548 | " 'shuffle': True,\n", 549 | " 'num_workers': 0\n", 550 | " }\n", 551 | "\n", 552 | "test_params = {'batch_size': VALID_BATCH_SIZE,\n", 553 | " 'shuffle': True,\n", 554 | " 'num_workers': 0\n", 555 | " }\n", 556 | "\n", 557 | "training_loader = DataLoader(training_set, **train_params)\n", 558 | "testing_loader = DataLoader(testing_set, **test_params)" 559 | ] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "metadata": {}, 564 | "source": [ 565 | "\n", 566 | "### Creating the Neural Network for Fine Tuning\n", 567 | "\n", 568 | "#### Neural Network\n", 569 | " - We will be creating a neural network with the `BERTClass`. \n", 570 | " - This network will have the `BertForTokenClassification` model. \n", 571 | " - The data will be fed to the `BertForTokenClassification` as defined in the dataset. \n", 572 | " - Final layer outputs is what will be used to calcuate the loss and to determine the accuracy of models prediction. \n", 573 | " - We will initiate an instance of the network called `model`. This instance will be used for training and then to save the final trained model for future inference. \n", 574 | " \n", 575 | "#### Loss Function and Optimizer\n", 576 | " - `Optimizer` is defined in the next cell.\n", 577 | " - We do not define any `Loss function` since the specified model already outputs `Loss` for a given input. \n", 578 | " - `Optimizer` is used to update the weights of the neural network to improve its performance.\n", 579 | " \n", 580 | "#### Further Reading\n", 581 | "- You can refer to my [Pytorch Tutorials](https://github.com/abhimishra91/pytorch-tutorials) to get an intuition of Loss Function and Optimizer.\n", 582 | "- [Pytorch Documentation for Loss Function](https://pytorch.org/docs/stable/nn.html#loss-functions)\n", 583 | "- [Pytorch Documentation for Optimizer](https://pytorch.org/docs/stable/optim.html)\n", 584 | "- Refer to the links provided on the top of the notebook to read more about `BertForTokenClassification`. " 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": 0, 590 | "metadata": { 591 | "colab": {}, 592 | "colab_type": "code", 593 | "id": "9vuIJrvSZble" 594 | }, 595 | "outputs": [], 596 | "source": [ 597 | "# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. \n", 598 | "\n", 599 | "class BERTClass(torch.nn.Module):\n", 600 | " def __init__(self):\n", 601 | " super(BERTClass, self).__init__()\n", 602 | " self.l1 = transformers.BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=18)\n", 603 | " # self.l2 = torch.nn.Dropout(0.3)\n", 604 | " # self.l3 = torch.nn.Linear(768, 200)\n", 605 | " \n", 606 | " def forward(self, ids, mask, labels):\n", 607 | " output_1= self.l1(ids, mask, labels = labels)\n", 608 | " # output_2 = self.l2(output_1[0])\n", 609 | " # output = self.l3(output_2)\n", 610 | " return output_1" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 16, 616 | "metadata": { 617 | "colab": { 618 | "base_uri": "https://localhost:8080/", 619 | "height": 1000 620 | }, 621 | "colab_type": "code", 622 | "collapsed": true, 623 | "id": "CflOeT2-ZoV6", 624 | "outputId": "8045068c-9aa0-48b8-cad1-8ac7c3b1e807" 625 | }, 626 | "outputs": [ 627 | { 628 | "data": { 629 | "text/plain": [ 630 | "BertForTokenClassification(\n", 631 | " (bert): BertModel(\n", 632 | " (embeddings): BertEmbeddings(\n", 633 | " (word_embeddings): Embedding(28996, 768, padding_idx=0)\n", 634 | " (position_embeddings): Embedding(512, 768)\n", 635 | " (token_type_embeddings): Embedding(2, 768)\n", 636 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 637 | " (dropout): Dropout(p=0.1, inplace=False)\n", 638 | " )\n", 639 | " (encoder): BertEncoder(\n", 640 | " (layer): ModuleList(\n", 641 | " (0): BertLayer(\n", 642 | " (attention): BertAttention(\n", 643 | " (self): BertSelfAttention(\n", 644 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 645 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 646 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 647 | " (dropout): Dropout(p=0.1, inplace=False)\n", 648 | " )\n", 649 | " (output): BertSelfOutput(\n", 650 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 651 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 652 | " (dropout): Dropout(p=0.1, inplace=False)\n", 653 | " )\n", 654 | " )\n", 655 | " (intermediate): BertIntermediate(\n", 656 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 657 | " )\n", 658 | " (output): BertOutput(\n", 659 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 660 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 661 | " (dropout): Dropout(p=0.1, inplace=False)\n", 662 | " )\n", 663 | " )\n", 664 | " (1): BertLayer(\n", 665 | " (attention): BertAttention(\n", 666 | " (self): BertSelfAttention(\n", 667 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 668 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 669 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 670 | " (dropout): Dropout(p=0.1, inplace=False)\n", 671 | " )\n", 672 | " (output): BertSelfOutput(\n", 673 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 674 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 675 | " (dropout): Dropout(p=0.1, inplace=False)\n", 676 | " )\n", 677 | " )\n", 678 | " (intermediate): BertIntermediate(\n", 679 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 680 | " )\n", 681 | " (output): BertOutput(\n", 682 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 683 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 684 | " (dropout): Dropout(p=0.1, inplace=False)\n", 685 | " )\n", 686 | " )\n", 687 | " (2): BertLayer(\n", 688 | " (attention): BertAttention(\n", 689 | " (self): BertSelfAttention(\n", 690 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 691 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 692 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 693 | " (dropout): Dropout(p=0.1, inplace=False)\n", 694 | " )\n", 695 | " (output): BertSelfOutput(\n", 696 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 697 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 698 | " (dropout): Dropout(p=0.1, inplace=False)\n", 699 | " )\n", 700 | " )\n", 701 | " (intermediate): BertIntermediate(\n", 702 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 703 | " )\n", 704 | " (output): BertOutput(\n", 705 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 706 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 707 | " (dropout): Dropout(p=0.1, inplace=False)\n", 708 | " )\n", 709 | " )\n", 710 | " (3): BertLayer(\n", 711 | " (attention): BertAttention(\n", 712 | " (self): BertSelfAttention(\n", 713 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 714 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 715 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 716 | " (dropout): Dropout(p=0.1, inplace=False)\n", 717 | " )\n", 718 | " (output): BertSelfOutput(\n", 719 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 720 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 721 | " (dropout): Dropout(p=0.1, inplace=False)\n", 722 | " )\n", 723 | " )\n", 724 | " (intermediate): BertIntermediate(\n", 725 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 726 | " )\n", 727 | " (output): BertOutput(\n", 728 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 729 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 730 | " (dropout): Dropout(p=0.1, inplace=False)\n", 731 | " )\n", 732 | " )\n", 733 | " (4): BertLayer(\n", 734 | " (attention): BertAttention(\n", 735 | " (self): BertSelfAttention(\n", 736 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 737 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 738 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 739 | " (dropout): Dropout(p=0.1, inplace=False)\n", 740 | " )\n", 741 | " (output): BertSelfOutput(\n", 742 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 743 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 744 | " (dropout): Dropout(p=0.1, inplace=False)\n", 745 | " )\n", 746 | " )\n", 747 | " (intermediate): BertIntermediate(\n", 748 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 749 | " )\n", 750 | " (output): BertOutput(\n", 751 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 752 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 753 | " (dropout): Dropout(p=0.1, inplace=False)\n", 754 | " )\n", 755 | " )\n", 756 | " (5): BertLayer(\n", 757 | " (attention): BertAttention(\n", 758 | " (self): BertSelfAttention(\n", 759 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 760 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 761 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 762 | " (dropout): Dropout(p=0.1, inplace=False)\n", 763 | " )\n", 764 | " (output): BertSelfOutput(\n", 765 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 766 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 767 | " (dropout): Dropout(p=0.1, inplace=False)\n", 768 | " )\n", 769 | " )\n", 770 | " (intermediate): BertIntermediate(\n", 771 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 772 | " )\n", 773 | " (output): BertOutput(\n", 774 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 775 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 776 | " (dropout): Dropout(p=0.1, inplace=False)\n", 777 | " )\n", 778 | " )\n", 779 | " (6): BertLayer(\n", 780 | " (attention): BertAttention(\n", 781 | " (self): BertSelfAttention(\n", 782 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 783 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 784 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 785 | " (dropout): Dropout(p=0.1, inplace=False)\n", 786 | " )\n", 787 | " (output): BertSelfOutput(\n", 788 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 789 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 790 | " (dropout): Dropout(p=0.1, inplace=False)\n", 791 | " )\n", 792 | " )\n", 793 | " (intermediate): BertIntermediate(\n", 794 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 795 | " )\n", 796 | " (output): BertOutput(\n", 797 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 798 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 799 | " (dropout): Dropout(p=0.1, inplace=False)\n", 800 | " )\n", 801 | " )\n", 802 | " (7): BertLayer(\n", 803 | " (attention): BertAttention(\n", 804 | " (self): BertSelfAttention(\n", 805 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 806 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 807 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 808 | " (dropout): Dropout(p=0.1, inplace=False)\n", 809 | " )\n", 810 | " (output): BertSelfOutput(\n", 811 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 812 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 813 | " (dropout): Dropout(p=0.1, inplace=False)\n", 814 | " )\n", 815 | " )\n", 816 | " (intermediate): BertIntermediate(\n", 817 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 818 | " )\n", 819 | " (output): BertOutput(\n", 820 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 821 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 822 | " (dropout): Dropout(p=0.1, inplace=False)\n", 823 | " )\n", 824 | " )\n", 825 | " (8): BertLayer(\n", 826 | " (attention): BertAttention(\n", 827 | " (self): BertSelfAttention(\n", 828 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 829 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 830 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 831 | " (dropout): Dropout(p=0.1, inplace=False)\n", 832 | " )\n", 833 | " (output): BertSelfOutput(\n", 834 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 835 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 836 | " (dropout): Dropout(p=0.1, inplace=False)\n", 837 | " )\n", 838 | " )\n", 839 | " (intermediate): BertIntermediate(\n", 840 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 841 | " )\n", 842 | " (output): BertOutput(\n", 843 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 844 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 845 | " (dropout): Dropout(p=0.1, inplace=False)\n", 846 | " )\n", 847 | " )\n", 848 | " (9): BertLayer(\n", 849 | " (attention): BertAttention(\n", 850 | " (self): BertSelfAttention(\n", 851 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 852 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 853 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 854 | " (dropout): Dropout(p=0.1, inplace=False)\n", 855 | " )\n", 856 | " (output): BertSelfOutput(\n", 857 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 858 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 859 | " (dropout): Dropout(p=0.1, inplace=False)\n", 860 | " )\n", 861 | " )\n", 862 | " (intermediate): BertIntermediate(\n", 863 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 864 | " )\n", 865 | " (output): BertOutput(\n", 866 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 867 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 868 | " (dropout): Dropout(p=0.1, inplace=False)\n", 869 | " )\n", 870 | " )\n", 871 | " (10): BertLayer(\n", 872 | " (attention): BertAttention(\n", 873 | " (self): BertSelfAttention(\n", 874 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 875 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 876 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 877 | " (dropout): Dropout(p=0.1, inplace=False)\n", 878 | " )\n", 879 | " (output): BertSelfOutput(\n", 880 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 881 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 882 | " (dropout): Dropout(p=0.1, inplace=False)\n", 883 | " )\n", 884 | " )\n", 885 | " (intermediate): BertIntermediate(\n", 886 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 887 | " )\n", 888 | " (output): BertOutput(\n", 889 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 890 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 891 | " (dropout): Dropout(p=0.1, inplace=False)\n", 892 | " )\n", 893 | " )\n", 894 | " (11): BertLayer(\n", 895 | " (attention): BertAttention(\n", 896 | " (self): BertSelfAttention(\n", 897 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 898 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 899 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 900 | " (dropout): Dropout(p=0.1, inplace=False)\n", 901 | " )\n", 902 | " (output): BertSelfOutput(\n", 903 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 904 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 905 | " (dropout): Dropout(p=0.1, inplace=False)\n", 906 | " )\n", 907 | " )\n", 908 | " (intermediate): BertIntermediate(\n", 909 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 910 | " )\n", 911 | " (output): BertOutput(\n", 912 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 913 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 914 | " (dropout): Dropout(p=0.1, inplace=False)\n", 915 | " )\n", 916 | " )\n", 917 | " )\n", 918 | " )\n", 919 | " (pooler): BertPooler(\n", 920 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 921 | " (activation): Tanh()\n", 922 | " )\n", 923 | " )\n", 924 | " (dropout): Dropout(p=0.1, inplace=False)\n", 925 | " (classifier): Linear(in_features=768, out_features=18, bias=True)\n", 926 | ")" 927 | ] 928 | }, 929 | "execution_count": 16, 930 | "metadata": { 931 | "tags": [] 932 | }, 933 | "output_type": "execute_result" 934 | } 935 | ], 936 | "source": [ 937 | "model = BERTClass()\n", 938 | "model.to(dev)" 939 | ] 940 | }, 941 | { 942 | "cell_type": "code", 943 | "execution_count": 0, 944 | "metadata": { 945 | "colab": {}, 946 | "colab_type": "code", 947 | "id": "DN_u9NC5aaa_" 948 | }, 949 | "outputs": [], 950 | "source": [ 951 | "optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)" 952 | ] 953 | }, 954 | { 955 | "cell_type": "markdown", 956 | "metadata": {}, 957 | "source": [ 958 | "\n", 959 | "### Fine Tuning the Model\n", 960 | "\n", 961 | "After all the effort of loading and preparing the data and datasets, creating the model and defining its loss and optimizer. This is probably the easier steps in the process. \n", 962 | "\n", 963 | "Here we define a training function that trains the model on the training dataset created above, specified number of times (EPOCH), An epoch defines how many times the complete data will be passed through the network. \n", 964 | "\n", 965 | "Following events happen in this function to fine tune the neural network:\n", 966 | "- The dataloader passes data to the model based on the batch size. \n", 967 | "- Subsequent output from the model and the actual category are compared to calculate the loss. \n", 968 | "- Loss value is used to optimize the weights of the neurons in the network.\n", 969 | "- After every 500 steps the loss value is printed in the console.\n", 970 | "\n", 971 | "As you can see just in 1 epoch by the final step the model was working with a miniscule loss of 0.08503091335296631 i.e. the output is extremely close to the actual output." 972 | ] 973 | }, 974 | { 975 | "cell_type": "code", 976 | "execution_count": 0, 977 | "metadata": { 978 | "colab": {}, 979 | "colab_type": "code", 980 | "id": "8aQ6WCk2a-Vd" 981 | }, 982 | "outputs": [], 983 | "source": [ 984 | "def train(epoch):\n", 985 | " model.train()\n", 986 | " for _,data in enumerate(training_loader, 0):\n", 987 | " ids = data['ids'].to(dev, dtype = torch.long)\n", 988 | " mask = data['mask'].to(dev, dtype = torch.long)\n", 989 | " targets = data['tags'].to(dev, dtype = torch.long)\n", 990 | "\n", 991 | " loss = model(ids, mask, labels = targets)[0]\n", 992 | "\n", 993 | " # optimizer.zero_grad()\n", 994 | " if _%500==0:\n", 995 | " print(f'Epoch: {epoch}, Loss: {loss.item()}')\n", 996 | " \n", 997 | " optimizer.zero_grad()\n", 998 | " loss.backward()\n", 999 | " xm.optimizer_step(optimizer)\n", 1000 | " xm.mark_step() " 1001 | ] 1002 | }, 1003 | { 1004 | "cell_type": "code", 1005 | "execution_count": 31, 1006 | "metadata": { 1007 | "colab": { 1008 | "base_uri": "https://localhost:8080/", 1009 | "height": 689 1010 | }, 1011 | "colab_type": "code", 1012 | "collapsed": true, 1013 | "id": "50oMGTe0bvl0", 1014 | "outputId": "292ede16-6da3-460f-d174-f7ee10c729e0" 1015 | }, 1016 | "outputs": [ 1017 | { 1018 | "name": "stdout", 1019 | "output_type": "stream", 1020 | "text": [ 1021 | "Epoch: 0, Loss: 0.21416641771793365\n", 1022 | "Epoch: 0, Loss: 0.08791390806436539\n", 1023 | "Epoch: 0, Loss: 0.1277497559785843\n", 1024 | "Epoch: 0, Loss: 0.25511449575424194\n", 1025 | "Epoch: 0, Loss: 0.11072967946529388\n", 1026 | "Epoch: 0, Loss: 0.1202322468161583\n", 1027 | "Epoch: 0, Loss: 0.16198261082172394\n", 1028 | "Epoch: 0, Loss: 0.31682807207107544\n", 1029 | "Epoch: 1, Loss: 0.09211093187332153\n", 1030 | "Epoch: 1, Loss: 0.15079179406166077\n", 1031 | "Epoch: 1, Loss: 0.1959223747253418\n", 1032 | "Epoch: 1, Loss: 0.09143798053264618\n", 1033 | "Epoch: 1, Loss: 0.29411888122558594\n", 1034 | "Epoch: 1, Loss: 0.11708520352840424\n", 1035 | "Epoch: 1, Loss: 0.11245028674602509\n", 1036 | "Epoch: 1, Loss: 0.14728033542633057\n", 1037 | "Epoch: 2, Loss: 0.1607980579137802\n", 1038 | "Epoch: 2, Loss: 0.08060580492019653\n", 1039 | "Epoch: 2, Loss: 0.14363577961921692\n", 1040 | "Epoch: 2, Loss: 0.12225533276796341\n", 1041 | "Epoch: 2, Loss: 0.10335233807563782\n", 1042 | "Epoch: 2, Loss: 0.04923604056239128\n", 1043 | "Epoch: 2, Loss: 0.09237729012966156\n", 1044 | "Epoch: 2, Loss: 0.12473192811012268\n", 1045 | "Epoch: 3, Loss: 0.09085617959499359\n", 1046 | "Epoch: 3, Loss: 0.09351193159818649\n", 1047 | "Epoch: 3, Loss: 0.06728512048721313\n", 1048 | "Epoch: 3, Loss: 0.1666068434715271\n", 1049 | "Epoch: 3, Loss: 0.19255675375461578\n", 1050 | "Epoch: 3, Loss: 0.16131675243377686\n", 1051 | "Epoch: 3, Loss: 0.15462705492973328\n", 1052 | "Epoch: 3, Loss: 0.18679684400558472\n", 1053 | "Epoch: 4, Loss: 0.11378277838230133\n", 1054 | "Epoch: 4, Loss: 0.025372153148055077\n", 1055 | "Epoch: 4, Loss: 0.08231651782989502\n", 1056 | "Epoch: 4, Loss: 0.2682102620601654\n", 1057 | "Epoch: 4, Loss: 0.05264609679579735\n", 1058 | "Epoch: 4, Loss: 0.056522976607084274\n", 1059 | "Epoch: 4, Loss: 0.15710100531578064\n", 1060 | "Epoch: 4, Loss: 0.08503091335296631\n" 1061 | ] 1062 | } 1063 | ], 1064 | "source": [ 1065 | "for epoch in range(5):\n", 1066 | " train(epoch)" 1067 | ] 1068 | }, 1069 | { 1070 | "cell_type": "markdown", 1071 | "metadata": {}, 1072 | "source": [ 1073 | "\n", 1074 | "### Validating the Model\n", 1075 | "\n", 1076 | "During the validation stage we pass the unseen data(Testing Dataset) to the model. This step determines how good the model performs on the unseen data. \n", 1077 | "\n", 1078 | "This unseen data is the 30% of `ner.csv` which was seperated during the Dataset creation stage. \n", 1079 | "During the validation stage the weights of the model are not updated. Only the final output is compared to the actual value. This comparison is then used to calcuate the accuracy of the model. \n", 1080 | "\n", 1081 | "The metric used for measuring the performance of model for these problem statements is called F1 score. We will create a helper function for helping us with f1 score calcuation and also import a library for the same. `seqeval`" 1082 | ] 1083 | }, 1084 | { 1085 | "cell_type": "code", 1086 | "execution_count": 27, 1087 | "metadata": { 1088 | "colab": { 1089 | "base_uri": "https://localhost:8080/", 1090 | "height": 34 1091 | }, 1092 | "colab_type": "code", 1093 | "id": "6OckC0XNkWWm", 1094 | "outputId": "d27682ee-c34f-4811-d791-0c922afa8b05" 1095 | }, 1096 | "outputs": [ 1097 | { 1098 | "name": "stdout", 1099 | "output_type": "stream", 1100 | "text": [ 1101 | " Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n" 1102 | ] 1103 | } 1104 | ], 1105 | "source": [ 1106 | "from seqeval.metrics import f1_score\n", 1107 | "\n", 1108 | "def flat_accuracy(preds, labels):\n", 1109 | " flat_preds = np.argmax(preds, axis=2).flatten()\n", 1110 | " flat_labels = labels.flatten()\n", 1111 | " return np.sum(flat_preds == flat_labels)/len(flat_labels)" 1112 | ] 1113 | }, 1114 | { 1115 | "cell_type": "code", 1116 | "execution_count": 0, 1117 | "metadata": { 1118 | "colab": {}, 1119 | "colab_type": "code", 1120 | "id": "9zwFzzBriLMO" 1121 | }, 1122 | "outputs": [], 1123 | "source": [ 1124 | "def valid(model, testing_loader):\n", 1125 | " model.eval()\n", 1126 | " eval_loss = 0; eval_accuracy = 0\n", 1127 | " n_correct = 0; n_wrong = 0; total = 0\n", 1128 | " predictions , true_labels = [], []\n", 1129 | " nb_eval_steps, nb_eval_examples = 0, 0\n", 1130 | " with torch.no_grad():\n", 1131 | " for _, data in enumerate(testing_loader, 0):\n", 1132 | " ids = data['ids'].to(dev, dtype = torch.long)\n", 1133 | " mask = data['mask'].to(dev, dtype = torch.long)\n", 1134 | " targets = data['tags'].to(dev, dtype = torch.long)\n", 1135 | "\n", 1136 | " output = model(ids, mask, labels=targets)\n", 1137 | " loss, logits = output[:2]\n", 1138 | " logits = logits.detach().cpu().numpy()\n", 1139 | " label_ids = targets.to('cpu').numpy()\n", 1140 | " predictions.extend([list(p) for p in np.argmax(logits, axis=2)])\n", 1141 | " true_labels.append(label_ids)\n", 1142 | " accuracy = flat_accuracy(logits, label_ids)\n", 1143 | " eval_loss += loss.mean().item()\n", 1144 | " eval_accuracy += accuracy\n", 1145 | " nb_eval_examples += ids.size(0)\n", 1146 | " nb_eval_steps += 1\n", 1147 | " eval_loss = eval_loss/nb_eval_steps\n", 1148 | " print(\"Validation loss: {}\".format(eval_loss))\n", 1149 | " print(\"Validation Accuracy: {}\".format(eval_accuracy/nb_eval_steps))\n", 1150 | " pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]\n", 1151 | " valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]\n", 1152 | " print(\"F1-Score: {}\".format(f1_score(pred_tags, valid_tags)))" 1153 | ] 1154 | }, 1155 | { 1156 | "cell_type": "code", 1157 | "execution_count": null, 1158 | "metadata": {}, 1159 | "outputs": [], 1160 | "source": [ 1161 | "# To get the results on the validation set. This data is not seen by the model\n", 1162 | "\n", 1163 | "valid(model, testing_loader)" 1164 | ] 1165 | } 1166 | ], 1167 | "metadata": { 1168 | "accelerator": "TPU", 1169 | "colab": { 1170 | "collapsed_sections": [], 1171 | "name": "transformers_ner.ipynb", 1172 | "provenance": [] 1173 | }, 1174 | "kernelspec": { 1175 | "display_name": "Python 3", 1176 | "language": "python", 1177 | "name": "python3" 1178 | }, 1179 | "language_info": { 1180 | "codemirror_mode": { 1181 | "name": "ipython", 1182 | "version": 3 1183 | }, 1184 | "file_extension": ".py", 1185 | "mimetype": "text/x-python", 1186 | "name": "python", 1187 | "nbconvert_exporter": "python", 1188 | "pygments_lexer": "ipython3", 1189 | "version": "3.7.6" 1190 | } 1191 | }, 1192 | "nbformat": 4, 1193 | "nbformat_minor": 1 1194 | } -------------------------------------------------------------------------------- /transformers_sentiment_wandb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab": {}, 7 | "colab_type": "code", 8 | "id": "UNiYBVL25rQK" 9 | }, 10 | "source": [ 11 | "# Fine Tuning Transformer for Sentiment Classification" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "### Introduction\n", 19 | "\n", 20 | "In this tutorial we will be fine tuning a transformer model for the **Sentiment classification** problem. **Sentiment classification** is a special case of **Multiclass Classification**. In this case the classes represent the sentiment represented by the text.\n", 21 | "The number of classes are generally lesser than a standard multiclass classification proboem where the classes represent the polarity, in form of `postive`, `negative` and in some cases and additional `neutral` polarity.\n", 22 | "\n", 23 | "This is one of the most common business problems when trying to ascertain the sentiment of a statement made by your customer in a business setup.\n", 24 | "\n", 25 | "#### Flow of the notebook\n", 26 | "\n", 27 | "* As with all the tutorials previously, this notebook also follows a easy to follow steps. Making the process of fine tuning and training a Transformers model a straight forward task.\n", 28 | "* However, unlike the other notebooks, in the tutorial, most of the sections have been created into functions, and they are called from the `main()` in the end of the notebook. \n", 29 | "* This is done to leverage the [Weights and Biases Service](https://www.wandb.com/) WandB in short.\n", 30 | "* It is a experiment tracking, parameter optimization and artifact management service. That can be very easily integrated to any of the Deep learning or Machine learning frameworks. \n", 31 | "\n", 32 | "The notebook will be divided into separate sections to provide a organized walk through for the process used. This process can be modified for individual use cases. The sections are:\n", 33 | "\n", 34 | "1. [Preparing Environment and Importing Libraries](#section01)\n", 35 | "2. [Pre-Processing and Preparing the Dataset for data processing: Class](#section02)\n", 36 | "3. [Defining a Model/Network](#section07)\n", 37 | "4. [Fine Tuning the Model: Function](#section03)\n", 38 | "5. [Validating the Model Performance: Function](#section04)\n", 39 | "6. [Main Function](#section05)\n", 40 | " * [Initializing WandB](#section501)\n", 41 | " * [Importing and Pre-Processing the domain data](#section502)\n", 42 | " * [Creation of Dataset and Dataloader](#section503)\n", 43 | " * [Neural Network and Optimizer](#section504)\n", 44 | " * [Training Model and Logging to WandB](#section505)\n", 45 | " * [Validation and generation of Summary](#section506)\n", 46 | "\n", 47 | "\n", 48 | "#### Technical Details\n", 49 | "\n", 50 | "This script leverages on multiple tools designed by other teams. Details of the tools used below. Please ensure that these elements are present in your setup to successfully implement this script.\n", 51 | "\n", 52 | "- **Data**:\n", 53 | "\t- We are using the **IMDB Dataset** available at [Kaggle](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)\n", 54 | "\t- This dataset is a collection of moview reviews obtained from IMDB website, the reviews are labled with a positive or negative sentiment. \n", 55 | "\t- There are approx. `50000` rows of data. Where each row has the following data-point:\n", 56 | "\t\t- **review** : Review of a movie\n", 57 | "\t\t- **sentiment** : positive or negative\n", 58 | "\n", 59 | "\n", 60 | "- **Language Model Used**: \n", 61 | " - This notebook uses ***RoBERTa*** as its base transformer model. [Research Paper](https://arxiv.org/abs/1907.11692) \n", 62 | " - ***RoBERTa*** was an incremental improvement in the ***BERT*** architecture with multiple tweaks in different domains.\n", 63 | " - Some of the changes in RoBERTa were: Bigger training data, Dymanic Masking, Different Self Supervised training objective.\n", 64 | " - You can have a detailed read of these changes at the following [link](https://medium.com/towards-artificial-intelligence/a-robustly-optimized-bert-pretraining-approach-f6b6e537e6a6). \n", 65 | " - We will be leveraging on the ***RoBERTa*** implementation from the HuggingFace team. \n", 66 | " - [Documentation for python](https://huggingface.co/transformers/model_doc/t5.html)\n", 67 | "\n", 68 | "\n", 69 | "- **Hardware Requirements**: \n", 70 | "\t- Python 3.6 and above\n", 71 | "\t- Pytorch, Transformers and\n", 72 | "\t- All the stock Python ML Library\n", 73 | "\t- GPU/TPU enabled setup \n", 74 | " \n", 75 | "\n", 76 | "- **Script Objective**:\n", 77 | "\t- The objective of this script is to fine tune ***RoBERTa*** to be able to classify wether the sentiment of a given text is positive or negative.\n", 78 | "\n", 79 | "---\n", 80 | "NOTE: \n", 81 | "We are using the Weights and Biases Tool-set in this tutorial. The different components will be explained as we go through the article. This is an incremental work done in the summarization notebook.\n", 82 | "\n", 83 | "[Link](https://app.wandb.ai/abhimishra-91/transformers_tutorials_sentiment?workspace=user-abhimishra-91) to the Project on WandB" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "\n", 91 | "### Preparing Environment and Importing Libraries\n", 92 | "\n", 93 | "At this step we will be installing the necessary libraries followed by importing the libraries and modules needed to run our script. \n", 94 | "We will be installing:\n", 95 | "* transformers\n", 96 | "* wandb\n", 97 | "* packages to support tpu for pytorch\n", 98 | "\n", 99 | "Libraries imported are:\n", 100 | "* Pandas\n", 101 | "* Pytorch\n", 102 | "* Pytorch Utils for Dataset and Dataloader\n", 103 | "* Transformers\n", 104 | "* Roberta Model and Tokenizer\n", 105 | "* wandb\n", 106 | "\n", 107 | "Followed by that we will preapre the device to support TPU execution for training.\n", 108 | "\n", 109 | "Finally, we will be logging into the [wandb](https://www.wandb.com/) serice using the login command" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "colab": { 117 | "base_uri": "https://localhost:8080/", 118 | "height": 844 119 | }, 120 | "colab_type": "code", 121 | "id": "4k86Li6zL9Jz", 122 | "outputId": "6fb4449b-12cc-4917-84f0-6e17ff56b9e7" 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "# Installing NLP-Transformers library\n", 127 | "!pip install -q transformers\n", 128 | "\n", 129 | "# Installing wandb library for experiment tracking and hyper parameter optimization\n", 130 | "!pip install -q wandb\n", 131 | "\n", 132 | "# Code for TPU packages install\n", 133 | "!curl -q https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py\n", 134 | "!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 3, 140 | "metadata": { 141 | "colab": {}, 142 | "colab_type": "code", 143 | "id": "hCOWCYgQvgkP" 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "# Importing stock libraries\n", 148 | "import numpy as np\n", 149 | "import pandas as pd\n", 150 | "import torch\n", 151 | "import torch.nn.functional as F\n", 152 | "from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler\n", 153 | "\n", 154 | "# Importing lackages from our NLP-Hugging Package\n", 155 | "from transformers import RobertaConfig, RobertaModel, RobertaTokenizerFast, RobertaForSequenceClassification\n", 156 | "\n", 157 | "# Importing wand for logging and hyper-parameter tuning\n", 158 | "import wandb" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 4, 164 | "metadata": { 165 | "colab": {}, 166 | "colab_type": "code", 167 | "id": "i7NMSGVKDwue" 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "# Setting up the accelerators\n", 172 | "\n", 173 | "# # GPU\n", 174 | "# from torch import cuda\n", 175 | "# device = 'cuda' if cuda.is_available() else 'cpu'\n", 176 | "\n", 177 | "# TPU\n", 178 | "import torch_xla\n", 179 | "import torch_xla.core.xla_model as xm\n", 180 | "device = xm.xla_device()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 5, 186 | "metadata": { 187 | "colab": { 188 | "base_uri": "https://localhost:8080/", 189 | "height": 87 190 | }, 191 | "colab_type": "code", 192 | "id": "X9L1AxSQDugC", 193 | "outputId": "76ceca60-14c8-4baf-b410-ec2c5c179bc9" 194 | }, 195 | "outputs": [ 196 | { 197 | "name": "stdout", 198 | "output_type": "stream", 199 | "text": [ 200 | "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://app.wandb.ai/authorize\n", 201 | "\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter: \n", 202 | "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n", 203 | "\u001b[32mSuccessfully logged in to Weights & Biases!\u001b[0m\n" 204 | ] 205 | } 206 | ], 207 | "source": [ 208 | "# login to wandb\n", 209 | "!wandb login" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "\n", 217 | "### Pre-Processing and Preparing the Dataset for data processing: Class\n", 218 | "\n", 219 | "* We will start with creation of Preprocess class - This defines how the text is pre-processed before working on the tokenization, dataset and dataloader aspects of the workflow. In this class the dataframe is loaded and then the `sentiment` column is used to create a new column in the dataframe called `encoded_polarity` such that if:\n", 220 | " * `sentiment = positive` then `encoded_polarity = 0`\n", 221 | " * `sentiment = negative` then `encoded_polarity = 1`\n", 222 | "\n", 223 | "* Followed by this, the `sentiment` column is removed from the dataframe.\n", 224 | "* The `dataframe` and `encoded_polarity` dictionary are returned. \n", 225 | "* This method is called in the `run()` function.\n", 226 | "\n", 227 | "* After this we will work on the Dataset class - This defines how the text is pre-processed before sending it to the neural network. This dataset will be used the the Dataloader method that will feed the data in batches to the neural network for suitable training and processing. \n", 228 | "* The Dataloader and Dataset will be used inside the `run()`.\n", 229 | "* Dataset and Dataloader are constructs of the PyTorch library for defining and controlling the data pre-processing and its passage to neural network. For further reading into Dataset and Dataloader read the [docs at PyTorch](https://pytorch.org/docs/stable/data.html)\n", 230 | "\n", 231 | "#### *CustomDataset* Dataset Class\n", 232 | "- This class is defined to accept the Dataframe as input and generate tokenized output that is used by the Roberta model for training. \n", 233 | "- We are using the Roberta tokenizer to tokenize the data in the `review` column of the dataframe. \n", 234 | "- The tokenizer uses the `encode_plus` method to perform tokenization and generate the necessary outputs, namely: `ids`, `attention_mask`\n", 235 | "- To read further into the tokenizer, [refer to this document](https://huggingface.co/transformers/model_doc/roberta.html#robertatokenizer)\n", 236 | "- `encoded_polarity` transformed into the `targets` tensor. \n", 237 | "- The *CustomDataset* class is used to create 2 datasets, for training and for validation.\n", 238 | "- *Training Dataset* is used to fine tune the model: **70% of the original data**\n", 239 | "- *Validation Dataset* is used to evaluate the performance of the model. The model has not seen this data during training. \n", 240 | "\n", 241 | "#### return_dataloader: Called inside the `run()`\n", 242 | "- `return_dataloader` function is used to for creating training and validation dataloader that load data to the neural network in a defined manner. This is needed because all the data from the dataset cannot be loaded to the memory at once, hence the amount of data loaded to the memory and then passed to the neural network needs to be controlled.\n", 243 | "- Internally the `return_dataloader` function calls the pytorch Dataloader class and the CustomDataset class to create the dataloaders for training and validation. \n", 244 | "- This control is achieved using the parameters such as `batch_size` and `max_len`.\n", 245 | "- Training and Validation dataloaders are used in the training and validation part of the flow respectively" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 6, 251 | "metadata": { 252 | "colab": {}, 253 | "colab_type": "code", 254 | "id": "yDe5ntrM9Kud" 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "# Preprocess class defines how the dataframe will be processed to generate and removal of features based on thier usage.\n", 259 | "# A new encoded_polarity column is added that adds 0 and 1 to the column based on the positive and negative\n", 260 | "# The processing method will return both the dictionary, and the updated dataframe for further usage.\n", 261 | "\n", 262 | "class Preprocess:\n", 263 | " def __init__(self, df):\n", 264 | " \"\"\"\n", 265 | " Constructor for the class\n", 266 | " :param df: Input Dataframe to be pre-processed\n", 267 | " \"\"\"\n", 268 | " self.df = df\n", 269 | " self.encoded_dict = dict()\n", 270 | "\n", 271 | " def encoding(self, x):\n", 272 | " if x not in self.encoded_dict.keys():\n", 273 | " self.encoded_dict[x] = len(self.encoded_dict)\n", 274 | " return self.encoded_dict[x]\n", 275 | "\n", 276 | " def processing(self):\n", 277 | " self.df['encoded_polarity'] = self.df['sentiment'].apply(lambda x: self.encoding(x))\n", 278 | " self.df.drop(['sentiment'], axis=1, inplace=True)\n", 279 | " return self.encoded_dict, self.df" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 7, 285 | "metadata": { 286 | "colab": {}, 287 | "colab_type": "code", 288 | "id": "l00E-IDDdg0s" 289 | }, 290 | "outputs": [], 291 | "source": [ 292 | "# Creating a CustomDataset class that is used to read the updated dataframe and tokenize the text. \n", 293 | "# The class is used in the return_dataloader function\n", 294 | "\n", 295 | "class CustomDataset(Dataset):\n", 296 | " def __init__(self, dataframe, tokenizer, max_len):\n", 297 | " self.len = len(dataframe)\n", 298 | " self.data = dataframe\n", 299 | " self.tokenizer = tokenizer\n", 300 | " self.max_len = max_len\n", 301 | " \n", 302 | " def __getitem__(self, index):\n", 303 | " text = str(self.data.review[index])\n", 304 | " text = \" \".join(text.split())\n", 305 | " inputs = self.tokenizer.encode_plus(\n", 306 | " text,\n", 307 | " None,\n", 308 | " add_special_tokens=True,\n", 309 | " max_length=self.max_len,\n", 310 | " pad_to_max_length=True,\n", 311 | " return_token_type_ids=True\n", 312 | " )\n", 313 | " ids = inputs['input_ids']\n", 314 | " mask = inputs['attention_mask']\n", 315 | "\n", 316 | " return {\n", 317 | " 'ids': torch.tensor(ids, dtype=torch.long),\n", 318 | " 'mask': torch.tensor(mask, dtype=torch.long),\n", 319 | " 'targets': torch.tensor(self.data.encoded_polarity[index], dtype=torch.float)\n", 320 | " } \n", 321 | " \n", 322 | " def __len__(self):\n", 323 | " return self.len" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 8, 329 | "metadata": { 330 | "colab": {}, 331 | "colab_type": "code", 332 | "id": "Oj5MC7xPhMoq" 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "# Creating a function that returns the dataloader based on the dataframe and the specified train and validation batch size. \n", 337 | "\n", 338 | "def return_dataloader(df, tokenizer, train_batch_size, validation_batch_size, MAX_LEN, train_size=0.7):\n", 339 | " train_size = 0.7\n", 340 | " train_dataset=df.sample(frac=train_size,random_state=200)\n", 341 | " val_dataset=df.drop(train_dataset.index).reset_index(drop=True)\n", 342 | " train_dataset = train_dataset.reset_index(drop=True)\n", 343 | "\n", 344 | " print(\"FULL Dataset: {}\".format(df.shape))\n", 345 | " print(\"TRAIN Dataset: {}\".format(train_dataset.shape))\n", 346 | " print(\"VAL Dataset: {}\".format(val_dataset.shape))\n", 347 | "\n", 348 | " training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)\n", 349 | " validation_set = CustomDataset(val_dataset, tokenizer, MAX_LEN)\n", 350 | "\n", 351 | " train_params = {'batch_size': train_batch_size,\n", 352 | " 'shuffle': True,\n", 353 | " 'num_workers': 1\n", 354 | " }\n", 355 | "\n", 356 | " val_params = {'batch_size': validation_batch_size,\n", 357 | " 'shuffle': True,\n", 358 | " 'num_workers': 1\n", 359 | " }\n", 360 | "\n", 361 | " training_loader = DataLoader(training_set, **train_params)\n", 362 | " validation_loader = DataLoader(validation_set, **val_params)\n", 363 | " \n", 364 | " return training_loader, validation_loader" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "\n", 372 | "### Defining a Model/Network\n", 373 | "\n", 374 | "#### Neural Network\n", 375 | " - We will be creating a neural network with the `ModelClass`. \n", 376 | " - This network will have the Roberta Language model and a few by a `dropout` and `Linear` layer to obtain the final outputs. \n", 377 | " - The data will be fed to the Roberta Language model as defined in the dataset. \n", 378 | " - Final layer outputs is what will be compared to the `encoded_polarity` to determine the accuracy of models prediction. \n", 379 | " - We will initiate an instance of the network called `model`. This instance will be used for training and then to save the final trained model for future inference. \n", 380 | " - The `return_model` function is used in the `run()` to instantiate the model and set it up for TPU execution." 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 9, 386 | "metadata": { 387 | "colab": {}, 388 | "colab_type": "code", 389 | "id": "MxaCC_XGh0ev" 390 | }, 391 | "outputs": [], 392 | "source": [ 393 | "# Creating the customized model, by adding a drop out and a dense layer on top of roberta to get the final output for the model. \n", 394 | "\n", 395 | "class ModelClass(torch.nn.Module):\n", 396 | " def __init__(self):\n", 397 | " super(ModelClass, self).__init__()\n", 398 | " self.model_layer = RobertaModel.from_pretrained(\"roberta-base\")\n", 399 | " self.pre_classifier = torch.nn.Linear(768, 768)\n", 400 | " self.dropout = torch.nn.Dropout(0.3)\n", 401 | " self.classifier = torch.nn.Linear(768, 2)\n", 402 | "\n", 403 | " def forward(self, input_ids, attention_mask):\n", 404 | " output_1 = self.model_layer(input_ids=input_ids, attention_mask=attention_mask)\n", 405 | " hidden_state = output_1[0]\n", 406 | " pooler = hidden_state[:, 0]\n", 407 | " pooler = self.pre_classifier(pooler)\n", 408 | " pooler = torch.nn.ReLU()(pooler)\n", 409 | " pooler = self.dropout(pooler)\n", 410 | " output = self.classifier(pooler)\n", 411 | " return output\n" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 10, 417 | "metadata": { 418 | "colab": {}, 419 | "colab_type": "code", 420 | "id": "tr7ueXG-B3Vz" 421 | }, 422 | "outputs": [], 423 | "source": [ 424 | "# Function to return model based on the defination of Model Class\n", 425 | "\n", 426 | "def return_model(device):\n", 427 | " model = ModelClass()\n", 428 | " model = model.to(device)\n", 429 | " return model" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 11, 435 | "metadata": { 436 | "colab": {}, 437 | "colab_type": "code", 438 | "id": "rbY5DHfbDSuZ" 439 | }, 440 | "outputs": [], 441 | "source": [ 442 | "# Function to calcuate the accuracy of the model\n", 443 | "\n", 444 | "def calcuate_accu(big_idx, targets):\n", 445 | " n_correct = (big_idx==targets).sum().item()\n", 446 | " return n_correct" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "\n", 454 | "### Fine Tuning the Model: Function\n", 455 | "\n", 456 | "Here we define a training function that trains the model on the training dataset created above, specified number of times (EPOCH), An epoch defines how many times the complete data will be passed through the network. \n", 457 | "\n", 458 | "This function is called in the `run()`\n", 459 | "\n", 460 | "Following events happen in this function to fine tune the neural network:\n", 461 | "- The `epoch`, `model`, `device` details, `testing_ dataloader`, `optimizer` and `loss_function` are passed to the `train ()` when its called from the `run()`\n", 462 | "- The dataloader passes data to the model based on the batch size.\n", 463 | "- The output from the neural network: `outputs` is compared to the `targets` tensor and loss is calcuated using `loss_function()`\n", 464 | "- Loss value is used to optimize the weights of the neurons in the network.\n", 465 | "- After every 100 steps the loss value and accuracy is logged in the wandb service. This log is then used to generate graphs for analysis. Such as [these](https://app.wandb.ai/abhimishra-91/transformers_tutorials_sentiment?workspace=user-abhimishra-91)\n", 466 | "- After every epoch the loss and accuracy value is printed in the console. Also, logged into the wandb service." 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 12, 472 | "metadata": { 473 | "colab": {}, 474 | "colab_type": "code", 475 | "id": "7GE89LJnkid7" 476 | }, 477 | "outputs": [], 478 | "source": [ 479 | "# Function to fine tune the model based on the epochs, model, tokenizer and other arguments\n", 480 | "\n", 481 | "def train(epoch, model, device, training_loader, optimizer, loss_function):\n", 482 | " n_correct = 0\n", 483 | " nb_tr_examples, nb_tr_steps = 0, 0\n", 484 | " tr_loss = 0\n", 485 | " model.train()\n", 486 | " for _,data in enumerate(training_loader, 0):\n", 487 | " ids = data['ids'].to(device, dtype = torch.long)\n", 488 | " mask = data['mask'].to(device, dtype = torch.long)\n", 489 | " targets = data['targets'].to(device, dtype = torch.long)\n", 490 | "\n", 491 | " outputs = model(ids, mask).squeeze()\n", 492 | " optimizer.zero_grad()\n", 493 | " loss = loss_function(outputs, targets)\n", 494 | " tr_loss += loss.item()\n", 495 | " big_val, big_idx = torch.max(outputs.data, dim=1)\n", 496 | " n_correct += calcuate_accu(big_idx, targets)\n", 497 | "\n", 498 | " nb_tr_steps += 1\n", 499 | " nb_tr_examples+=targets.size(0)\n", 500 | " \n", 501 | " if _%100==0:\n", 502 | " loss_step = tr_loss/nb_tr_steps\n", 503 | " accu_step = (n_correct*100)/nb_tr_examples \n", 504 | " wandb.log({\"Training Loss per 100 steps\": loss_step})\n", 505 | " wandb.log({\"Training Accuracy per 100 steps\": accu_step})\n", 506 | "\n", 507 | " optimizer.zero_grad()\n", 508 | " loss.backward()\n", 509 | " \n", 510 | " # # When using GPU or GPU\n", 511 | " # optimizer.step()\n", 512 | " \n", 513 | " # When using TPU\n", 514 | " xm.optimizer_step(optimizer)\n", 515 | " xm.mark_step()\n", 516 | "\n", 517 | " print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')\n", 518 | " epoch_loss = tr_loss/nb_tr_steps\n", 519 | " epoch_accu = (n_correct*100)/nb_tr_examples\n", 520 | " wandb.log({\"Training Loss Epoch\": epoch_loss})\n", 521 | " wandb.log({\"Training Accuracy Epoch\": epoch_accu})" 522 | ] 523 | }, 524 | { 525 | "cell_type": "markdown", 526 | "metadata": {}, 527 | "source": [ 528 | "\n", 529 | "### Validating the Model Performance: Function\n", 530 | "\n", 531 | "During the validation stage we pass the unseen data(Validation Dataset), trained model, and device details to the function to perform the validation run. This step generates new encoded_sentiment value for dataset that it has not seen during the training session. \n", 532 | "\n", 533 | "This is then compared to the actual encoded_sentiment, to give us the Validation Accuracy and Loss.\n", 534 | "\n", 535 | "This function is called in the `run()`\n", 536 | "\n", 537 | "This unseen data is the 30% of `IMBD Dataset` which was seperated during the Dataset creation stage. \n", 538 | "During the validation stage the weights of the model are not updated. We use the generate method for generating new text for the summary. \n", 539 | "\n", 540 | "The generated validation accuracy and loss are logged to wandb for every 100th step and per epoch. " 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": 13, 546 | "metadata": { 547 | "colab": {}, 548 | "colab_type": "code", 549 | "id": "2_1Hw4k-0M4r" 550 | }, 551 | "outputs": [], 552 | "source": [ 553 | "# Function to run the validation dataloader to validate the performance of the fine tuned model. \n", 554 | "\n", 555 | "def valid(epoch, model, device, validation_loader, loss_function):\n", 556 | " n_correct = 0; total = 0\n", 557 | " nb_tr_examples, nb_tr_steps = 0, 0\n", 558 | " tr_loss = 0\n", 559 | " model.eval()\n", 560 | " with torch.no_grad():\n", 561 | " for _,data in enumerate(validation_loader, 0):\n", 562 | " ids = data['ids'].to(device, dtype = torch.long)\n", 563 | " mask = data['mask'].to(device, dtype = torch.long)\n", 564 | " targets = data['targets'].to(device, dtype = torch.long)\n", 565 | "\n", 566 | " outputs = model(ids, mask).squeeze()\n", 567 | " loss = loss_function(outputs, targets)\n", 568 | " tr_loss += loss.item()\n", 569 | " big_val, big_idx = torch.max(outputs.data, dim=1)\n", 570 | " n_correct += calcuate_accu(big_idx, targets)\n", 571 | "\n", 572 | " nb_tr_steps += 1\n", 573 | " nb_tr_examples+=targets.size(0)\n", 574 | " \n", 575 | " if _%100==0:\n", 576 | " loss_step = tr_loss/nb_tr_steps\n", 577 | " accu_step = (n_correct*100)/nb_tr_examples \n", 578 | " wandb.log({\"Validation Loss per 100 steps\": loss_step})\n", 579 | " wandb.log({\"Validation Accuracy per 100 steps\": accu_step})\n", 580 | " \n", 581 | " epoch_loss = tr_loss/nb_tr_steps\n", 582 | " epoch_accu = (n_correct*100)/nb_tr_examples\n", 583 | " wandb.log({\"Validation Loss Epoch\": epoch_loss})\n", 584 | " wandb.log({\"Validation Accuracy Epoch\": epoch_accu})\n", 585 | " print(f'The Validation Accuracy: {(n_correct*100)/nb_tr_examples}')" 586 | ] 587 | }, 588 | { 589 | "cell_type": "markdown", 590 | "metadata": {}, 591 | "source": [ 592 | "\n", 593 | "### Run Function\n", 594 | "\n", 595 | "The `run()` as the name suggests is the central location to run all the functions/flows created above in the notebook. The following steps are executed in the `run()`:\n", 596 | "\n", 597 | "\n", 598 | "\n", 599 | "#### Initializing WandB \n", 600 | "\n", 601 | "* The `run()` begins with initializing WandB run under a specific project. This command initiates a new run for each execution of this command. \n", 602 | "\n", 603 | "* We have seend wandb in action in one of the previous notebooks. Leveraging this notebook to log some additional metrics. \n", 604 | "\n", 605 | "* This particular tutorial is logged in the project: **[transformers_tutorials_sentiment](https://app.wandb.ai/abhimishra-91/transformers_tutorials_sentiment?workspace=user-abhimishra-91)**\n", 606 | "\n", 607 | "**One of the dadshboard from the project**\n", 608 | "![](meta/wandb-sentiment.jpg)\n", 609 | "\n", 610 | "* Visit the project page to see the details of different runs and what information is logged by the service. \n", 611 | "\n", 612 | "* Following the initialization of the WandB service we define configuration parameters that will be used across the tutorial such as `batch_size`, `epoch`, `learning_rate` etc.\n", 613 | "\n", 614 | "* These parameters are also passed to the WandB config. The config construct with all the parameters can be optimized using the Sweep service from WandB. Currently, that is outof scope of this tutorial. \n", 615 | "\n", 616 | "\n", 617 | "\n", 618 | "#### Importing and Pre-Processing the domain data\n", 619 | "\n", 620 | "We will be working with the data and preparing it for fine tuning purposes. \n", 621 | "*Assuming that the `IMDB Dataset.csv` is already downloaded in your `data` folder*\n", 622 | "\n", 623 | "* The file is imported as a dataframe and give it the headers as per the documentation.\n", 624 | "* Cleaning the file to remove the unwanted columns.\n", 625 | "* All these steps are done using the `Preprocess Class` defined above\n", 626 | "* The final Dataframe will be something like this:\n", 627 | "\n", 628 | "|review|encoded_polarity|\n", 629 | "|--|--|\n", 630 | "|summary-1|0|\n", 631 | "|summary-2|1|\n", 632 | "|summary-3|1|\n", 633 | "\n", 634 | "\n", 635 | "\n", 636 | "#### Creation of Dataset and Dataloader\n", 637 | "\n", 638 | "* The updated dataframe is divided into 70-20 ratio for test and validation. \n", 639 | "* Both the data-frames are passed to the `CustomerDataset` class for tokenization of the review and its sentiment.\n", 640 | "* The tokenization is done using roberta tokenizer.\n", 641 | "* Train and Validation parameters are defined and passed to the `pytorch Dataloader contstruct` to create `train` and `validation` data loaders.\n", 642 | "* These dataloaders will be passed to `train()` and `validate()` respectively for training and validation action.\n", 643 | "* The shape of datasets is printed in the console.\n", 644 | "* All these actions are performed using the `return_dataloader()` and `CustomDataset class` defined above.\n", 645 | "\n", 646 | "\n", 647 | "\n", 648 | "#### Neural Network and Optimizer\n", 649 | "\n", 650 | "* In this stage we define the model and optimizer that will be used for training and to update the weights of the network. \n", 651 | "* We are using the `roberta-base-uncased` transformer model for our project. You can read about the `RoBERTa model` and its features above. \n", 652 | "* The model is returned and instiated using the `return_model()` and `ModelClass`.\n", 653 | "* We are using the `Adam` optimizer for our project. This has been a standard for all our tutorials and is something that can be changed updated to see how different optimizer perform with different learning rates. \n", 654 | "* There is also a scope for doing more with Optimizer such a decay, momentum to dynamically update the Learning rate and other parameters. All those concepts have been kept out of scope for these tutorials. \n", 655 | "\n", 656 | "\n", 657 | "\n", 658 | "#### Training Model and Logging to WandB\n", 659 | "\n", 660 | "* Followed by that we call the `train()` with all the necessary parameters.\n", 661 | "* Loss and accuracy at every 100th step is logged to the WandB service. \n", 662 | "* Accuracy and end of every epoch is logged in WandB and also printed in the console.\n", 663 | "\n", 664 | "\n", 665 | "\n", 666 | "#### Validation\n", 667 | "\n", 668 | "* After the training is completed, the validation step is initiated.\n", 669 | "* As defined in the validation function, the model weights are not updated. We use the fine tuned model to generate encoded sentiment.\n", 670 | "* An output is printed on the console giving the accuracy at the end of Validation. " 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": 14, 676 | "metadata": { 677 | "colab": {}, 678 | "colab_type": "code", 679 | "id": "zgZVNTmu8w0J" 680 | }, 681 | "outputs": [], 682 | "source": [ 683 | "def run():\n", 684 | " \n", 685 | " # WandB – Initialize a new run\n", 686 | " wandb.init(project=\"transformers_tutorials_sentiment\")\n", 687 | " \n", 688 | " # Defining some key variables that will be used later on in the training\n", 689 | " config = wandb.config \n", 690 | " config.MAX_LEN = 512\n", 691 | " config.TRAIN_BATCH_SIZE = 4\n", 692 | " config.VALID_BATCH_SIZE = 2\n", 693 | " config.EPOCHS = 2\n", 694 | " config.LEARNING_RATE = 1e-05\n", 695 | " tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')\n", 696 | "\n", 697 | " # Reading the dataset and pre-processing it for usage\n", 698 | " df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/IMDB Dataset.csv', encoding='latin-1')\n", 699 | " pre = Preprocess(df)\n", 700 | " encoding_dict, df = pre.processing()\n", 701 | "\n", 702 | " # Creating the training and validation dataloader using the functions defined above\n", 703 | " training_loader, validation_loader = return_dataloader(df, tokenizer, config.TRAIN_BATCH_SIZE, config.VALID_BATCH_SIZE, config.MAX_LEN)\n", 704 | "\n", 705 | " # Defining the model based on the function and ModelClass defined above\n", 706 | " model = return_model(device)\n", 707 | "\n", 708 | " # Creating the loss function and optimizer\n", 709 | " loss_function = torch.nn.CrossEntropyLoss()\n", 710 | " optimizer = torch.optim.Adam(params = model.parameters(), lr=config.LEARNING_RATE)\n", 711 | "\n", 712 | " # Fine tuning the model using the train function:\n", 713 | " for epoch in range(config.EPOCHS):\n", 714 | " train(epoch, model, device, training_loader, optimizer, loss_function)\n", 715 | "\n", 716 | " # Running the validation function to validate the performance of the trained model\n", 717 | " valid(epoch, model, device, validation_loader, loss_function)" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": 15, 723 | "metadata": { 724 | "colab": { 725 | "base_uri": "https://localhost:8080/", 726 | "height": 155 727 | }, 728 | "colab_type": "code", 729 | "id": "MzQHSMj7IDIz", 730 | "outputId": "005fedb8-9be9-4809-d2e4-661823b98021", 731 | "scrolled": true 732 | }, 733 | "outputs": [ 734 | { 735 | "data": { 736 | "text/html": [ 737 | "\n", 738 | " Logging results to Weights & Biases (Documentation).
\n", 739 | " Project page: https://app.wandb.ai/abhimishra-91/transformers_tutorials_sentiment
\n", 740 | " Run page: https://app.wandb.ai/abhimishra-91/transformers_tutorials_sentiment/runs/1zwn4gbg
\n", 741 | " " 742 | ], 743 | "text/plain": [ 744 | "" 745 | ] 746 | }, 747 | "metadata": { 748 | "tags": [] 749 | }, 750 | "output_type": "display_data" 751 | }, 752 | { 753 | "name": "stdout", 754 | "output_type": "stream", 755 | "text": [ 756 | "FULL Dataset: (50000, 2)\n", 757 | "TRAIN Dataset: (35000, 2)\n", 758 | "VAL Dataset: (15000, 2)\n", 759 | "The Total Accuracy for Epoch 0: 91.74285714285715\n", 760 | "The Total Accuracy for Epoch 1: 95.54\n", 761 | "The Validation Accuracy: 94.68\n" 762 | ] 763 | } 764 | ], 765 | "source": [ 766 | "run()" 767 | ] 768 | } 769 | ], 770 | "metadata": { 771 | "accelerator": "TPU", 772 | "colab": { 773 | "collapsed_sections": [], 774 | "name": "transformers_sentiment_imdb.ipynb", 775 | "provenance": [] 776 | }, 777 | "kernelspec": { 778 | "display_name": "Python 3", 779 | "language": "python", 780 | "name": "python3" 781 | }, 782 | "varInspector": { 783 | "cols": { 784 | "lenName": 16, 785 | "lenType": 16, 786 | "lenVar": 40 787 | }, 788 | "kernels_config": { 789 | "python": { 790 | "delete_cmd_postfix": "", 791 | "delete_cmd_prefix": "del ", 792 | "library": "var_list.py", 793 | "varRefreshCmd": "print(var_dic_list())" 794 | }, 795 | "r": { 796 | "delete_cmd_postfix": ") ", 797 | "delete_cmd_prefix": "rm(", 798 | "library": "var_list.r", 799 | "varRefreshCmd": "cat(var_dic_list()) " 800 | } 801 | }, 802 | "types_to_exclude": [ 803 | "module", 804 | "function", 805 | "builtin_function_or_method", 806 | "instance", 807 | "_Feature" 808 | ], 809 | "window_display": false 810 | } 811 | }, 812 | "nbformat": 4, 813 | "nbformat_minor": 1 814 | } --------------------------------------------------------------------------------