├── .gitignore ├── LICENSE ├── README.md ├── data ├── eng │ ├── task_a_tiny.zip │ ├── task_b_tiny.zip │ └── task_c_tiny.zip └── test_data │ ├── readme-offenseval-testsetA-english.txt │ ├── readme-offenseval-testsetB-english.txt │ ├── readme-offenseval-testsetC-english.txt │ ├── test_a_tweets.tsv │ ├── test_b_tweets.tsv │ └── test_c_tweets.tsv ├── notebooks ├── Eng Task A - Ensemble DistilGPT2.ipynb ├── Eng Task B - Ensemble Roberta.ipynb └── Eng Task C - Ensemble DistilRoberta AttnMask Dropout.ipynb └── src ├── __init__.py ├── lookahead ├── LICENSE ├── __init__.py └── optimizer.py ├── radam ├── LICENSE.txt ├── __init__.py └── radam.py └── utils ├── __init__.py ├── activations.py ├── general.py ├── offenseval2020.py └── transformer ├── __init.py__ ├── data.py ├── general.py └── roberta.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # macOS 132 | .DS_Store 133 | .DocumentRevisions-V100 134 | .fseventsd 135 | .Spotlight-V100 136 | .TemporaryItems 137 | .Trashes 138 | .VolumeIcon.icns 139 | .com.apple.timemachine.donotpresent 140 | .AppleDB 141 | .AppleDesktop 142 | Network Trash Folder 143 | Temporary Items 144 | .apdisk 145 | .AppleDouble 146 | .LSOverride 147 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Kaushik Amar Das 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OffensEval2020_submission 2 | [OffensEval 2020](https://sites.google.com/site/offensevalsharedtask/home) Models code for Team KAFK 3 | 4 | Paper Link: https://www.aclweb.org/anthology/2020.semeval-1.267.pdf 5 | 6 | Please find the notebooks for the system code used for each task in the `notebooks` directory. 7 | They should work out of the box in Google Colab. However, to fully replicate our work you will need the exact hyperparmeters 8 | from the original paper and the full dataset which might not be possible in Colab. 9 | 10 | We have provided small subset of the dataset for each task in the `data` folder to use with the abovementioned notebooks. Please cite their work if you used the data in your work. The citation is provided below. Also, if you want to use the full dataset, kindly create DataFrames out of them in the same manner as used in the notebooks. 11 | 12 | 13 | Credits: 14 | 15 | - RAdam : https://github.com/LiyuanLucasLiu/RAdam 16 | - LookAhead: https://github.com/lonePatient/lookahead_pytorch 17 | - Transformers: https://github.com/huggingface/transformers 18 | 19 | 20 | If you found our paper/scripts useful cite: 21 | ``` 22 | @inproceedings{das2020kafk, 23 | title={KAFK at SemEval-2020 Task 12: Checkpoint Ensemble of Transformers for Hate Speech Classification}, 24 | author={Das, Kaushik Amar and Baruah, Arup and Barbhuiya, Ferdous Ahmed and Dey, Kuntal}, 25 | booktitle={Proceedings of the Fourteenth Workshop on Semantic Evaluation}, 26 | pages={2023--2029}, 27 | year={2020} 28 | } 29 | ``` 30 | 31 | If you used the data please cite 32 | ``` 33 | @inproceedings{rosenthal2020, 34 | title={{A Large-Scale Semi-Supervised Dataset for Offensive Language Identification}}, 35 | author={Rosenthal, Sara and Atanasova, Pepa and Karadzhov, Georgi and Zampieri, Marcos and Nakov, Preslav}, 36 | year={2020}, 37 | booktitle={arxiv} 38 | } 39 | ``` 40 | -------------------------------------------------------------------------------- /data/eng/task_a_tiny.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cozek/OffensEval2020-code/874b86f14112326c9ba09965174142bdb0acdfb0/data/eng/task_a_tiny.zip -------------------------------------------------------------------------------- /data/eng/task_b_tiny.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cozek/OffensEval2020-code/874b86f14112326c9ba09965174142bdb0acdfb0/data/eng/task_b_tiny.zip -------------------------------------------------------------------------------- /data/eng/task_c_tiny.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cozek/OffensEval2020-code/874b86f14112326c9ba09965174142bdb0acdfb0/data/eng/task_c_tiny.zip -------------------------------------------------------------------------------- /data/test_data/readme-offenseval-testsetA-english.txt: -------------------------------------------------------------------------------- 1 | ======================== 2 | 3 | OffensEval 2020: Multilingual Offensive Language Identification in Social Media (SemEval 2020 - Task 12) 4 | Test data - Sub-task A 5 | v 1.0: February 26 2020 6 | https://sites.google.com/site/offensevalsharedtask/home 7 | 8 | ======================== 9 | 10 | 1) DESCRIPTION 11 | 12 | The file testset-taska.tsv contains 3887 unlabeled tweets. 13 | 14 | You are required to upload your sub-task A predictions for each of the 3887 instances to CodaLab by no later than 4 March 2020 (23:59 GMT). 15 | 16 | You will find ALL the necessary information regarding data format, dates, number of submissions, etc. at CodaLab. 17 | 18 | 2) FORMAT 19 | 20 | Instances are included in TSV format as follows: 21 | 22 | ID INSTANCE 23 | 24 | The column names in the file are the following: 25 | 26 | id tweet 27 | 28 | 3) TASK AND LABELS 29 | 30 | (A) Sub-task A: Offensive language identification 31 | 32 | - (NOT) Not Offensive - This post does not contain offense or profanity. 33 | - (OFF) Offensive - This post contains offensive language or a targeted (veiled or direct) offense. 34 | 35 | In our annotation, we label a post as offensive (OFF) if it contains any form of non-acceptable language (profanity) or a targeted offense, which can be veiled or direct. 36 | 37 | 4) SUBMISSION 38 | 39 | When you have your model predictions, you should upload a CSV file on CodaLab in the following format. 40 | 41 | ID, LABEL 42 | 43 | 4) CREDITS 44 | 45 | Task Organizers 46 | 47 | Marcos Zampieri - Rochester Institute of Technology, USA 48 | Preslav Nakov - Qatar Computing Research Institute, Qatar 49 | Sara Rosenthal - IBM Research, USA 50 | Pepa Atanasova - University of Copenhagen, Denmark 51 | Georgi Karadzhov - University of Cambridge, UK 52 | Hamdy Mubarak - Qatar Computing Research Institute, Qatar 53 | Leon Derczynski - IT University Copenhagen, Denmark 54 | Zeses Pitenis - University of Wolverhampton, UK 55 | Çağrı Çöltekin - University of Tübingen, Germany 56 | 57 | 5) Contact 58 | 59 | Organizers: semeval-2020-task-12-organizers@googlegroups.com 60 | All participants: semeval-2020-task-12-all@googlegroups.com -------------------------------------------------------------------------------- /data/test_data/readme-offenseval-testsetB-english.txt: -------------------------------------------------------------------------------- 1 | ======================== 2 | 3 | OffensEval 2020: Multilingual Offensive Language Identification in Social Media (SemEval 2020 - Task 12) 4 | Test data - Sub-task B 5 | v 1.0: February 26 2020 6 | https://sites.google.com/site/offensevalsharedtask/home 7 | 8 | ======================== 9 | 10 | 1) DESCRIPTION 11 | 12 | The file test_b_tweets.tsv contains 1422 unlabeled tweets. This file contains ONLY tweets which are offensive. 13 | 14 | You are required to upload your sub-task B predictions for each of the 1422 instances to CodaLab by no later than 4 Mar 2020 (23:59 GMT). 15 | 16 | You will find ALL the necessary information regarding data format, dates, number of submissions, etc. at CodaLab. Please read it carefully. 17 | 18 | 2) FORMAT 19 | 20 | Instances are included in TSV format as follows: 21 | 22 | id tweet 23 | 24 | 3) TASK AND LABELS 25 | 26 | (B) Sub-task B: Automatic categorization of offense types 27 | 28 | - (TIN) Targeted Insult and Threats - A post containing an insult or threat to an individual, a group, or others (see categories in sub-task C). 29 | - (UNT) Untargeted - A post containing non-targeted profanity and swearing. 30 | 31 | Posts containing general profanity are not targeted but they contain non-acceptable language. 32 | 33 | 4) SUBMISSION 34 | 35 | When you have your model predictions, you should upload a CSV file on CodaLab in the following format. 36 | 37 | ID, LABEL 38 | 39 | 5) CREDITS 40 | 41 | Task Organizers 42 | 43 | Marcos Zampieri - Rochester Institute of Technology, USA 44 | Preslav Nakov - Qatar Computing Research Institute, Qatar 45 | Sara Rosenthal - IBM Research, USA 46 | Pepa Atanasova - University of Copenhagen, Denmark 47 | Georgi Karadzhov - University of Cambridge, UK 48 | Hamdy Mubarak - Qatar Computing Research Institute, Qatar 49 | Leon Derczynski - IT University Copenhagen, Denmark 50 | Zeses Pitenis - University of Wolverhampton, UK 51 | Çağrı Çöltekin - University of Tübingen, Germany 52 | 53 | 6) Contact 54 | 55 | Organizers: semeval-2020-task-12-organizers@googlegroups.com 56 | All participants: semeval-2020-task-12-all@googlegroups.com -------------------------------------------------------------------------------- /data/test_data/readme-offenseval-testsetC-english.txt: -------------------------------------------------------------------------------- 1 | ======================== 2 | 3 | OffensEval 2020: Multilingual Offensive Language Identification in Social Media (SemEval 2020 - Task 12) 4 | Test data - Sub-task C 5 | v 1.0: March 5 2020 6 | https://sites.google.com/site/offensevalsharedtask/home 7 | 8 | ======================== 9 | 10 | 1) DESCRIPTION 11 | 12 | The file test_c_tweets.tsv contains 850 unlabeled tweets. This file contains ONLY tweets which are offensive AND targeted. 13 | 14 | You are required to upload your sub-task C predictions for each of the 850 instances to CodaLab by no later than 11 Mar 2020 (23:59 GMT). 15 | 16 | You will find ALL the necessary information regarding data format, dates, number of submissions, etc. at CodaLab. Please read it carefully. 17 | 18 | 2) FORMAT 19 | 20 | Instances are included in TSV format as follows: 21 | 22 | id tweet 23 | 24 | 3) TASK AND LABELS 25 | 26 | (C) Sub-task C: Offense target identification 27 | 28 | - (IND) Individual - The target of the offensive post is an individual: a famous person, a named individual or an unnamed person interacting in the conversation. 29 | - (GRP) Group - The target of the offensive post is a group of people considered as a unity due to the same ethnicity, gender or sexual orientation, political affiliation, religious belief, or something else. 30 | - (OTH) Other – The target of the offensive post does not belong to any of the previous two categories (e.g., an organization, a situation, an event, or an issue) 31 | 32 | 4) SUBMISSION 33 | 34 | When you have your model predictions, you should upload a ZIP file containing a CSV file on CodaLab in the following format. 35 | 36 | ID, LABEL 37 | 38 | 5) CREDITS 39 | 40 | Task Organizers 41 | 42 | Marcos Zampieri - Rochester Institute of Technology, USA 43 | Preslav Nakov - Qatar Computing Research Institute, Qatar 44 | Sara Rosenthal - IBM Research, USA 45 | Pepa Atanasova - University of Copenhagen, Denmark 46 | Georgi Karadzhov - University of Cambridge, UK 47 | Hamdy Mubarak - Qatar Computing Research Institute, Qatar 48 | Leon Derczynski - IT University Copenhagen, Denmark 49 | Zeses Pitenis - University of Wolverhampton, UK 50 | Çağrı Çöltekin - University of Tübingen, Germany 51 | 52 | 6) Contact 53 | 54 | Organizers: semeval-2020-task-12-organizers@googlegroups.com 55 | All participants: semeval-2020-task-12-all@googlegroups.com -------------------------------------------------------------------------------- /notebooks/Eng Task B - Ensemble Roberta.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.7.0" 21 | }, 22 | "colab": { 23 | "name": "Eng Task B - Ensemble Roberta.ipynb", 24 | "provenance": [] 25 | }, 26 | "accelerator": "GPU" 27 | }, 28 | "cells": [ 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "id": "D_dUF2evouow", 33 | "colab_type": "text" 34 | }, 35 | "source": [ 36 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cozek/OffensEval2020-code/blob/master/notebooks/Eng%20Task%20B%20-%20Ensemble%20Roberta.ipynb)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "id": "u7Uo50Chouox", 43 | "colab_type": "text" 44 | }, 45 | "source": [ 46 | "# Import Libraries\n", 47 | "\n", 48 | "At the time of our work, we used the following library versions\n", 49 | "- numpy 1.18.1\n", 50 | "- pandas 1.0.1\n", 51 | "- torch 1.2.0\n", 52 | "- Cuda 10.0\n", 53 | "- python 3.7.0\n", 54 | "- sklearn 0.22.1\n", 55 | "- tqdm 4.42.1\n", 56 | "- nltk 3.4.5" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "metadata": { 62 | "id": "vFPGTXG3ouox", 63 | "colab_type": "code", 64 | "colab": {} 65 | }, 66 | "source": [ 67 | "!git clone https://github.com/cozek/OffensEval2020-code/" 68 | ], 69 | "execution_count": 0, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "metadata": { 75 | "id": "FpGHYSkhouo0", 76 | "colab_type": "code", 77 | "colab": {} 78 | }, 79 | "source": [ 80 | "!git clone https://github.com/huggingface/transformers\n", 81 | "!pip install /content/transformers/" 82 | ], 83 | "execution_count": 0, 84 | "outputs": [] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "metadata": { 89 | "id": "6om7VnoNouo3", 90 | "colab_type": "code", 91 | "colab": {} 92 | }, 93 | "source": [ 94 | "import sys\n", 95 | "sys.path.append('/content/OffensEval2020-code/src/')\n", 96 | "import collections\n", 97 | "from typing import Callable\n", 98 | "import numpy as np\n", 99 | "np.random.seed(42)\n", 100 | "import pandas as pd\n", 101 | "from tqdm import notebook\n", 102 | "import importlib\n", 103 | "import pprint\n", 104 | "import nltk\n", 105 | "import datetime\n", 106 | "import os\n", 107 | "from argparse import Namespace\n", 108 | "\n", 109 | "from collections import Counter" 110 | ], 111 | "execution_count": 0, 112 | "outputs": [] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "metadata": { 117 | "id": "iyUPjzykouo5", 118 | "colab_type": "code", 119 | "colab": {} 120 | }, 121 | "source": [ 122 | "import utils.general as general_utils\n", 123 | "import utils.transformer.data as transformer_data_utils\n", 124 | "import utils.transformer.general as transformer_general_utils\n", 125 | "general_utils.set_seed_everywhere()" 126 | ], 127 | "execution_count": 0, 128 | "outputs": [] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "metadata": { 133 | "id": "MoJmxSDPouo9", 134 | "colab_type": "code", 135 | "colab": {} 136 | }, 137 | "source": [ 138 | "import logging\n", 139 | "logging.basicConfig(level=logging.INFO) " 140 | ], 141 | "execution_count": 0, 142 | "outputs": [] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "metadata": { 147 | "id": "2ehv7SLoouo_", 148 | "colab_type": "code", 149 | "colab": {} 150 | }, 151 | "source": [ 152 | "import torch\n", 153 | "import torch.nn as nn\n", 154 | "import torch.nn.functional as F\n", 155 | "import torch.optim as optim\n", 156 | "from torch.utils.data import Dataset, DataLoader\n", 157 | "torch.__version__ # we used version 1.2.0\n" 158 | ], 159 | "execution_count": 0, 160 | "outputs": [] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "metadata": { 165 | "id": "TkS9WQy2oupC", 166 | "colab_type": "code", 167 | "colab": {} 168 | }, 169 | "source": [ 170 | "# Import RAdam and Lookahead\n", 171 | "from radam.radam import RAdam\n", 172 | "from lookahead.optimizer import Lookahead\n" 173 | ], 174 | "execution_count": 0, 175 | "outputs": [] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "metadata": { 180 | "id": "71wul4V7oupF", 181 | "colab_type": "code", 182 | "colab": {} 183 | }, 184 | "source": [ 185 | "from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification" 186 | ], 187 | "execution_count": 0, 188 | "outputs": [] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "metadata": { 193 | "id": "eoAlBoFYoupH", 194 | "colab_type": "code", 195 | "colab": {} 196 | }, 197 | "source": [ 198 | " args = Namespace(\n", 199 | " #use cuda by default\n", 200 | " device = 'cuda' if torch.cuda.is_available() else 'cpu',\n", 201 | " \n", 202 | " #set batch size and number of epochs\n", 203 | " batch_size = 32,\n", 204 | " num_epochs = 20,\n", 205 | " \n", 206 | " #set the learning rate\n", 207 | " learning_rate = 0.0001,\n", 208 | "\n", 209 | " #location of the train, dev and test csv\n", 210 | " train_val_csv = '/content/OffensEval2020-code/data/eng/task_b_tiny.zip',\n", 211 | " test_csv = '/content/OffensEval2020-code/data/test_data/test_b_tweets.tsv',\n", 212 | " \n", 213 | " #directory to save our models at\n", 214 | " directory = './models/', \n", 215 | " model_name = 'roberta.pt',\n", 216 | " \n", 217 | " date = datetime.datetime.now().strftime(\"%a_%d_%b_%Y/\"),\n", 218 | ")" 219 | ], 220 | "execution_count": 0, 221 | "outputs": [] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": { 226 | "id": "Tt5X-gWsoupL", 227 | "colab_type": "text" 228 | }, 229 | "source": [ 230 | "## Model save location" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "metadata": { 236 | "id": "xMOYDefpoupM", 237 | "colab_type": "code", 238 | "colab": {} 239 | }, 240 | "source": [ 241 | "directory = args.directory + args.date\n", 242 | "if not os.path.exists(directory):\n", 243 | " os.makedirs(directory)\n", 244 | "args.directory = directory\n", 245 | "print(args.directory)" 246 | ], 247 | "execution_count": 0, 248 | "outputs": [] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": { 253 | "id": "thSqAbT3oupP", 254 | "colab_type": "text" 255 | }, 256 | "source": [ 257 | "## Load presplit dataset portion\n", 258 | "```\n", 259 | "Labelled as\n", 260 | "\n", 261 | "'UNT': 1\n", 262 | "'TIN': 0\n", 263 | "```" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "metadata": { 269 | "id": "2WazWg4zoupP", 270 | "colab_type": "code", 271 | "colab": {} 272 | }, 273 | "source": [ 274 | "data_df_task_b = pd.read_csv(args.train_val_csv, compression='zip')\n", 275 | "print(data_df_task_b.label.value_counts())\n", 276 | "print(data_df_task_b.split.value_counts())" 277 | ], 278 | "execution_count": 0, 279 | "outputs": [] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "metadata": { 284 | "id": "_dV3P48EphCH", 285 | "colab_type": "code", 286 | "colab": {} 287 | }, 288 | "source": [ 289 | "data_df_task_b.columns" 290 | ], 291 | "execution_count": 0, 292 | "outputs": [] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "metadata": { 297 | "id": "LkUZ5O46oupS", 298 | "colab_type": "code", 299 | "colab": {} 300 | }, 301 | "source": [ 302 | "with pd.option_context('display.max_colwidth', -1): \n", 303 | " print(data_df_task_b[['text','label']].sample(5))" 304 | ], 305 | "execution_count": 0, 306 | "outputs": [] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": { 311 | "id": "NZrYLTrxoupU", 312 | "colab_type": "text" 313 | }, 314 | "source": [ 315 | "## Importing the Roberta Tokeniker and Punkt sentence tokenizer" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "metadata": { 321 | "id": "PtFADHnToupV", 322 | "colab_type": "code", 323 | "colab": {} 324 | }, 325 | "source": [ 326 | "class RobertaPreprocessor():\n", 327 | " def __init__(self,transformer_tokenizer,sentence_detector):\n", 328 | " self.transformer_tokenizer = transformer_tokenizer\n", 329 | " self.sentence_detector = sentence_detector\n", 330 | " self.bos_token = transformer_tokenizer.bos_token\n", 331 | " self.sep_token = ' ' + transformer_tokenizer.sep_token + ' '\n", 332 | " def add_special_tokens(self, text):\n", 333 | " sentences = self.sentence_detector.tokenize(text)\n", 334 | " eos_added_text = self.sep_token.join(sentences) \n", 335 | " return self.bos_token +' '+ eos_added_text + ' ' + self.transformer_tokenizer.sep_token" 336 | ], 337 | "execution_count": 0, 338 | "outputs": [] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "metadata": { 343 | "id": "ar4EbV4BoupX", 344 | "colab_type": "code", 345 | "colab": {} 346 | }, 347 | "source": [ 348 | "!python -c 'import nltk; nltk.download(\"punkt\")'" 349 | ], 350 | "execution_count": 0, 351 | "outputs": [] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "metadata": { 356 | "scrolled": true, 357 | "id": "6RlaXDBtoupY", 358 | "colab_type": "code", 359 | "colab": {} 360 | }, 361 | "source": [ 362 | "roberta_tokenizer = tokenizer = RobertaTokenizer.from_pretrained('roberta-base')\n", 363 | "punkt_sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')" 364 | ], 365 | "execution_count": 0, 366 | "outputs": [] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "metadata": { 371 | "id": "RKFM-0bqoupb", 372 | "colab_type": "code", 373 | "colab": {} 374 | }, 375 | "source": [ 376 | "roberta_preproc = RobertaPreprocessor(roberta_tokenizer, punkt_sentence_detector)" 377 | ], 378 | "execution_count": 0, 379 | "outputs": [] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "metadata": { 384 | "id": "jDq61UIRoupf", 385 | "colab_type": "code", 386 | "colab": {} 387 | }, 388 | "source": [ 389 | "#apply the preprocessor on the exploded dataframe\n", 390 | "data_df_task_b['text'] = data_df_task_b['text'].map(roberta_preproc.add_special_tokens)\n" 391 | ], 392 | "execution_count": 0, 393 | "outputs": [] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "metadata": { 398 | "id": "kocu45Xtoupj", 399 | "colab_type": "code", 400 | "colab": {} 401 | }, 402 | "source": [ 403 | "with pd.option_context('display.max_colwidth', -1): \n", 404 | " print(data_df_task_b[['text','label']].sample(5))" 405 | ], 406 | "execution_count": 0, 407 | "outputs": [] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": { 412 | "id": "0NOwcyecoupm", 413 | "colab_type": "text" 414 | }, 415 | "source": [ 416 | "### Here we create the dataset" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "metadata": { 422 | "id": "KyQMayxOoupn", 423 | "colab_type": "code", 424 | "colab": {} 425 | }, 426 | "source": [ 427 | "class SimpleVectorizer():\n", 428 | " def __init__(self,tokenizer: Callable, max_seq_len: int):\n", 429 | " \"\"\"\n", 430 | " Args:\n", 431 | " tokenizer (Callable): transformer tokenizer\n", 432 | " max_seq_len (int): Maximum sequence lenght \n", 433 | " \"\"\"\n", 434 | " self.tokenizer = tokenizer\n", 435 | " self._max_seq_len = max_seq_len\n", 436 | "\n", 437 | " def vectorize(self,text :str):\n", 438 | " \n", 439 | " encoded = self.tokenizer.encode_plus(\n", 440 | " text,\n", 441 | " add_special_tokens=False, #already added by preproc\n", 442 | " max_length = self._max_seq_len,\n", 443 | " pad_to_max_length = True,\n", 444 | " )\n", 445 | " ids = np.array(encoded['input_ids'], dtype=np.int64)\n", 446 | " attn = np.array(encoded['attention_mask'], dtype=np.int64)\n", 447 | " \n", 448 | " return ids, attn" 449 | ], 450 | "execution_count": 0, 451 | "outputs": [] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "metadata": { 456 | "id": "JI0SzK1Woupw", 457 | "colab_type": "code", 458 | "colab": {} 459 | }, 460 | "source": [ 461 | "class HateDataset(Dataset):\n", 462 | " def __init__(\n", 463 | " self,\n", 464 | " data_df: pd.DataFrame,\n", 465 | " tokenizer: Callable,\n", 466 | " max_seq_length:int = None,\n", 467 | " ):\n", 468 | " \"\"\"\n", 469 | " Args:\n", 470 | " data_df (pandas.DataFrame): df containing the labels and text\n", 471 | " tokenizer (tokenizer module for the transformer)\n", 472 | " \"\"\"\n", 473 | " self.data_df = data_df\n", 474 | " self.tokenizer = tokenizer\n", 475 | "\n", 476 | " if max_seq_length is None:\n", 477 | " self._max_seq_length = self._get_max_len(data_df,tokenizer)\n", 478 | " else:\n", 479 | " self._max_seq_length = max_seq_length\n", 480 | "\n", 481 | " self.train_df = self.data_df[self.data_df.split == 'train']\n", 482 | " self.train_size = len(self.train_df)\n", 483 | "\n", 484 | " self.val_df = self.data_df[self.data_df.split == 'val']\n", 485 | " self.val_size = len(self.val_df)\n", 486 | "\n", 487 | " self.test_df = self.data_df[self.data_df.split == 'test']\n", 488 | " self.test_size = len(self.test_df)\n", 489 | " \n", 490 | " self._simple_vectorizer = SimpleVectorizer(tokenizer, self._max_seq_length)\n", 491 | " \n", 492 | " self._lookup_dict = {\n", 493 | " 'train': (self.train_df, self.train_size),\n", 494 | " 'val': (self.val_df, self.val_size),\n", 495 | " 'test': (self.test_df, self.test_size)\n", 496 | " }\n", 497 | "\n", 498 | " self.set_split('train')\n", 499 | "\n", 500 | " class_counts = data_df.label.value_counts().to_dict()\n", 501 | " #sorted on the basis of class label,eg, 0,1,2..\n", 502 | " cts = sorted([(lbl,cts) for lbl,cts in class_counts.items()], key=lambda x: x[0])\n", 503 | " freq = [ x[1] for x in cts ]\n", 504 | " # print(freq,cts)\n", 505 | " self.class_weights = 1.0/ torch.tensor(freq, dtype=torch.float32)\n", 506 | " \n", 507 | " \n", 508 | " def _get_max_len(self,data_df: pd.DataFrame, tokenizer: Callable):\n", 509 | " len_func = lambda x: len(self.tokenizer.encode_plus(x)['input_ids'])\n", 510 | " max_len = data_df.text.map(len_func).max() \n", 511 | " return max_len\n", 512 | "\n", 513 | " def set_split(self, split=\"train\"):\n", 514 | " \"\"\" selects the splits in the dataset using a column in the dataframe \"\"\"\n", 515 | " self._target_split = split\n", 516 | " self._target_df, self._target_size = self._lookup_dict[split]\n", 517 | " \n", 518 | " def __len__(self):\n", 519 | " return self._target_size\n", 520 | " \n", 521 | " def __getitem__(self, index):\n", 522 | " \"\"\"the primary entry point method for PyTorch datasets\n", 523 | " \n", 524 | " Args:\n", 525 | " index (int): the index to the data point \n", 526 | " Returns:\n", 527 | " a dictionary holding the data point's features (x_data) and label (y_target)\n", 528 | " \"\"\"\n", 529 | " row = self._target_df.iloc[index]\n", 530 | " \n", 531 | " indices, attention_masks = self._simple_vectorizer.vectorize(row.text)\n", 532 | "\n", 533 | " label = row.label\n", 534 | " return {'x_data': indices,\n", 535 | " 'x_attn_mask': attention_masks,\n", 536 | " 'x_index': index,\n", 537 | " 'y_target': label}\n", 538 | " \n", 539 | " def get_num_batches(self, batch_size):\n", 540 | " \"\"\"Given a batch size, return the number of batches in the dataset\n", 541 | " \n", 542 | " Args:\n", 543 | " batch_size (int)\n", 544 | " Returns:\n", 545 | " number of batches in the dataset\n", 546 | " \"\"\"\n", 547 | " return len(self) // batch_size" 548 | ], 549 | "execution_count": 0, 550 | "outputs": [] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "metadata": { 555 | "id": "PGvvsTq9oupy", 556 | "colab_type": "code", 557 | "colab": {} 558 | }, 559 | "source": [ 560 | "def generate_batches(dataset, batch_size, shuffle=True,\n", 561 | " drop_last=False, device=\"cpu\", pinned_memory = False, n_workers = 0): \n", 562 | " \"\"\"\n", 563 | " A generator function which wraps the PyTorch DataLoader. It will \n", 564 | " ensure each tensor is on the write device location.\n", 565 | " \"\"\"\n", 566 | " dataloader = DataLoader(dataset=dataset, batch_size=batch_size,\n", 567 | " shuffle=shuffle, drop_last=drop_last,\n", 568 | " pin_memory= pinned_memory,\n", 569 | " num_workers = n_workers,\n", 570 | " )\n", 571 | " \n", 572 | " for data_dict in dataloader:\n", 573 | " out_data_dict = {}\n", 574 | " out_data_dict['x_data'] = data_dict['x_data'].to(\n", 575 | " device, non_blocking= (True if pinned_memory else False) \n", 576 | " )\n", 577 | " out_data_dict['x_attn_mask'] = data_dict['x_attn_mask'].to(\n", 578 | " device, non_blocking= (True if pinned_memory else False) \n", 579 | " )\n", 580 | " out_data_dict['x_index'] = data_dict['x_index']\n", 581 | " out_data_dict['y_target'] = data_dict['y_target'].to(\n", 582 | " device, non_blocking= (True if pinned_memory else False) \n", 583 | " )\n", 584 | " yield out_data_dict" 585 | ], 586 | "execution_count": 0, 587 | "outputs": [] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "metadata": { 592 | "id": "dqjxEQKZoup0", 593 | "colab_type": "code", 594 | "colab": {} 595 | }, 596 | "source": [ 597 | "dataset = HateDataset(\n", 598 | " data_df = data_df_task_b,\n", 599 | " tokenizer = roberta_tokenizer\n", 600 | ")" 601 | ], 602 | "execution_count": 0, 603 | "outputs": [] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "metadata": { 608 | "scrolled": true, 609 | "id": "r1S0e8djoup3", 610 | "colab_type": "code", 611 | "colab": {} 612 | }, 613 | "source": [ 614 | "assert dataset._max_seq_length <= 1024" 615 | ], 616 | "execution_count": 0, 617 | "outputs": [] 618 | }, 619 | { 620 | "cell_type": "markdown", 621 | "metadata": { 622 | "id": "faAM7TdDoup5", 623 | "colab_type": "text" 624 | }, 625 | "source": [ 626 | "# Initialize the Roberta model" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "metadata": { 632 | "scrolled": false, 633 | "id": "Sdjpj_fvoup6", 634 | "colab_type": "code", 635 | "colab": {} 636 | }, 637 | "source": [ 638 | "model = RobertaForSequenceClassification.from_pretrained(\n", 639 | " 'roberta-base',\n", 640 | " num_labels=len(set(data_df_task_b.label)),\n", 641 | ")" 642 | ], 643 | "execution_count": 0, 644 | "outputs": [] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "metadata": { 649 | "scrolled": true, 650 | "id": "7VAugC07oup8", 651 | "colab_type": "code", 652 | "colab": {} 653 | }, 654 | "source": [ 655 | "model.to(args.device)" 656 | ], 657 | "execution_count": 0, 658 | "outputs": [] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "metadata": { 663 | "id": "2nf9iX9Eoup_", 664 | "colab_type": "code", 665 | "colab": {} 666 | }, 667 | "source": [ 668 | "early_stopping = transformer_general_utils.EarlyStopping(patience=4)" 669 | ], 670 | "execution_count": 0, 671 | "outputs": [] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "metadata": { 676 | "id": "BjtphpU6s4LV", 677 | "colab_type": "code", 678 | "colab": {} 679 | }, 680 | "source": [ 681 | "!nvidia-smi" 682 | ], 683 | "execution_count": 0, 684 | "outputs": [] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "metadata": { 689 | "id": "ZjFXYTcEouqB", 690 | "colab_type": "code", 691 | "colab": {} 692 | }, 693 | "source": [ 694 | "args.num_epochs = 20\n", 695 | "args.batch_size = 16 #set according to GPU capacity" 696 | ], 697 | "execution_count": 0, 698 | "outputs": [] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "metadata": { 703 | "scrolled": false, 704 | "id": "OYfdxscNouqE", 705 | "colab_type": "code", 706 | "colab": {} 707 | }, 708 | "source": [ 709 | "loss_func = nn.CrossEntropyLoss()\n", 710 | "\n", 711 | "print(f'Using LR:{args.learning_rate}')\n", 712 | "base_optimizer = RAdam(model.parameters(), lr = args.learning_rate)\n", 713 | "optimizer = Lookahead(optimizer = base_optimizer, k = 5, alpha=0.5 )\n", 714 | "scheduler = optim.lr_scheduler.ReduceLROnPlateau(\n", 715 | " optimizer=optimizer.optimizer, factor =0.1 ,mode='max',\n", 716 | ")" 717 | ], 718 | "execution_count": 0, 719 | "outputs": [] 720 | }, 721 | { 722 | "cell_type": "markdown", 723 | "metadata": { 724 | "id": "HvhB0DIPouqH", 725 | "colab_type": "text" 726 | }, 727 | "source": [ 728 | "# Begin Training" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "metadata": { 734 | "id": "ta4xhZcdouqH", 735 | "colab_type": "code", 736 | "colab": {} 737 | }, 738 | "source": [ 739 | "train_state = general_utils.make_train_state()\n", 740 | "train_state.keys()\n" 741 | ], 742 | "execution_count": 0, 743 | "outputs": [] 744 | }, 745 | { 746 | "cell_type": "code", 747 | "metadata": { 748 | "scrolled": false, 749 | "id": "Hyp2Q5ReouqK", 750 | "colab_type": "code", 751 | "colab": {} 752 | }, 753 | "source": [ 754 | "epoch_bar = notebook.tqdm(\n", 755 | " desc = 'training_routine',\n", 756 | " total = args.num_epochs,\n", 757 | " position=0,\n", 758 | " leave = True,\n", 759 | ")\n", 760 | "dataset.set_split('train')\n", 761 | "train_bar = notebook.tqdm(\n", 762 | " desc = 'split=train ',\n", 763 | " total=dataset.get_num_batches(args.batch_size),\n", 764 | " position=0,\n", 765 | " leave=True,\n", 766 | ")\n", 767 | "dataset.set_split('val')\n", 768 | "eval_bar = notebook.tqdm(\n", 769 | " desc = 'split=eval',\n", 770 | " total=dataset.get_num_batches(args.batch_size),\n", 771 | " position=0,\n", 772 | " leave=True,\n", 773 | ")\n", 774 | "\n", 775 | "old_val_acc = 0\n", 776 | "old_f1 = 0\n", 777 | "model_state = None\n", 778 | "for epoch_index in range(args.num_epochs):\n", 779 | " train_state['epoch_in'] = epoch_index\n", 780 | "\n", 781 | " dataset.set_split('train')\n", 782 | "\n", 783 | " batch_generator = generate_batches(\n", 784 | " dataset= dataset, batch_size= args.batch_size, shuffle=True,\n", 785 | " device = args.device, drop_last=False,\n", 786 | " pinned_memory = True, n_workers = 3, \n", 787 | " )\n", 788 | "\n", 789 | " running_loss = 0.0\n", 790 | " running_acc = 0.0\n", 791 | " running_f1 = 0.0\n", 792 | " model.train()\n", 793 | "\n", 794 | " train_bar.reset(\n", 795 | " total=dataset.get_num_batches(args.batch_size),\n", 796 | " )\n", 797 | "\n", 798 | " for batch_index, batch_dict in enumerate(batch_generator):\n", 799 | " optimizer.zero_grad()\n", 800 | " \n", 801 | " loss,y_pred = model(\n", 802 | " input_ids = batch_dict['x_data'],\n", 803 | " attention_mask = batch_dict['x_attn_mask'],\n", 804 | " labels= batch_dict['y_target'].unsqueeze(1),\n", 805 | " )[:2]\n", 806 | " \n", 807 | " y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))\n", 808 | " \n", 809 | " loss.backward()\n", 810 | " optimizer.step()\n", 811 | " \n", 812 | " loss_t = loss.item()\n", 813 | " running_loss += (loss_t - running_loss) / (batch_index + 1)\n", 814 | " \n", 815 | " y_pred = y_pred.detach().cpu()\n", 816 | " batch_dict['y_target'] = batch_dict['y_target'].cpu()\n", 817 | " \n", 818 | " acc_t = transformer_general_utils \\\n", 819 | " .compute_accuracy(y_pred, batch_dict['y_target'])\n", 820 | " \n", 821 | " f1_t = transformer_general_utils \\\n", 822 | " .compute_macro_f1(y_pred, batch_dict['y_target'])\n", 823 | "\n", 824 | " train_state['batch_preds'].append(y_pred)\n", 825 | " train_state['batch_targets'].append(batch_dict['y_target'])\n", 826 | " train_state['batch_indexes'].append(batch_dict['x_index'])\n", 827 | "\n", 828 | " running_acc += (acc_t - running_acc) / (batch_index + 1)\n", 829 | " running_f1 += (f1_t - running_f1) / (batch_index + 1)\n", 830 | "\n", 831 | " train_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,\n", 832 | " epoch=epoch_index)\n", 833 | "\n", 834 | " train_bar.update()\n", 835 | "\n", 836 | " if torch.cuda.is_available():\n", 837 | " torch.cuda.empty_cache()\n", 838 | " \n", 839 | " train_state['train_accuracies'].append(running_acc)\n", 840 | " train_state['train_losses'].append(running_loss)\n", 841 | " \n", 842 | " train_state['train_preds'].append(\n", 843 | " torch.cat(train_state['batch_preds']).cpu()\n", 844 | " )\n", 845 | " train_state['train_targets'].append(\n", 846 | " torch.cat(train_state['batch_targets']).cpu()\n", 847 | " )\n", 848 | " train_state['train_indexes'].append(\n", 849 | " torch.cat(train_state['batch_indexes']).cpu()\n", 850 | " )\n", 851 | " train_f1 = transformer_general_utils \\\n", 852 | " .compute_macro_f1(train_state['train_preds'][-1],\n", 853 | " train_state['train_targets'][-1],\n", 854 | " )\n", 855 | " \n", 856 | " train_state['train_f1s'].append(train_f1)\n", 857 | " \n", 858 | " train_state['batch_preds'] = []\n", 859 | " train_state['batch_targets'] = []\n", 860 | " train_state['batch_indexes'] = []\n", 861 | " \n", 862 | " \n", 863 | " dataset.set_split('val')\n", 864 | " batch_generator = generate_batches(\n", 865 | " dataset= dataset, batch_size= args.batch_size, shuffle=True,\n", 866 | " device = args.device, drop_last=False,\n", 867 | " pinned_memory = True, n_workers = 2, \n", 868 | " )\n", 869 | " eval_bar.reset(\n", 870 | " total=dataset.get_num_batches(args.batch_size),\n", 871 | " )\n", 872 | " running_loss = 0.0\n", 873 | " running_acc = 0.0\n", 874 | " running_f1 = 0.0\n", 875 | " \n", 876 | " model.eval()\n", 877 | " with torch.no_grad():\n", 878 | " optimizer._backup_and_load_cache()\n", 879 | " for batch_index, batch_dict in enumerate(batch_generator):\n", 880 | " loss, y_pred = model(\n", 881 | " input_ids = batch_dict['x_data'],\n", 882 | " attention_mask = batch_dict['x_attn_mask'],\n", 883 | " labels= batch_dict['y_target'].unsqueeze(1),\n", 884 | " )[:2]\n", 885 | " y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))\n", 886 | " \n", 887 | " loss_t = loss.item()\n", 888 | " running_loss += (loss_t - running_loss) / (batch_index + 1)\n", 889 | "\n", 890 | " y_pred = y_pred.detach()\n", 891 | " batch_dict['y_target'] = batch_dict['y_target'].cpu()\n", 892 | " \n", 893 | " acc_t = transformer_general_utils\\\n", 894 | " .compute_accuracy(y_pred, batch_dict['y_target'])\n", 895 | " f1_t = transformer_general_utils \\\n", 896 | " .compute_macro_f1(y_pred, batch_dict['y_target'])\n", 897 | "\n", 898 | " train_state['batch_preds'].append(y_pred.cpu())\n", 899 | " train_state['batch_targets'].append(batch_dict['y_target'].cpu())\n", 900 | " train_state['batch_indexes'].append(batch_dict['x_index'].cpu())\n", 901 | "\n", 902 | " running_acc += (acc_t - running_acc) / (batch_index + 1)\n", 903 | " running_f1 += (f1_t - running_f1) / (batch_index + 1)\n", 904 | " \n", 905 | "\n", 906 | " eval_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,\n", 907 | " epoch=epoch_index)\n", 908 | " eval_bar.update()\n", 909 | " \n", 910 | " train_state['val_accuracies'].append(running_acc)\n", 911 | " train_state['val_losses'].append(running_loss)\n", 912 | " \n", 913 | " \n", 914 | " train_state['val_preds'].append(\n", 915 | " torch.cat(train_state['batch_preds']).cpu()\n", 916 | " )\n", 917 | "\n", 918 | " train_state['val_targets'].append(\n", 919 | " torch.cat(train_state['batch_targets']).cpu()\n", 920 | " )\n", 921 | " train_state['val_indexes'].append(\n", 922 | " torch.cat(train_state['batch_indexes']).cpu()\n", 923 | " )\n", 924 | " val_f1 = transformer_general_utils \\\n", 925 | " .compute_macro_f1(train_state['val_preds'][-1],\n", 926 | " train_state['val_targets'][-1],\n", 927 | " )\n", 928 | " \n", 929 | " train_state['val_f1s'].append(val_f1)\n", 930 | " \n", 931 | " train_state['batch_preds'] = []\n", 932 | " train_state['batch_targets'] = []\n", 933 | " train_state['batch_indexes'] = []\n", 934 | " \n", 935 | " torch.save(\n", 936 | " {\n", 937 | " 'model':model.state_dict(),\n", 938 | " },\n", 939 | " args.directory + f'_epoc_{epoch_index}_' + args.model_name,\n", 940 | " )\n", 941 | " \n", 942 | " scheduler.step(val_f1)\n", 943 | " early_stopping(val_f1, model)\n", 944 | " optimizer._clear_and_load_backup()\n", 945 | " epoch_bar.set_postfix( best_f1 = early_stopping.best_score, current = val_f1)\n", 946 | " epoch_bar.update() \n", 947 | " \n", 948 | " if early_stopping.early_stop:\n", 949 | " print(\"Early stopping\")\n", 950 | " break\n" 951 | ], 952 | "execution_count": 0, 953 | "outputs": [] 954 | }, 955 | { 956 | "cell_type": "code", 957 | "metadata": { 958 | "id": "JvhJbjv3ouqM", 959 | "colab_type": "code", 960 | "colab": {} 961 | }, 962 | "source": [ 963 | "epoch_index" 964 | ], 965 | "execution_count": 0, 966 | "outputs": [] 967 | }, 968 | { 969 | "cell_type": "code", 970 | "metadata": { 971 | "scrolled": true, 972 | "id": "UZ_tiTQsouqQ", 973 | "colab_type": "code", 974 | "colab": {} 975 | }, 976 | "source": [ 977 | "print(train_state['val_f1s'])" 978 | ], 979 | "execution_count": 0, 980 | "outputs": [] 981 | }, 982 | { 983 | "cell_type": "code", 984 | "metadata": { 985 | "id": "pGTvvqJOouqS", 986 | "colab_type": "code", 987 | "colab": {} 988 | }, 989 | "source": [ 990 | "from sklearn.metrics import classification_report\n", 991 | "from sklearn.metrics import confusion_matrix\n", 992 | "from sklearn.metrics import accuracy_score\n", 993 | "from sklearn.metrics import f1_score" 994 | ], 995 | "execution_count": 0, 996 | "outputs": [] 997 | }, 998 | { 999 | "cell_type": "code", 1000 | "metadata": { 1001 | "id": "zmsdTS5XouqU", 1002 | "colab_type": "code", 1003 | "colab": {} 1004 | }, 1005 | "source": [ 1006 | "\n", 1007 | "print('Train:',classification_report(\n", 1008 | " y_pred=(torch.argmax(train_state['train_preds'][-1],dim=1) ).cpu().long().numpy(),\n", 1009 | " y_true= train_state['train_targets'][-1].cpu().numpy(), \n", 1010 | " digits=4)\n", 1011 | ")\n", 1012 | "print('Dev:',classification_report(\n", 1013 | " y_pred=(torch.argmax(train_state['val_preds'][-1],dim=1) ).cpu().long().numpy(),\n", 1014 | " y_true= train_state['val_targets'][-1].cpu().numpy(), \n", 1015 | " digits=4)\n", 1016 | ")\n" 1017 | ], 1018 | "execution_count": 0, 1019 | "outputs": [] 1020 | }, 1021 | { 1022 | "cell_type": "code", 1023 | "metadata": { 1024 | "id": "dBvhO_3MouqX", 1025 | "colab_type": "code", 1026 | "colab": {} 1027 | }, 1028 | "source": [ 1029 | "best_run_index = train_state['val_f1s'].index(max(train_state['val_f1s']))\n", 1030 | "print('Train:',classification_report(\n", 1031 | " y_pred=(torch.argmax(train_state['train_preds'][best_run_index],dim=1) ).cpu().long().numpy(),\n", 1032 | " y_true= train_state['train_targets'][best_run_index].cpu().numpy(), \n", 1033 | " digits=4)\n", 1034 | ")\n", 1035 | "print('Dev:',classification_report(\n", 1036 | " y_pred=(torch.argmax(train_state['val_preds'][best_run_index],dim=1) ).cpu().long().numpy(),\n", 1037 | " y_true= train_state['val_targets'][best_run_index].cpu().numpy(), \n", 1038 | " digits=4)\n", 1039 | ")" 1040 | ], 1041 | "execution_count": 0, 1042 | "outputs": [] 1043 | }, 1044 | { 1045 | "cell_type": "code", 1046 | "metadata": { 1047 | "id": "ZyJlr1Ucouqa", 1048 | "colab_type": "code", 1049 | "colab": {} 1050 | }, 1051 | "source": [ 1052 | "def sort_preds(indexes, preds):\n", 1053 | " \"\"\"Sorts the predictions in order, to reverse the effects of shuffle\n", 1054 | " done by dataloader\"\"\"\n", 1055 | " indexes = indexes.cpu().numpy().reshape(-1,1)\n", 1056 | " preds = preds.cpu().numpy()\n", 1057 | " arr_concat = np.hstack((indexes,preds)) #concat the preds and their indexes\n", 1058 | " sort_arr = arr_concat[ arr_concat[:,0].argsort()] #sort based on the indexes\n", 1059 | " sorted_preds = np.delete(sort_arr,0,axis=1)\n", 1060 | " return sorted_preds" 1061 | ], 1062 | "execution_count": 0, 1063 | "outputs": [] 1064 | }, 1065 | { 1066 | "cell_type": "code", 1067 | "metadata": { 1068 | "id": "PEKLcktCourg", 1069 | "colab_type": "code", 1070 | "colab": {} 1071 | }, 1072 | "source": [ 1073 | "def get_optimal_models_v2(train_state, split):\n", 1074 | " l = zip(train_state[f'{split}_f1s'], range(len(train_state[f'{split}_f1s'])))\n", 1075 | " sorted_vals = sorted(l, key = lambda x:x[0], reverse=True)\n", 1076 | " model_idxes = [i[1] for i in sorted_vals]\n", 1077 | " \n", 1078 | " trgts= sort_preds(train_state[f'{split}_indexes'][-1],train_state[f'{split}_targets'][-1].reshape(-1,1))\n", 1079 | " total_preds = len(train_state[f'{split}_indexes'])\n", 1080 | " init = np.zeros(train_state[f'{split}_preds'][-1].shape)\n", 1081 | " max_f1 = 0\n", 1082 | " idxes = []\n", 1083 | " for i in model_idxes:\n", 1084 | " temp = sort_preds(train_state[f'{split}_indexes'][i],train_state[f'{split}_preds'][i])\n", 1085 | " temp2 = init+temp\n", 1086 | " f1 = f1_score(\n", 1087 | " y_pred=temp2.argmax(axis=1),\n", 1088 | " y_true= trgts, average ='macro'\n", 1089 | " )\n", 1090 | " if f1 > max_f1:\n", 1091 | " max_f1 = f1\n", 1092 | " init = init+temp\n", 1093 | " idxes.append(i)\n", 1094 | " print(f'Taking preds from {idxes} | Dev f1:{f1}')\n", 1095 | " return idxes" 1096 | ], 1097 | "execution_count": 0, 1098 | "outputs": [] 1099 | }, 1100 | { 1101 | "cell_type": "code", 1102 | "metadata": { 1103 | "scrolled": true, 1104 | "id": "RI0eIVAtourj", 1105 | "colab_type": "code", 1106 | "colab": {} 1107 | }, 1108 | "source": [ 1109 | "final_optimal_models = get_optimal_models_v2(train_state, 'val')\n", 1110 | "final_optimal_models" 1111 | ], 1112 | "execution_count": 0, 1113 | "outputs": [] 1114 | }, 1115 | { 1116 | "cell_type": "markdown", 1117 | "metadata": { 1118 | "id": "siZdY82mours", 1119 | "colab_type": "text" 1120 | }, 1121 | "source": [ 1122 | "# Making preds on the given test set" 1123 | ] 1124 | }, 1125 | { 1126 | "cell_type": "code", 1127 | "metadata": { 1128 | "id": "Zcztc0lGourz", 1129 | "colab_type": "code", 1130 | "colab": {} 1131 | }, 1132 | "source": [ 1133 | "test_df = data_df_task_b" 1134 | ], 1135 | "execution_count": 0, 1136 | "outputs": [] 1137 | }, 1138 | { 1139 | "cell_type": "code", 1140 | "metadata": { 1141 | "id": "5hkU-POXour5", 1142 | "colab_type": "code", 1143 | "colab": {} 1144 | }, 1145 | "source": [ 1146 | "test_dataset = dataset\n", 1147 | "test_dataset.set_split('test')" 1148 | ], 1149 | "execution_count": 0, 1150 | "outputs": [] 1151 | }, 1152 | { 1153 | "cell_type": "code", 1154 | "metadata": { 1155 | "id": "Iaq8vD0Xour7", 1156 | "colab_type": "code", 1157 | "colab": {} 1158 | }, 1159 | "source": [ 1160 | "test_dataset._target_df.sample(5)" 1161 | ], 1162 | "execution_count": 0, 1163 | "outputs": [] 1164 | }, 1165 | { 1166 | "cell_type": "code", 1167 | "metadata": { 1168 | "id": "M2PQOwWzousA", 1169 | "colab_type": "code", 1170 | "colab": {} 1171 | }, 1172 | "source": [ 1173 | "print(len(test_df))\n", 1174 | "print(test_dataset._target_df.split.value_counts())" 1175 | ], 1176 | "execution_count": 0, 1177 | "outputs": [] 1178 | }, 1179 | { 1180 | "cell_type": "code", 1181 | "metadata": { 1182 | "id": "uRfZ2GLHousC", 1183 | "colab_type": "code", 1184 | "colab": {} 1185 | }, 1186 | "source": [ 1187 | "def evaluate_testset(model, state, dataset, split,args):\n", 1188 | " \"\"\"Returns the final layer output of our transformer model\n", 1189 | " Puts them in the '{split}_*' keys in the state dict\n", 1190 | " Args:\n", 1191 | " model: A pytorch transformers model\n", 1192 | " state: dict to store outputs\n", 1193 | " dataset: A pytorch Dataset\n", 1194 | " split: The split on which to evaluate the model on\n", 1195 | " args: Arguments from namespace, etc\n", 1196 | " Returns:\n", 1197 | " state: all evaluated output stored in the \"test\" key\n", 1198 | " \"\"\"\n", 1199 | " eval_bar = notebook.tqdm(\n", 1200 | " desc = 'evaluation progress: ',\n", 1201 | " total=dataset.get_num_batches(args.batch_size),\n", 1202 | " position=0,\n", 1203 | " leave=False,\n", 1204 | " )\n", 1205 | " dataset.set_split(split)\n", 1206 | " batch_generator = generate_batches(\n", 1207 | " dataset= dataset, batch_size= args.batch_size, shuffle=False,\n", 1208 | " device = args.device, drop_last=False,\n", 1209 | " pinned_memory = True, n_workers = 2, \n", 1210 | " )\n", 1211 | " eval_bar.reset(\n", 1212 | " total=dataset.get_num_batches(args.batch_size),\n", 1213 | " )\n", 1214 | " model.eval()\n", 1215 | " with torch.no_grad():\n", 1216 | " for batch_index, batch_dict in enumerate(batch_generator):\n", 1217 | " y_pred = model(\n", 1218 | " input_ids = batch_dict['x_data'],\n", 1219 | " attention_mask = batch_dict['x_attn_mask'],\n", 1220 | " )[0]\n", 1221 | " y_pred = y_pred.view(-1, 3)\n", 1222 | "\n", 1223 | " y_pred = y_pred.detach()\n", 1224 | " \n", 1225 | " state['batch_preds'].append(y_pred.cpu())\n", 1226 | " state['batch_indexes'].append(batch_dict['x_index'].cpu())\n", 1227 | " \n", 1228 | " eval_bar.update()\n", 1229 | " \n", 1230 | " if torch.cuda.is_available():\n", 1231 | " torch.cuda.empty_cache()\n", 1232 | " \n", 1233 | " state[f'{split}_preds'].append(\n", 1234 | " torch.cat(state['batch_preds']).cpu()\n", 1235 | " )\n", 1236 | " state[f'{split}_indexes'].append(\n", 1237 | " torch.cat(state['batch_indexes']).cpu()\n", 1238 | " )\n", 1239 | " \n", 1240 | " state['batch_preds'] = []\n", 1241 | " state['batch_indexes'] = []\n", 1242 | " \n", 1243 | " eval_bar.close()\n", 1244 | " return state" 1245 | ], 1246 | "execution_count": 0, 1247 | "outputs": [] 1248 | }, 1249 | { 1250 | "cell_type": "code", 1251 | "metadata": { 1252 | "id": "UDLAcVoOousD", 1253 | "colab_type": "code", 1254 | "colab": {} 1255 | }, 1256 | "source": [ 1257 | "chosen_models = [all_model_paths[i] for i in final_optimal_models]" 1258 | ], 1259 | "execution_count": 0, 1260 | "outputs": [] 1261 | }, 1262 | { 1263 | "cell_type": "code", 1264 | "metadata": { 1265 | "id": "ZRhTG0jJousG", 1266 | "colab_type": "code", 1267 | "colab": {} 1268 | }, 1269 | "source": [ 1270 | "test_state = general_utils.make_train_state()\n", 1271 | "for model_path in notebook.tqdm(chosen_models, total=len(chosen_models)):\n", 1272 | " model.load_state_dict(torch.load(model_path)['model'])\n", 1273 | " test_state = evaluate_testset(model, test_state, test_dataset, 'test',args)" 1274 | ], 1275 | "execution_count": 0, 1276 | "outputs": [] 1277 | }, 1278 | { 1279 | "cell_type": "code", 1280 | "metadata": { 1281 | "id": "NyPLbSx_ousH", 1282 | "colab_type": "code", 1283 | "colab": {} 1284 | }, 1285 | "source": [ 1286 | "test_state['test_preds'][-1].shape" 1287 | ], 1288 | "execution_count": 0, 1289 | "outputs": [] 1290 | }, 1291 | { 1292 | "cell_type": "code", 1293 | "metadata": { 1294 | "id": "x9zIQn2PousJ", 1295 | "colab_type": "code", 1296 | "colab": {} 1297 | }, 1298 | "source": [ 1299 | "[test_state['test_preds'][i].size() for i in range(len(test_state['test_preds']))]" 1300 | ], 1301 | "execution_count": 0, 1302 | "outputs": [] 1303 | }, 1304 | { 1305 | "cell_type": "code", 1306 | "metadata": { 1307 | "id": "9k7efmwXousM", 1308 | "colab_type": "code", 1309 | "colab": {} 1310 | }, 1311 | "source": [ 1312 | "len(test_dataset._target_df)" 1313 | ], 1314 | "execution_count": 0, 1315 | "outputs": [] 1316 | }, 1317 | { 1318 | "cell_type": "code", 1319 | "metadata": { 1320 | "id": "1-mAcbFFousO", 1321 | "colab_type": "code", 1322 | "colab": {} 1323 | }, 1324 | "source": [ 1325 | "torch.zeros_like(test_state['test_preds'][0]).size()" 1326 | ], 1327 | "execution_count": 0, 1328 | "outputs": [] 1329 | }, 1330 | { 1331 | "cell_type": "code", 1332 | "metadata": { 1333 | "id": "Zw23sdIAousQ", 1334 | "colab_type": "code", 1335 | "colab": {} 1336 | }, 1337 | "source": [ 1338 | "ensemble_pred = torch.zeros_like(test_state['test_preds'][0])\n", 1339 | "for i in test_state['test_preds']:\n", 1340 | " ensemble_pred += i" 1341 | ], 1342 | "execution_count": 0, 1343 | "outputs": [] 1344 | }, 1345 | { 1346 | "cell_type": "code", 1347 | "metadata": { 1348 | "id": "f7l7scgnousU", 1349 | "colab_type": "code", 1350 | "colab": {} 1351 | }, 1352 | "source": [ 1353 | "int_to_label = {0: 'TIN', 1:'UNT'}\n", 1354 | "# {'UNT': 1, 'TIN': 0}" 1355 | ], 1356 | "execution_count": 0, 1357 | "outputs": [] 1358 | }, 1359 | { 1360 | "cell_type": "code", 1361 | "metadata": { 1362 | "id": "qIG3GtyDousW", 1363 | "colab_type": "code", 1364 | "colab": {} 1365 | }, 1366 | "source": [ 1367 | "t = []\n", 1368 | "for i in torch.argmax(ensemble_pred, dim=1):\n", 1369 | " t.append(int_to_label[i.item()])\n", 1370 | "\n", 1371 | "collections.Counter(t)" 1372 | ], 1373 | "execution_count": 0, 1374 | "outputs": [] 1375 | }, 1376 | { 1377 | "cell_type": "code", 1378 | "metadata": { 1379 | "id": "N43KI4P4ousY", 1380 | "colab_type": "code", 1381 | "colab": {} 1382 | }, 1383 | "source": [ 1384 | "assert len(t) == len(test_df)" 1385 | ], 1386 | "execution_count": 0, 1387 | "outputs": [] 1388 | }, 1389 | { 1390 | "cell_type": "code", 1391 | "metadata": { 1392 | "id": "ANkfSvUSousa", 1393 | "colab_type": "code", 1394 | "colab": {} 1395 | }, 1396 | "source": [ 1397 | "offeval_task_b_pred_analysis_df = pd.DataFrame(\n", 1398 | " data={\n", 1399 | " 'id':test_df.id,\n", 1400 | " 'text':test_df.tweet,\n", 1401 | " 'label':t,\n", 1402 | " }\n", 1403 | ")" 1404 | ], 1405 | "execution_count": 0, 1406 | "outputs": [] 1407 | }, 1408 | { 1409 | "cell_type": "code", 1410 | "metadata": { 1411 | "id": "8tYSn6VNousb", 1412 | "colab_type": "code", 1413 | "colab": {} 1414 | }, 1415 | "source": [ 1416 | "offeval_task_b_label_df = pd.DataFrame(\n", 1417 | " data={\n", 1418 | " 'id':test_df.id,\n", 1419 | " 'label':t,\n", 1420 | " }\n", 1421 | ")" 1422 | ], 1423 | "execution_count": 0, 1424 | "outputs": [] 1425 | }, 1426 | { 1427 | "cell_type": "code", 1428 | "metadata": { 1429 | "id": "ompoxc6Nousc", 1430 | "colab_type": "code", 1431 | "colab": {} 1432 | }, 1433 | "source": [ 1434 | "offeval_task_b_pred_analysis_df.to_csv(\n", 1435 | " 'offeval_task_b_pred_analysis_df.csv',index=False,\n", 1436 | ")" 1437 | ], 1438 | "execution_count": 0, 1439 | "outputs": [] 1440 | }, 1441 | { 1442 | "cell_type": "code", 1443 | "metadata": { 1444 | "id": "BHiGB2Q-ouse", 1445 | "colab_type": "code", 1446 | "colab": {} 1447 | }, 1448 | "source": [ 1449 | "offeval_task_b_pred_label_df.to_csv(\n", 1450 | " 'offeval_task_b_pred_label_df.csv', index=False, header=False,\n", 1451 | ")" 1452 | ], 1453 | "execution_count": 0, 1454 | "outputs": [] 1455 | }, 1456 | { 1457 | "cell_type": "code", 1458 | "metadata": { 1459 | "id": "8IVzPSmwousf", 1460 | "colab_type": "code", 1461 | "colab": {} 1462 | }, 1463 | "source": [ 1464 | "offeval_task_b_pred_label_df.label.value_counts()\n" 1465 | ], 1466 | "execution_count": 0, 1467 | "outputs": [] 1468 | }, 1469 | { 1470 | "cell_type": "code", 1471 | "metadata": { 1472 | "id": "mutV5hWkoush", 1473 | "colab_type": "code", 1474 | "colab": {} 1475 | }, 1476 | "source": [ 1477 | "offeval_task_b_pred_analysis_df.label.value_counts()" 1478 | ], 1479 | "execution_count": 0, 1480 | "outputs": [] 1481 | } 1482 | ] 1483 | } -------------------------------------------------------------------------------- /notebooks/Eng Task C - Ensemble DistilRoberta AttnMask Dropout.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.7.0" 21 | }, 22 | "colab": { 23 | "name": "Eng Task C - Ensemble DistilRoberta AttnMask Dropout.ipynb", 24 | "provenance": [] 25 | }, 26 | "accelerator": "GPU" 27 | }, 28 | "cells": [ 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "id": "D_dUF2evouow", 33 | "colab_type": "text" 34 | }, 35 | "source": [ 36 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cozek/OffensEval2020-code/blob/master/notebooks/Eng%20Task%20C%20-%20Ensemble%20DistilRoberta%20AttnMask%20Dropout.ipynb)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "id": "u7Uo50Chouox", 43 | "colab_type": "text" 44 | }, 45 | "source": [ 46 | "# Import Libraries\n", 47 | "\n", 48 | "At the time of our work, we used the following library versions\n", 49 | "- numpy 1.18.1\n", 50 | "- pandas 1.0.1\n", 51 | "- torch 1.2.0\n", 52 | "- Cuda 10.0\n", 53 | "- python 3.7.0\n", 54 | "- sklearn 0.22.1\n", 55 | "- tqdm 4.42.1\n", 56 | "- nltk 3.4.5" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "metadata": { 62 | "id": "vFPGTXG3ouox", 63 | "colab_type": "code", 64 | "colab": {} 65 | }, 66 | "source": [ 67 | "!git clone https://github.com/cozek/OffensEval2020-code/" 68 | ], 69 | "execution_count": 0, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "metadata": { 75 | "id": "FpGHYSkhouo0", 76 | "colab_type": "code", 77 | "colab": {} 78 | }, 79 | "source": [ 80 | "!git clone https://github.com/huggingface/transformers\n", 81 | "!pip install /content/transformers/" 82 | ], 83 | "execution_count": 0, 84 | "outputs": [] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "metadata": { 89 | "id": "6om7VnoNouo3", 90 | "colab_type": "code", 91 | "colab": {} 92 | }, 93 | "source": [ 94 | "import sys\n", 95 | "sys.path.append('/content/OffensEval2020-code/src/')\n", 96 | "import collections\n", 97 | "from typing import Callable\n", 98 | "import numpy as np\n", 99 | "np.random.seed(42)\n", 100 | "import pandas as pd\n", 101 | "from tqdm import notebook\n", 102 | "import importlib\n", 103 | "import pprint\n", 104 | "import nltk\n", 105 | "import datetime\n", 106 | "import os\n", 107 | "from argparse import Namespace\n", 108 | "\n", 109 | "from collections import Counter" 110 | ], 111 | "execution_count": 0, 112 | "outputs": [] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "metadata": { 117 | "id": "iyUPjzykouo5", 118 | "colab_type": "code", 119 | "colab": {} 120 | }, 121 | "source": [ 122 | "import utils.general as general_utils\n", 123 | "import utils.transformer.data as transformer_data_utils\n", 124 | "import utils.transformer.general as transformer_general_utils\n", 125 | "general_utils.set_seed_everywhere()" 126 | ], 127 | "execution_count": 0, 128 | "outputs": [] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "metadata": { 133 | "id": "MoJmxSDPouo9", 134 | "colab_type": "code", 135 | "colab": {} 136 | }, 137 | "source": [ 138 | "import logging\n", 139 | "logging.basicConfig(level=logging.INFO) " 140 | ], 141 | "execution_count": 0, 142 | "outputs": [] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "metadata": { 147 | "id": "2ehv7SLoouo_", 148 | "colab_type": "code", 149 | "colab": {} 150 | }, 151 | "source": [ 152 | "import torch\n", 153 | "import torch.nn as nn\n", 154 | "import torch.nn.functional as F\n", 155 | "import torch.optim as optim\n", 156 | "from torch.utils.data import Dataset, DataLoader\n", 157 | "torch.__version__ # we used version 1.2.0\n" 158 | ], 159 | "execution_count": 0, 160 | "outputs": [] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "metadata": { 165 | "id": "TkS9WQy2oupC", 166 | "colab_type": "code", 167 | "colab": {} 168 | }, 169 | "source": [ 170 | "# Import RAdam and Lookahead\n", 171 | "from radam.radam import RAdam\n", 172 | "from lookahead.optimizer import Lookahead\n" 173 | ], 174 | "execution_count": 0, 175 | "outputs": [] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "metadata": { 180 | "id": "71wul4V7oupF", 181 | "colab_type": "code", 182 | "colab": {} 183 | }, 184 | "source": [ 185 | "from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification" 186 | ], 187 | "execution_count": 0, 188 | "outputs": [] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "metadata": { 193 | "id": "eoAlBoFYoupH", 194 | "colab_type": "code", 195 | "colab": {} 196 | }, 197 | "source": [ 198 | " args = Namespace(\n", 199 | " #use cuda by default\n", 200 | " device = 'cuda' if torch.cuda.is_available() else 'cpu',\n", 201 | " \n", 202 | " #set batch size and number of epochs\n", 203 | " batch_size = 32,\n", 204 | " num_epochs = 20,\n", 205 | " \n", 206 | " #set the learning rate\n", 207 | " learning_rate = 0.0001,\n", 208 | "\n", 209 | " #location of the train, dev and test csv\n", 210 | " train_val_csv = '/content/OffensEval2020-code/data/eng/task_c_tiny.zip',\n", 211 | " test_csv = '/content/OffensEval2020-code/data/test_data/test_a_tweets.tsv',\n", 212 | " \n", 213 | " #directory to save our models at\n", 214 | " directory = './models/', \n", 215 | " model_name = 'roberta_attn_trac_task_a.pt',\n", 216 | " \n", 217 | " date = datetime.datetime.now().strftime(\"%a_%d_%b_%Y/\"),\n", 218 | ")" 219 | ], 220 | "execution_count": 0, 221 | "outputs": [] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": { 226 | "id": "Tt5X-gWsoupL", 227 | "colab_type": "text" 228 | }, 229 | "source": [ 230 | "## Model save location" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "metadata": { 236 | "id": "xMOYDefpoupM", 237 | "colab_type": "code", 238 | "colab": {} 239 | }, 240 | "source": [ 241 | "directory = args.directory + args.date\n", 242 | "if not os.path.exists(directory):\n", 243 | " os.makedirs(directory)\n", 244 | "args.directory = directory\n", 245 | "print(args.directory)" 246 | ], 247 | "execution_count": 0, 248 | "outputs": [] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": { 253 | "id": "thSqAbT3oupP", 254 | "colab_type": "text" 255 | }, 256 | "source": [ 257 | "## Load presplit dataset portion\n", 258 | "```\n", 259 | "Labelled as\n", 260 | "\n", 261 | "IND = 0\n", 262 | "GRP = 1\n", 263 | "OTH = 2\n", 264 | "```" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "metadata": { 270 | "id": "2WazWg4zoupP", 271 | "colab_type": "code", 272 | "colab": {} 273 | }, 274 | "source": [ 275 | "data_df_task_c = pd.read_csv(args.train_val_csv, compression='zip')\n", 276 | "print(data_df_task_c.label.value_counts())\n", 277 | "print(data_df_task_c.split.value_counts())" 278 | ], 279 | "execution_count": 0, 280 | "outputs": [] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "metadata": { 285 | "id": "_dV3P48EphCH", 286 | "colab_type": "code", 287 | "colab": {} 288 | }, 289 | "source": [ 290 | "data_df_task_c.columns" 291 | ], 292 | "execution_count": 0, 293 | "outputs": [] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "metadata": { 298 | "id": "LkUZ5O46oupS", 299 | "colab_type": "code", 300 | "colab": {} 301 | }, 302 | "source": [ 303 | "with pd.option_context('display.max_colwidth', -1): \n", 304 | " print(data_df_task_c[['text','label']].sample(5))" 305 | ], 306 | "execution_count": 0, 307 | "outputs": [] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": { 312 | "id": "NZrYLTrxoupU", 313 | "colab_type": "text" 314 | }, 315 | "source": [ 316 | "## Importing the Roberta Tokeniker and Punkt sentence tokenizer" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "metadata": { 322 | "id": "PtFADHnToupV", 323 | "colab_type": "code", 324 | "colab": {} 325 | }, 326 | "source": [ 327 | "class RobertaPreprocessor():\n", 328 | " def __init__(self,transformer_tokenizer,sentence_detector):\n", 329 | " self.transformer_tokenizer = transformer_tokenizer\n", 330 | " self.sentence_detector = sentence_detector\n", 331 | " self.bos_token = transformer_tokenizer.bos_token\n", 332 | " self.sep_token = ' ' + transformer_tokenizer.sep_token + ' '\n", 333 | " def add_special_tokens(self, text):\n", 334 | " sentences = self.sentence_detector.tokenize(text)\n", 335 | " eos_added_text = self.sep_token.join(sentences) \n", 336 | " return self.bos_token +' '+ eos_added_text + ' ' + self.transformer_tokenizer.sep_token" 337 | ], 338 | "execution_count": 0, 339 | "outputs": [] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "metadata": { 344 | "id": "ar4EbV4BoupX", 345 | "colab_type": "code", 346 | "colab": {} 347 | }, 348 | "source": [ 349 | "!python -c 'import nltk; nltk.download(\"punkt\")'" 350 | ], 351 | "execution_count": 0, 352 | "outputs": [] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "metadata": { 357 | "scrolled": true, 358 | "id": "6RlaXDBtoupY", 359 | "colab_type": "code", 360 | "colab": {} 361 | }, 362 | "source": [ 363 | "roberta_tokenizer = tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')\n", 364 | "punkt_sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')" 365 | ], 366 | "execution_count": 0, 367 | "outputs": [] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "metadata": { 372 | "id": "RKFM-0bqoupb", 373 | "colab_type": "code", 374 | "colab": {} 375 | }, 376 | "source": [ 377 | "roberta_preproc = RobertaPreprocessor(roberta_tokenizer, punkt_sentence_detector)" 378 | ], 379 | "execution_count": 0, 380 | "outputs": [] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "metadata": { 385 | "id": "jDq61UIRoupf", 386 | "colab_type": "code", 387 | "colab": {} 388 | }, 389 | "source": [ 390 | "#apply the preprocessor on the exploded dataframe\n", 391 | "data_df_task_c['text'] = data_df_task_c['text'].map(roberta_preproc.add_special_tokens)\n" 392 | ], 393 | "execution_count": 0, 394 | "outputs": [] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "metadata": { 399 | "id": "kocu45Xtoupj", 400 | "colab_type": "code", 401 | "colab": {} 402 | }, 403 | "source": [ 404 | "with pd.option_context('display.max_colwidth', -1): \n", 405 | " print(data_df_task_c[['text','label']].sample(5))" 406 | ], 407 | "execution_count": 0, 408 | "outputs": [] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "metadata": { 413 | "id": "0NOwcyecoupm", 414 | "colab_type": "text" 415 | }, 416 | "source": [ 417 | "### Implement Attention Mask Dropout in the vectorizer" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "metadata": { 423 | "id": "KyQMayxOoupn", 424 | "colab_type": "code", 425 | "colab": {} 426 | }, 427 | "source": [ 428 | "class SimpleVectorizer():\n", 429 | " def __init__(self,tokenizer: Callable, max_seq_len: int):\n", 430 | " \"\"\"\n", 431 | " Args:\n", 432 | " tokenizer (Callable): transformer tokenizer\n", 433 | " max_seq_len (int): Maximum sequence lenght \n", 434 | " \"\"\"\n", 435 | " self.tokenizer = tokenizer\n", 436 | " self._max_seq_len = max_seq_len\n", 437 | "\n", 438 | " def vectorize(self,text :str):\n", 439 | " \n", 440 | " encoded = self.tokenizer.encode_plus(\n", 441 | " text,\n", 442 | " add_special_tokens=False, #already added by preproc\n", 443 | " max_length = self._max_seq_len,\n", 444 | " pad_to_max_length = True,\n", 445 | " )\n", 446 | " ids = np.array(encoded['input_ids'], dtype=np.int64)\n", 447 | " attn = np.array(encoded['attention_mask'], dtype=np.int64)\n", 448 | " \n", 449 | " return ids, attn\n", 450 | "\n", 451 | "class Vectorizer():\n", 452 | " \"\"\"Vectorizer with Attention Mask Dropout\"\"\"\n", 453 | " def __init__(self,tokenizer: Callable, max_seq_len: int ):\n", 454 | " \"\"\"\n", 455 | " Args:\n", 456 | " tokenizer (Callable): transformer tokenizer\n", 457 | " max_seq_len (int): Maximum sequence lenght \n", 458 | " \"\"\"\n", 459 | " self.tokenizer = tokenizer\n", 460 | " self._max_seq_len = max_seq_len\n", 461 | "\n", 462 | " def vectorize(self,text :str, mask_prob: float = 0.50, mask_amount:float=0.30):\n", 463 | " \"\"\"Implements Attention Mask Dropout\n", 464 | " \n", 465 | " Args:\n", 466 | " text (str): The string to vectorize\n", 467 | " mask_prob (float): Probability of the attention mask \n", 468 | " dropout being applied\n", 469 | " mask_amount (float): Percentage of tokens to mask\n", 470 | "\n", 471 | " Returns:\n", 472 | " ids (np.array) : Array to token ids of the text\n", 473 | " attn (np.array) : 0-1 Array of attention masks\n", 474 | " \"\"\"\n", 475 | "\n", 476 | " encoded = self.tokenizer.encode_plus(\n", 477 | " text,\n", 478 | " add_special_tokens=False, #already added by preproc\n", 479 | " max_length = self._max_seq_len,\n", 480 | " pad_to_max_length = True,\n", 481 | " )\n", 482 | " ids = np.array(encoded['input_ids'], dtype=np.int64)\n", 483 | " attn = np.array(encoded['attention_mask'], dtype=np.int64)\n", 484 | " prob = np.random.rand(1)[0]\n", 485 | " if prob <= mask_prob:\n", 486 | " len_of_sent = np.where(ids==tokenizer.pad_token_id)[0][0]\n", 487 | " amount_to_mask = max(int(len_of_sent * mask_amount ) , 1)\n", 488 | " ids_to_not_attend = [np.random.randint(low=0, high=len_of_sent )\n", 489 | " for i in range(amount_to_mask)]\n", 490 | " attn[ids_to_not_attend]=0\n", 491 | " ids[ids_to_not_attend] = tokenizer.mask_token_id\n", 492 | " return ids, attn" 493 | ], 494 | "execution_count": 0, 495 | "outputs": [] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "metadata": { 500 | "id": "m0pI_W73rkHt", 501 | "colab_type": "text" 502 | }, 503 | "source": [ 504 | "Attention Mask Dropout Example" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "metadata": { 510 | "id": "1ZTtbNz7oupp", 511 | "colab_type": "code", 512 | "colab": {} 513 | }, 514 | "source": [ 515 | "v = Vectorizer(roberta_tokenizer, 15) #attention maskdropout vectorizer\n", 516 | "sv = SimpleVectorizer(roberta_tokenizer, 15) #simple vectorizer" 517 | ], 518 | "execution_count": 0, 519 | "outputs": [] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "metadata": { 524 | "id": "sx4WAmFuoups", 525 | "colab_type": "code", 526 | "colab": {} 527 | }, 528 | "source": [ 529 | "sent = \"I am alright bro, dont worry about me\"\n", 530 | "_, attn_masks_dropped = v.vectorize(sent)\n", 531 | "attn_masks_dropped" 532 | ], 533 | "execution_count": 0, 534 | "outputs": [] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "metadata": { 539 | "id": "BGb2aQcTrWxv", 540 | "colab_type": "code", 541 | "colab": {} 542 | }, 543 | "source": [ 544 | "_, attn_masks = sv.vectorize(sent)\n", 545 | "attn_masks" 546 | ], 547 | "execution_count": 0, 548 | "outputs": [] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": { 553 | "id": "EfXDJER9wnGn", 554 | "colab_type": "text" 555 | }, 556 | "source": [ 557 | "### Create the dataset class" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "metadata": { 563 | "id": "JI0SzK1Woupw", 564 | "colab_type": "code", 565 | "colab": {} 566 | }, 567 | "source": [ 568 | "class HateDataset(Dataset):\n", 569 | " def __init__(\n", 570 | " self,\n", 571 | " data_df: pd.DataFrame,\n", 572 | " tokenizer: Callable,\n", 573 | " max_seq_length:int = None,\n", 574 | " ):\n", 575 | " \"\"\"\n", 576 | " Args:\n", 577 | " data_df (pandas.DataFrame): df containing the labels and text\n", 578 | " tokenizer (tokenizer module for the transformer)\n", 579 | " \"\"\"\n", 580 | " self.data_df = data_df\n", 581 | " self.tokenizer = tokenizer\n", 582 | "\n", 583 | " if max_seq_length is None:\n", 584 | " self._max_seq_length = self._get_max_len(data_df,tokenizer)\n", 585 | " else:\n", 586 | " self._max_seq_length = max_seq_length\n", 587 | "\n", 588 | " self.train_df = self.data_df[self.data_df.split == 'train']\n", 589 | " self.train_size = len(self.train_df)\n", 590 | "\n", 591 | " self.val_df = self.data_df[self.data_df.split == 'val']\n", 592 | " self.val_size = len(self.val_df)\n", 593 | "\n", 594 | " self.test_df = self.data_df[self.data_df.split == 'test']\n", 595 | " self.test_size = len(self.test_df)\n", 596 | " \n", 597 | " self.simple_vectorize = False,\n", 598 | " self._simple_vectorizer = SimpleVectorizer(tokenizer, self._max_seq_length)\n", 599 | " self._vectorizer = Vectorizer(tokenizer, self._max_seq_length)\n", 600 | " \n", 601 | " self._lookup_dict = {\n", 602 | " 'train': (self.train_df, self.train_size),\n", 603 | " 'val': (self.val_df, self.val_size),\n", 604 | " 'test': (self.test_df, self.test_size)\n", 605 | " }\n", 606 | "\n", 607 | " self.set_split('train')\n", 608 | "\n", 609 | " class_counts = data_df.label.value_counts().to_dict()\n", 610 | " #sorted on the basis of class label,eg, 0,1,2..\n", 611 | " cts = sorted([(lbl,cts) for lbl,cts in class_counts.items()], key=lambda x: x[0])\n", 612 | " freq = [ x[1] for x in cts ]\n", 613 | " # print(freq,cts)\n", 614 | " self.class_weights = 1.0/ torch.tensor(freq, dtype=torch.float32)\n", 615 | " \n", 616 | " def flip_simple_vectorizer(self) :\n", 617 | " if self.simple_vectorize:\n", 618 | " self.simple_vectorize=False\n", 619 | " else:\n", 620 | " self.simple_vectorize= True\n", 621 | " \n", 622 | " def _get_max_len(self,data_df: pd.DataFrame, tokenizer: Callable):\n", 623 | " len_func = lambda x: len(self.tokenizer.encode_plus(x)['input_ids'])\n", 624 | " max_len = data_df.text.map(len_func).max() \n", 625 | " return max_len\n", 626 | "\n", 627 | " def set_split(self, split=\"train\"):\n", 628 | " \"\"\" selects the splits in the dataset using a column in the dataframe \"\"\"\n", 629 | " self._target_split = split\n", 630 | " self._target_df, self._target_size = self._lookup_dict[split]\n", 631 | " \n", 632 | " def __len__(self):\n", 633 | " return self._target_size\n", 634 | " \n", 635 | " def __getitem__(self, index):\n", 636 | " \"\"\"the primary entry point method for PyTorch datasets\n", 637 | " \n", 638 | " Args:\n", 639 | " index (int): the index to the data point \n", 640 | " Returns:\n", 641 | " a dictionary holding the data point's features (x_data) and label (y_target)\n", 642 | " \"\"\"\n", 643 | " row = self._target_df.iloc[index]\n", 644 | " \n", 645 | " if self._target_split == 'train':\n", 646 | " indices, attention_masks = self._vectorizer.vectorize(row.text)\n", 647 | " else:\n", 648 | " indices, attention_masks = self._simple_vectorizer.vectorize(row.text)\n", 649 | "\n", 650 | " label = row.label\n", 651 | " return {'x_data': indices,\n", 652 | " 'x_attn_mask': attention_masks,\n", 653 | " 'x_index': index,\n", 654 | " 'y_target': label}\n", 655 | " \n", 656 | " def get_num_batches(self, batch_size):\n", 657 | " \"\"\"Given a batch size, return the number of batches in the dataset\n", 658 | " \n", 659 | " Args:\n", 660 | " batch_size (int)\n", 661 | " Returns:\n", 662 | " number of batches in the dataset\n", 663 | " \"\"\"\n", 664 | " return len(self) // batch_size" 665 | ], 666 | "execution_count": 0, 667 | "outputs": [] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "metadata": { 672 | "id": "PGvvsTq9oupy", 673 | "colab_type": "code", 674 | "colab": {} 675 | }, 676 | "source": [ 677 | "def generate_batches(dataset, batch_size, shuffle=True,\n", 678 | " drop_last=False, device=\"cpu\", pinned_memory = False, n_workers = 0): \n", 679 | " \"\"\"\n", 680 | " A generator function which wraps the PyTorch DataLoader. It will \n", 681 | " ensure each tensor is on the write device location.\n", 682 | " \"\"\"\n", 683 | " dataloader = DataLoader(dataset=dataset, batch_size=batch_size,\n", 684 | " shuffle=shuffle, drop_last=drop_last,\n", 685 | " pin_memory= pinned_memory,\n", 686 | " num_workers = n_workers,\n", 687 | " )\n", 688 | " \n", 689 | " for data_dict in dataloader:\n", 690 | " out_data_dict = {}\n", 691 | " out_data_dict['x_data'] = data_dict['x_data'].to(\n", 692 | " device, non_blocking= (True if pinned_memory else False) \n", 693 | " )\n", 694 | " out_data_dict['x_attn_mask'] = data_dict['x_attn_mask'].to(\n", 695 | " device, non_blocking= (True if pinned_memory else False) \n", 696 | " )\n", 697 | " out_data_dict['x_index'] = data_dict['x_index']\n", 698 | " out_data_dict['y_target'] = data_dict['y_target'].to(\n", 699 | " device, non_blocking= (True if pinned_memory else False) \n", 700 | " )\n", 701 | " yield out_data_dict" 702 | ], 703 | "execution_count": 0, 704 | "outputs": [] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "metadata": { 709 | "id": "dqjxEQKZoup0", 710 | "colab_type": "code", 711 | "colab": {} 712 | }, 713 | "source": [ 714 | "dataset = HateDataset(\n", 715 | " data_df = data_df_task_c,\n", 716 | " tokenizer = roberta_tokenizer\n", 717 | ")" 718 | ], 719 | "execution_count": 0, 720 | "outputs": [] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "metadata": { 725 | "scrolled": true, 726 | "id": "r1S0e8djoup3", 727 | "colab_type": "code", 728 | "colab": {} 729 | }, 730 | "source": [ 731 | "assert dataset._max_seq_length <= 512" 732 | ], 733 | "execution_count": 0, 734 | "outputs": [] 735 | }, 736 | { 737 | "cell_type": "markdown", 738 | "metadata": { 739 | "id": "faAM7TdDoup5", 740 | "colab_type": "text" 741 | }, 742 | "source": [ 743 | "# Initialize the Roberta model\n", 744 | "\n", 745 | "\n" 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "metadata": { 751 | "scrolled": false, 752 | "id": "Sdjpj_fvoup6", 753 | "colab_type": "code", 754 | "colab": {} 755 | }, 756 | "source": [ 757 | "model = RobertaForSequenceClassification.from_pretrained(\n", 758 | " 'distilroberta-base',\n", 759 | " num_labels=len(set(data_df_task_c.label)),\n", 760 | ")" 761 | ], 762 | "execution_count": 0, 763 | "outputs": [] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "metadata": { 768 | "scrolled": true, 769 | "id": "7VAugC07oup8", 770 | "colab_type": "code", 771 | "colab": {} 772 | }, 773 | "source": [ 774 | "model.to(args.device)" 775 | ], 776 | "execution_count": 0, 777 | "outputs": [] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "metadata": { 782 | "id": "2nf9iX9Eoup_", 783 | "colab_type": "code", 784 | "colab": {} 785 | }, 786 | "source": [ 787 | "early_stopping = transformer_general_utils.EarlyStopping(patience=4)" 788 | ], 789 | "execution_count": 0, 790 | "outputs": [] 791 | }, 792 | { 793 | "cell_type": "code", 794 | "metadata": { 795 | "id": "BjtphpU6s4LV", 796 | "colab_type": "code", 797 | "colab": {} 798 | }, 799 | "source": [ 800 | "!nvidia-smi" 801 | ], 802 | "execution_count": 0, 803 | "outputs": [] 804 | }, 805 | { 806 | "cell_type": "code", 807 | "metadata": { 808 | "id": "ZjFXYTcEouqB", 809 | "colab_type": "code", 810 | "colab": {} 811 | }, 812 | "source": [ 813 | "args.num_epochs = 20\n", 814 | "args.batch_size = 70" 815 | ], 816 | "execution_count": 0, 817 | "outputs": [] 818 | }, 819 | { 820 | "cell_type": "code", 821 | "metadata": { 822 | "scrolled": false, 823 | "id": "OYfdxscNouqE", 824 | "colab_type": "code", 825 | "colab": {} 826 | }, 827 | "source": [ 828 | "loss_func = nn.CrossEntropyLoss()\n", 829 | "\n", 830 | "print(f'Using LR:{args.learning_rate}')\n", 831 | "base_optimizer = RAdam(model.parameters(), lr = args.learning_rate)\n", 832 | "optimizer = Lookahead(optimizer = base_optimizer, k = 5, alpha=0.5 )\n", 833 | "scheduler = optim.lr_scheduler.ReduceLROnPlateau(\n", 834 | " optimizer=optimizer.optimizer, factor =0.1 ,mode='max',\n", 835 | ")" 836 | ], 837 | "execution_count": 0, 838 | "outputs": [] 839 | }, 840 | { 841 | "cell_type": "markdown", 842 | "metadata": { 843 | "id": "HvhB0DIPouqH", 844 | "colab_type": "text" 845 | }, 846 | "source": [ 847 | "# Begin Training" 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "metadata": { 853 | "id": "ta4xhZcdouqH", 854 | "colab_type": "code", 855 | "colab": {} 856 | }, 857 | "source": [ 858 | "train_state = general_utils.make_train_state()\n", 859 | "train_state.keys()\n" 860 | ], 861 | "execution_count": 0, 862 | "outputs": [] 863 | }, 864 | { 865 | "cell_type": "code", 866 | "metadata": { 867 | "scrolled": false, 868 | "id": "Hyp2Q5ReouqK", 869 | "colab_type": "code", 870 | "colab": {} 871 | }, 872 | "source": [ 873 | "epoch_bar = notebook.tqdm(\n", 874 | " desc = 'training_routine',\n", 875 | " total = args.num_epochs,\n", 876 | " position=0,\n", 877 | " leave = True,\n", 878 | ")\n", 879 | "dataset.set_split('train')\n", 880 | "train_bar = notebook.tqdm(\n", 881 | " desc = 'split=train ',\n", 882 | " total=dataset.get_num_batches(args.batch_size),\n", 883 | " position=0,\n", 884 | " leave=True,\n", 885 | ")\n", 886 | "dataset.set_split('val')\n", 887 | "eval_bar = notebook.tqdm(\n", 888 | " desc = 'split=eval',\n", 889 | " total=dataset.get_num_batches(args.batch_size),\n", 890 | " position=0,\n", 891 | " leave=True,\n", 892 | ")\n", 893 | "\n", 894 | "old_val_acc = 0\n", 895 | "old_f1 = 0\n", 896 | "model_state = None\n", 897 | "for epoch_index in range(args.num_epochs):\n", 898 | " train_state['epoch_in'] = epoch_index\n", 899 | "\n", 900 | " dataset.set_split('train')\n", 901 | "\n", 902 | " batch_generator = generate_batches(\n", 903 | " dataset= dataset, batch_size= args.batch_size, shuffle=True,\n", 904 | " device = args.device, drop_last=False,\n", 905 | " pinned_memory = True, n_workers = 3, \n", 906 | " )\n", 907 | "\n", 908 | " running_loss = 0.0\n", 909 | " running_acc = 0.0\n", 910 | " running_f1 = 0.0\n", 911 | " model.train()\n", 912 | "\n", 913 | " train_bar.reset(\n", 914 | " total=dataset.get_num_batches(args.batch_size),\n", 915 | " )\n", 916 | "\n", 917 | " for batch_index, batch_dict in enumerate(batch_generator):\n", 918 | " optimizer.zero_grad()\n", 919 | " \n", 920 | " loss,y_pred = model(\n", 921 | " input_ids = batch_dict['x_data'],\n", 922 | " attention_mask = batch_dict['x_attn_mask'],\n", 923 | " labels= batch_dict['y_target'].unsqueeze(1),\n", 924 | " )[:2]\n", 925 | " \n", 926 | " y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))\n", 927 | " \n", 928 | "# scheduler.step()\n", 929 | " loss.backward()\n", 930 | " optimizer.step()\n", 931 | " \n", 932 | " loss_t = loss.item()\n", 933 | " running_loss += (loss_t - running_loss) / (batch_index + 1)\n", 934 | " \n", 935 | " y_pred = y_pred.detach().cpu()\n", 936 | " batch_dict['y_target'] = batch_dict['y_target'].cpu()\n", 937 | " \n", 938 | " acc_t = transformer_general_utils \\\n", 939 | " .compute_accuracy(y_pred, batch_dict['y_target'])\n", 940 | " \n", 941 | " f1_t = transformer_general_utils \\\n", 942 | " .compute_macro_f1(y_pred, batch_dict['y_target'])\n", 943 | "\n", 944 | " train_state['batch_preds'].append(y_pred)\n", 945 | " train_state['batch_targets'].append(batch_dict['y_target'])\n", 946 | " train_state['batch_indexes'].append(batch_dict['x_index'])\n", 947 | "\n", 948 | " running_acc += (acc_t - running_acc) / (batch_index + 1)\n", 949 | " running_f1 += (f1_t - running_f1) / (batch_index + 1)\n", 950 | "\n", 951 | " train_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,\n", 952 | " epoch=epoch_index)\n", 953 | "\n", 954 | " train_bar.update()\n", 955 | "\n", 956 | " if torch.cuda.is_available():\n", 957 | " torch.cuda.empty_cache()\n", 958 | " \n", 959 | " train_state['train_accuracies'].append(running_acc)\n", 960 | " train_state['train_losses'].append(running_loss)\n", 961 | " \n", 962 | " train_state['train_preds'].append(\n", 963 | " torch.cat(train_state['batch_preds']).cpu()\n", 964 | " )\n", 965 | " train_state['train_targets'].append(\n", 966 | " torch.cat(train_state['batch_targets']).cpu()\n", 967 | " )\n", 968 | " train_state['train_indexes'].append(\n", 969 | " torch.cat(train_state['batch_indexes']).cpu()\n", 970 | " )\n", 971 | " train_f1 = transformer_general_utils \\\n", 972 | " .compute_macro_f1(train_state['train_preds'][-1],\n", 973 | " train_state['train_targets'][-1],\n", 974 | " )\n", 975 | " \n", 976 | " train_state['train_f1s'].append(train_f1)\n", 977 | " \n", 978 | " train_state['batch_preds'] = []\n", 979 | " train_state['batch_targets'] = []\n", 980 | " train_state['batch_indexes'] = []\n", 981 | " \n", 982 | " \n", 983 | " dataset.set_split('val')\n", 984 | " batch_generator = generate_batches(\n", 985 | " dataset= dataset, batch_size= args.batch_size, shuffle=True,\n", 986 | " device = args.device, drop_last=False,\n", 987 | " pinned_memory = True, n_workers = 2, \n", 988 | " )\n", 989 | " eval_bar.reset(\n", 990 | " total=dataset.get_num_batches(args.batch_size),\n", 991 | " )\n", 992 | " running_loss = 0.0\n", 993 | " running_acc = 0.0\n", 994 | " running_f1 = 0.0\n", 995 | " \n", 996 | " model.eval()\n", 997 | " with torch.no_grad():\n", 998 | " optimizer._backup_and_load_cache()\n", 999 | " for batch_index, batch_dict in enumerate(batch_generator):\n", 1000 | " loss, y_pred = model(\n", 1001 | " input_ids = batch_dict['x_data'],\n", 1002 | " attention_mask = batch_dict['x_attn_mask'],\n", 1003 | " labels= batch_dict['y_target'].unsqueeze(1),\n", 1004 | " )[:2]\n", 1005 | " y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))\n", 1006 | " \n", 1007 | " loss_t = loss.item()\n", 1008 | " running_loss += (loss_t - running_loss) / (batch_index + 1)\n", 1009 | "\n", 1010 | " y_pred = y_pred.detach()\n", 1011 | " batch_dict['y_target'] = batch_dict['y_target'].cpu()\n", 1012 | " \n", 1013 | " acc_t = transformer_general_utils\\\n", 1014 | " .compute_accuracy(y_pred, batch_dict['y_target'])\n", 1015 | " f1_t = transformer_general_utils \\\n", 1016 | " .compute_macro_f1(y_pred, batch_dict['y_target'])\n", 1017 | "\n", 1018 | " train_state['batch_preds'].append(y_pred.cpu())\n", 1019 | " train_state['batch_targets'].append(batch_dict['y_target'].cpu())\n", 1020 | " train_state['batch_indexes'].append(batch_dict['x_index'].cpu())\n", 1021 | "\n", 1022 | " running_acc += (acc_t - running_acc) / (batch_index + 1)\n", 1023 | " running_f1 += (f1_t - running_f1) / (batch_index + 1)\n", 1024 | " \n", 1025 | "\n", 1026 | " eval_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,\n", 1027 | " epoch=epoch_index)\n", 1028 | " eval_bar.update()\n", 1029 | " \n", 1030 | " train_state['val_accuracies'].append(running_acc)\n", 1031 | " train_state['val_losses'].append(running_loss)\n", 1032 | " \n", 1033 | " \n", 1034 | " train_state['val_preds'].append(\n", 1035 | " torch.cat(train_state['batch_preds']).cpu()\n", 1036 | " )\n", 1037 | "\n", 1038 | " train_state['val_targets'].append(\n", 1039 | " torch.cat(train_state['batch_targets']).cpu()\n", 1040 | " )\n", 1041 | " train_state['val_indexes'].append(\n", 1042 | " torch.cat(train_state['batch_indexes']).cpu()\n", 1043 | " )\n", 1044 | " val_f1 = transformer_general_utils \\\n", 1045 | " .compute_macro_f1(train_state['val_preds'][-1],\n", 1046 | " train_state['val_targets'][-1],\n", 1047 | " )\n", 1048 | " \n", 1049 | " train_state['val_f1s'].append(val_f1)\n", 1050 | " \n", 1051 | " train_state['batch_preds'] = []\n", 1052 | " train_state['batch_targets'] = []\n", 1053 | " train_state['batch_indexes'] = []\n", 1054 | " \n", 1055 | " torch.save(\n", 1056 | " {\n", 1057 | " 'model':model.state_dict(),\n", 1058 | " },\n", 1059 | " args.directory + f'_epoc_{epoch_index}_' + args.model_name,\n", 1060 | " )\n", 1061 | " \n", 1062 | " scheduler.step(val_f1)\n", 1063 | " early_stopping(val_f1, model)\n", 1064 | " optimizer._clear_and_load_backup()\n", 1065 | " epoch_bar.set_postfix( best_f1 = early_stopping.best_score, current = val_f1)\n", 1066 | " epoch_bar.update() \n", 1067 | " \n", 1068 | " if early_stopping.early_stop:\n", 1069 | " print(\"Early stopping\")\n", 1070 | " break\n" 1071 | ], 1072 | "execution_count": 0, 1073 | "outputs": [] 1074 | }, 1075 | { 1076 | "cell_type": "code", 1077 | "metadata": { 1078 | "id": "JvhJbjv3ouqM", 1079 | "colab_type": "code", 1080 | "colab": {} 1081 | }, 1082 | "source": [ 1083 | "epoch_index" 1084 | ], 1085 | "execution_count": 0, 1086 | "outputs": [] 1087 | }, 1088 | { 1089 | "cell_type": "code", 1090 | "metadata": { 1091 | "scrolled": true, 1092 | "id": "UZ_tiTQsouqQ", 1093 | "colab_type": "code", 1094 | "colab": {} 1095 | }, 1096 | "source": [ 1097 | "print(train_state['val_f1s'])" 1098 | ], 1099 | "execution_count": 0, 1100 | "outputs": [] 1101 | }, 1102 | { 1103 | "cell_type": "code", 1104 | "metadata": { 1105 | "id": "pGTvvqJOouqS", 1106 | "colab_type": "code", 1107 | "colab": {} 1108 | }, 1109 | "source": [ 1110 | "from sklearn.metrics import classification_report\n", 1111 | "from sklearn.metrics import confusion_matrix\n", 1112 | "from sklearn.metrics import accuracy_score\n", 1113 | "from sklearn.metrics import f1_score" 1114 | ], 1115 | "execution_count": 0, 1116 | "outputs": [] 1117 | }, 1118 | { 1119 | "cell_type": "code", 1120 | "metadata": { 1121 | "id": "zmsdTS5XouqU", 1122 | "colab_type": "code", 1123 | "colab": {} 1124 | }, 1125 | "source": [ 1126 | "\n", 1127 | "print('Train:',classification_report(\n", 1128 | " y_pred=(torch.argmax(train_state['train_preds'][-1],dim=1) ).cpu().long().numpy(),\n", 1129 | " y_true= train_state['train_targets'][-1].cpu().numpy(), \n", 1130 | " digits=4)\n", 1131 | ")\n", 1132 | "print('Dev:',classification_report(\n", 1133 | " y_pred=(torch.argmax(train_state['val_preds'][-1],dim=1) ).cpu().long().numpy(),\n", 1134 | " y_true= train_state['val_targets'][-1].cpu().numpy(), \n", 1135 | " digits=4)\n", 1136 | ")\n" 1137 | ], 1138 | "execution_count": 0, 1139 | "outputs": [] 1140 | }, 1141 | { 1142 | "cell_type": "code", 1143 | "metadata": { 1144 | "id": "dBvhO_3MouqX", 1145 | "colab_type": "code", 1146 | "colab": {} 1147 | }, 1148 | "source": [ 1149 | "best_run_index = train_state['val_f1s'].index(max(train_state['val_f1s']))\n", 1150 | "print('Train:',classification_report(\n", 1151 | " y_pred=(torch.argmax(train_state['train_preds'][best_run_index],dim=1) ).cpu().long().numpy(),\n", 1152 | " y_true= train_state['train_targets'][best_run_index].cpu().numpy(), \n", 1153 | " digits=4)\n", 1154 | ")\n", 1155 | "print('Dev:',classification_report(\n", 1156 | " y_pred=(torch.argmax(train_state['val_preds'][best_run_index],dim=1) ).cpu().long().numpy(),\n", 1157 | " y_true= train_state['val_targets'][best_run_index].cpu().numpy(), \n", 1158 | " digits=4)\n", 1159 | ")" 1160 | ], 1161 | "execution_count": 0, 1162 | "outputs": [] 1163 | }, 1164 | { 1165 | "cell_type": "code", 1166 | "metadata": { 1167 | "id": "ZyJlr1Ucouqa", 1168 | "colab_type": "code", 1169 | "colab": {} 1170 | }, 1171 | "source": [ 1172 | "def sort_preds(indexes, preds):\n", 1173 | " \"\"\"Sorts the predictions in order, to reverse the effects of shuffle\n", 1174 | " done by dataloader\"\"\"\n", 1175 | " indexes = indexes.cpu().numpy().reshape(-1,1)\n", 1176 | " preds = preds.cpu().numpy()\n", 1177 | " arr_concat = np.hstack((indexes,preds)) #concat the preds and their indexes\n", 1178 | " sort_arr = arr_concat[ arr_concat[:,0].argsort()] #sort based on the indexes\n", 1179 | " sorted_preds = np.delete(sort_arr,0,axis=1)\n", 1180 | " return sorted_preds" 1181 | ], 1182 | "execution_count": 0, 1183 | "outputs": [] 1184 | }, 1185 | { 1186 | "cell_type": "code", 1187 | "metadata": { 1188 | "id": "PEKLcktCourg", 1189 | "colab_type": "code", 1190 | "colab": {} 1191 | }, 1192 | "source": [ 1193 | "def get_optimal_models_v2(train_state, split):\n", 1194 | " l = zip(train_state[f'{split}_f1s'], range(len(train_state[f'{split}_f1s'])))\n", 1195 | " sorted_vals = sorted(l, key = lambda x:x[0], reverse=True)\n", 1196 | " model_idxes = [i[1] for i in sorted_vals]\n", 1197 | " \n", 1198 | " trgts= sort_preds(train_state[f'{split}_indexes'][-1],train_state[f'{split}_targets'][-1].reshape(-1,1))\n", 1199 | " total_preds = len(train_state[f'{split}_indexes'])\n", 1200 | " init = np.zeros(train_state[f'{split}_preds'][-1].shape)\n", 1201 | " max_f1 = 0\n", 1202 | " idxes = []\n", 1203 | " for i in model_idxes:\n", 1204 | " temp = sort_preds(train_state[f'{split}_indexes'][i],train_state[f'{split}_preds'][i])\n", 1205 | " temp2 = init+temp\n", 1206 | " f1 = f1_score(\n", 1207 | " y_pred=temp2.argmax(axis=1),\n", 1208 | " y_true= trgts, average ='macro'\n", 1209 | " )\n", 1210 | " if f1 > max_f1:\n", 1211 | " max_f1 = f1\n", 1212 | " init = init+temp\n", 1213 | " idxes.append(i)\n", 1214 | " print(f'Taking preds from {idxes} | Dev f1:{f1}')\n", 1215 | " return idxes" 1216 | ], 1217 | "execution_count": 0, 1218 | "outputs": [] 1219 | }, 1220 | { 1221 | "cell_type": "code", 1222 | "metadata": { 1223 | "scrolled": true, 1224 | "id": "RI0eIVAtourj", 1225 | "colab_type": "code", 1226 | "colab": {} 1227 | }, 1228 | "source": [ 1229 | "final_optimal_models = get_optimal_models_v2(train_state, 'val')\n", 1230 | "final_optimal_models" 1231 | ], 1232 | "execution_count": 0, 1233 | "outputs": [] 1234 | }, 1235 | { 1236 | "cell_type": "markdown", 1237 | "metadata": { 1238 | "id": "siZdY82mours", 1239 | "colab_type": "text" 1240 | }, 1241 | "source": [ 1242 | "# Making preds on the given test set" 1243 | ] 1244 | }, 1245 | { 1246 | "cell_type": "code", 1247 | "metadata": { 1248 | "id": "Zcztc0lGourz", 1249 | "colab_type": "code", 1250 | "colab": {} 1251 | }, 1252 | "source": [ 1253 | "test_df = data_df_task_c" 1254 | ], 1255 | "execution_count": 0, 1256 | "outputs": [] 1257 | }, 1258 | { 1259 | "cell_type": "code", 1260 | "metadata": { 1261 | "id": "5hkU-POXour5", 1262 | "colab_type": "code", 1263 | "colab": {} 1264 | }, 1265 | "source": [ 1266 | "test_dataset = dataset\n", 1267 | "test_dataset.set_split('test')" 1268 | ], 1269 | "execution_count": 0, 1270 | "outputs": [] 1271 | }, 1272 | { 1273 | "cell_type": "code", 1274 | "metadata": { 1275 | "id": "Iaq8vD0Xour7", 1276 | "colab_type": "code", 1277 | "colab": {} 1278 | }, 1279 | "source": [ 1280 | "test_dataset._target_df.sample(5)" 1281 | ], 1282 | "execution_count": 0, 1283 | "outputs": [] 1284 | }, 1285 | { 1286 | "cell_type": "code", 1287 | "metadata": { 1288 | "id": "M2PQOwWzousA", 1289 | "colab_type": "code", 1290 | "colab": {} 1291 | }, 1292 | "source": [ 1293 | "print(len(test_df))\n", 1294 | "print(test_dataset._target_df.split.value_counts())" 1295 | ], 1296 | "execution_count": 0, 1297 | "outputs": [] 1298 | }, 1299 | { 1300 | "cell_type": "code", 1301 | "metadata": { 1302 | "id": "uRfZ2GLHousC", 1303 | "colab_type": "code", 1304 | "colab": {} 1305 | }, 1306 | "source": [ 1307 | "def evaluate_testset(model, state, dataset, split,args):\n", 1308 | " \"\"\"Returns the final layer output of our transformer model\n", 1309 | " Puts them in the '{split}_*' keys in the state dict\n", 1310 | " Args:\n", 1311 | " model: A pytorch transformers model\n", 1312 | " state: dict to store outputs\n", 1313 | " dataset: A pytorch Dataset\n", 1314 | " split: The split on which to evaluate the model on\n", 1315 | " args: Arguments from namespace, etc\n", 1316 | " Returns:\n", 1317 | " state: all evaluated output stored in the \"test\" key\n", 1318 | " \"\"\"\n", 1319 | " eval_bar = notebook.tqdm(\n", 1320 | " desc = 'evaluation progress: ',\n", 1321 | " total=dataset.get_num_batches(args.batch_size),\n", 1322 | " position=0,\n", 1323 | " leave=False,\n", 1324 | " )\n", 1325 | " dataset.set_split(split)\n", 1326 | " batch_generator = generate_batches(\n", 1327 | " dataset= dataset, batch_size= args.batch_size, shuffle=False,\n", 1328 | " device = args.device, drop_last=False,\n", 1329 | " pinned_memory = True, n_workers = 2, \n", 1330 | " )\n", 1331 | " eval_bar.reset(\n", 1332 | " total=dataset.get_num_batches(args.batch_size),\n", 1333 | " )\n", 1334 | " model.eval()\n", 1335 | " with torch.no_grad():\n", 1336 | " for batch_index, batch_dict in enumerate(batch_generator):\n", 1337 | " y_pred = model(\n", 1338 | " input_ids = batch_dict['x_data'],\n", 1339 | " attention_mask = batch_dict['x_attn_mask'],\n", 1340 | " )[0]\n", 1341 | " y_pred = y_pred.view(-1, 3)\n", 1342 | "\n", 1343 | " y_pred = y_pred.detach()\n", 1344 | " \n", 1345 | " state['batch_preds'].append(y_pred.cpu())\n", 1346 | " state['batch_indexes'].append(batch_dict['x_index'].cpu())\n", 1347 | " \n", 1348 | " eval_bar.update()\n", 1349 | " \n", 1350 | " if torch.cuda.is_available():\n", 1351 | " torch.cuda.empty_cache()\n", 1352 | " \n", 1353 | " state[f'{split}_preds'].append(\n", 1354 | " torch.cat(state['batch_preds']).cpu()\n", 1355 | " )\n", 1356 | " state[f'{split}_indexes'].append(\n", 1357 | " torch.cat(state['batch_indexes']).cpu()\n", 1358 | " )\n", 1359 | " \n", 1360 | " state['batch_preds'] = []\n", 1361 | " state['batch_indexes'] = []\n", 1362 | " \n", 1363 | " eval_bar.close()\n", 1364 | " return state" 1365 | ], 1366 | "execution_count": 0, 1367 | "outputs": [] 1368 | }, 1369 | { 1370 | "cell_type": "code", 1371 | "metadata": { 1372 | "id": "UDLAcVoOousD", 1373 | "colab_type": "code", 1374 | "colab": {} 1375 | }, 1376 | "source": [ 1377 | "chosen_models = [all_model_paths[i] for i in final_optimal_models]" 1378 | ], 1379 | "execution_count": 0, 1380 | "outputs": [] 1381 | }, 1382 | { 1383 | "cell_type": "code", 1384 | "metadata": { 1385 | "id": "ZRhTG0jJousG", 1386 | "colab_type": "code", 1387 | "colab": {} 1388 | }, 1389 | "source": [ 1390 | "test_state = general_utils.make_train_state()\n", 1391 | "for model_path in notebook.tqdm(chosen_models, total=len(chosen_models)):\n", 1392 | " model.load_state_dict(torch.load(model_path)['model'])\n", 1393 | " test_state = evaluate_testset(model, test_state, test_dataset, 'test',args)" 1394 | ], 1395 | "execution_count": 0, 1396 | "outputs": [] 1397 | }, 1398 | { 1399 | "cell_type": "code", 1400 | "metadata": { 1401 | "id": "NyPLbSx_ousH", 1402 | "colab_type": "code", 1403 | "colab": {} 1404 | }, 1405 | "source": [ 1406 | "test_state['test_preds'][-1].shape" 1407 | ], 1408 | "execution_count": 0, 1409 | "outputs": [] 1410 | }, 1411 | { 1412 | "cell_type": "code", 1413 | "metadata": { 1414 | "id": "x9zIQn2PousJ", 1415 | "colab_type": "code", 1416 | "colab": {} 1417 | }, 1418 | "source": [ 1419 | "[test_state['test_preds'][i].size() for i in range(len(test_state['test_preds']))]" 1420 | ], 1421 | "execution_count": 0, 1422 | "outputs": [] 1423 | }, 1424 | { 1425 | "cell_type": "code", 1426 | "metadata": { 1427 | "id": "9k7efmwXousM", 1428 | "colab_type": "code", 1429 | "colab": {} 1430 | }, 1431 | "source": [ 1432 | "len(test_dataset._target_df)" 1433 | ], 1434 | "execution_count": 0, 1435 | "outputs": [] 1436 | }, 1437 | { 1438 | "cell_type": "code", 1439 | "metadata": { 1440 | "id": "1-mAcbFFousO", 1441 | "colab_type": "code", 1442 | "colab": {} 1443 | }, 1444 | "source": [ 1445 | "torch.zeros_like(test_state['test_preds'][0]).size()" 1446 | ], 1447 | "execution_count": 0, 1448 | "outputs": [] 1449 | }, 1450 | { 1451 | "cell_type": "code", 1452 | "metadata": { 1453 | "id": "Zw23sdIAousQ", 1454 | "colab_type": "code", 1455 | "colab": {} 1456 | }, 1457 | "source": [ 1458 | "ensemble_pred = torch.zeros_like(test_state['test_preds'][0])\n", 1459 | "for i in test_state['test_preds']:\n", 1460 | " ensemble_pred += i" 1461 | ], 1462 | "execution_count": 0, 1463 | "outputs": [] 1464 | }, 1465 | { 1466 | "cell_type": "code", 1467 | "metadata": { 1468 | "id": "W1tEtoDvousS", 1469 | "colab_type": "code", 1470 | "colab": {} 1471 | }, 1472 | "source": [ 1473 | "# label_dict[\"IND\"] = 0\n", 1474 | "# label_dict[\"GRP\"] = 1\n", 1475 | "# label_dict[\"OTH\"] = 2\n", 1476 | "#ref utils/offeval2020.py" 1477 | ], 1478 | "execution_count": 0, 1479 | "outputs": [] 1480 | }, 1481 | { 1482 | "cell_type": "code", 1483 | "metadata": { 1484 | "id": "f7l7scgnousU", 1485 | "colab_type": "code", 1486 | "colab": {} 1487 | }, 1488 | "source": [ 1489 | "int_to_label = { 0: 'IND', 1:'GRP', 2:'OTH'}" 1490 | ], 1491 | "execution_count": 0, 1492 | "outputs": [] 1493 | }, 1494 | { 1495 | "cell_type": "code", 1496 | "metadata": { 1497 | "id": "qIG3GtyDousW", 1498 | "colab_type": "code", 1499 | "colab": {} 1500 | }, 1501 | "source": [ 1502 | "t = []\n", 1503 | "for i in torch.argmax(ensemble_pred, dim=1):\n", 1504 | " t.append(int_to_label[i.item()])\n", 1505 | "\n", 1506 | "collections.Counter(t)" 1507 | ], 1508 | "execution_count": 0, 1509 | "outputs": [] 1510 | }, 1511 | { 1512 | "cell_type": "code", 1513 | "metadata": { 1514 | "id": "N43KI4P4ousY", 1515 | "colab_type": "code", 1516 | "colab": {} 1517 | }, 1518 | "source": [ 1519 | "assert len(t) == len(test_df)" 1520 | ], 1521 | "execution_count": 0, 1522 | "outputs": [] 1523 | }, 1524 | { 1525 | "cell_type": "code", 1526 | "metadata": { 1527 | "id": "ANkfSvUSousa", 1528 | "colab_type": "code", 1529 | "colab": {} 1530 | }, 1531 | "source": [ 1532 | "offeval_task_c_pred_analysis_df = pd.DataFrame(\n", 1533 | " data={\n", 1534 | " 'id':test_df.id,\n", 1535 | " 'text':test_df.tweet,\n", 1536 | " 'label':t,\n", 1537 | " }\n", 1538 | ")" 1539 | ], 1540 | "execution_count": 0, 1541 | "outputs": [] 1542 | }, 1543 | { 1544 | "cell_type": "code", 1545 | "metadata": { 1546 | "id": "8tYSn6VNousb", 1547 | "colab_type": "code", 1548 | "colab": {} 1549 | }, 1550 | "source": [ 1551 | "offeval_task_c_pred_label_df = pd.DataFrame(\n", 1552 | " data={\n", 1553 | " 'id':test_df.id,\n", 1554 | " 'label':t,\n", 1555 | " }\n", 1556 | ")" 1557 | ], 1558 | "execution_count": 0, 1559 | "outputs": [] 1560 | }, 1561 | { 1562 | "cell_type": "code", 1563 | "metadata": { 1564 | "id": "ompoxc6Nousc", 1565 | "colab_type": "code", 1566 | "colab": {} 1567 | }, 1568 | "source": [ 1569 | "offeval_task_c_pred_analysis_df.to_csv(\n", 1570 | " 'offeval_task_c_pred_analysis.csv',index=False,\n", 1571 | ")" 1572 | ], 1573 | "execution_count": 0, 1574 | "outputs": [] 1575 | }, 1576 | { 1577 | "cell_type": "code", 1578 | "metadata": { 1579 | "id": "BHiGB2Q-ouse", 1580 | "colab_type": "code", 1581 | "colab": {} 1582 | }, 1583 | "source": [ 1584 | "offeval_task_c_pred_label_df.to_csv(\n", 1585 | " 'offeval_task_c_pred_label.csv', index=False, header=False,\n", 1586 | ")" 1587 | ], 1588 | "execution_count": 0, 1589 | "outputs": [] 1590 | }, 1591 | { 1592 | "cell_type": "code", 1593 | "metadata": { 1594 | "id": "8IVzPSmwousf", 1595 | "colab_type": "code", 1596 | "colab": {} 1597 | }, 1598 | "source": [ 1599 | "offeval_task_c_pred_label_df.label.value_counts()\n" 1600 | ], 1601 | "execution_count": 0, 1602 | "outputs": [] 1603 | }, 1604 | { 1605 | "cell_type": "code", 1606 | "metadata": { 1607 | "id": "mutV5hWkoush", 1608 | "colab_type": "code", 1609 | "colab": {} 1610 | }, 1611 | "source": [ 1612 | "offeval_task_c_pred_label_df.label.value_counts()" 1613 | ], 1614 | "execution_count": 0, 1615 | "outputs": [] 1616 | }, 1617 | { 1618 | "cell_type": "code", 1619 | "metadata": { 1620 | "id": "jr707IBvousi", 1621 | "colab_type": "code", 1622 | "colab": {} 1623 | }, 1624 | "source": [ 1625 | "" 1626 | ], 1627 | "execution_count": 0, 1628 | "outputs": [] 1629 | } 1630 | ] 1631 | } -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cozek/OffensEval2020-code/874b86f14112326c9ba09965174142bdb0acdfb0/src/__init__.py -------------------------------------------------------------------------------- /src/lookahead/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 lonePatinet 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/lookahead/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cozek/OffensEval2020-code/874b86f14112326c9ba09965174142bdb0acdfb0/src/lookahead/__init__.py -------------------------------------------------------------------------------- /src/lookahead/optimizer.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import itertools as it 4 | from torch.optim import Optimizer 5 | from collections import defaultdict 6 | 7 | class Lookahead(Optimizer): 8 | ''' 9 | PyTorch implementation of the lookahead wrapper. 10 | Lookahead Optimizer: https://arxiv.org/abs/1907.08610 11 | ''' 12 | def __init__(self, optimizer,alpha=0.5, k=6,pullback_momentum="none"): 13 | ''' 14 | :param optimizer:inner optimizer 15 | :param k (int): number of lookahead steps 16 | :param alpha(float): linear interpolation factor. 1.0 recovers the inner optimizer. 17 | :param pullback_momentum (str): change to inner optimizer momentum on interpolation update 18 | ''' 19 | if not 0.0 <= alpha <= 1.0: 20 | raise ValueError(f'Invalid slow update rate: {alpha}') 21 | if not 1 <= k: 22 | raise ValueError(f'Invalid lookahead steps: {k}') 23 | self.optimizer = optimizer 24 | self.param_groups = self.optimizer.param_groups 25 | self.alpha = alpha 26 | self.k = k 27 | self.step_counter = 0 28 | assert pullback_momentum in ["reset", "pullback", "none"] 29 | self.pullback_momentum = pullback_momentum 30 | self.state = defaultdict(dict) 31 | 32 | # Cache the current optimizer parameters 33 | for group in self.optimizer.param_groups: 34 | for p in group['params']: 35 | param_state = self.state[p] 36 | param_state['cached_params'] = torch.zeros_like(p.data) 37 | param_state['cached_params'].copy_(p.data) 38 | 39 | def __getstate__(self): 40 | return { 41 | 'state': self.state, 42 | 'optimizer': self.optimizer, 43 | 'alpha': self.alpha, 44 | 'step_counter': self.step_counter, 45 | 'k':self.k, 46 | 'pullback_momentum': self.pullback_momentum 47 | } 48 | 49 | def zero_grad(self): 50 | self.optimizer.zero_grad() 51 | 52 | def state_dict(self): 53 | return self.optimizer.state_dict() 54 | 55 | def load_state_dict(self, state_dict): 56 | self.optimizer.load_state_dict(state_dict) 57 | 58 | def _backup_and_load_cache(self): 59 | """Useful for performing evaluation on the slow weights (which typically generalize better) 60 | """ 61 | for group in self.optimizer.param_groups: 62 | for p in group['params']: 63 | param_state = self.state[p] 64 | param_state['backup_params'] = torch.zeros_like(p.data) 65 | param_state['backup_params'].copy_(p.data) 66 | p.data.copy_(param_state['cached_params']) 67 | 68 | def _clear_and_load_backup(self): 69 | for group in self.optimizer.param_groups: 70 | for p in group['params']: 71 | param_state = self.state[p] 72 | p.data.copy_(param_state['backup_params']) 73 | del param_state['backup_params'] 74 | 75 | def step(self, closure=None): 76 | """Performs a single Lookahead optimization step. 77 | Arguments: 78 | closure (callable, optional): A closure that reevaluates the model 79 | and returns the loss. 80 | """ 81 | loss = self.optimizer.step(closure) 82 | self.step_counter += 1 83 | 84 | if self.step_counter >= self.k: 85 | self.step_counter = 0 86 | # Lookahead and cache the current optimizer parameters 87 | for group in self.optimizer.param_groups: 88 | for p in group['params']: 89 | param_state = self.state[p] 90 | p.data.mul_(self.alpha).add_(1.0 - self.alpha, param_state['cached_params']) # crucial line 91 | param_state['cached_params'].copy_(p.data) 92 | if self.pullback_momentum == "pullback": 93 | internal_momentum = self.optimizer.state[p]["momentum_buffer"] 94 | self.optimizer.state[p]["momentum_buffer"] = internal_momentum.mul_(self.alpha).add_( 95 | 1.0 - self.alpha, param_state["cached_mom"]) 96 | param_state["cached_mom"] = self.optimizer.state[p]["momentum_buffer"] 97 | elif self.pullback_momentum == "reset": 98 | self.optimizer.state[p]["momentum_buffer"] = torch.zeros_like(p.data) 99 | 100 | return loss 101 | -------------------------------------------------------------------------------- /src/radam/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [2019] [Liyuan Liu] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /src/radam/__init__.py: -------------------------------------------------------------------------------- 1 | from .radam import RAdam, PlainRAdam, AdamW 2 | -------------------------------------------------------------------------------- /src/radam/radam.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.optim.optimizer import Optimizer, required 4 | 5 | class RAdam(Optimizer): 6 | 7 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): 8 | if not 0.0 <= lr: 9 | raise ValueError("Invalid learning rate: {}".format(lr)) 10 | if not 0.0 <= eps: 11 | raise ValueError("Invalid epsilon value: {}".format(eps)) 12 | if not 0.0 <= betas[0] < 1.0: 13 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 14 | if not 0.0 <= betas[1] < 1.0: 15 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 16 | 17 | self.degenerated_to_sgd = degenerated_to_sgd 18 | if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): 19 | for param in params: 20 | if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): 21 | param['buffer'] = [[None, None, None] for _ in range(10)] 22 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)]) 23 | super(RAdam, self).__init__(params, defaults) 24 | 25 | def __setstate__(self, state): 26 | super(RAdam, self).__setstate__(state) 27 | 28 | def step(self, closure=None): 29 | 30 | loss = None 31 | if closure is not None: 32 | loss = closure() 33 | 34 | for group in self.param_groups: 35 | 36 | for p in group['params']: 37 | if p.grad is None: 38 | continue 39 | grad = p.grad.data.float() 40 | if grad.is_sparse: 41 | raise RuntimeError('RAdam does not support sparse gradients') 42 | 43 | p_data_fp32 = p.data.float() 44 | 45 | state = self.state[p] 46 | 47 | if len(state) == 0: 48 | state['step'] = 0 49 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 50 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 51 | else: 52 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 53 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 54 | 55 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 56 | beta1, beta2 = group['betas'] 57 | 58 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 59 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 60 | 61 | state['step'] += 1 62 | buffered = group['buffer'][int(state['step'] % 10)] 63 | if state['step'] == buffered[0]: 64 | N_sma, step_size = buffered[1], buffered[2] 65 | else: 66 | buffered[0] = state['step'] 67 | beta2_t = beta2 ** state['step'] 68 | N_sma_max = 2 / (1 - beta2) - 1 69 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 70 | buffered[1] = N_sma 71 | 72 | # more conservative since it's an approximated value 73 | if N_sma >= 5: 74 | step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) 75 | elif self.degenerated_to_sgd: 76 | step_size = 1.0 / (1 - beta1 ** state['step']) 77 | else: 78 | step_size = -1 79 | buffered[2] = step_size 80 | 81 | # more conservative since it's an approximated value 82 | if N_sma >= 5: 83 | if group['weight_decay'] != 0: 84 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 85 | denom = exp_avg_sq.sqrt().add_(group['eps']) 86 | p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) 87 | p.data.copy_(p_data_fp32) 88 | elif step_size > 0: 89 | if group['weight_decay'] != 0: 90 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 91 | p_data_fp32.add_(-step_size * group['lr'], exp_avg) 92 | p.data.copy_(p_data_fp32) 93 | 94 | return loss 95 | 96 | class PlainRAdam(Optimizer): 97 | 98 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): 99 | if not 0.0 <= lr: 100 | raise ValueError("Invalid learning rate: {}".format(lr)) 101 | if not 0.0 <= eps: 102 | raise ValueError("Invalid epsilon value: {}".format(eps)) 103 | if not 0.0 <= betas[0] < 1.0: 104 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 105 | if not 0.0 <= betas[1] < 1.0: 106 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 107 | 108 | self.degenerated_to_sgd = degenerated_to_sgd 109 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 110 | 111 | super(PlainRAdam, self).__init__(params, defaults) 112 | 113 | def __setstate__(self, state): 114 | super(PlainRAdam, self).__setstate__(state) 115 | 116 | def step(self, closure=None): 117 | 118 | loss = None 119 | if closure is not None: 120 | loss = closure() 121 | 122 | for group in self.param_groups: 123 | 124 | for p in group['params']: 125 | if p.grad is None: 126 | continue 127 | grad = p.grad.data.float() 128 | if grad.is_sparse: 129 | raise RuntimeError('RAdam does not support sparse gradients') 130 | 131 | p_data_fp32 = p.data.float() 132 | 133 | state = self.state[p] 134 | 135 | if len(state) == 0: 136 | state['step'] = 0 137 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 138 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 139 | else: 140 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 141 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 142 | 143 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 144 | beta1, beta2 = group['betas'] 145 | 146 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 147 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 148 | 149 | state['step'] += 1 150 | beta2_t = beta2 ** state['step'] 151 | N_sma_max = 2 / (1 - beta2) - 1 152 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 153 | 154 | 155 | # more conservative since it's an approximated value 156 | if N_sma >= 5: 157 | if group['weight_decay'] != 0: 158 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 159 | step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) 160 | denom = exp_avg_sq.sqrt().add_(group['eps']) 161 | p_data_fp32.addcdiv_(-step_size, exp_avg, denom) 162 | p.data.copy_(p_data_fp32) 163 | elif self.degenerated_to_sgd: 164 | if group['weight_decay'] != 0: 165 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 166 | step_size = group['lr'] / (1 - beta1 ** state['step']) 167 | p_data_fp32.add_(-step_size, exp_avg) 168 | p.data.copy_(p_data_fp32) 169 | 170 | return loss 171 | 172 | 173 | class AdamW(Optimizer): 174 | 175 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup = 0): 176 | if not 0.0 <= lr: 177 | raise ValueError("Invalid learning rate: {}".format(lr)) 178 | if not 0.0 <= eps: 179 | raise ValueError("Invalid epsilon value: {}".format(eps)) 180 | if not 0.0 <= betas[0] < 1.0: 181 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 182 | if not 0.0 <= betas[1] < 1.0: 183 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 184 | 185 | defaults = dict(lr=lr, betas=betas, eps=eps, 186 | weight_decay=weight_decay, warmup = warmup) 187 | super(AdamW, self).__init__(params, defaults) 188 | 189 | def __setstate__(self, state): 190 | super(AdamW, self).__setstate__(state) 191 | 192 | def step(self, closure=None): 193 | loss = None 194 | if closure is not None: 195 | loss = closure() 196 | 197 | for group in self.param_groups: 198 | 199 | for p in group['params']: 200 | if p.grad is None: 201 | continue 202 | grad = p.grad.data.float() 203 | if grad.is_sparse: 204 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 205 | 206 | p_data_fp32 = p.data.float() 207 | 208 | state = self.state[p] 209 | 210 | if len(state) == 0: 211 | state['step'] = 0 212 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 213 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 214 | else: 215 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 216 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 217 | 218 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 219 | beta1, beta2 = group['betas'] 220 | 221 | state['step'] += 1 222 | 223 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 224 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 225 | 226 | denom = exp_avg_sq.sqrt().add_(group['eps']) 227 | bias_correction1 = 1 - beta1 ** state['step'] 228 | bias_correction2 = 1 - beta2 ** state['step'] 229 | 230 | if group['warmup'] > state['step']: 231 | scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup'] 232 | else: 233 | scheduled_lr = group['lr'] 234 | 235 | step_size = scheduled_lr * math.sqrt(bias_correction2) / bias_correction1 236 | 237 | if group['weight_decay'] != 0: 238 | p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32) 239 | 240 | p_data_fp32.addcdiv_(-step_size, exp_avg, denom) 241 | 242 | p.data.copy_(p_data_fp32) 243 | 244 | return loss 245 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cozek/OffensEval2020-code/874b86f14112326c9ba09965174142bdb0acdfb0/src/utils/__init__.py -------------------------------------------------------------------------------- /src/utils/activations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | def swish(x): 6 | """ 7 | Simple implementation of Swish activation function 8 | https://arxiv.org/pdf/1710.05941.pdf 9 | """ 10 | return x * torch.sigmoid(x) 11 | 12 | def mish(x): 13 | """ 14 | Simple implementation of Mish activation Function 15 | https://arxiv.org/abs/1908.08681 16 | """ 17 | tanh = nn.Tanh() 18 | softplus = nn.Softplus() 19 | return x * tanh( softplus(x)) 20 | 21 | def penalized_tanh(x): 22 | """ 23 | http://aclweb.org/anthology/D18-1472 24 | """ 25 | alpha = 0.25 26 | return torch.max(torch.tanh(x), alpha*torch.tanh(x)) -------------------------------------------------------------------------------- /src/utils/general.py: -------------------------------------------------------------------------------- 1 | """ 2 | General Utilities 3 | """ 4 | import os 5 | import io 6 | import mmap 7 | import torch 8 | import random 9 | import numpy as np 10 | import pandas as pd # type: ignore 11 | from tqdm import tqdm 12 | from sklearn.metrics import classification_report 13 | from sklearn.metrics import confusion_matrix 14 | from sklearn.metrics import accuracy_score 15 | from argparse import Namespace 16 | import matplotlib.pyplot as plt 17 | import seaborn as sns 18 | 19 | 20 | def alert(): 21 | from IPython.display import Audio 22 | 23 | wave = np.sin(2 * np.pi * 400 * np.arange(10000 * 0.35) / 10000) 24 | Audio(wave, rate=10000, autoplay=True) 25 | 26 | 27 | def plot_train_state(train_state): 28 | """Plot the train state 29 | Args: 30 | train_state (dict): Dict containing train state information 31 | """ 32 | 33 | sns.set(style="darkgrid") 34 | 35 | plot_df = pd.DataFrame( 36 | { 37 | "train_acc": train_state["train_accuracies"], 38 | "val_acc": train_state["val_accuracies"], 39 | } 40 | ) 41 | plot_df.index += 1 42 | num_epochs = len(plot_df) 43 | 44 | fig, ax = plt.subplots(figsize=(10, 7)) 45 | 46 | start, end = ax.get_xlim() 47 | ax.xaxis.set_ticks(np.arange(0, num_epochs + 1, 1)) 48 | plt.ylabel("accuracy") 49 | plt.xlabel("epoch") 50 | axp = sns.lineplot(ax=ax, data=plot_df, legend="full") 51 | for epoch, train_acc, val_acc in zip( 52 | range(1, num_epochs + 1), plot_df["train_acc"], plot_df["val_acc"] 53 | ): 54 | plt.annotate( 55 | f"{train_acc:.3f}", 56 | xy=(epoch, train_acc), 57 | xytext=(0, 30), 58 | textcoords="offset points", 59 | ha="center", 60 | va="top", 61 | bbox=dict(boxstyle="square,pad=0.2", alpha=0.5), 62 | # arrowprops=dict(arrowstyle = 'simple', connectionstyle='arc3,rad=0'), 63 | ) 64 | plt.annotate( 65 | f"{val_acc:.3f}", 66 | xy=(epoch, val_acc), 67 | xytext=(0, -30), 68 | textcoords="offset points", 69 | ha="center", 70 | va="bottom", 71 | bbox=dict(boxstyle="square,pad=0.2", fc="orange", alpha=0.5), 72 | # arrowprops=dict(arrowstyle = 'simple', connectionstyle='arc3,rad=0'), 73 | ) 74 | 75 | 76 | def get_misclassified_examples(torch_dataset, split_type, train_state, threshold=0.5): 77 | torch_dataset.set_split(split_type) 78 | new_df = torch_dataset._target_df.iloc[ 79 | train_state[f"{split_type}_indexes"][-1].cpu().numpy() 80 | ] 81 | new_df.reset_index(drop=True, inplace=True) 82 | y_pred = ( 83 | (torch.sigmoid(train_state[f"{split_type}_preds"][-1]) > threshold).cpu().long() 84 | ) 85 | new_df = new_df.assign(pred=pd.Series(y_pred)) 86 | new_df = new_df[new_df.label != new_df.pred][["text", "label", "pred"]] 87 | 88 | return new_df 89 | 90 | 91 | def analyse_preds(y_pred, y_target, threshold=0.5): 92 | y_pred = (torch.sigmoid(y_pred) > threshold).cpu().long().numpy() 93 | y_target = y_target.cpu().numpy() 94 | 95 | conmat = confusion_matrix(y_pred=y_pred, y_true=y_target) 96 | confusion = pd.DataFrame( 97 | conmat, index=["NOT", "HS"], columns=["predicted_NOT", "predicted_HS"] 98 | ) 99 | print("acc = ", accuracy_score(y_pred=y_pred, y_true=y_target)) 100 | print(classification_report(y_pred=y_pred, y_true=y_target, digits=4)) 101 | print(confusion) 102 | 103 | 104 | def make_train_state(): 105 | d = { 106 | "train_preds": [], 107 | "train_indexes": [], 108 | "train_targets": [], 109 | "train_accuracies": [], 110 | "train_f1s": [], 111 | "train_losses": [], 112 | "val_preds": [], 113 | "val_indexes": [], 114 | "val_targets": [], 115 | "val_accuracies": [], 116 | "val_f1s": [], 117 | "val_losses": [], 118 | "test_preds": [], 119 | "test_indexes": [], 120 | "test_targets": [], 121 | "test_accuracies": [], 122 | "test_f1s": [], 123 | "test_losses": [], 124 | "batch_preds": [], 125 | "batch_targets": [], 126 | "batch_indexes": [], 127 | "epoch_index": 0, 128 | # "save_path": '' 129 | } 130 | return dict(d) 131 | 132 | 133 | def compute_accuracy(y_pred, y_target): 134 | y_target = y_target.cpu() 135 | y_pred_indices = (torch.sigmoid(y_pred) > 0.5).cpu().long() 136 | n_correct = torch.eq(y_pred_indices, y_target).sum().item() 137 | return n_correct / len(y_pred_indices) * 100 138 | 139 | 140 | def set_seed_everywhere(seed=42): 141 | np.random.seed(seed) 142 | random.seed(seed) 143 | torch.manual_seed(seed) 144 | if torch.cuda.is_available(): 145 | torch.cuda.manual_seed_all(seed) 146 | 147 | 148 | def describe_tensor(x): 149 | """ 150 | Prints information about a given tensor 151 | """ 152 | print("Type: {}".format(x.type())) 153 | print("Shape/size: {}".format(x.shape)) 154 | print("Values: \n{}".format(x)) 155 | 156 | 157 | class DefaultFilePaths: 158 | """ 159 | Helper class that stores the location of datafiles, embeddings, etc. 160 | Must be set up for your local machine. Default configuration is for the maintainer's 161 | personal machine. 162 | """ 163 | 164 | def __init__(self, location="local"): 165 | if location == "local": 166 | self.PREFIX = "/Users/cozek/Documents/MTech/4th Sem/OffensEval/data" 167 | self.glove = "/Users/cozek/Documents/MTech/3rd Sem/Project/glove.twitter.27B/glove.twitter.27B.200d.txt" 168 | self.fasttext_bin = ( 169 | "/Users/cozek/Documents/MTech/3rd Sem/Project/cc.en.300.bin" 170 | ) 171 | self.bert_uncased_large = ( 172 | "/Users/cozek/Documents/MTech/4th Sem/wwm_uncased_L-24_H-1024_A-16/" 173 | ) 174 | self.gpt_2 = "/Users/cozek/Documents/MTech/4th Sem/gpt_2/" 175 | self.offeval_data = { 176 | "en": { 177 | "task_a": self.PREFIX 178 | + "/OffenseEval2020Data/English/task_a_distant.tsv", 179 | "task_b": self.PREFIX 180 | + "/OffenseEval2020Data/English/task_b_distant.tsv", 181 | "task_c": self.PREFIX 182 | + "/OffenseEval2020Data/English/task_c_distant.tsv", 183 | }, 184 | "en_presplit": self.PREFIX 185 | + "/OffenseEval2020Data/English/task_a_split.csv", 186 | "en_presplit_lite": self.PREFIX 187 | + "/OffenseEval2020Data/English/task_a_split_lite.csv", 188 | "en_presplit_tiny": self.PREFIX 189 | + "/OffenseEval2020Data/English/task_a_split_tiny.csv", 190 | } 191 | self.hasoc_data = { 192 | "en": { 193 | "train": self.PREFIX + "/hasoc_data/en/english_dataset.tsv", 194 | "test": self.PREFIX + "/hasoc_data/gold/hasoc2019_en_test-2919.tsv", 195 | }, 196 | "en_presplit_task_a": self.PREFIX 197 | + "/hasoc_data/en/en_presplit_task_a.csv", 198 | "en_presplit_task_a_lite": self.PREFIX 199 | + "/hasoc_data/en/en_presplit_task_a_tiny.csv", 200 | } 201 | elif location == "server": 202 | self.PREFIX = "/home/kaushik.das/OffensEval2020/data" 203 | self.glove = "/home/kaushik.das/embeddings/glove.twitter.27B.200d.txt" 204 | self.fasttext_bin = "/home/kaushik.das/embeddings/crawl-300d-2M-subword.bin" 205 | self.bert_uncased_large = ( 206 | "/home/kaushik.das/pytorch_transformers/bert_uncased/" 207 | ) 208 | self.memotion = { 209 | 'loc' : self.PREFIX + '/memotion_dataset_7k/', 210 | 'task_a_advprop_df': self.PREFIX + 'memotion_dataset_7k/images_advprop_df_task_a.pickle', 211 | 'task_a_simple_df': self.PREFIX + 'memotion_dataset_7k/images_simple_df_task_a.pickle', 212 | 213 | } 214 | self.gpt_2 = "/home/kaushik.das/pytorch_transformers/gpt2/" 215 | self.distilgpt2 = "/home/kaushik.das/pytorch_transformers/distilgpt2/" 216 | self.model_storage = "/home/kaushik.das/OffensEval2020/saved_models/" 217 | self.trac_data = { 218 | "en_dev": self.PREFIX + "/TRAC/eng/trac2_eng_dev.csv", 219 | "en_train": self.PREFIX + "/TRAC/eng/trac2_eng_train.csv", 220 | "en_task_a_dataframe": self.PREFIX + "/TRAC/eng/trac2_eng_task_a_df.csv", 221 | "en_task_b_dataframe": self.PREFIX + "/TRAC/eng/trac2_eng_task_b_df.csv", 222 | 223 | 224 | "hin_dev": self.PREFIX + "/TRAC/hin/trac2_hin_dev.csv", 225 | "hin_train": self.PREFIX + "/TRAC/hin/trac2_hin_train.csv", 226 | "iben_dev": self.PREFIX + "/TRAC/iben/trac2_iben_dev.csv", 227 | "iben_train": self.PREFIX + "/TRAC/iben/trac2_iben_train.csv", 228 | } 229 | self.offeval_data = { 230 | "en": { 231 | "task_a": self.PREFIX 232 | + "/OffenseEval2020Data/English/task_a_distant.tsv", 233 | "task_b": self.PREFIX 234 | + "/OffenseEval2020Data/English/task_b_distant.tsv", 235 | "task_c": self.PREFIX 236 | + "/OffenseEval2020Data/English/task_c_distant_ann.tsv", 237 | }, 238 | # TASK C 239 | "en_task_c_presplit_final": self.PREFIX 240 | + "/OffenseEval2020Data/English/offeval2020_task_c_en_presplit.csv", 241 | "en_task_c_presplit_lite": self.PREFIX 242 | + "/OffenseEval2020Data/English/en_task_c_presplit_lite.csv", 243 | "en_task_c_presplit_full": self.PREFIX 244 | + "/OffenseEval2020Data/English/en_task_c_presplit_full.csv", 245 | # TASK B 246 | "en_task_b_presplit_lite": self.PREFIX 247 | + "/OffenseEval2020Data/English/en_task_b_presplit_lite.csv", 248 | "en_task_b_presplit_full": self.PREFIX 249 | + "/OffenseEval2020Data/English/en_task_b_presplit_full.csv", 250 | "en_public_test_b": self.PREFIX # testset 251 | + "/OffenseEval2020Data/English/task_b_test/test_b_tweets.tsv", 252 | # TASK A 253 | "en_public_test_a": self.PREFIX # testset 254 | + "/OffenseEval2020Data/English/public_data_A/test_a_tweets.tsv", 255 | "en_presplit_full": self.PREFIX 256 | + "/OffenseEval2020Data/English/task_a_split_full.csv", 257 | "en_presplit_lite": self.PREFIX 258 | + "/OffenseEval2020Data/English/task_a_split_lite.csv", 259 | "en_presplit_tiny": self.PREFIX 260 | + "/OffenseEval2020Data/English/task_a_split_tiny.csv", 261 | "en_presplit_tiny_fixed": self.PREFIX 262 | + "/OffenseEval2020Data/English/task_a_split_tiny_fixed.csv", 263 | "en_presplit_lite_fixed": self.PREFIX 264 | + "/OffenseEval2020Data/English/task_a_split_lite_fixed.csv", 265 | "en_presplit_full_fixed": self.PREFIX 266 | + "/OffenseEval2020Data/English/task_a_split_full_fixed.csv", 267 | # std <= 0.3 268 | "en_safe_presplit_full": self.PREFIX 269 | + "/OffenseEval2020Data/English/task_a_split_full_safe.csv", 270 | # std <= 0.2 271 | "en_verysafe_presplit_tiny": self.PREFIX 272 | + "/OffenseEval2020Data/English/task_a_split_tiny_verysafe.csv", 273 | "en_verysafe_presplit_full": self.PREFIX 274 | + "/OffenseEval2020Data/English/task_a_split_full_verysafe.csv", 275 | } 276 | 277 | self.hasoc_data = { 278 | "en": { 279 | "train": self.PREFIX + "/hasoc_data/en/english_dataset.tsv", 280 | "test": self.PREFIX + "/hasoc_data/gold/hasoc2019_en_test-2919.tsv", 281 | }, 282 | "en_presplit_task_a": self.PREFIX 283 | + "/hasoc_data/en/en_presplit_task_a.csv", 284 | "en_presplit_task_a_lite": self.PREFIX 285 | + "/hasoc_data/en/en_presplit_task_a_tiny.csv", 286 | } 287 | 288 | 289 | if __name__ == "__main__": 290 | d = DefaultFilePaths() 291 | -------------------------------------------------------------------------------- /src/utils/offenseval2020.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utils for OffensEval 2020 dataset 3 | """ 4 | from argparse import Namespace 5 | from tqdm import tqdm 6 | import pandas as pd # type: ignore 7 | import numpy as np 8 | import random 9 | import torch 10 | import mmap 11 | import io 12 | import os 13 | 14 | 15 | def load_dataset(path: str): 16 | """Loads OffensEval 2020 dataset 17 | Args: 18 | path (str): full path of a task_*_distant.tsv file 19 | provided by the organiser 20 | Returns: 21 | pandas.DataFrame containing the data 22 | """ 23 | data_df = pd.read_csv(path, sep="\t", quoting=3) 24 | # remove errenous space character in column name in some files 25 | data_df.columns = list(map(lambda c: c.strip(), data_df.columns)) 26 | 27 | return data_df 28 | 29 | 30 | def labeller(df: pd.DataFrame, threshold: float, task: str, drop_cols: bool): 31 | """Adds a label to the samples in the given DataFrame 32 | Args: 33 | df (pd.DataFrame): A dataframe containing the samples their given confidence 34 | as df.text and df.average respectively 35 | threshold (float): Probability to label a sample as positive 36 | task: one of 'a','b','c' 37 | drop_cols: drops some columns that are not necessary 38 | Returns: 39 | df (pd.DataFrame): with a added column that labels of each sample respectively 40 | label_dict (dict): The labels and their corresponding integer values 41 | """ 42 | task = task.lower() 43 | 44 | assert isinstance(df, pd.DataFrame) 45 | assert 0.0 <= threshold <= 1.0 46 | assert task in ["a", "b", "c"] 47 | assert isinstance(drop_cols, bool) 48 | 49 | if task in ["a", "b"]: 50 | df["label"] = df.average >= 0.5 51 | df["label"] = df["label"].astype(int) 52 | elif task == "c": 53 | cols = {"average_ind": 0, "average_grp": 1, "average_oth": 2} 54 | df["label"] = df[list(cols.keys())].idxmax(axis=1) 55 | df["label"] = df["label"].apply(lambda x: cols[x]) 56 | 57 | if drop_cols: 58 | df = df[["id", "text", "label"]] 59 | 60 | label_dict = {} 61 | if task == "a": 62 | label_dict["OFF"] = 1 63 | label_dict["NOT"] = 0 64 | elif task == "b": #bug 65 | label_dict["UNT"] = 1 66 | label_dict["TIN"] = 0 67 | elif task == "c": 68 | label_dict["IND"] = 0 69 | label_dict["GRP"] = 1 70 | label_dict["OTH"] = 2 71 | 72 | return df, label_dict 73 | -------------------------------------------------------------------------------- /src/utils/transformer/__init.py__: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cozek/OffensEval2020-code/874b86f14112326c9ba09965174142bdb0acdfb0/src/utils/transformer/__init.py__ -------------------------------------------------------------------------------- /src/utils/transformer/data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for creating the dataset for 3 | 4 | """ 5 | from typing import Callable 6 | from torch.utils.data import Dataset, DataLoader 7 | from argparse import Namespace 8 | import collections 9 | import pandas as pd 10 | import numpy as np 11 | import string 12 | import torch 13 | import nltk 14 | 15 | class GPT2Preprocessor(): 16 | def __init__(self,transformer_tokenizer,sentence_detector): 17 | self.transformer_tokenizer = transformer_tokenizer 18 | self.sentence_detector = sentence_detector 19 | 20 | def add_eos_tokens(self, text): 21 | eos_token = ' ' + self.transformer_tokenizer.eos_token + ' ' 22 | sentences = self.sentence_detector.tokenize(text) 23 | eos_added_text = eos_token.join(sentences) + ' ' + self.transformer_tokenizer.eos_token 24 | return eos_added_text 25 | 26 | class Vectorizer(): 27 | def __init__(self,tokenizer: Callable, max_seq_len: int): 28 | """ 29 | Args: 30 | tokenizer (Callable): transformer tokenizer 31 | max_seq_len (int): Maximum sequence lenght 32 | """ 33 | self.tokenizer = tokenizer 34 | self._max_seq_len = max_seq_len 35 | 36 | def vectorize(self,text :str): 37 | sequence = \ 38 | self.tokenizer.prepare_for_tokenization(text,add_prefix_space=True) 39 | indices = self.tokenizer.encode(sequence) 40 | 41 | out_vector = np.zeros(self._max_seq_len, dtype=np.int64) 42 | out_vector[: len(indices)] = indices 43 | # max len is restricted to 1024 44 | return out_vector[:min(self._max_seq_len,1024)] 45 | 46 | class HateDataset(Dataset): 47 | def __init__(self, data_df: pd.DataFrame, tokenizer: Callable, max_len:int=None): 48 | """ 49 | Args: 50 | data_df (pandas.DataFrame): df containing the labels and text 51 | tokenizer (tokenizer module for the transformer) 52 | """ 53 | self.data_df = data_df 54 | self.tokenizer = tokenizer 55 | 56 | # measure_len = lambda context: len(context.split(" ")) 57 | # self._max_seq_length = max(map(measure_len, data_df.text)) + 2 58 | if max_len == None: 59 | self._max_seq_length = self._get_max_len(data_df,tokenizer) 60 | else: 61 | self._max_seq_length = max_len 62 | 63 | self.train_df = self.data_df[self.data_df.split == 'train'] 64 | self.train_size = len(self.train_df) 65 | 66 | self.val_df = self.data_df[self.data_df.split == 'val'] 67 | self.val_size = len(self.val_df) 68 | 69 | self.test_df = self.data_df[self.data_df.split == 'test'] 70 | self.test_size = len(self.test_df) 71 | 72 | 73 | self._vectorizer = Vectorizer(tokenizer, self._max_seq_length) 74 | 75 | 76 | self._lookup_dict = { 77 | 'train': (self.train_df, self.train_size), 78 | 'val': (self.val_df, self.val_size), 79 | 'test': (self.test_df, self.test_size) 80 | } 81 | 82 | self.set_split('train') 83 | 84 | class_counts = data_df.label.value_counts().to_dict() 85 | #sorted on the basis of class label,eg, 0,1,2.. 86 | cts = sorted([(lbl,cts) for lbl,cts in class_counts.items()], key=lambda x: x[0]) 87 | freq = [ x[1] for x in cts ] 88 | # print(freq,cts) 89 | self.class_weights = 1.0/ torch.tensor(freq, dtype=torch.float32) 90 | 91 | def _get_max_len(self,data_df: pd.DataFrame, tokenizer: Callable): 92 | prep_func = lambda x: self.tokenizer.prepare_for_tokenization(x,add_prefix_space=True) 93 | len_func = lambda x: len(prep_func(x)) 94 | max_len = data_df.text.map(len_func).max() 95 | return max_len 96 | 97 | # max_len = 0 98 | # for seq in data_df['text']: 99 | # temp = tokenizer.prepare_for_tokenization(seq,add_prefix_space=True) 100 | # tokenized_seq = tokenizer.tokenize(temp) 101 | # if len(tokenized_seq) > max_len: 102 | # max_len = len(tokenized_seq) 103 | # return max_len 104 | 105 | 106 | 107 | def set_split(self, split="train"): 108 | """ selects the splits in the dataset using a column in the dataframe """ 109 | self._target_split = split 110 | self._target_df, self._target_size = self._lookup_dict[split] 111 | 112 | def __len__(self): 113 | return self._target_size 114 | 115 | def __getitem__(self, index): 116 | """the primary entry point method for PyTorch datasets 117 | 118 | Args: 119 | index (int): the index to the data point 120 | Returns: 121 | a dictionary holding the data point's features (x_data) and label (y_target) 122 | """ 123 | row = self._target_df.iloc[index] 124 | 125 | sequence = self._vectorizer.vectorize(row.text) 126 | 127 | label = row.label 128 | return {'x_data': sequence, 129 | 'x_index': index, 130 | 'y_target': label} 131 | 132 | def get_num_batches(self, batch_size): 133 | """Given a batch size, return the number of batches in the dataset 134 | 135 | Args: 136 | batch_size (int) 137 | Returns: 138 | number of batches in the dataset 139 | """ 140 | return len(self) // batch_size 141 | 142 | class TracDataset(Dataset): 143 | def __init__(self, data_df: pd.DataFrame, tokenizer: Callable): 144 | """ 145 | Args: 146 | data_df (pandas.DataFrame): df containing the labels and text 147 | tokenizer (tokenizer module for the transformer) 148 | """ 149 | self.data_df = data_df 150 | self.tokenizer = tokenizer 151 | 152 | # measure_len = lambda context: len(context.split(" ")) 153 | # self._max_seq_length = max(map(measure_len, data_df.text)) + 2 154 | self._max_seq_length = self._get_max_len(data_df,tokenizer) 155 | 156 | self.train_df = self.data_df[self.data_df.split == 'train'] 157 | self.train_size = len(self.train_df) 158 | 159 | self.val_df = self.data_df[self.data_df.split == 'dev'] 160 | self.val_size = len(self.val_df) 161 | 162 | self.test_df = self.data_df[self.data_df.split == 'test'] 163 | self.test_size = len(self.test_df) 164 | 165 | 166 | self._vectorizer = Vectorizer(tokenizer, self._max_seq_length) 167 | 168 | 169 | self._lookup_dict = { 170 | 'train': (self.train_df, self.train_size), 171 | 'val': (self.val_df, self.val_size), 172 | 'test': (self.test_df, self.test_size) 173 | } 174 | 175 | self.set_split('train') 176 | 177 | class_counts = data_df.label.value_counts().to_dict() 178 | #sorted on the basis of class label,eg, 0,1,2.. 179 | cts = sorted([(lbl,cts) for lbl,cts in class_counts.items()], key=lambda x: x[0]) 180 | freq = [ x[1] for x in cts ] 181 | # print(freq,cts) 182 | self.class_weights = 1.0/ torch.tensor(freq, dtype=torch.float32) 183 | 184 | def _get_max_len(self,data_df: pd.DataFrame, tokenizer: Callable): 185 | max_len = 0 186 | for seq in data_df['text']: 187 | temp = tokenizer.prepare_for_tokenization(seq,add_prefix_space=True) 188 | tokenized_seq = tokenizer.tokenize(temp) 189 | if len(tokenized_seq) > max_len: 190 | max_len = len(tokenized_seq) 191 | return max_len 192 | 193 | 194 | 195 | def set_split(self, split="train"): 196 | """ selects the splits in the dataset using a column in the dataframe """ 197 | self._target_split = split 198 | self._target_df, self._target_size = self._lookup_dict[split] 199 | 200 | def __len__(self): 201 | return self._target_size 202 | 203 | def __getitem__(self, index): 204 | """the primary entry point method for PyTorch datasets 205 | 206 | Args: 207 | index (int): the index to the data point 208 | Returns: 209 | a dictionary holding the data point's features (x_data) and label (y_target) 210 | """ 211 | row = self._target_df.iloc[index] 212 | 213 | sequence = self._vectorizer.vectorize(row.text) 214 | 215 | label = row.label 216 | return {'x_data': sequence, 217 | 'x_index': index, 218 | 'y_target': label} 219 | 220 | def get_num_batches(self, batch_size): 221 | """Given a batch size, return the number of batches in the dataset 222 | 223 | Args: 224 | batch_size (int) 225 | Returns: 226 | number of batches in the dataset 227 | """ 228 | return len(self) // batch_size 229 | 230 | def generate_batches(dataset, batch_size, shuffle=True, 231 | drop_last=False, device="cpu", pinned_memory = False, n_workers = 0): 232 | """ 233 | A generator function which wraps the PyTorch DataLoader. It will 234 | ensure each tensor is on the write device location. 235 | """ 236 | dataloader = DataLoader(dataset=dataset, batch_size=batch_size, 237 | shuffle=shuffle, drop_last=drop_last, 238 | pin_memory= pinned_memory, 239 | num_workers = n_workers, 240 | ) 241 | 242 | for data_dict in dataloader: 243 | out_data_dict = {} 244 | # print(data_dict.items()) 245 | for name, tensor in data_dict.items(): 246 | out_data_dict[name] = data_dict[name].to(device, non_blocking= (True if pinned_memory else False) ) 247 | yield out_data_dict -------------------------------------------------------------------------------- /src/utils/transformer/general.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import classification_report 2 | from sklearn.metrics import confusion_matrix 3 | from sklearn.metrics import accuracy_score 4 | from sklearn.metrics import f1_score 5 | import torch 6 | import pandas as pd 7 | import numpy as np 8 | import torch.nn as nn 9 | 10 | class EarlyStopping: 11 | """Early stops the training if validation loss doesn't improve after a given patience.""" 12 | def __init__(self, patience=7, verbose=False, delta=0): 13 | """ 14 | Args: 15 | patience (int): How long to wait after last time validation loss improved. 16 | Default: 7 17 | verbose (bool): If True, prints a message for each validation loss improvement. 18 | Default: False 19 | delta (float): Minimum change in the monitored quantity to qualify as an improvement. 20 | Default: 0 21 | """ 22 | self.patience = patience 23 | self.verbose = verbose 24 | self.counter = 0 25 | self.best_score = None 26 | self.early_stop = False 27 | self.val_loss_min = np.Inf 28 | self.delta = delta 29 | 30 | def __call__(self, val_loss, model): 31 | 32 | score = val_loss 33 | 34 | if self.best_score is None: 35 | self.best_score = score 36 | self.save_checkpoint(val_loss, model) 37 | elif score < self.best_score + self.delta: 38 | self.counter += 1 39 | print(f'EarlyStopping counter: {self.counter} out of {self.patience}') 40 | if self.counter >= self.patience: 41 | self.early_stop = True 42 | else: 43 | self.best_score = score 44 | self.save_checkpoint(val_loss, model) 45 | self.counter = 0 46 | 47 | def save_checkpoint(self, val_loss, model): 48 | '''Saves model when validation loss decrease.''' 49 | if self.verbose: 50 | print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...') 51 | torch.save(model.state_dict(), 'checkpoint.pt') 52 | self.val_loss_min = val_loss 53 | 54 | def compute_accuracy(y_pred, y_target): 55 | y_pred = y_pred.cpu() 56 | y_target = y_target.cpu() 57 | return torch.eq(torch.argmax(y_pred,dim=1),y_target).sum().item() / len(y_pred) 58 | 59 | def compute_macro_f1(y_pred, y_target, average = 'macro'): 60 | y_pred = (torch.argmax(y_pred,dim=1)).cpu().long().numpy() 61 | y_target = y_target.cpu().numpy() 62 | 63 | return f1_score(y_true = y_target, y_pred=y_pred , average=average) 64 | 65 | 66 | def analyse_preds(y_pred, y_target, threshold=0.5): 67 | y_pred = (torch.argmax(y_pred,dim=1) > threshold).cpu().long().numpy() 68 | # y_pred = (torch.argmax(y_pred > threshold,dim=1)).cpu().long().numpy() 69 | y_target = y_target.cpu().numpy() 70 | 71 | conmat = confusion_matrix(y_pred=y_pred, y_true=y_target) 72 | confusion = pd.DataFrame( 73 | conmat, index=["NOT", "HS"], columns=["predicted_NOT", "predicted_HS"] 74 | ) 75 | print("acc = ", accuracy_score(y_pred=y_pred, y_true=y_target)) 76 | print(classification_report(y_pred=y_pred, y_true=y_target, digits=4)) 77 | print(confusion) 78 | 79 | def analyse_preds2(y_pred, y_target, threshold=0.5): 80 | # y_pred = (torch.argmax(y_pred,dim=1) > threshold).cpu().long().numpy() 81 | y_pred = torch.argmax(nn.Sigmoid()(y_pred) > threshold,dim=1).cpu().long().numpy() 82 | y_target = y_target.cpu().numpy() 83 | 84 | conmat = confusion_matrix(y_pred=y_pred, y_true=y_target) 85 | confusion = pd.DataFrame( 86 | conmat, index=["NOT", "HS"], columns=["predicted_NOT", "predicted_HS"] 87 | ) 88 | print("acc = ", accuracy_score(y_pred=y_pred, y_true=y_target)) 89 | print(classification_report(y_pred=y_pred, y_true=y_target, digits=4)) 90 | print(confusion) 91 | -------------------------------------------------------------------------------- /src/utils/transformer/roberta.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class RobertaAttention(nn.Module): 6 | """Implements Attention Head Classifier 7 | on Pretrained Roberta Transformer representations. 8 | Attention Head Implementation based on: https://www.aclweb.org/anthology/P16-2034/ 9 | """ 10 | def penalized_tanh(self,x): 11 | """ 12 | http://aclweb.org/anthology/D18-1472 13 | """ 14 | alpha = 0.25 15 | return torch.max(torch.tanh(x), alpha*torch.tanh(x)) 16 | def swish(self, x): 17 | """ 18 | Simple implementation of Swish activation function 19 | https://arxiv.org/pdf/1710.05941.pdf 20 | """ 21 | return x * torch.sigmoid(x) 22 | 23 | def mish(self, x): 24 | """ 25 | Simple implementation of Mish activation Function 26 | https://arxiv.org/abs/1908.08681 27 | """ 28 | tanh = nn.Tanh() 29 | softplus = nn.Softplus() 30 | return x * tanh( softplus(x)) 31 | 32 | def __init__(self, model_name, num_labels): 33 | """ 34 | Args: 35 | model_name: model name, eg, roberta-base' 36 | """ 37 | super().__init__() 38 | self.w = nn.Linear(768,1, bias=False) 39 | self.roberta = RobertaModel.from_pretrained(model_name) 40 | self.prediction_layer = nn.Linear(768, num_labels) 41 | 42 | self.init_weights() 43 | 44 | def init_weights(self): 45 | for name, param in self.prediction_layer.named_parameters(): 46 | if 'bias' in name: 47 | nn.init.constant_(param, 0.0) 48 | elif 'weight' in name: 49 | nn.init.xavier_uniform_(param) 50 | for name, param in self.w.named_parameters(): 51 | if 'bias' in name: 52 | nn.init.constant_(param, 0.0) 53 | elif 'weight' in name: 54 | nn.init.xavier_uniform_(param) 55 | 56 | def forward(self, input_ids,attention_mask): 57 | """ 58 | Args: 59 | input_ids: sent encoded into indices 60 | attention_mask: their respective attention masks, 61 | """ 62 | #elmo layer takes care of padding 63 | embeddings = self.roberta(input_ids = input_ids, 64 | attention_mask = attention_mask) 65 | H = embeddings[0] #final hidden layer outputs 66 | # print(H.shape) 67 | M = self.penalized_tanh(H) 68 | alpha = torch.softmax(self.w(M), dim=1) 69 | r = torch.bmm(H.permute(0,2,1),alpha) 70 | h_star = self.penalized_tanh(r) 71 | preds = self.prediction_layer(h_star.permute(0,2,1)) 72 | return preds 73 | 74 | class RobertaAttentionReg(nn.Module): 75 | """Implements Attention Head Classifier 76 | on Pretrained Roberta Transformer representations. 77 | Attention Head Implementation based on: https://www.aclweb.org/anthology/P16-2034/ 78 | """ 79 | def swish(self, x): 80 | """ 81 | Simple implementation of Swish activation function 82 | https://arxiv.org/pdf/1710.05941.pdf 83 | """ 84 | return x * torch.sigmoid(x) 85 | 86 | def mish(self, x): 87 | """ 88 | Simple implementation of Mish activation Function 89 | https://arxiv.org/abs/1908.08681 90 | """ 91 | tanh = nn.Tanh() 92 | softplus = nn.Softplus() 93 | return x * tanh( softplus(x)) 94 | 95 | def __init__(self, model_name, num_labels): 96 | """ 97 | Args: 98 | model_name: model name, eg, roberta-base' 99 | """ 100 | super().__init__() 101 | self.w = nn.Linear(768,1, bias=False) 102 | self.roberta = RobertaModel.from_pretrained(model_name) 103 | self.prediction_layer = nn.Linear(768, num_labels) 104 | self.dropout = nn.Dropout(p=0.1) 105 | self.init_weights() 106 | 107 | def init_weights(self): 108 | for name, param in self.prediction_layer.named_parameters(): 109 | if 'bias' in name: 110 | nn.init.constant_(param, 0.0) 111 | elif 'weight' in name: 112 | nn.init.kaiming_normal_(param) 113 | for name, param in self.w.named_parameters(): 114 | if 'bias' in name: 115 | nn.init.constant_(param, 0.0) 116 | elif 'weight' in name: 117 | nn.init.kaiming_normal_(param) 118 | 119 | def forward(self, input_ids,attention_mask): 120 | """ 121 | Args: 122 | input_ids: sent encoded into indices 123 | attention_mask: their respective attention masks, 124 | """ 125 | #elmo layer takes care of padding 126 | embeddings = self.roberta(input_ids = input_ids, 127 | attention_mask = attention_mask) 128 | 129 | H = embeddings[0] #final hidden layer outputs 130 | # print(H.shape) 131 | M = self.mish(H) 132 | alpha = torch.softmax(self.w(M), dim=1) 133 | alpha = self.dropout(alpha) 134 | 135 | r = torch.bmm(H.permute(0,2,1),alpha) 136 | 137 | h_star = self.mish(r) 138 | h_star = self.dropout(h_star) 139 | 140 | preds = self.prediction_layer(h_star.permute(0,2,1)) 141 | return preds 142 | 143 | class RobertaAttentionNorm(nn.Module): 144 | """Implements Attention Head Classifier 145 | on Pretrained Roberta Transformer representations. 146 | Attention Head Implementation based on: https://www.aclweb.org/anthology/P16-2034/ 147 | """ 148 | def swish(self, x): 149 | """ 150 | Simple implementation of Swish activation function 151 | https://arxiv.org/pdf/1710.05941.pdf 152 | """ 153 | return x * torch.sigmoid(x) 154 | 155 | def mish(self, x): 156 | """ 157 | Simple implementation of Mish activation Function 158 | https://arxiv.org/abs/1908.08681 159 | """ 160 | tanh = nn.Tanh() 161 | softplus = nn.Softplus() 162 | return x * tanh( softplus(x)) 163 | 164 | def __init__(self, model_name, num_labels, max_seq_len): 165 | """ 166 | Args: 167 | model_name: model name, eg, roberta-base' 168 | """ 169 | super().__init__() 170 | self.w = nn.Linear(768,1, bias=False) 171 | self.roberta = RobertaModel.from_pretrained(model_name) 172 | self.prediction_layer = nn.Linear(768, num_labels) 173 | self.dropout = nn.Dropout(p=0.1) 174 | self.batchnorm = nn.BatchNorm1d(max_seq_len) 175 | self.init_weights() 176 | 177 | def init_weights(self): 178 | for name, param in self.prediction_layer.named_parameters(): 179 | if 'bias' in name: 180 | nn.init.constant_(param, 0.0) 181 | elif 'weight' in name: 182 | nn.init.kaiming_normal_(param) 183 | for name, param in self.w.named_parameters(): 184 | if 'bias' in name: 185 | nn.init.constant_(param, 0.0) 186 | elif 'weight' in name: 187 | nn.init.kaiming_normal_(param) 188 | 189 | def forward(self, input_ids,attention_mask): 190 | """ 191 | Args: 192 | input_ids: sent encoded into indices 193 | attention_mask: their respective attention masks, 194 | """ 195 | #elmo layer takes care of padding 196 | embeddings = self.roberta(input_ids = input_ids, 197 | attention_mask = attention_mask) 198 | 199 | H = embeddings[0] #final hidden layer outputs 200 | 201 | H = self.batchnorm(H) 202 | 203 | M = self.swish(H) 204 | alpha = torch.softmax(self.w(M), dim=1) 205 | alpha = self.dropout(alpha) 206 | 207 | r = torch.bmm(H.permute(0,2,1),alpha) 208 | 209 | h_star = self.swish(r) 210 | h_star = self.dropout(h_star) 211 | 212 | preds = self.prediction_layer(h_star.permute(0,2,1)) 213 | 214 | return preds --------------------------------------------------------------------------------