├── .gitignore
├── LICENSE
├── README.md
├── data
    ├── eng
    │   ├── task_a_tiny.zip
    │   ├── task_b_tiny.zip
    │   └── task_c_tiny.zip
    └── test_data
    │   ├── readme-offenseval-testsetA-english.txt
    │   ├── readme-offenseval-testsetB-english.txt
    │   ├── readme-offenseval-testsetC-english.txt
    │   ├── test_a_tweets.tsv
    │   ├── test_b_tweets.tsv
    │   └── test_c_tweets.tsv
├── notebooks
    ├── Eng Task A - Ensemble DistilGPT2.ipynb
    ├── Eng Task B - Ensemble Roberta.ipynb
    └── Eng Task C - Ensemble DistilRoberta AttnMask Dropout.ipynb
└── src
    ├── __init__.py
    ├── lookahead
        ├── LICENSE
        ├── __init__.py
        └── optimizer.py
    ├── radam
        ├── LICENSE.txt
        ├── __init__.py
        └── radam.py
    └── utils
        ├── __init__.py
        ├── activations.py
        ├── general.py
        ├── offenseval2020.py
        └── transformer
            ├── __init.py__
            ├── data.py
            ├── general.py
            └── roberta.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # macOS
132 | .DS_Store
133 | .DocumentRevisions-V100
134 | .fseventsd
135 | .Spotlight-V100
136 | .TemporaryItems
137 | .Trashes
138 | .VolumeIcon.icns
139 | .com.apple.timemachine.donotpresent
140 | .AppleDB
141 | .AppleDesktop
142 | Network Trash Folder
143 | Temporary Items
144 | .apdisk
145 | .AppleDouble
146 | .LSOverride
147 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Kaushik Amar Das
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OffensEval2020_submission 
 2 | [OffensEval 2020](https://sites.google.com/site/offensevalsharedtask/home) Models code for Team KAFK
 3 | 
 4 | Paper Link: https://www.aclweb.org/anthology/2020.semeval-1.267.pdf
 5 | 
 6 | Please find the notebooks for the system code used for each task  in the `notebooks` directory.
 7 | They should work out of the box in Google Colab. However, to fully replicate our work you will need the exact hyperparmeters
 8 | from the original paper and the full dataset which might not be possible in Colab. 
 9 | 
10 | We have provided small subset of the dataset for each task in the `data` folder to use with the abovementioned notebooks. Please cite their work if you used the data in your work. The citation is provided below. Also, if you want to use the full dataset, kindly create DataFrames out of them in the same manner as used in the notebooks.
11 | 
12 | 
13 | Credits:
14 | 
15 | - RAdam : https://github.com/LiyuanLucasLiu/RAdam
16 | - LookAhead: https://github.com/lonePatient/lookahead_pytorch
17 | - Transformers: https://github.com/huggingface/transformers
18 | 
19 | 
20 | If you found our paper/scripts useful cite:
21 | ```
22 | @inproceedings{das2020kafk,
23 |   title={KAFK at SemEval-2020 Task 12: Checkpoint Ensemble of Transformers for Hate Speech Classification},
24 |   author={Das, Kaushik Amar and Baruah, Arup and Barbhuiya, Ferdous Ahmed and Dey, Kuntal},
25 |   booktitle={Proceedings of the Fourteenth Workshop on Semantic Evaluation},
26 |   pages={2023--2029},
27 |   year={2020}
28 | }
29 | ```
30 | 
31 | If you used the data please cite
32 | ```
33 | @inproceedings{rosenthal2020,
34 |     title={{A Large-Scale Semi-Supervised Dataset for Offensive Language Identification}},
35 |     author={Rosenthal, Sara and Atanasova, Pepa and Karadzhov, Georgi and Zampieri, Marcos and Nakov, Preslav},
36 |     year={2020},
37 |     booktitle={arxiv}
38 |  }
39 | ```
40 | 


--------------------------------------------------------------------------------
/data/eng/task_a_tiny.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cozek/OffensEval2020-code/874b86f14112326c9ba09965174142bdb0acdfb0/data/eng/task_a_tiny.zip


--------------------------------------------------------------------------------
/data/eng/task_b_tiny.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cozek/OffensEval2020-code/874b86f14112326c9ba09965174142bdb0acdfb0/data/eng/task_b_tiny.zip


--------------------------------------------------------------------------------
/data/eng/task_c_tiny.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cozek/OffensEval2020-code/874b86f14112326c9ba09965174142bdb0acdfb0/data/eng/task_c_tiny.zip


--------------------------------------------------------------------------------
/data/test_data/readme-offenseval-testsetA-english.txt:
--------------------------------------------------------------------------------
 1 | ﻿========================
 2 | 
 3 | OffensEval 2020: Multilingual Offensive Language Identification in Social Media (SemEval 2020 - Task 12)
 4 | Test data - Sub-task A
 5 | v 1.0: February 26 2020
 6 | https://sites.google.com/site/offensevalsharedtask/home
 7 | 
 8 | ========================
 9 | 
10 | 1) DESCRIPTION
11 | 
12 | The file testset-taska.tsv contains 3887 unlabeled tweets. 
13 | 
14 | You are required to upload your sub-task A predictions for each of the 3887 instances to CodaLab by no later than 4 March 2020 (23:59 GMT).
15 | 
16 | You will find ALL the necessary information regarding data format, dates, number of submissions, etc. at CodaLab. 
17 | 
18 | 2) FORMAT
19 | 
20 | Instances are included in TSV format as follows:
21 | 
22 | ID	INSTANCE 
23 | 
24 | The column names in the file are the following:
25 | 
26 | id	tweet
27 | 
28 | 3) TASK AND LABELS
29 | 
30 | (A) Sub-task A: Offensive language identification
31 | 
32 | - (NOT) Not Offensive - This post does not contain offense or profanity.
33 | - (OFF) Offensive - This post contains offensive language or a targeted (veiled or direct) offense.
34 | 
35 | In our annotation, we label a post as offensive (OFF) if it contains any form of non-acceptable language (profanity) or a targeted offense, which can be veiled or direct. 
36 | 
37 | 4) SUBMISSION
38 | 
39 | When you have your model predictions, you should upload a CSV file on CodaLab in the following format. 
40 | 
41 | ID, LABEL
42 | 
43 | 4) CREDITS
44 | 
45 | Task Organizers
46 | 
47 | Marcos Zampieri - Rochester Institute of Technology, USA
48 | Preslav Nakov - Qatar Computing Research Institute, Qatar
49 | Sara Rosenthal - IBM Research, USA
50 | Pepa Atanasova - University of Copenhagen, Denmark
51 | Georgi Karadzhov - University of Cambridge, UK
52 | Hamdy Mubarak - Qatar Computing Research Institute, Qatar
53 | Leon Derczynski - IT University Copenhagen, Denmark
54 | Zeses Pitenis - University of Wolverhampton, UK
55 | Çağrı Çöltekin - University of Tübingen, Germany
56 | 
57 | 5) Contact
58 | 
59 | Organizers: semeval-2020-task-12-organizers@googlegroups.com
60 | All participants: semeval-2020-task-12-all@googlegroups.com


--------------------------------------------------------------------------------
/data/test_data/readme-offenseval-testsetB-english.txt:
--------------------------------------------------------------------------------
 1 | ﻿========================
 2 | 
 3 | OffensEval 2020: Multilingual Offensive Language Identification in Social Media (SemEval 2020 - Task 12)
 4 | Test data - Sub-task B
 5 | v 1.0: February 26 2020
 6 | https://sites.google.com/site/offensevalsharedtask/home
 7 | 
 8 | ========================
 9 | 
10 | 1) DESCRIPTION
11 | 
12 | The file test_b_tweets.tsv contains 1422 unlabeled tweets. This file contains ONLY tweets which are offensive.
13 | 
14 | You are required to upload your sub-task B predictions for each of the 1422 instances to CodaLab by no later than 4 Mar 2020 (23:59 GMT).
15 | 
16 | You will find ALL the necessary information regarding data format, dates, number of submissions, etc. at CodaLab. Please read it carefully.
17 | 
18 | 2) FORMAT
19 | 
20 | Instances are included in TSV format as follows:
21 | 
22 | id	tweet
23 | 
24 | 3) TASK AND LABELS
25 | 
26 | (B) Sub-task B: Automatic categorization of offense types
27 | 
28 | - (TIN) Targeted Insult and Threats - A post containing an insult or threat to an individual, a group, or others (see categories in sub-task C).
29 | - (UNT) Untargeted - A post containing non-targeted profanity and swearing.
30 | 
31 | Posts containing general profanity are not targeted but they contain non-acceptable language.
32 | 
33 | 4) SUBMISSION
34 | 
35 | When you have your model predictions, you should upload a CSV file on CodaLab in the following format. 
36 | 
37 | ID, LABEL
38 | 
39 | 5) CREDITS
40 | 
41 | Task Organizers
42 | 
43 | Marcos Zampieri - Rochester Institute of Technology, USA
44 | Preslav Nakov - Qatar Computing Research Institute, Qatar
45 | Sara Rosenthal - IBM Research, USA
46 | Pepa Atanasova - University of Copenhagen, Denmark
47 | Georgi Karadzhov - University of Cambridge, UK
48 | Hamdy Mubarak - Qatar Computing Research Institute, Qatar
49 | Leon Derczynski - IT University Copenhagen, Denmark
50 | Zeses Pitenis - University of Wolverhampton, UK
51 | Çağrı Çöltekin - University of Tübingen, Germany
52 | 
53 | 6) Contact
54 | 
55 | Organizers: semeval-2020-task-12-organizers@googlegroups.com
56 | All participants: semeval-2020-task-12-all@googlegroups.com


--------------------------------------------------------------------------------
/data/test_data/readme-offenseval-testsetC-english.txt:
--------------------------------------------------------------------------------
 1 | ========================
 2 | 
 3 | OffensEval 2020: Multilingual Offensive Language Identification in Social Media (SemEval 2020 - Task 12)
 4 | Test data - Sub-task C
 5 | v 1.0: March 5 2020
 6 | https://sites.google.com/site/offensevalsharedtask/home
 7 | 
 8 | ========================
 9 | 
10 | 1) DESCRIPTION
11 | 
12 | The file test_c_tweets.tsv contains 850 unlabeled tweets. This file contains ONLY tweets which are offensive AND targeted.
13 | 
14 | You are required to upload your sub-task C predictions for each of the 850 instances to CodaLab by no later than 11 Mar 2020 (23:59 GMT).
15 | 
16 | You will find ALL the necessary information regarding data format, dates, number of submissions, etc. at CodaLab. Please read it carefully.
17 | 
18 | 2) FORMAT
19 | 
20 | Instances are included in TSV format as follows:
21 | 
22 | id	tweet
23 | 
24 | 3) TASK AND LABELS
25 | 
26 | (C) Sub-task C: Offense target identification
27 | 
28 | - (IND) Individual - The target of the offensive post is an individual: a famous person, a named individual or an unnamed person interacting in the conversation.
29 | - (GRP) Group - The target of the offensive post is a group of people considered as a unity due to the same ethnicity, gender or sexual orientation, political affiliation, religious belief, or something else.
30 | - (OTH) Other – The target of the offensive post does not belong to any of the previous two categories (e.g., an organization, a situation, an event, or an issue)
31 | 
32 | 4) SUBMISSION
33 | 
34 | When you have your model predictions, you should upload a ZIP file containing a CSV file on CodaLab in the following format. 
35 | 
36 | ID, LABEL
37 | 
38 | 5) CREDITS
39 | 
40 | Task Organizers
41 | 
42 | Marcos Zampieri - Rochester Institute of Technology, USA
43 | Preslav Nakov - Qatar Computing Research Institute, Qatar
44 | Sara Rosenthal - IBM Research, USA
45 | Pepa Atanasova - University of Copenhagen, Denmark
46 | Georgi Karadzhov - University of Cambridge, UK
47 | Hamdy Mubarak - Qatar Computing Research Institute, Qatar
48 | Leon Derczynski - IT University Copenhagen, Denmark
49 | Zeses Pitenis - University of Wolverhampton, UK
50 | Çağrı Çöltekin - University of Tübingen, Germany
51 | 
52 | 6) Contact
53 | 
54 | Organizers: semeval-2020-task-12-organizers@googlegroups.com
55 | All participants: semeval-2020-task-12-all@googlegroups.com


--------------------------------------------------------------------------------
/notebooks/Eng Task B - Ensemble Roberta.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |   "nbformat": 4,
   3 |   "nbformat_minor": 0,
   4 |   "metadata": {
   5 |     "kernelspec": {
   6 |       "display_name": "Python 3",
   7 |       "language": "python",
   8 |       "name": "python3"
   9 |     },
  10 |     "language_info": {
  11 |       "codemirror_mode": {
  12 |         "name": "ipython",
  13 |         "version": 3
  14 |       },
  15 |       "file_extension": ".py",
  16 |       "mimetype": "text/x-python",
  17 |       "name": "python",
  18 |       "nbconvert_exporter": "python",
  19 |       "pygments_lexer": "ipython3",
  20 |       "version": "3.7.0"
  21 |     },
  22 |     "colab": {
  23 |       "name": "Eng Task B - Ensemble Roberta.ipynb",
  24 |       "provenance": []
  25 |     },
  26 |     "accelerator": "GPU"
  27 |   },
  28 |   "cells": [
  29 |     {
  30 |       "cell_type": "markdown",
  31 |       "metadata": {
  32 |         "id": "D_dUF2evouow",
  33 |         "colab_type": "text"
  34 |       },
  35 |       "source": [
  36 |         "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cozek/OffensEval2020-code/blob/master/notebooks/Eng%20Task%20B%20-%20Ensemble%20Roberta.ipynb)"
  37 |       ]
  38 |     },
  39 |     {
  40 |       "cell_type": "markdown",
  41 |       "metadata": {
  42 |         "id": "u7Uo50Chouox",
  43 |         "colab_type": "text"
  44 |       },
  45 |       "source": [
  46 |         "# Import Libraries\n",
  47 |         "\n",
  48 |         "At the time of our work, we used the following library versions\n",
  49 |         "- numpy 1.18.1\n",
  50 |         "- pandas 1.0.1\n",
  51 |         "- torch 1.2.0\n",
  52 |         "- Cuda 10.0\n",
  53 |         "- python 3.7.0\n",
  54 |         "- sklearn 0.22.1\n",
  55 |         "- tqdm 4.42.1\n",
  56 |         "- nltk 3.4.5"
  57 |       ]
  58 |     },
  59 |     {
  60 |       "cell_type": "code",
  61 |       "metadata": {
  62 |         "id": "vFPGTXG3ouox",
  63 |         "colab_type": "code",
  64 |         "colab": {}
  65 |       },
  66 |       "source": [
  67 |         "!git clone https://github.com/cozek/OffensEval2020-code/"
  68 |       ],
  69 |       "execution_count": 0,
  70 |       "outputs": []
  71 |     },
  72 |     {
  73 |       "cell_type": "code",
  74 |       "metadata": {
  75 |         "id": "FpGHYSkhouo0",
  76 |         "colab_type": "code",
  77 |         "colab": {}
  78 |       },
  79 |       "source": [
  80 |         "!git clone https://github.com/huggingface/transformers\n",
  81 |         "!pip install /content/transformers/"
  82 |       ],
  83 |       "execution_count": 0,
  84 |       "outputs": []
  85 |     },
  86 |     {
  87 |       "cell_type": "code",
  88 |       "metadata": {
  89 |         "id": "6om7VnoNouo3",
  90 |         "colab_type": "code",
  91 |         "colab": {}
  92 |       },
  93 |       "source": [
  94 |         "import sys\n",
  95 |         "sys.path.append('/content/OffensEval2020-code/src/')\n",
  96 |         "import collections\n",
  97 |         "from typing import Callable\n",
  98 |         "import numpy as np\n",
  99 |         "np.random.seed(42)\n",
 100 |         "import pandas as pd\n",
 101 |         "from tqdm import notebook\n",
 102 |         "import importlib\n",
 103 |         "import pprint\n",
 104 |         "import nltk\n",
 105 |         "import datetime\n",
 106 |         "import os\n",
 107 |         "from argparse import Namespace\n",
 108 |         "\n",
 109 |         "from collections import Counter"
 110 |       ],
 111 |       "execution_count": 0,
 112 |       "outputs": []
 113 |     },
 114 |     {
 115 |       "cell_type": "code",
 116 |       "metadata": {
 117 |         "id": "iyUPjzykouo5",
 118 |         "colab_type": "code",
 119 |         "colab": {}
 120 |       },
 121 |       "source": [
 122 |         "import utils.general as general_utils\n",
 123 |         "import utils.transformer.data as transformer_data_utils\n",
 124 |         "import utils.transformer.general as transformer_general_utils\n",
 125 |         "general_utils.set_seed_everywhere()"
 126 |       ],
 127 |       "execution_count": 0,
 128 |       "outputs": []
 129 |     },
 130 |     {
 131 |       "cell_type": "code",
 132 |       "metadata": {
 133 |         "id": "MoJmxSDPouo9",
 134 |         "colab_type": "code",
 135 |         "colab": {}
 136 |       },
 137 |       "source": [
 138 |         "import logging\n",
 139 |         "logging.basicConfig(level=logging.INFO) "
 140 |       ],
 141 |       "execution_count": 0,
 142 |       "outputs": []
 143 |     },
 144 |     {
 145 |       "cell_type": "code",
 146 |       "metadata": {
 147 |         "id": "2ehv7SLoouo_",
 148 |         "colab_type": "code",
 149 |         "colab": {}
 150 |       },
 151 |       "source": [
 152 |         "import torch\n",
 153 |         "import torch.nn as nn\n",
 154 |         "import torch.nn.functional as F\n",
 155 |         "import torch.optim as optim\n",
 156 |         "from torch.utils.data import Dataset, DataLoader\n",
 157 |         "torch.__version__ # we used version 1.2.0\n"
 158 |       ],
 159 |       "execution_count": 0,
 160 |       "outputs": []
 161 |     },
 162 |     {
 163 |       "cell_type": "code",
 164 |       "metadata": {
 165 |         "id": "TkS9WQy2oupC",
 166 |         "colab_type": "code",
 167 |         "colab": {}
 168 |       },
 169 |       "source": [
 170 |         "# Import RAdam and Lookahead\n",
 171 |         "from radam.radam import RAdam\n",
 172 |         "from lookahead.optimizer import Lookahead\n"
 173 |       ],
 174 |       "execution_count": 0,
 175 |       "outputs": []
 176 |     },
 177 |     {
 178 |       "cell_type": "code",
 179 |       "metadata": {
 180 |         "id": "71wul4V7oupF",
 181 |         "colab_type": "code",
 182 |         "colab": {}
 183 |       },
 184 |       "source": [
 185 |         "from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification"
 186 |       ],
 187 |       "execution_count": 0,
 188 |       "outputs": []
 189 |     },
 190 |     {
 191 |       "cell_type": "code",
 192 |       "metadata": {
 193 |         "id": "eoAlBoFYoupH",
 194 |         "colab_type": "code",
 195 |         "colab": {}
 196 |       },
 197 |       "source": [
 198 |         " args = Namespace(\n",
 199 |         "        #use cuda by default\n",
 200 |         "        device = 'cuda' if torch.cuda.is_available() else 'cpu',\n",
 201 |         "    \n",
 202 |         "        #set batch size and number of epochs\n",
 203 |         "        batch_size = 32,\n",
 204 |         "        num_epochs = 20,\n",
 205 |         "    \n",
 206 |         "        #set the learning rate\n",
 207 |         "        learning_rate = 0.0001,\n",
 208 |         "\n",
 209 |         "        #location of the train, dev and test csv\n",
 210 |         "        train_val_csv = '/content/OffensEval2020-code/data/eng/task_b_tiny.zip',\n",
 211 |         "        test_csv = '/content/OffensEval2020-code/data/test_data/test_b_tweets.tsv',\n",
 212 |         "    \n",
 213 |         "        #directory to save our models at\n",
 214 |         "        directory = './models/', \n",
 215 |         "        model_name = 'roberta.pt',\n",
 216 |         "     \n",
 217 |         "        date = datetime.datetime.now().strftime(\"%a_%d_%b_%Y/\"),\n",
 218 |         ")"
 219 |       ],
 220 |       "execution_count": 0,
 221 |       "outputs": []
 222 |     },
 223 |     {
 224 |       "cell_type": "markdown",
 225 |       "metadata": {
 226 |         "id": "Tt5X-gWsoupL",
 227 |         "colab_type": "text"
 228 |       },
 229 |       "source": [
 230 |         "## Model save location"
 231 |       ]
 232 |     },
 233 |     {
 234 |       "cell_type": "code",
 235 |       "metadata": {
 236 |         "id": "xMOYDefpoupM",
 237 |         "colab_type": "code",
 238 |         "colab": {}
 239 |       },
 240 |       "source": [
 241 |         "directory = args.directory + args.date\n",
 242 |         "if not os.path.exists(directory):\n",
 243 |         "    os.makedirs(directory)\n",
 244 |         "args.directory = directory\n",
 245 |         "print(args.directory)"
 246 |       ],
 247 |       "execution_count": 0,
 248 |       "outputs": []
 249 |     },
 250 |     {
 251 |       "cell_type": "markdown",
 252 |       "metadata": {
 253 |         "id": "thSqAbT3oupP",
 254 |         "colab_type": "text"
 255 |       },
 256 |       "source": [
 257 |         "## Load presplit dataset portion\n",
 258 |         "```\n",
 259 |         "Labelled as\n",
 260 |         "\n",
 261 |         "'UNT': 1\n",
 262 |         "'TIN': 0\n",
 263 |         "```"
 264 |       ]
 265 |     },
 266 |     {
 267 |       "cell_type": "code",
 268 |       "metadata": {
 269 |         "id": "2WazWg4zoupP",
 270 |         "colab_type": "code",
 271 |         "colab": {}
 272 |       },
 273 |       "source": [
 274 |         "data_df_task_b = pd.read_csv(args.train_val_csv, compression='zip')\n",
 275 |         "print(data_df_task_b.label.value_counts())\n",
 276 |         "print(data_df_task_b.split.value_counts())"
 277 |       ],
 278 |       "execution_count": 0,
 279 |       "outputs": []
 280 |     },
 281 |     {
 282 |       "cell_type": "code",
 283 |       "metadata": {
 284 |         "id": "_dV3P48EphCH",
 285 |         "colab_type": "code",
 286 |         "colab": {}
 287 |       },
 288 |       "source": [
 289 |         "data_df_task_b.columns"
 290 |       ],
 291 |       "execution_count": 0,
 292 |       "outputs": []
 293 |     },
 294 |     {
 295 |       "cell_type": "code",
 296 |       "metadata": {
 297 |         "id": "LkUZ5O46oupS",
 298 |         "colab_type": "code",
 299 |         "colab": {}
 300 |       },
 301 |       "source": [
 302 |         "with pd.option_context('display.max_colwidth', -1): \n",
 303 |         "    print(data_df_task_b[['text','label']].sample(5))"
 304 |       ],
 305 |       "execution_count": 0,
 306 |       "outputs": []
 307 |     },
 308 |     {
 309 |       "cell_type": "markdown",
 310 |       "metadata": {
 311 |         "id": "NZrYLTrxoupU",
 312 |         "colab_type": "text"
 313 |       },
 314 |       "source": [
 315 |         "## Importing the Roberta Tokeniker and Punkt sentence tokenizer"
 316 |       ]
 317 |     },
 318 |     {
 319 |       "cell_type": "code",
 320 |       "metadata": {
 321 |         "id": "PtFADHnToupV",
 322 |         "colab_type": "code",
 323 |         "colab": {}
 324 |       },
 325 |       "source": [
 326 |         "class RobertaPreprocessor():\n",
 327 |         "    def __init__(self,transformer_tokenizer,sentence_detector):\n",
 328 |         "        self.transformer_tokenizer = transformer_tokenizer\n",
 329 |         "        self.sentence_detector = sentence_detector\n",
 330 |         "        self.bos_token = transformer_tokenizer.bos_token\n",
 331 |         "        self.sep_token = ' ' + transformer_tokenizer.sep_token + ' '\n",
 332 |         "    def add_special_tokens(self, text):\n",
 333 |         "        sentences = self.sentence_detector.tokenize(text)\n",
 334 |         "        eos_added_text  = self.sep_token.join(sentences) \n",
 335 |         "        return self.bos_token +' '+ eos_added_text + ' ' + self.transformer_tokenizer.sep_token"
 336 |       ],
 337 |       "execution_count": 0,
 338 |       "outputs": []
 339 |     },
 340 |     {
 341 |       "cell_type": "code",
 342 |       "metadata": {
 343 |         "id": "ar4EbV4BoupX",
 344 |         "colab_type": "code",
 345 |         "colab": {}
 346 |       },
 347 |       "source": [
 348 |         "!python -c 'import nltk; nltk.download(\"punkt\")'"
 349 |       ],
 350 |       "execution_count": 0,
 351 |       "outputs": []
 352 |     },
 353 |     {
 354 |       "cell_type": "code",
 355 |       "metadata": {
 356 |         "scrolled": true,
 357 |         "id": "6RlaXDBtoupY",
 358 |         "colab_type": "code",
 359 |         "colab": {}
 360 |       },
 361 |       "source": [
 362 |         "roberta_tokenizer = tokenizer = RobertaTokenizer.from_pretrained('roberta-base')\n",
 363 |         "punkt_sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')"
 364 |       ],
 365 |       "execution_count": 0,
 366 |       "outputs": []
 367 |     },
 368 |     {
 369 |       "cell_type": "code",
 370 |       "metadata": {
 371 |         "id": "RKFM-0bqoupb",
 372 |         "colab_type": "code",
 373 |         "colab": {}
 374 |       },
 375 |       "source": [
 376 |         "roberta_preproc = RobertaPreprocessor(roberta_tokenizer, punkt_sentence_detector)"
 377 |       ],
 378 |       "execution_count": 0,
 379 |       "outputs": []
 380 |     },
 381 |     {
 382 |       "cell_type": "code",
 383 |       "metadata": {
 384 |         "id": "jDq61UIRoupf",
 385 |         "colab_type": "code",
 386 |         "colab": {}
 387 |       },
 388 |       "source": [
 389 |         "#apply the preprocessor on the exploded dataframe\n",
 390 |         "data_df_task_b['text'] = data_df_task_b['text'].map(roberta_preproc.add_special_tokens)\n"
 391 |       ],
 392 |       "execution_count": 0,
 393 |       "outputs": []
 394 |     },
 395 |     {
 396 |       "cell_type": "code",
 397 |       "metadata": {
 398 |         "id": "kocu45Xtoupj",
 399 |         "colab_type": "code",
 400 |         "colab": {}
 401 |       },
 402 |       "source": [
 403 |         "with pd.option_context('display.max_colwidth', -1): \n",
 404 |         "    print(data_df_task_b[['text','label']].sample(5))"
 405 |       ],
 406 |       "execution_count": 0,
 407 |       "outputs": []
 408 |     },
 409 |     {
 410 |       "cell_type": "markdown",
 411 |       "metadata": {
 412 |         "id": "0NOwcyecoupm",
 413 |         "colab_type": "text"
 414 |       },
 415 |       "source": [
 416 |         "### Here we create the dataset"
 417 |       ]
 418 |     },
 419 |     {
 420 |       "cell_type": "code",
 421 |       "metadata": {
 422 |         "id": "KyQMayxOoupn",
 423 |         "colab_type": "code",
 424 |         "colab": {}
 425 |       },
 426 |       "source": [
 427 |         "class SimpleVectorizer():\n",
 428 |         "    def __init__(self,tokenizer: Callable, max_seq_len: int):\n",
 429 |         "        \"\"\"\n",
 430 |         "        Args:\n",
 431 |         "            tokenizer (Callable): transformer tokenizer\n",
 432 |         "            max_seq_len (int): Maximum sequence lenght \n",
 433 |         "        \"\"\"\n",
 434 |         "        self.tokenizer = tokenizer\n",
 435 |         "        self._max_seq_len = max_seq_len\n",
 436 |         "\n",
 437 |         "    def vectorize(self,text :str):\n",
 438 |         "        \n",
 439 |         "        encoded = self.tokenizer.encode_plus(\n",
 440 |         "            text,\n",
 441 |         "            add_special_tokens=False, #already added by preproc\n",
 442 |         "            max_length = self._max_seq_len,\n",
 443 |         "            pad_to_max_length = True,\n",
 444 |         "        )\n",
 445 |         "        ids =  np.array(encoded['input_ids'], dtype=np.int64)\n",
 446 |         "        attn = np.array(encoded['attention_mask'], dtype=np.int64)\n",
 447 |         "        \n",
 448 |         "        return ids, attn"
 449 |       ],
 450 |       "execution_count": 0,
 451 |       "outputs": []
 452 |     },
 453 |     {
 454 |       "cell_type": "code",
 455 |       "metadata": {
 456 |         "id": "JI0SzK1Woupw",
 457 |         "colab_type": "code",
 458 |         "colab": {}
 459 |       },
 460 |       "source": [
 461 |         "class HateDataset(Dataset):\n",
 462 |         "    def __init__(\n",
 463 |         "        self,\n",
 464 |         "        data_df: pd.DataFrame,\n",
 465 |         "        tokenizer: Callable,\n",
 466 |         "        max_seq_length:int = None,\n",
 467 |         "    ):\n",
 468 |         "        \"\"\"\n",
 469 |         "        Args:\n",
 470 |         "            data_df (pandas.DataFrame): df containing the labels and text\n",
 471 |         "            tokenizer (tokenizer module for the transformer)\n",
 472 |         "        \"\"\"\n",
 473 |         "        self.data_df = data_df\n",
 474 |         "        self.tokenizer = tokenizer\n",
 475 |         "\n",
 476 |         "        if max_seq_length is None:\n",
 477 |         "            self._max_seq_length = self._get_max_len(data_df,tokenizer)\n",
 478 |         "        else:\n",
 479 |         "            self._max_seq_length = max_seq_length\n",
 480 |         "\n",
 481 |         "        self.train_df = self.data_df[self.data_df.split == 'train']\n",
 482 |         "        self.train_size = len(self.train_df)\n",
 483 |         "\n",
 484 |         "        self.val_df = self.data_df[self.data_df.split == 'val']\n",
 485 |         "        self.val_size = len(self.val_df)\n",
 486 |         "\n",
 487 |         "        self.test_df = self.data_df[self.data_df.split == 'test']\n",
 488 |         "        self.test_size = len(self.test_df)\n",
 489 |         "        \n",
 490 |         "        self._simple_vectorizer = SimpleVectorizer(tokenizer, self._max_seq_length)\n",
 491 |         "        \n",
 492 |         "        self._lookup_dict = {\n",
 493 |         "            'train': (self.train_df, self.train_size),\n",
 494 |         "            'val': (self.val_df, self.val_size),\n",
 495 |         "            'test': (self.test_df, self.test_size)\n",
 496 |         "        }\n",
 497 |         "\n",
 498 |         "        self.set_split('train')\n",
 499 |         "\n",
 500 |         "        class_counts = data_df.label.value_counts().to_dict()\n",
 501 |         "         #sorted on the basis of class label,eg, 0,1,2..\n",
 502 |         "        cts = sorted([(lbl,cts) for lbl,cts in class_counts.items()], key=lambda x: x[0])\n",
 503 |         "        freq = [ x[1] for x in cts ]\n",
 504 |         "        # print(freq,cts)\n",
 505 |         "        self.class_weights = 1.0/ torch.tensor(freq, dtype=torch.float32)\n",
 506 |         "    \n",
 507 |         "    \n",
 508 |         "    def _get_max_len(self,data_df: pd.DataFrame, tokenizer: Callable):\n",
 509 |         "        len_func = lambda x: len(self.tokenizer.encode_plus(x)['input_ids'])\n",
 510 |         "        max_len = data_df.text.map(len_func).max() \n",
 511 |         "        return max_len\n",
 512 |         "\n",
 513 |         "    def set_split(self, split=\"train\"):\n",
 514 |         "        \"\"\" selects the splits in the dataset using a column in the dataframe \"\"\"\n",
 515 |         "        self._target_split = split\n",
 516 |         "        self._target_df, self._target_size = self._lookup_dict[split]\n",
 517 |         "    \n",
 518 |         "    def __len__(self):\n",
 519 |         "        return self._target_size\n",
 520 |         "    \n",
 521 |         "    def __getitem__(self, index):\n",
 522 |         "        \"\"\"the primary entry point method for PyTorch datasets\n",
 523 |         "        \n",
 524 |         "        Args:\n",
 525 |         "            index (int): the index to the data point \n",
 526 |         "        Returns:\n",
 527 |         "            a dictionary holding the data point's features (x_data) and label (y_target)\n",
 528 |         "        \"\"\"\n",
 529 |         "        row = self._target_df.iloc[index]\n",
 530 |         "        \n",
 531 |         "        indices, attention_masks = self._simple_vectorizer.vectorize(row.text)\n",
 532 |         "\n",
 533 |         "        label = row.label\n",
 534 |         "        return {'x_data': indices,\n",
 535 |         "                'x_attn_mask': attention_masks,\n",
 536 |         "                'x_index': index,\n",
 537 |         "                'y_target': label}\n",
 538 |         "    \n",
 539 |         "    def get_num_batches(self, batch_size):\n",
 540 |         "        \"\"\"Given a batch size, return the number of batches in the dataset\n",
 541 |         "        \n",
 542 |         "        Args:\n",
 543 |         "            batch_size (int)\n",
 544 |         "        Returns:\n",
 545 |         "            number of batches in the dataset\n",
 546 |         "        \"\"\"\n",
 547 |         "        return len(self) // batch_size"
 548 |       ],
 549 |       "execution_count": 0,
 550 |       "outputs": []
 551 |     },
 552 |     {
 553 |       "cell_type": "code",
 554 |       "metadata": {
 555 |         "id": "PGvvsTq9oupy",
 556 |         "colab_type": "code",
 557 |         "colab": {}
 558 |       },
 559 |       "source": [
 560 |         "def generate_batches(dataset, batch_size, shuffle=True,\n",
 561 |         "                     drop_last=False, device=\"cpu\", pinned_memory = False, n_workers = 0): \n",
 562 |         "    \"\"\"\n",
 563 |         "    A generator function which wraps the PyTorch DataLoader. It will \n",
 564 |         "      ensure each tensor is on the write device location.\n",
 565 |         "    \"\"\"\n",
 566 |         "    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,\n",
 567 |         "                            shuffle=shuffle, drop_last=drop_last,\n",
 568 |         "                            pin_memory= pinned_memory,\n",
 569 |         "                            num_workers = n_workers,\n",
 570 |         "                            )\n",
 571 |         "    \n",
 572 |         "    for data_dict in dataloader:\n",
 573 |         "        out_data_dict = {}\n",
 574 |         "        out_data_dict['x_data'] = data_dict['x_data'].to(\n",
 575 |         "            device, non_blocking= (True if pinned_memory else False) \n",
 576 |         "        )\n",
 577 |         "        out_data_dict['x_attn_mask'] = data_dict['x_attn_mask'].to(\n",
 578 |         "            device, non_blocking= (True if pinned_memory else False) \n",
 579 |         "        )\n",
 580 |         "        out_data_dict['x_index'] = data_dict['x_index']\n",
 581 |         "        out_data_dict['y_target'] = data_dict['y_target'].to(\n",
 582 |         "            device, non_blocking= (True if pinned_memory else False) \n",
 583 |         "        )\n",
 584 |         "        yield out_data_dict"
 585 |       ],
 586 |       "execution_count": 0,
 587 |       "outputs": []
 588 |     },
 589 |     {
 590 |       "cell_type": "code",
 591 |       "metadata": {
 592 |         "id": "dqjxEQKZoup0",
 593 |         "colab_type": "code",
 594 |         "colab": {}
 595 |       },
 596 |       "source": [
 597 |         "dataset = HateDataset(\n",
 598 |         "    data_df = data_df_task_b,\n",
 599 |         "    tokenizer = roberta_tokenizer\n",
 600 |         ")"
 601 |       ],
 602 |       "execution_count": 0,
 603 |       "outputs": []
 604 |     },
 605 |     {
 606 |       "cell_type": "code",
 607 |       "metadata": {
 608 |         "scrolled": true,
 609 |         "id": "r1S0e8djoup3",
 610 |         "colab_type": "code",
 611 |         "colab": {}
 612 |       },
 613 |       "source": [
 614 |         "assert dataset._max_seq_length <= 1024"
 615 |       ],
 616 |       "execution_count": 0,
 617 |       "outputs": []
 618 |     },
 619 |     {
 620 |       "cell_type": "markdown",
 621 |       "metadata": {
 622 |         "id": "faAM7TdDoup5",
 623 |         "colab_type": "text"
 624 |       },
 625 |       "source": [
 626 |         "# Initialize the Roberta  model"
 627 |       ]
 628 |     },
 629 |     {
 630 |       "cell_type": "code",
 631 |       "metadata": {
 632 |         "scrolled": false,
 633 |         "id": "Sdjpj_fvoup6",
 634 |         "colab_type": "code",
 635 |         "colab": {}
 636 |       },
 637 |       "source": [
 638 |         "model = RobertaForSequenceClassification.from_pretrained(\n",
 639 |         "    'roberta-base',\n",
 640 |         "    num_labels=len(set(data_df_task_b.label)),\n",
 641 |         ")"
 642 |       ],
 643 |       "execution_count": 0,
 644 |       "outputs": []
 645 |     },
 646 |     {
 647 |       "cell_type": "code",
 648 |       "metadata": {
 649 |         "scrolled": true,
 650 |         "id": "7VAugC07oup8",
 651 |         "colab_type": "code",
 652 |         "colab": {}
 653 |       },
 654 |       "source": [
 655 |         "model.to(args.device)"
 656 |       ],
 657 |       "execution_count": 0,
 658 |       "outputs": []
 659 |     },
 660 |     {
 661 |       "cell_type": "code",
 662 |       "metadata": {
 663 |         "id": "2nf9iX9Eoup_",
 664 |         "colab_type": "code",
 665 |         "colab": {}
 666 |       },
 667 |       "source": [
 668 |         "early_stopping = transformer_general_utils.EarlyStopping(patience=4)"
 669 |       ],
 670 |       "execution_count": 0,
 671 |       "outputs": []
 672 |     },
 673 |     {
 674 |       "cell_type": "code",
 675 |       "metadata": {
 676 |         "id": "BjtphpU6s4LV",
 677 |         "colab_type": "code",
 678 |         "colab": {}
 679 |       },
 680 |       "source": [
 681 |         "!nvidia-smi"
 682 |       ],
 683 |       "execution_count": 0,
 684 |       "outputs": []
 685 |     },
 686 |     {
 687 |       "cell_type": "code",
 688 |       "metadata": {
 689 |         "id": "ZjFXYTcEouqB",
 690 |         "colab_type": "code",
 691 |         "colab": {}
 692 |       },
 693 |       "source": [
 694 |         "args.num_epochs = 20\n",
 695 |         "args.batch_size = 16 #set according to GPU capacity"
 696 |       ],
 697 |       "execution_count": 0,
 698 |       "outputs": []
 699 |     },
 700 |     {
 701 |       "cell_type": "code",
 702 |       "metadata": {
 703 |         "scrolled": false,
 704 |         "id": "OYfdxscNouqE",
 705 |         "colab_type": "code",
 706 |         "colab": {}
 707 |       },
 708 |       "source": [
 709 |         "loss_func = nn.CrossEntropyLoss()\n",
 710 |         "\n",
 711 |         "print(f'Using LR:{args.learning_rate}')\n",
 712 |         "base_optimizer = RAdam(model.parameters(), lr = args.learning_rate)\n",
 713 |         "optimizer = Lookahead(optimizer = base_optimizer, k = 5, alpha=0.5 )\n",
 714 |         "scheduler = optim.lr_scheduler.ReduceLROnPlateau(\n",
 715 |         "    optimizer=optimizer.optimizer, factor =0.1 ,mode='max',\n",
 716 |         ")"
 717 |       ],
 718 |       "execution_count": 0,
 719 |       "outputs": []
 720 |     },
 721 |     {
 722 |       "cell_type": "markdown",
 723 |       "metadata": {
 724 |         "id": "HvhB0DIPouqH",
 725 |         "colab_type": "text"
 726 |       },
 727 |       "source": [
 728 |         "# Begin Training"
 729 |       ]
 730 |     },
 731 |     {
 732 |       "cell_type": "code",
 733 |       "metadata": {
 734 |         "id": "ta4xhZcdouqH",
 735 |         "colab_type": "code",
 736 |         "colab": {}
 737 |       },
 738 |       "source": [
 739 |         "train_state = general_utils.make_train_state()\n",
 740 |         "train_state.keys()\n"
 741 |       ],
 742 |       "execution_count": 0,
 743 |       "outputs": []
 744 |     },
 745 |     {
 746 |       "cell_type": "code",
 747 |       "metadata": {
 748 |         "scrolled": false,
 749 |         "id": "Hyp2Q5ReouqK",
 750 |         "colab_type": "code",
 751 |         "colab": {}
 752 |       },
 753 |       "source": [
 754 |         "epoch_bar = notebook.tqdm(\n",
 755 |         "    desc = 'training_routine',\n",
 756 |         "    total = args.num_epochs,\n",
 757 |         "    position=0,\n",
 758 |         "    leave = True,\n",
 759 |         ")\n",
 760 |         "dataset.set_split('train')\n",
 761 |         "train_bar = notebook.tqdm(\n",
 762 |         "    desc = 'split=train ',\n",
 763 |         "    total=dataset.get_num_batches(args.batch_size),\n",
 764 |         "    position=0,\n",
 765 |         "    leave=True,\n",
 766 |         ")\n",
 767 |         "dataset.set_split('val')\n",
 768 |         "eval_bar = notebook.tqdm(\n",
 769 |         "    desc = 'split=eval',\n",
 770 |         "    total=dataset.get_num_batches(args.batch_size),\n",
 771 |         "    position=0,\n",
 772 |         "    leave=True,\n",
 773 |         ")\n",
 774 |         "\n",
 775 |         "old_val_acc = 0\n",
 776 |         "old_f1 = 0\n",
 777 |         "model_state = None\n",
 778 |         "for epoch_index in range(args.num_epochs):\n",
 779 |         "    train_state['epoch_in'] = epoch_index\n",
 780 |         "\n",
 781 |         "    dataset.set_split('train')\n",
 782 |         "\n",
 783 |         "    batch_generator = generate_batches(\n",
 784 |         "        dataset= dataset, batch_size= args.batch_size, shuffle=True,\n",
 785 |         "        device = args.device, drop_last=False,\n",
 786 |         "        pinned_memory = True, n_workers = 3, \n",
 787 |         "    )\n",
 788 |         "\n",
 789 |         "    running_loss = 0.0\n",
 790 |         "    running_acc = 0.0\n",
 791 |         "    running_f1 = 0.0\n",
 792 |         "    model.train()\n",
 793 |         "\n",
 794 |         "    train_bar.reset(\n",
 795 |         "        total=dataset.get_num_batches(args.batch_size),\n",
 796 |         "    )\n",
 797 |         "\n",
 798 |         "    for batch_index, batch_dict in enumerate(batch_generator):\n",
 799 |         "        optimizer.zero_grad()\n",
 800 |         "        \n",
 801 |         "        loss,y_pred = model(\n",
 802 |         "            input_ids = batch_dict['x_data'],\n",
 803 |         "            attention_mask =  batch_dict['x_attn_mask'],\n",
 804 |         "            labels= batch_dict['y_target'].unsqueeze(1),\n",
 805 |         "        )[:2]\n",
 806 |         "        \n",
 807 |         "        y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))\n",
 808 |         "                             \n",
 809 |         "        loss.backward()\n",
 810 |         "        optimizer.step()\n",
 811 |         "                             \n",
 812 |         "        loss_t = loss.item()\n",
 813 |         "        running_loss += (loss_t - running_loss) / (batch_index + 1)\n",
 814 |         "                             \n",
 815 |         "        y_pred = y_pred.detach().cpu()\n",
 816 |         "        batch_dict['y_target'] = batch_dict['y_target'].cpu()\n",
 817 |         "        \n",
 818 |         "        acc_t = transformer_general_utils \\\n",
 819 |         "            .compute_accuracy(y_pred, batch_dict['y_target'])\n",
 820 |         "        \n",
 821 |         "        f1_t = transformer_general_utils \\\n",
 822 |         "            .compute_macro_f1(y_pred, batch_dict['y_target'])\n",
 823 |         "\n",
 824 |         "        train_state['batch_preds'].append(y_pred)\n",
 825 |         "        train_state['batch_targets'].append(batch_dict['y_target'])\n",
 826 |         "        train_state['batch_indexes'].append(batch_dict['x_index'])\n",
 827 |         "\n",
 828 |         "        running_acc += (acc_t - running_acc) / (batch_index + 1)\n",
 829 |         "        running_f1 += (f1_t - running_f1) / (batch_index + 1)\n",
 830 |         "\n",
 831 |         "        train_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,\n",
 832 |         "                             epoch=epoch_index)\n",
 833 |         "\n",
 834 |         "        train_bar.update()\n",
 835 |         "\n",
 836 |         "    if torch.cuda.is_available():\n",
 837 |         "        torch.cuda.empty_cache()\n",
 838 |         "    \n",
 839 |         "    train_state['train_accuracies'].append(running_acc)\n",
 840 |         "    train_state['train_losses'].append(running_loss)\n",
 841 |         "    \n",
 842 |         "    train_state['train_preds'].append(\n",
 843 |         "        torch.cat(train_state['batch_preds']).cpu()\n",
 844 |         "    )\n",
 845 |         "    train_state['train_targets'].append(\n",
 846 |         "        torch.cat(train_state['batch_targets']).cpu()\n",
 847 |         "    )\n",
 848 |         "    train_state['train_indexes'].append(\n",
 849 |         "        torch.cat(train_state['batch_indexes']).cpu()\n",
 850 |         "    )\n",
 851 |         "    train_f1 = transformer_general_utils \\\n",
 852 |         "                .compute_macro_f1(train_state['train_preds'][-1],\n",
 853 |         "                                  train_state['train_targets'][-1],\n",
 854 |         "                                 )\n",
 855 |         "                                 \n",
 856 |         "    train_state['train_f1s'].append(train_f1)\n",
 857 |         "    \n",
 858 |         "    train_state['batch_preds'] = []\n",
 859 |         "    train_state['batch_targets'] = []\n",
 860 |         "    train_state['batch_indexes'] = []\n",
 861 |         "    \n",
 862 |         "    \n",
 863 |         "    dataset.set_split('val')\n",
 864 |         "    batch_generator = generate_batches(\n",
 865 |         "        dataset= dataset, batch_size= args.batch_size, shuffle=True,\n",
 866 |         "        device = args.device, drop_last=False,\n",
 867 |         "        pinned_memory = True, n_workers = 2, \n",
 868 |         "    )\n",
 869 |         "    eval_bar.reset(\n",
 870 |         "        total=dataset.get_num_batches(args.batch_size),\n",
 871 |         "    )\n",
 872 |         "    running_loss = 0.0\n",
 873 |         "    running_acc = 0.0\n",
 874 |         "    running_f1 = 0.0\n",
 875 |         "    \n",
 876 |         "    model.eval()\n",
 877 |         "    with torch.no_grad():\n",
 878 |         "        optimizer._backup_and_load_cache()\n",
 879 |         "        for batch_index, batch_dict in enumerate(batch_generator):\n",
 880 |         "            loss, y_pred = model(\n",
 881 |         "                input_ids = batch_dict['x_data'],\n",
 882 |         "                attention_mask =  batch_dict['x_attn_mask'],\n",
 883 |         "                labels= batch_dict['y_target'].unsqueeze(1),\n",
 884 |         "            )[:2]\n",
 885 |         "            y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))\n",
 886 |         "            \n",
 887 |         "            loss_t = loss.item()\n",
 888 |         "            running_loss += (loss_t - running_loss) / (batch_index + 1)\n",
 889 |         "\n",
 890 |         "            y_pred = y_pred.detach()\n",
 891 |         "            batch_dict['y_target'] = batch_dict['y_target'].cpu()\n",
 892 |         "            \n",
 893 |         "            acc_t = transformer_general_utils\\\n",
 894 |         "                .compute_accuracy(y_pred, batch_dict['y_target'])\n",
 895 |         "            f1_t = transformer_general_utils \\\n",
 896 |         "                .compute_macro_f1(y_pred, batch_dict['y_target'])\n",
 897 |         "\n",
 898 |         "            train_state['batch_preds'].append(y_pred.cpu())\n",
 899 |         "            train_state['batch_targets'].append(batch_dict['y_target'].cpu())\n",
 900 |         "            train_state['batch_indexes'].append(batch_dict['x_index'].cpu())\n",
 901 |         "\n",
 902 |         "            running_acc += (acc_t - running_acc) / (batch_index + 1)\n",
 903 |         "            running_f1 += (f1_t - running_f1) / (batch_index + 1)\n",
 904 |         "            \n",
 905 |         "\n",
 906 |         "            eval_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,\n",
 907 |         "                                 epoch=epoch_index)\n",
 908 |         "            eval_bar.update()\n",
 909 |         "            \n",
 910 |         "    train_state['val_accuracies'].append(running_acc)\n",
 911 |         "    train_state['val_losses'].append(running_loss)\n",
 912 |         "    \n",
 913 |         "        \n",
 914 |         "    train_state['val_preds'].append(\n",
 915 |         "        torch.cat(train_state['batch_preds']).cpu()\n",
 916 |         "    )\n",
 917 |         "\n",
 918 |         "    train_state['val_targets'].append(\n",
 919 |         "        torch.cat(train_state['batch_targets']).cpu()\n",
 920 |         "    )\n",
 921 |         "    train_state['val_indexes'].append(\n",
 922 |         "        torch.cat(train_state['batch_indexes']).cpu()\n",
 923 |         "    )\n",
 924 |         "    val_f1 = transformer_general_utils \\\n",
 925 |         "                .compute_macro_f1(train_state['val_preds'][-1],\n",
 926 |         "                                  train_state['val_targets'][-1],\n",
 927 |         "                                 )\n",
 928 |         "                                 \n",
 929 |         "    train_state['val_f1s'].append(val_f1)\n",
 930 |         "    \n",
 931 |         "    train_state['batch_preds'] = []\n",
 932 |         "    train_state['batch_targets'] = []\n",
 933 |         "    train_state['batch_indexes'] = []\n",
 934 |         "    \n",
 935 |         "    torch.save(\n",
 936 |         "        {\n",
 937 |         "            'model':model.state_dict(),\n",
 938 |         "        },\n",
 939 |         "        args.directory + f'_epoc_{epoch_index}_' + args.model_name,\n",
 940 |         "    )\n",
 941 |         "    \n",
 942 |         "    scheduler.step(val_f1)\n",
 943 |         "    early_stopping(val_f1, model)\n",
 944 |         "    optimizer._clear_and_load_backup()\n",
 945 |         "    epoch_bar.set_postfix( best_f1 = early_stopping.best_score, current = val_f1)\n",
 946 |         "    epoch_bar.update()    \n",
 947 |         "    \n",
 948 |         "    if early_stopping.early_stop:\n",
 949 |         "        print(\"Early stopping\")\n",
 950 |         "        break\n"
 951 |       ],
 952 |       "execution_count": 0,
 953 |       "outputs": []
 954 |     },
 955 |     {
 956 |       "cell_type": "code",
 957 |       "metadata": {
 958 |         "id": "JvhJbjv3ouqM",
 959 |         "colab_type": "code",
 960 |         "colab": {}
 961 |       },
 962 |       "source": [
 963 |         "epoch_index"
 964 |       ],
 965 |       "execution_count": 0,
 966 |       "outputs": []
 967 |     },
 968 |     {
 969 |       "cell_type": "code",
 970 |       "metadata": {
 971 |         "scrolled": true,
 972 |         "id": "UZ_tiTQsouqQ",
 973 |         "colab_type": "code",
 974 |         "colab": {}
 975 |       },
 976 |       "source": [
 977 |         "print(train_state['val_f1s'])"
 978 |       ],
 979 |       "execution_count": 0,
 980 |       "outputs": []
 981 |     },
 982 |     {
 983 |       "cell_type": "code",
 984 |       "metadata": {
 985 |         "id": "pGTvvqJOouqS",
 986 |         "colab_type": "code",
 987 |         "colab": {}
 988 |       },
 989 |       "source": [
 990 |         "from sklearn.metrics import classification_report\n",
 991 |         "from sklearn.metrics import confusion_matrix\n",
 992 |         "from sklearn.metrics import accuracy_score\n",
 993 |         "from sklearn.metrics import f1_score"
 994 |       ],
 995 |       "execution_count": 0,
 996 |       "outputs": []
 997 |     },
 998 |     {
 999 |       "cell_type": "code",
1000 |       "metadata": {
1001 |         "id": "zmsdTS5XouqU",
1002 |         "colab_type": "code",
1003 |         "colab": {}
1004 |       },
1005 |       "source": [
1006 |         "\n",
1007 |         "print('Train:',classification_report(\n",
1008 |         "    y_pred=(torch.argmax(train_state['train_preds'][-1],dim=1) ).cpu().long().numpy(),\n",
1009 |         "    y_true= train_state['train_targets'][-1].cpu().numpy(), \n",
1010 |         "    digits=4)\n",
1011 |         ")\n",
1012 |         "print('Dev:',classification_report(\n",
1013 |         "    y_pred=(torch.argmax(train_state['val_preds'][-1],dim=1) ).cpu().long().numpy(),\n",
1014 |         "    y_true= train_state['val_targets'][-1].cpu().numpy(), \n",
1015 |         "    digits=4)\n",
1016 |         ")\n"
1017 |       ],
1018 |       "execution_count": 0,
1019 |       "outputs": []
1020 |     },
1021 |     {
1022 |       "cell_type": "code",
1023 |       "metadata": {
1024 |         "id": "dBvhO_3MouqX",
1025 |         "colab_type": "code",
1026 |         "colab": {}
1027 |       },
1028 |       "source": [
1029 |         "best_run_index = train_state['val_f1s'].index(max(train_state['val_f1s']))\n",
1030 |         "print('Train:',classification_report(\n",
1031 |         "    y_pred=(torch.argmax(train_state['train_preds'][best_run_index],dim=1) ).cpu().long().numpy(),\n",
1032 |         "    y_true= train_state['train_targets'][best_run_index].cpu().numpy(), \n",
1033 |         "    digits=4)\n",
1034 |         ")\n",
1035 |         "print('Dev:',classification_report(\n",
1036 |         "    y_pred=(torch.argmax(train_state['val_preds'][best_run_index],dim=1) ).cpu().long().numpy(),\n",
1037 |         "    y_true= train_state['val_targets'][best_run_index].cpu().numpy(), \n",
1038 |         "    digits=4)\n",
1039 |         ")"
1040 |       ],
1041 |       "execution_count": 0,
1042 |       "outputs": []
1043 |     },
1044 |     {
1045 |       "cell_type": "code",
1046 |       "metadata": {
1047 |         "id": "ZyJlr1Ucouqa",
1048 |         "colab_type": "code",
1049 |         "colab": {}
1050 |       },
1051 |       "source": [
1052 |         "def sort_preds(indexes, preds):\n",
1053 |         "    \"\"\"Sorts the predictions in order, to reverse the effects of shuffle\n",
1054 |         "    done by dataloader\"\"\"\n",
1055 |         "    indexes = indexes.cpu().numpy().reshape(-1,1)\n",
1056 |         "    preds = preds.cpu().numpy()\n",
1057 |         "    arr_concat = np.hstack((indexes,preds)) #concat the preds and their indexes\n",
1058 |         "    sort_arr = arr_concat[ arr_concat[:,0].argsort()] #sort based on the indexes\n",
1059 |         "    sorted_preds = np.delete(sort_arr,0,axis=1)\n",
1060 |         "    return sorted_preds"
1061 |       ],
1062 |       "execution_count": 0,
1063 |       "outputs": []
1064 |     },
1065 |     {
1066 |       "cell_type": "code",
1067 |       "metadata": {
1068 |         "id": "PEKLcktCourg",
1069 |         "colab_type": "code",
1070 |         "colab": {}
1071 |       },
1072 |       "source": [
1073 |         "def get_optimal_models_v2(train_state, split):\n",
1074 |         "    l = zip(train_state[f'{split}_f1s'], range(len(train_state[f'{split}_f1s'])))\n",
1075 |         "    sorted_vals = sorted(l, key = lambda x:x[0], reverse=True)\n",
1076 |         "    model_idxes = [i[1] for i in sorted_vals]\n",
1077 |         "    \n",
1078 |         "    trgts= sort_preds(train_state[f'{split}_indexes'][-1],train_state[f'{split}_targets'][-1].reshape(-1,1))\n",
1079 |         "    total_preds = len(train_state[f'{split}_indexes'])\n",
1080 |         "    init = np.zeros(train_state[f'{split}_preds'][-1].shape)\n",
1081 |         "    max_f1 = 0\n",
1082 |         "    idxes = []\n",
1083 |         "    for i in model_idxes:\n",
1084 |         "        temp = sort_preds(train_state[f'{split}_indexes'][i],train_state[f'{split}_preds'][i])\n",
1085 |         "        temp2 = init+temp\n",
1086 |         "        f1 = f1_score(\n",
1087 |         "            y_pred=temp2.argmax(axis=1),\n",
1088 |         "            y_true= trgts, average ='macro'\n",
1089 |         "        )\n",
1090 |         "        if f1 > max_f1:\n",
1091 |         "            max_f1 = f1\n",
1092 |         "            init = init+temp\n",
1093 |         "            idxes.append(i)\n",
1094 |         "    print(f'Taking preds from {idxes} | Dev f1:{f1}')\n",
1095 |         "    return idxes"
1096 |       ],
1097 |       "execution_count": 0,
1098 |       "outputs": []
1099 |     },
1100 |     {
1101 |       "cell_type": "code",
1102 |       "metadata": {
1103 |         "scrolled": true,
1104 |         "id": "RI0eIVAtourj",
1105 |         "colab_type": "code",
1106 |         "colab": {}
1107 |       },
1108 |       "source": [
1109 |         "final_optimal_models = get_optimal_models_v2(train_state, 'val')\n",
1110 |         "final_optimal_models"
1111 |       ],
1112 |       "execution_count": 0,
1113 |       "outputs": []
1114 |     },
1115 |     {
1116 |       "cell_type": "markdown",
1117 |       "metadata": {
1118 |         "id": "siZdY82mours",
1119 |         "colab_type": "text"
1120 |       },
1121 |       "source": [
1122 |         "# Making preds on the given test set"
1123 |       ]
1124 |     },
1125 |     {
1126 |       "cell_type": "code",
1127 |       "metadata": {
1128 |         "id": "Zcztc0lGourz",
1129 |         "colab_type": "code",
1130 |         "colab": {}
1131 |       },
1132 |       "source": [
1133 |         "test_df = data_df_task_b"
1134 |       ],
1135 |       "execution_count": 0,
1136 |       "outputs": []
1137 |     },
1138 |     {
1139 |       "cell_type": "code",
1140 |       "metadata": {
1141 |         "id": "5hkU-POXour5",
1142 |         "colab_type": "code",
1143 |         "colab": {}
1144 |       },
1145 |       "source": [
1146 |         "test_dataset = dataset\n",
1147 |         "test_dataset.set_split('test')"
1148 |       ],
1149 |       "execution_count": 0,
1150 |       "outputs": []
1151 |     },
1152 |     {
1153 |       "cell_type": "code",
1154 |       "metadata": {
1155 |         "id": "Iaq8vD0Xour7",
1156 |         "colab_type": "code",
1157 |         "colab": {}
1158 |       },
1159 |       "source": [
1160 |         "test_dataset._target_df.sample(5)"
1161 |       ],
1162 |       "execution_count": 0,
1163 |       "outputs": []
1164 |     },
1165 |     {
1166 |       "cell_type": "code",
1167 |       "metadata": {
1168 |         "id": "M2PQOwWzousA",
1169 |         "colab_type": "code",
1170 |         "colab": {}
1171 |       },
1172 |       "source": [
1173 |         "print(len(test_df))\n",
1174 |         "print(test_dataset._target_df.split.value_counts())"
1175 |       ],
1176 |       "execution_count": 0,
1177 |       "outputs": []
1178 |     },
1179 |     {
1180 |       "cell_type": "code",
1181 |       "metadata": {
1182 |         "id": "uRfZ2GLHousC",
1183 |         "colab_type": "code",
1184 |         "colab": {}
1185 |       },
1186 |       "source": [
1187 |         "def evaluate_testset(model, state, dataset, split,args):\n",
1188 |         "    \"\"\"Returns the final layer output of our transformer model\n",
1189 |         "    Puts them in the '{split}_*' keys in the state dict\n",
1190 |         "    Args:\n",
1191 |         "        model: A pytorch transformers model\n",
1192 |         "        state: dict to store outputs\n",
1193 |         "        dataset: A pytorch Dataset\n",
1194 |         "        split: The split on which to evaluate the model on\n",
1195 |         "        args: Arguments from namespace, etc\n",
1196 |         "    Returns:\n",
1197 |         "        state: all evaluated output stored in the \"test\" key\n",
1198 |         "    \"\"\"\n",
1199 |         "    eval_bar = notebook.tqdm(\n",
1200 |         "        desc = 'evaluation progress: ',\n",
1201 |         "        total=dataset.get_num_batches(args.batch_size),\n",
1202 |         "        position=0,\n",
1203 |         "        leave=False,\n",
1204 |         "    )\n",
1205 |         "    dataset.set_split(split)\n",
1206 |         "    batch_generator = generate_batches(\n",
1207 |         "        dataset= dataset, batch_size= args.batch_size, shuffle=False,\n",
1208 |         "        device = args.device, drop_last=False,\n",
1209 |         "        pinned_memory = True, n_workers = 2, \n",
1210 |         "    )\n",
1211 |         "    eval_bar.reset(\n",
1212 |         "        total=dataset.get_num_batches(args.batch_size),\n",
1213 |         "    )\n",
1214 |         "    model.eval()\n",
1215 |         "    with torch.no_grad():\n",
1216 |         "        for batch_index, batch_dict in enumerate(batch_generator):\n",
1217 |         "            y_pred = model(\n",
1218 |         "                input_ids = batch_dict['x_data'],\n",
1219 |         "                attention_mask =  batch_dict['x_attn_mask'],\n",
1220 |         "            )[0]\n",
1221 |         "            y_pred = y_pred.view(-1, 3)\n",
1222 |         "\n",
1223 |         "            y_pred = y_pred.detach()\n",
1224 |         "            \n",
1225 |         "            state['batch_preds'].append(y_pred.cpu())\n",
1226 |         "            state['batch_indexes'].append(batch_dict['x_index'].cpu())\n",
1227 |         "            \n",
1228 |         "            eval_bar.update()\n",
1229 |         "            \n",
1230 |         "        if torch.cuda.is_available():\n",
1231 |         "            torch.cuda.empty_cache()\n",
1232 |         "    \n",
1233 |         "    state[f'{split}_preds'].append(\n",
1234 |         "        torch.cat(state['batch_preds']).cpu()\n",
1235 |         "    )\n",
1236 |         "    state[f'{split}_indexes'].append(\n",
1237 |         "        torch.cat(state['batch_indexes']).cpu()\n",
1238 |         "    )\n",
1239 |         "    \n",
1240 |         "    state['batch_preds'] = []\n",
1241 |         "    state['batch_indexes'] = []\n",
1242 |         "    \n",
1243 |         "    eval_bar.close()\n",
1244 |         "    return state"
1245 |       ],
1246 |       "execution_count": 0,
1247 |       "outputs": []
1248 |     },
1249 |     {
1250 |       "cell_type": "code",
1251 |       "metadata": {
1252 |         "id": "UDLAcVoOousD",
1253 |         "colab_type": "code",
1254 |         "colab": {}
1255 |       },
1256 |       "source": [
1257 |         "chosen_models = [all_model_paths[i] for i in final_optimal_models]"
1258 |       ],
1259 |       "execution_count": 0,
1260 |       "outputs": []
1261 |     },
1262 |     {
1263 |       "cell_type": "code",
1264 |       "metadata": {
1265 |         "id": "ZRhTG0jJousG",
1266 |         "colab_type": "code",
1267 |         "colab": {}
1268 |       },
1269 |       "source": [
1270 |         "test_state = general_utils.make_train_state()\n",
1271 |         "for model_path in notebook.tqdm(chosen_models, total=len(chosen_models)):\n",
1272 |         "    model.load_state_dict(torch.load(model_path)['model'])\n",
1273 |         "    test_state = evaluate_testset(model, test_state, test_dataset, 'test',args)"
1274 |       ],
1275 |       "execution_count": 0,
1276 |       "outputs": []
1277 |     },
1278 |     {
1279 |       "cell_type": "code",
1280 |       "metadata": {
1281 |         "id": "NyPLbSx_ousH",
1282 |         "colab_type": "code",
1283 |         "colab": {}
1284 |       },
1285 |       "source": [
1286 |         "test_state['test_preds'][-1].shape"
1287 |       ],
1288 |       "execution_count": 0,
1289 |       "outputs": []
1290 |     },
1291 |     {
1292 |       "cell_type": "code",
1293 |       "metadata": {
1294 |         "id": "x9zIQn2PousJ",
1295 |         "colab_type": "code",
1296 |         "colab": {}
1297 |       },
1298 |       "source": [
1299 |         "[test_state['test_preds'][i].size() for i in range(len(test_state['test_preds']))]"
1300 |       ],
1301 |       "execution_count": 0,
1302 |       "outputs": []
1303 |     },
1304 |     {
1305 |       "cell_type": "code",
1306 |       "metadata": {
1307 |         "id": "9k7efmwXousM",
1308 |         "colab_type": "code",
1309 |         "colab": {}
1310 |       },
1311 |       "source": [
1312 |         "len(test_dataset._target_df)"
1313 |       ],
1314 |       "execution_count": 0,
1315 |       "outputs": []
1316 |     },
1317 |     {
1318 |       "cell_type": "code",
1319 |       "metadata": {
1320 |         "id": "1-mAcbFFousO",
1321 |         "colab_type": "code",
1322 |         "colab": {}
1323 |       },
1324 |       "source": [
1325 |         "torch.zeros_like(test_state['test_preds'][0]).size()"
1326 |       ],
1327 |       "execution_count": 0,
1328 |       "outputs": []
1329 |     },
1330 |     {
1331 |       "cell_type": "code",
1332 |       "metadata": {
1333 |         "id": "Zw23sdIAousQ",
1334 |         "colab_type": "code",
1335 |         "colab": {}
1336 |       },
1337 |       "source": [
1338 |         "ensemble_pred = torch.zeros_like(test_state['test_preds'][0])\n",
1339 |         "for i in test_state['test_preds']:\n",
1340 |         "    ensemble_pred += i"
1341 |       ],
1342 |       "execution_count": 0,
1343 |       "outputs": []
1344 |     },
1345 |     {
1346 |       "cell_type": "code",
1347 |       "metadata": {
1348 |         "id": "f7l7scgnousU",
1349 |         "colab_type": "code",
1350 |         "colab": {}
1351 |       },
1352 |       "source": [
1353 |         "int_to_label = {0: 'TIN', 1:'UNT'}\n",
1354 |         "# {'UNT': 1, 'TIN': 0}"
1355 |       ],
1356 |       "execution_count": 0,
1357 |       "outputs": []
1358 |     },
1359 |     {
1360 |       "cell_type": "code",
1361 |       "metadata": {
1362 |         "id": "qIG3GtyDousW",
1363 |         "colab_type": "code",
1364 |         "colab": {}
1365 |       },
1366 |       "source": [
1367 |         "t = []\n",
1368 |         "for i in torch.argmax(ensemble_pred, dim=1):\n",
1369 |         "    t.append(int_to_label[i.item()])\n",
1370 |         "\n",
1371 |         "collections.Counter(t)"
1372 |       ],
1373 |       "execution_count": 0,
1374 |       "outputs": []
1375 |     },
1376 |     {
1377 |       "cell_type": "code",
1378 |       "metadata": {
1379 |         "id": "N43KI4P4ousY",
1380 |         "colab_type": "code",
1381 |         "colab": {}
1382 |       },
1383 |       "source": [
1384 |         "assert len(t) == len(test_df)"
1385 |       ],
1386 |       "execution_count": 0,
1387 |       "outputs": []
1388 |     },
1389 |     {
1390 |       "cell_type": "code",
1391 |       "metadata": {
1392 |         "id": "ANkfSvUSousa",
1393 |         "colab_type": "code",
1394 |         "colab": {}
1395 |       },
1396 |       "source": [
1397 |         "offeval_task_b_pred_analysis_df = pd.DataFrame(\n",
1398 |         "    data={\n",
1399 |         "        'id':test_df.id,\n",
1400 |         "        'text':test_df.tweet,\n",
1401 |         "        'label':t,\n",
1402 |         "    }\n",
1403 |         ")"
1404 |       ],
1405 |       "execution_count": 0,
1406 |       "outputs": []
1407 |     },
1408 |     {
1409 |       "cell_type": "code",
1410 |       "metadata": {
1411 |         "id": "8tYSn6VNousb",
1412 |         "colab_type": "code",
1413 |         "colab": {}
1414 |       },
1415 |       "source": [
1416 |         "offeval_task_b_label_df = pd.DataFrame(\n",
1417 |         "    data={\n",
1418 |         "        'id':test_df.id,\n",
1419 |         "        'label':t,\n",
1420 |         "    }\n",
1421 |         ")"
1422 |       ],
1423 |       "execution_count": 0,
1424 |       "outputs": []
1425 |     },
1426 |     {
1427 |       "cell_type": "code",
1428 |       "metadata": {
1429 |         "id": "ompoxc6Nousc",
1430 |         "colab_type": "code",
1431 |         "colab": {}
1432 |       },
1433 |       "source": [
1434 |         "offeval_task_b_pred_analysis_df.to_csv(\n",
1435 |         "    'offeval_task_b_pred_analysis_df.csv',index=False,\n",
1436 |         ")"
1437 |       ],
1438 |       "execution_count": 0,
1439 |       "outputs": []
1440 |     },
1441 |     {
1442 |       "cell_type": "code",
1443 |       "metadata": {
1444 |         "id": "BHiGB2Q-ouse",
1445 |         "colab_type": "code",
1446 |         "colab": {}
1447 |       },
1448 |       "source": [
1449 |         "offeval_task_b_pred_label_df.to_csv(\n",
1450 |         "    'offeval_task_b_pred_label_df.csv', index=False, header=False,\n",
1451 |         ")"
1452 |       ],
1453 |       "execution_count": 0,
1454 |       "outputs": []
1455 |     },
1456 |     {
1457 |       "cell_type": "code",
1458 |       "metadata": {
1459 |         "id": "8IVzPSmwousf",
1460 |         "colab_type": "code",
1461 |         "colab": {}
1462 |       },
1463 |       "source": [
1464 |         "offeval_task_b_pred_label_df.label.value_counts()\n"
1465 |       ],
1466 |       "execution_count": 0,
1467 |       "outputs": []
1468 |     },
1469 |     {
1470 |       "cell_type": "code",
1471 |       "metadata": {
1472 |         "id": "mutV5hWkoush",
1473 |         "colab_type": "code",
1474 |         "colab": {}
1475 |       },
1476 |       "source": [
1477 |         "offeval_task_b_pred_analysis_df.label.value_counts()"
1478 |       ],
1479 |       "execution_count": 0,
1480 |       "outputs": []
1481 |     }
1482 |   ]
1483 | }


--------------------------------------------------------------------------------
/notebooks/Eng Task C - Ensemble DistilRoberta AttnMask Dropout.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |   "nbformat": 4,
   3 |   "nbformat_minor": 0,
   4 |   "metadata": {
   5 |     "kernelspec": {
   6 |       "display_name": "Python 3",
   7 |       "language": "python",
   8 |       "name": "python3"
   9 |     },
  10 |     "language_info": {
  11 |       "codemirror_mode": {
  12 |         "name": "ipython",
  13 |         "version": 3
  14 |       },
  15 |       "file_extension": ".py",
  16 |       "mimetype": "text/x-python",
  17 |       "name": "python",
  18 |       "nbconvert_exporter": "python",
  19 |       "pygments_lexer": "ipython3",
  20 |       "version": "3.7.0"
  21 |     },
  22 |     "colab": {
  23 |       "name": "Eng Task C - Ensemble DistilRoberta AttnMask Dropout.ipynb",
  24 |       "provenance": []
  25 |     },
  26 |     "accelerator": "GPU"
  27 |   },
  28 |   "cells": [
  29 |     {
  30 |       "cell_type": "markdown",
  31 |       "metadata": {
  32 |         "id": "D_dUF2evouow",
  33 |         "colab_type": "text"
  34 |       },
  35 |       "source": [
  36 |         "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cozek/OffensEval2020-code/blob/master/notebooks/Eng%20Task%20C%20-%20Ensemble%20DistilRoberta%20AttnMask%20Dropout.ipynb)"
  37 |       ]
  38 |     },
  39 |     {
  40 |       "cell_type": "markdown",
  41 |       "metadata": {
  42 |         "id": "u7Uo50Chouox",
  43 |         "colab_type": "text"
  44 |       },
  45 |       "source": [
  46 |         "# Import Libraries\n",
  47 |         "\n",
  48 |         "At the time of our work, we used the following library versions\n",
  49 |         "- numpy 1.18.1\n",
  50 |         "- pandas 1.0.1\n",
  51 |         "- torch 1.2.0\n",
  52 |         "- Cuda 10.0\n",
  53 |         "- python 3.7.0\n",
  54 |         "- sklearn 0.22.1\n",
  55 |         "- tqdm 4.42.1\n",
  56 |         "- nltk 3.4.5"
  57 |       ]
  58 |     },
  59 |     {
  60 |       "cell_type": "code",
  61 |       "metadata": {
  62 |         "id": "vFPGTXG3ouox",
  63 |         "colab_type": "code",
  64 |         "colab": {}
  65 |       },
  66 |       "source": [
  67 |         "!git clone https://github.com/cozek/OffensEval2020-code/"
  68 |       ],
  69 |       "execution_count": 0,
  70 |       "outputs": []
  71 |     },
  72 |     {
  73 |       "cell_type": "code",
  74 |       "metadata": {
  75 |         "id": "FpGHYSkhouo0",
  76 |         "colab_type": "code",
  77 |         "colab": {}
  78 |       },
  79 |       "source": [
  80 |         "!git clone https://github.com/huggingface/transformers\n",
  81 |         "!pip install /content/transformers/"
  82 |       ],
  83 |       "execution_count": 0,
  84 |       "outputs": []
  85 |     },
  86 |     {
  87 |       "cell_type": "code",
  88 |       "metadata": {
  89 |         "id": "6om7VnoNouo3",
  90 |         "colab_type": "code",
  91 |         "colab": {}
  92 |       },
  93 |       "source": [
  94 |         "import sys\n",
  95 |         "sys.path.append('/content/OffensEval2020-code/src/')\n",
  96 |         "import collections\n",
  97 |         "from typing import Callable\n",
  98 |         "import numpy as np\n",
  99 |         "np.random.seed(42)\n",
 100 |         "import pandas as pd\n",
 101 |         "from tqdm import notebook\n",
 102 |         "import importlib\n",
 103 |         "import pprint\n",
 104 |         "import nltk\n",
 105 |         "import datetime\n",
 106 |         "import os\n",
 107 |         "from argparse import Namespace\n",
 108 |         "\n",
 109 |         "from collections import Counter"
 110 |       ],
 111 |       "execution_count": 0,
 112 |       "outputs": []
 113 |     },
 114 |     {
 115 |       "cell_type": "code",
 116 |       "metadata": {
 117 |         "id": "iyUPjzykouo5",
 118 |         "colab_type": "code",
 119 |         "colab": {}
 120 |       },
 121 |       "source": [
 122 |         "import utils.general as general_utils\n",
 123 |         "import utils.transformer.data as transformer_data_utils\n",
 124 |         "import utils.transformer.general as transformer_general_utils\n",
 125 |         "general_utils.set_seed_everywhere()"
 126 |       ],
 127 |       "execution_count": 0,
 128 |       "outputs": []
 129 |     },
 130 |     {
 131 |       "cell_type": "code",
 132 |       "metadata": {
 133 |         "id": "MoJmxSDPouo9",
 134 |         "colab_type": "code",
 135 |         "colab": {}
 136 |       },
 137 |       "source": [
 138 |         "import logging\n",
 139 |         "logging.basicConfig(level=logging.INFO) "
 140 |       ],
 141 |       "execution_count": 0,
 142 |       "outputs": []
 143 |     },
 144 |     {
 145 |       "cell_type": "code",
 146 |       "metadata": {
 147 |         "id": "2ehv7SLoouo_",
 148 |         "colab_type": "code",
 149 |         "colab": {}
 150 |       },
 151 |       "source": [
 152 |         "import torch\n",
 153 |         "import torch.nn as nn\n",
 154 |         "import torch.nn.functional as F\n",
 155 |         "import torch.optim as optim\n",
 156 |         "from torch.utils.data import Dataset, DataLoader\n",
 157 |         "torch.__version__ # we used version 1.2.0\n"
 158 |       ],
 159 |       "execution_count": 0,
 160 |       "outputs": []
 161 |     },
 162 |     {
 163 |       "cell_type": "code",
 164 |       "metadata": {
 165 |         "id": "TkS9WQy2oupC",
 166 |         "colab_type": "code",
 167 |         "colab": {}
 168 |       },
 169 |       "source": [
 170 |         "# Import RAdam and Lookahead\n",
 171 |         "from radam.radam import RAdam\n",
 172 |         "from lookahead.optimizer import Lookahead\n"
 173 |       ],
 174 |       "execution_count": 0,
 175 |       "outputs": []
 176 |     },
 177 |     {
 178 |       "cell_type": "code",
 179 |       "metadata": {
 180 |         "id": "71wul4V7oupF",
 181 |         "colab_type": "code",
 182 |         "colab": {}
 183 |       },
 184 |       "source": [
 185 |         "from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification"
 186 |       ],
 187 |       "execution_count": 0,
 188 |       "outputs": []
 189 |     },
 190 |     {
 191 |       "cell_type": "code",
 192 |       "metadata": {
 193 |         "id": "eoAlBoFYoupH",
 194 |         "colab_type": "code",
 195 |         "colab": {}
 196 |       },
 197 |       "source": [
 198 |         " args = Namespace(\n",
 199 |         "        #use cuda by default\n",
 200 |         "        device = 'cuda' if torch.cuda.is_available() else 'cpu',\n",
 201 |         "    \n",
 202 |         "        #set batch size and number of epochs\n",
 203 |         "        batch_size = 32,\n",
 204 |         "        num_epochs = 20,\n",
 205 |         "    \n",
 206 |         "        #set the learning rate\n",
 207 |         "        learning_rate = 0.0001,\n",
 208 |         "\n",
 209 |         "        #location of the train, dev and test csv\n",
 210 |         "        train_val_csv = '/content/OffensEval2020-code/data/eng/task_c_tiny.zip',\n",
 211 |         "        test_csv = '/content/OffensEval2020-code/data/test_data/test_a_tweets.tsv',\n",
 212 |         "    \n",
 213 |         "        #directory to save our models at\n",
 214 |         "        directory = './models/', \n",
 215 |         "        model_name = 'roberta_attn_trac_task_a.pt',\n",
 216 |         "     \n",
 217 |         "        date = datetime.datetime.now().strftime(\"%a_%d_%b_%Y/\"),\n",
 218 |         ")"
 219 |       ],
 220 |       "execution_count": 0,
 221 |       "outputs": []
 222 |     },
 223 |     {
 224 |       "cell_type": "markdown",
 225 |       "metadata": {
 226 |         "id": "Tt5X-gWsoupL",
 227 |         "colab_type": "text"
 228 |       },
 229 |       "source": [
 230 |         "## Model save location"
 231 |       ]
 232 |     },
 233 |     {
 234 |       "cell_type": "code",
 235 |       "metadata": {
 236 |         "id": "xMOYDefpoupM",
 237 |         "colab_type": "code",
 238 |         "colab": {}
 239 |       },
 240 |       "source": [
 241 |         "directory = args.directory + args.date\n",
 242 |         "if not os.path.exists(directory):\n",
 243 |         "    os.makedirs(directory)\n",
 244 |         "args.directory = directory\n",
 245 |         "print(args.directory)"
 246 |       ],
 247 |       "execution_count": 0,
 248 |       "outputs": []
 249 |     },
 250 |     {
 251 |       "cell_type": "markdown",
 252 |       "metadata": {
 253 |         "id": "thSqAbT3oupP",
 254 |         "colab_type": "text"
 255 |       },
 256 |       "source": [
 257 |         "## Load presplit dataset portion\n",
 258 |         "```\n",
 259 |         "Labelled as\n",
 260 |         "\n",
 261 |         "IND = 0\n",
 262 |         "GRP = 1\n",
 263 |         "OTH = 2\n",
 264 |         "```"
 265 |       ]
 266 |     },
 267 |     {
 268 |       "cell_type": "code",
 269 |       "metadata": {
 270 |         "id": "2WazWg4zoupP",
 271 |         "colab_type": "code",
 272 |         "colab": {}
 273 |       },
 274 |       "source": [
 275 |         "data_df_task_c = pd.read_csv(args.train_val_csv, compression='zip')\n",
 276 |         "print(data_df_task_c.label.value_counts())\n",
 277 |         "print(data_df_task_c.split.value_counts())"
 278 |       ],
 279 |       "execution_count": 0,
 280 |       "outputs": []
 281 |     },
 282 |     {
 283 |       "cell_type": "code",
 284 |       "metadata": {
 285 |         "id": "_dV3P48EphCH",
 286 |         "colab_type": "code",
 287 |         "colab": {}
 288 |       },
 289 |       "source": [
 290 |         "data_df_task_c.columns"
 291 |       ],
 292 |       "execution_count": 0,
 293 |       "outputs": []
 294 |     },
 295 |     {
 296 |       "cell_type": "code",
 297 |       "metadata": {
 298 |         "id": "LkUZ5O46oupS",
 299 |         "colab_type": "code",
 300 |         "colab": {}
 301 |       },
 302 |       "source": [
 303 |         "with pd.option_context('display.max_colwidth', -1): \n",
 304 |         "    print(data_df_task_c[['text','label']].sample(5))"
 305 |       ],
 306 |       "execution_count": 0,
 307 |       "outputs": []
 308 |     },
 309 |     {
 310 |       "cell_type": "markdown",
 311 |       "metadata": {
 312 |         "id": "NZrYLTrxoupU",
 313 |         "colab_type": "text"
 314 |       },
 315 |       "source": [
 316 |         "## Importing the Roberta Tokeniker and Punkt sentence tokenizer"
 317 |       ]
 318 |     },
 319 |     {
 320 |       "cell_type": "code",
 321 |       "metadata": {
 322 |         "id": "PtFADHnToupV",
 323 |         "colab_type": "code",
 324 |         "colab": {}
 325 |       },
 326 |       "source": [
 327 |         "class RobertaPreprocessor():\n",
 328 |         "    def __init__(self,transformer_tokenizer,sentence_detector):\n",
 329 |         "        self.transformer_tokenizer = transformer_tokenizer\n",
 330 |         "        self.sentence_detector = sentence_detector\n",
 331 |         "        self.bos_token = transformer_tokenizer.bos_token\n",
 332 |         "        self.sep_token = ' ' + transformer_tokenizer.sep_token + ' '\n",
 333 |         "    def add_special_tokens(self, text):\n",
 334 |         "        sentences = self.sentence_detector.tokenize(text)\n",
 335 |         "        eos_added_text  = self.sep_token.join(sentences) \n",
 336 |         "        return self.bos_token +' '+ eos_added_text + ' ' + self.transformer_tokenizer.sep_token"
 337 |       ],
 338 |       "execution_count": 0,
 339 |       "outputs": []
 340 |     },
 341 |     {
 342 |       "cell_type": "code",
 343 |       "metadata": {
 344 |         "id": "ar4EbV4BoupX",
 345 |         "colab_type": "code",
 346 |         "colab": {}
 347 |       },
 348 |       "source": [
 349 |         "!python -c 'import nltk; nltk.download(\"punkt\")'"
 350 |       ],
 351 |       "execution_count": 0,
 352 |       "outputs": []
 353 |     },
 354 |     {
 355 |       "cell_type": "code",
 356 |       "metadata": {
 357 |         "scrolled": true,
 358 |         "id": "6RlaXDBtoupY",
 359 |         "colab_type": "code",
 360 |         "colab": {}
 361 |       },
 362 |       "source": [
 363 |         "roberta_tokenizer = tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')\n",
 364 |         "punkt_sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')"
 365 |       ],
 366 |       "execution_count": 0,
 367 |       "outputs": []
 368 |     },
 369 |     {
 370 |       "cell_type": "code",
 371 |       "metadata": {
 372 |         "id": "RKFM-0bqoupb",
 373 |         "colab_type": "code",
 374 |         "colab": {}
 375 |       },
 376 |       "source": [
 377 |         "roberta_preproc = RobertaPreprocessor(roberta_tokenizer, punkt_sentence_detector)"
 378 |       ],
 379 |       "execution_count": 0,
 380 |       "outputs": []
 381 |     },
 382 |     {
 383 |       "cell_type": "code",
 384 |       "metadata": {
 385 |         "id": "jDq61UIRoupf",
 386 |         "colab_type": "code",
 387 |         "colab": {}
 388 |       },
 389 |       "source": [
 390 |         "#apply the preprocessor on the exploded dataframe\n",
 391 |         "data_df_task_c['text'] = data_df_task_c['text'].map(roberta_preproc.add_special_tokens)\n"
 392 |       ],
 393 |       "execution_count": 0,
 394 |       "outputs": []
 395 |     },
 396 |     {
 397 |       "cell_type": "code",
 398 |       "metadata": {
 399 |         "id": "kocu45Xtoupj",
 400 |         "colab_type": "code",
 401 |         "colab": {}
 402 |       },
 403 |       "source": [
 404 |         "with pd.option_context('display.max_colwidth', -1): \n",
 405 |         "    print(data_df_task_c[['text','label']].sample(5))"
 406 |       ],
 407 |       "execution_count": 0,
 408 |       "outputs": []
 409 |     },
 410 |     {
 411 |       "cell_type": "markdown",
 412 |       "metadata": {
 413 |         "id": "0NOwcyecoupm",
 414 |         "colab_type": "text"
 415 |       },
 416 |       "source": [
 417 |         "### Implement Attention Mask Dropout in the vectorizer"
 418 |       ]
 419 |     },
 420 |     {
 421 |       "cell_type": "code",
 422 |       "metadata": {
 423 |         "id": "KyQMayxOoupn",
 424 |         "colab_type": "code",
 425 |         "colab": {}
 426 |       },
 427 |       "source": [
 428 |         "class SimpleVectorizer():\n",
 429 |         "    def __init__(self,tokenizer: Callable, max_seq_len: int):\n",
 430 |         "        \"\"\"\n",
 431 |         "        Args:\n",
 432 |         "            tokenizer (Callable): transformer tokenizer\n",
 433 |         "            max_seq_len (int): Maximum sequence lenght \n",
 434 |         "        \"\"\"\n",
 435 |         "        self.tokenizer = tokenizer\n",
 436 |         "        self._max_seq_len = max_seq_len\n",
 437 |         "\n",
 438 |         "    def vectorize(self,text :str):\n",
 439 |         "        \n",
 440 |         "        encoded = self.tokenizer.encode_plus(\n",
 441 |         "            text,\n",
 442 |         "            add_special_tokens=False, #already added by preproc\n",
 443 |         "            max_length = self._max_seq_len,\n",
 444 |         "            pad_to_max_length = True,\n",
 445 |         "        )\n",
 446 |         "        ids =  np.array(encoded['input_ids'], dtype=np.int64)\n",
 447 |         "        attn = np.array(encoded['attention_mask'], dtype=np.int64)\n",
 448 |         "        \n",
 449 |         "        return ids, attn\n",
 450 |         "\n",
 451 |         "class Vectorizer():\n",
 452 |         "    \"\"\"Vectorizer with Attention Mask Dropout\"\"\"\n",
 453 |         "    def __init__(self,tokenizer: Callable, max_seq_len: int ):\n",
 454 |         "        \"\"\"\n",
 455 |         "        Args:\n",
 456 |         "            tokenizer (Callable): transformer tokenizer\n",
 457 |         "            max_seq_len (int): Maximum sequence lenght \n",
 458 |         "        \"\"\"\n",
 459 |         "        self.tokenizer = tokenizer\n",
 460 |         "        self._max_seq_len = max_seq_len\n",
 461 |         "\n",
 462 |         "    def vectorize(self,text :str, mask_prob: float = 0.50, mask_amount:float=0.30):\n",
 463 |         "        \"\"\"Implements Attention Mask Dropout\n",
 464 |         "        \n",
 465 |         "        Args:\n",
 466 |         "            text (str): The string to vectorize\n",
 467 |         "            mask_prob (float): Probability of the attention mask \n",
 468 |         "                dropout being applied\n",
 469 |         "            mask_amount (float): Percentage of tokens to mask\n",
 470 |         "\n",
 471 |         "        Returns:\n",
 472 |         "            ids (np.array)  : Array to token ids of the text\n",
 473 |         "            attn (np.array) : 0-1 Array of attention masks\n",
 474 |         "        \"\"\"\n",
 475 |         "\n",
 476 |         "        encoded = self.tokenizer.encode_plus(\n",
 477 |         "            text,\n",
 478 |         "            add_special_tokens=False, #already added by preproc\n",
 479 |         "            max_length = self._max_seq_len,\n",
 480 |         "            pad_to_max_length = True,\n",
 481 |         "        )\n",
 482 |         "        ids =  np.array(encoded['input_ids'], dtype=np.int64)\n",
 483 |         "        attn = np.array(encoded['attention_mask'], dtype=np.int64)\n",
 484 |         "        prob = np.random.rand(1)[0]\n",
 485 |         "        if  prob <= mask_prob:\n",
 486 |         "            len_of_sent = np.where(ids==tokenizer.pad_token_id)[0][0]\n",
 487 |         "            amount_to_mask = max(int(len_of_sent * mask_amount ) , 1)\n",
 488 |         "            ids_to_not_attend = [np.random.randint(low=0, high=len_of_sent )\n",
 489 |         "             for i in range(amount_to_mask)]\n",
 490 |         "            attn[ids_to_not_attend]=0\n",
 491 |         "            ids[ids_to_not_attend] = tokenizer.mask_token_id\n",
 492 |         "        return ids, attn"
 493 |       ],
 494 |       "execution_count": 0,
 495 |       "outputs": []
 496 |     },
 497 |     {
 498 |       "cell_type": "markdown",
 499 |       "metadata": {
 500 |         "id": "m0pI_W73rkHt",
 501 |         "colab_type": "text"
 502 |       },
 503 |       "source": [
 504 |         "Attention Mask Dropout Example"
 505 |       ]
 506 |     },
 507 |     {
 508 |       "cell_type": "code",
 509 |       "metadata": {
 510 |         "id": "1ZTtbNz7oupp",
 511 |         "colab_type": "code",
 512 |         "colab": {}
 513 |       },
 514 |       "source": [
 515 |         "v = Vectorizer(roberta_tokenizer, 15) #attention maskdropout vectorizer\n",
 516 |         "sv = SimpleVectorizer(roberta_tokenizer, 15) #simple vectorizer"
 517 |       ],
 518 |       "execution_count": 0,
 519 |       "outputs": []
 520 |     },
 521 |     {
 522 |       "cell_type": "code",
 523 |       "metadata": {
 524 |         "id": "sx4WAmFuoups",
 525 |         "colab_type": "code",
 526 |         "colab": {}
 527 |       },
 528 |       "source": [
 529 |         "sent = \"I am alright bro, dont worry about me\"\n",
 530 |         "_, attn_masks_dropped = v.vectorize(sent)\n",
 531 |         "attn_masks_dropped"
 532 |       ],
 533 |       "execution_count": 0,
 534 |       "outputs": []
 535 |     },
 536 |     {
 537 |       "cell_type": "code",
 538 |       "metadata": {
 539 |         "id": "BGb2aQcTrWxv",
 540 |         "colab_type": "code",
 541 |         "colab": {}
 542 |       },
 543 |       "source": [
 544 |         "_, attn_masks = sv.vectorize(sent)\n",
 545 |         "attn_masks"
 546 |       ],
 547 |       "execution_count": 0,
 548 |       "outputs": []
 549 |     },
 550 |     {
 551 |       "cell_type": "markdown",
 552 |       "metadata": {
 553 |         "id": "EfXDJER9wnGn",
 554 |         "colab_type": "text"
 555 |       },
 556 |       "source": [
 557 |         "###  Create the dataset class"
 558 |       ]
 559 |     },
 560 |     {
 561 |       "cell_type": "code",
 562 |       "metadata": {
 563 |         "id": "JI0SzK1Woupw",
 564 |         "colab_type": "code",
 565 |         "colab": {}
 566 |       },
 567 |       "source": [
 568 |         "class HateDataset(Dataset):\n",
 569 |         "    def __init__(\n",
 570 |         "        self,\n",
 571 |         "        data_df: pd.DataFrame,\n",
 572 |         "        tokenizer: Callable,\n",
 573 |         "        max_seq_length:int = None,\n",
 574 |         "    ):\n",
 575 |         "        \"\"\"\n",
 576 |         "        Args:\n",
 577 |         "            data_df (pandas.DataFrame): df containing the labels and text\n",
 578 |         "            tokenizer (tokenizer module for the transformer)\n",
 579 |         "        \"\"\"\n",
 580 |         "        self.data_df = data_df\n",
 581 |         "        self.tokenizer = tokenizer\n",
 582 |         "\n",
 583 |         "        if max_seq_length is None:\n",
 584 |         "            self._max_seq_length = self._get_max_len(data_df,tokenizer)\n",
 585 |         "        else:\n",
 586 |         "            self._max_seq_length = max_seq_length\n",
 587 |         "\n",
 588 |         "        self.train_df = self.data_df[self.data_df.split == 'train']\n",
 589 |         "        self.train_size = len(self.train_df)\n",
 590 |         "\n",
 591 |         "        self.val_df = self.data_df[self.data_df.split == 'val']\n",
 592 |         "        self.val_size = len(self.val_df)\n",
 593 |         "\n",
 594 |         "        self.test_df = self.data_df[self.data_df.split == 'test']\n",
 595 |         "        self.test_size = len(self.test_df)\n",
 596 |         "        \n",
 597 |         "        self.simple_vectorize = False,\n",
 598 |         "        self._simple_vectorizer = SimpleVectorizer(tokenizer, self._max_seq_length)\n",
 599 |         "        self._vectorizer = Vectorizer(tokenizer, self._max_seq_length)\n",
 600 |         "        \n",
 601 |         "        self._lookup_dict = {\n",
 602 |         "            'train': (self.train_df, self.train_size),\n",
 603 |         "            'val': (self.val_df, self.val_size),\n",
 604 |         "            'test': (self.test_df, self.test_size)\n",
 605 |         "        }\n",
 606 |         "\n",
 607 |         "        self.set_split('train')\n",
 608 |         "\n",
 609 |         "        class_counts = data_df.label.value_counts().to_dict()\n",
 610 |         "         #sorted on the basis of class label,eg, 0,1,2..\n",
 611 |         "        cts = sorted([(lbl,cts) for lbl,cts in class_counts.items()], key=lambda x: x[0])\n",
 612 |         "        freq = [ x[1] for x in cts ]\n",
 613 |         "        # print(freq,cts)\n",
 614 |         "        self.class_weights = 1.0/ torch.tensor(freq, dtype=torch.float32)\n",
 615 |         "    \n",
 616 |         "    def flip_simple_vectorizer(self) :\n",
 617 |         "        if self.simple_vectorize:\n",
 618 |         "            self.simple_vectorize=False\n",
 619 |         "        else:\n",
 620 |         "            self.simple_vectorize= True\n",
 621 |         "    \n",
 622 |         "    def _get_max_len(self,data_df: pd.DataFrame, tokenizer: Callable):\n",
 623 |         "        len_func = lambda x: len(self.tokenizer.encode_plus(x)['input_ids'])\n",
 624 |         "        max_len = data_df.text.map(len_func).max() \n",
 625 |         "        return max_len\n",
 626 |         "\n",
 627 |         "    def set_split(self, split=\"train\"):\n",
 628 |         "        \"\"\" selects the splits in the dataset using a column in the dataframe \"\"\"\n",
 629 |         "        self._target_split = split\n",
 630 |         "        self._target_df, self._target_size = self._lookup_dict[split]\n",
 631 |         "    \n",
 632 |         "    def __len__(self):\n",
 633 |         "        return self._target_size\n",
 634 |         "    \n",
 635 |         "    def __getitem__(self, index):\n",
 636 |         "        \"\"\"the primary entry point method for PyTorch datasets\n",
 637 |         "        \n",
 638 |         "        Args:\n",
 639 |         "            index (int): the index to the data point \n",
 640 |         "        Returns:\n",
 641 |         "            a dictionary holding the data point's features (x_data) and label (y_target)\n",
 642 |         "        \"\"\"\n",
 643 |         "        row = self._target_df.iloc[index]\n",
 644 |         "        \n",
 645 |         "        if self._target_split == 'train':\n",
 646 |         "            indices, attention_masks = self._vectorizer.vectorize(row.text)\n",
 647 |         "        else:\n",
 648 |         "            indices, attention_masks = self._simple_vectorizer.vectorize(row.text)\n",
 649 |         "\n",
 650 |         "        label = row.label\n",
 651 |         "        return {'x_data': indices,\n",
 652 |         "                'x_attn_mask': attention_masks,\n",
 653 |         "                'x_index': index,\n",
 654 |         "                'y_target': label}\n",
 655 |         "    \n",
 656 |         "    def get_num_batches(self, batch_size):\n",
 657 |         "        \"\"\"Given a batch size, return the number of batches in the dataset\n",
 658 |         "        \n",
 659 |         "        Args:\n",
 660 |         "            batch_size (int)\n",
 661 |         "        Returns:\n",
 662 |         "            number of batches in the dataset\n",
 663 |         "        \"\"\"\n",
 664 |         "        return len(self) // batch_size"
 665 |       ],
 666 |       "execution_count": 0,
 667 |       "outputs": []
 668 |     },
 669 |     {
 670 |       "cell_type": "code",
 671 |       "metadata": {
 672 |         "id": "PGvvsTq9oupy",
 673 |         "colab_type": "code",
 674 |         "colab": {}
 675 |       },
 676 |       "source": [
 677 |         "def generate_batches(dataset, batch_size, shuffle=True,\n",
 678 |         "                     drop_last=False, device=\"cpu\", pinned_memory = False, n_workers = 0): \n",
 679 |         "    \"\"\"\n",
 680 |         "    A generator function which wraps the PyTorch DataLoader. It will \n",
 681 |         "      ensure each tensor is on the write device location.\n",
 682 |         "    \"\"\"\n",
 683 |         "    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,\n",
 684 |         "                            shuffle=shuffle, drop_last=drop_last,\n",
 685 |         "                            pin_memory= pinned_memory,\n",
 686 |         "                            num_workers = n_workers,\n",
 687 |         "                            )\n",
 688 |         "    \n",
 689 |         "    for data_dict in dataloader:\n",
 690 |         "        out_data_dict = {}\n",
 691 |         "        out_data_dict['x_data'] = data_dict['x_data'].to(\n",
 692 |         "            device, non_blocking= (True if pinned_memory else False) \n",
 693 |         "        )\n",
 694 |         "        out_data_dict['x_attn_mask'] = data_dict['x_attn_mask'].to(\n",
 695 |         "            device, non_blocking= (True if pinned_memory else False) \n",
 696 |         "        )\n",
 697 |         "        out_data_dict['x_index'] = data_dict['x_index']\n",
 698 |         "        out_data_dict['y_target'] = data_dict['y_target'].to(\n",
 699 |         "            device, non_blocking= (True if pinned_memory else False) \n",
 700 |         "        )\n",
 701 |         "        yield out_data_dict"
 702 |       ],
 703 |       "execution_count": 0,
 704 |       "outputs": []
 705 |     },
 706 |     {
 707 |       "cell_type": "code",
 708 |       "metadata": {
 709 |         "id": "dqjxEQKZoup0",
 710 |         "colab_type": "code",
 711 |         "colab": {}
 712 |       },
 713 |       "source": [
 714 |         "dataset = HateDataset(\n",
 715 |         "    data_df = data_df_task_c,\n",
 716 |         "    tokenizer = roberta_tokenizer\n",
 717 |         ")"
 718 |       ],
 719 |       "execution_count": 0,
 720 |       "outputs": []
 721 |     },
 722 |     {
 723 |       "cell_type": "code",
 724 |       "metadata": {
 725 |         "scrolled": true,
 726 |         "id": "r1S0e8djoup3",
 727 |         "colab_type": "code",
 728 |         "colab": {}
 729 |       },
 730 |       "source": [
 731 |         "assert dataset._max_seq_length <= 512"
 732 |       ],
 733 |       "execution_count": 0,
 734 |       "outputs": []
 735 |     },
 736 |     {
 737 |       "cell_type": "markdown",
 738 |       "metadata": {
 739 |         "id": "faAM7TdDoup5",
 740 |         "colab_type": "text"
 741 |       },
 742 |       "source": [
 743 |         "# Initialize the Roberta model\n",
 744 |         "\n",
 745 |         "\n"
 746 |       ]
 747 |     },
 748 |     {
 749 |       "cell_type": "code",
 750 |       "metadata": {
 751 |         "scrolled": false,
 752 |         "id": "Sdjpj_fvoup6",
 753 |         "colab_type": "code",
 754 |         "colab": {}
 755 |       },
 756 |       "source": [
 757 |         "model = RobertaForSequenceClassification.from_pretrained(\n",
 758 |         "    'distilroberta-base',\n",
 759 |         "    num_labels=len(set(data_df_task_c.label)),\n",
 760 |         ")"
 761 |       ],
 762 |       "execution_count": 0,
 763 |       "outputs": []
 764 |     },
 765 |     {
 766 |       "cell_type": "code",
 767 |       "metadata": {
 768 |         "scrolled": true,
 769 |         "id": "7VAugC07oup8",
 770 |         "colab_type": "code",
 771 |         "colab": {}
 772 |       },
 773 |       "source": [
 774 |         "model.to(args.device)"
 775 |       ],
 776 |       "execution_count": 0,
 777 |       "outputs": []
 778 |     },
 779 |     {
 780 |       "cell_type": "code",
 781 |       "metadata": {
 782 |         "id": "2nf9iX9Eoup_",
 783 |         "colab_type": "code",
 784 |         "colab": {}
 785 |       },
 786 |       "source": [
 787 |         "early_stopping = transformer_general_utils.EarlyStopping(patience=4)"
 788 |       ],
 789 |       "execution_count": 0,
 790 |       "outputs": []
 791 |     },
 792 |     {
 793 |       "cell_type": "code",
 794 |       "metadata": {
 795 |         "id": "BjtphpU6s4LV",
 796 |         "colab_type": "code",
 797 |         "colab": {}
 798 |       },
 799 |       "source": [
 800 |         "!nvidia-smi"
 801 |       ],
 802 |       "execution_count": 0,
 803 |       "outputs": []
 804 |     },
 805 |     {
 806 |       "cell_type": "code",
 807 |       "metadata": {
 808 |         "id": "ZjFXYTcEouqB",
 809 |         "colab_type": "code",
 810 |         "colab": {}
 811 |       },
 812 |       "source": [
 813 |         "args.num_epochs = 20\n",
 814 |         "args.batch_size = 70"
 815 |       ],
 816 |       "execution_count": 0,
 817 |       "outputs": []
 818 |     },
 819 |     {
 820 |       "cell_type": "code",
 821 |       "metadata": {
 822 |         "scrolled": false,
 823 |         "id": "OYfdxscNouqE",
 824 |         "colab_type": "code",
 825 |         "colab": {}
 826 |       },
 827 |       "source": [
 828 |         "loss_func = nn.CrossEntropyLoss()\n",
 829 |         "\n",
 830 |         "print(f'Using LR:{args.learning_rate}')\n",
 831 |         "base_optimizer = RAdam(model.parameters(), lr = args.learning_rate)\n",
 832 |         "optimizer = Lookahead(optimizer = base_optimizer, k = 5, alpha=0.5 )\n",
 833 |         "scheduler = optim.lr_scheduler.ReduceLROnPlateau(\n",
 834 |         "    optimizer=optimizer.optimizer, factor =0.1 ,mode='max',\n",
 835 |         ")"
 836 |       ],
 837 |       "execution_count": 0,
 838 |       "outputs": []
 839 |     },
 840 |     {
 841 |       "cell_type": "markdown",
 842 |       "metadata": {
 843 |         "id": "HvhB0DIPouqH",
 844 |         "colab_type": "text"
 845 |       },
 846 |       "source": [
 847 |         "# Begin Training"
 848 |       ]
 849 |     },
 850 |     {
 851 |       "cell_type": "code",
 852 |       "metadata": {
 853 |         "id": "ta4xhZcdouqH",
 854 |         "colab_type": "code",
 855 |         "colab": {}
 856 |       },
 857 |       "source": [
 858 |         "train_state = general_utils.make_train_state()\n",
 859 |         "train_state.keys()\n"
 860 |       ],
 861 |       "execution_count": 0,
 862 |       "outputs": []
 863 |     },
 864 |     {
 865 |       "cell_type": "code",
 866 |       "metadata": {
 867 |         "scrolled": false,
 868 |         "id": "Hyp2Q5ReouqK",
 869 |         "colab_type": "code",
 870 |         "colab": {}
 871 |       },
 872 |       "source": [
 873 |         "epoch_bar = notebook.tqdm(\n",
 874 |         "    desc = 'training_routine',\n",
 875 |         "    total = args.num_epochs,\n",
 876 |         "    position=0,\n",
 877 |         "    leave = True,\n",
 878 |         ")\n",
 879 |         "dataset.set_split('train')\n",
 880 |         "train_bar = notebook.tqdm(\n",
 881 |         "    desc = 'split=train ',\n",
 882 |         "    total=dataset.get_num_batches(args.batch_size),\n",
 883 |         "    position=0,\n",
 884 |         "    leave=True,\n",
 885 |         ")\n",
 886 |         "dataset.set_split('val')\n",
 887 |         "eval_bar = notebook.tqdm(\n",
 888 |         "    desc = 'split=eval',\n",
 889 |         "    total=dataset.get_num_batches(args.batch_size),\n",
 890 |         "    position=0,\n",
 891 |         "    leave=True,\n",
 892 |         ")\n",
 893 |         "\n",
 894 |         "old_val_acc = 0\n",
 895 |         "old_f1 = 0\n",
 896 |         "model_state = None\n",
 897 |         "for epoch_index in range(args.num_epochs):\n",
 898 |         "    train_state['epoch_in'] = epoch_index\n",
 899 |         "\n",
 900 |         "    dataset.set_split('train')\n",
 901 |         "\n",
 902 |         "    batch_generator = generate_batches(\n",
 903 |         "        dataset= dataset, batch_size= args.batch_size, shuffle=True,\n",
 904 |         "        device = args.device, drop_last=False,\n",
 905 |         "        pinned_memory = True, n_workers = 3, \n",
 906 |         "    )\n",
 907 |         "\n",
 908 |         "    running_loss = 0.0\n",
 909 |         "    running_acc = 0.0\n",
 910 |         "    running_f1 = 0.0\n",
 911 |         "    model.train()\n",
 912 |         "\n",
 913 |         "    train_bar.reset(\n",
 914 |         "        total=dataset.get_num_batches(args.batch_size),\n",
 915 |         "    )\n",
 916 |         "\n",
 917 |         "    for batch_index, batch_dict in enumerate(batch_generator):\n",
 918 |         "        optimizer.zero_grad()\n",
 919 |         "        \n",
 920 |         "        loss,y_pred = model(\n",
 921 |         "            input_ids = batch_dict['x_data'],\n",
 922 |         "            attention_mask =  batch_dict['x_attn_mask'],\n",
 923 |         "            labels= batch_dict['y_target'].unsqueeze(1),\n",
 924 |         "        )[:2]\n",
 925 |         "        \n",
 926 |         "        y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))\n",
 927 |         "                             \n",
 928 |         "#         scheduler.step()\n",
 929 |         "        loss.backward()\n",
 930 |         "        optimizer.step()\n",
 931 |         "                             \n",
 932 |         "        loss_t = loss.item()\n",
 933 |         "        running_loss += (loss_t - running_loss) / (batch_index + 1)\n",
 934 |         "                             \n",
 935 |         "        y_pred = y_pred.detach().cpu()\n",
 936 |         "        batch_dict['y_target'] = batch_dict['y_target'].cpu()\n",
 937 |         "        \n",
 938 |         "        acc_t = transformer_general_utils \\\n",
 939 |         "            .compute_accuracy(y_pred, batch_dict['y_target'])\n",
 940 |         "        \n",
 941 |         "        f1_t = transformer_general_utils \\\n",
 942 |         "            .compute_macro_f1(y_pred, batch_dict['y_target'])\n",
 943 |         "\n",
 944 |         "        train_state['batch_preds'].append(y_pred)\n",
 945 |         "        train_state['batch_targets'].append(batch_dict['y_target'])\n",
 946 |         "        train_state['batch_indexes'].append(batch_dict['x_index'])\n",
 947 |         "\n",
 948 |         "        running_acc += (acc_t - running_acc) / (batch_index + 1)\n",
 949 |         "        running_f1 += (f1_t - running_f1) / (batch_index + 1)\n",
 950 |         "\n",
 951 |         "        train_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,\n",
 952 |         "                             epoch=epoch_index)\n",
 953 |         "\n",
 954 |         "        train_bar.update()\n",
 955 |         "\n",
 956 |         "    if torch.cuda.is_available():\n",
 957 |         "        torch.cuda.empty_cache()\n",
 958 |         "    \n",
 959 |         "    train_state['train_accuracies'].append(running_acc)\n",
 960 |         "    train_state['train_losses'].append(running_loss)\n",
 961 |         "    \n",
 962 |         "    train_state['train_preds'].append(\n",
 963 |         "        torch.cat(train_state['batch_preds']).cpu()\n",
 964 |         "    )\n",
 965 |         "    train_state['train_targets'].append(\n",
 966 |         "        torch.cat(train_state['batch_targets']).cpu()\n",
 967 |         "    )\n",
 968 |         "    train_state['train_indexes'].append(\n",
 969 |         "        torch.cat(train_state['batch_indexes']).cpu()\n",
 970 |         "    )\n",
 971 |         "    train_f1 = transformer_general_utils \\\n",
 972 |         "                .compute_macro_f1(train_state['train_preds'][-1],\n",
 973 |         "                                  train_state['train_targets'][-1],\n",
 974 |         "                                 )\n",
 975 |         "                                 \n",
 976 |         "    train_state['train_f1s'].append(train_f1)\n",
 977 |         "    \n",
 978 |         "    train_state['batch_preds'] = []\n",
 979 |         "    train_state['batch_targets'] = []\n",
 980 |         "    train_state['batch_indexes'] = []\n",
 981 |         "    \n",
 982 |         "    \n",
 983 |         "    dataset.set_split('val')\n",
 984 |         "    batch_generator = generate_batches(\n",
 985 |         "        dataset= dataset, batch_size= args.batch_size, shuffle=True,\n",
 986 |         "        device = args.device, drop_last=False,\n",
 987 |         "        pinned_memory = True, n_workers = 2, \n",
 988 |         "    )\n",
 989 |         "    eval_bar.reset(\n",
 990 |         "        total=dataset.get_num_batches(args.batch_size),\n",
 991 |         "    )\n",
 992 |         "    running_loss = 0.0\n",
 993 |         "    running_acc = 0.0\n",
 994 |         "    running_f1 = 0.0\n",
 995 |         "    \n",
 996 |         "    model.eval()\n",
 997 |         "    with torch.no_grad():\n",
 998 |         "        optimizer._backup_and_load_cache()\n",
 999 |         "        for batch_index, batch_dict in enumerate(batch_generator):\n",
1000 |         "            loss, y_pred = model(\n",
1001 |         "                input_ids = batch_dict['x_data'],\n",
1002 |         "                attention_mask =  batch_dict['x_attn_mask'],\n",
1003 |         "                labels= batch_dict['y_target'].unsqueeze(1),\n",
1004 |         "            )[:2]\n",
1005 |         "            y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))\n",
1006 |         "            \n",
1007 |         "            loss_t = loss.item()\n",
1008 |         "            running_loss += (loss_t - running_loss) / (batch_index + 1)\n",
1009 |         "\n",
1010 |         "            y_pred = y_pred.detach()\n",
1011 |         "            batch_dict['y_target'] = batch_dict['y_target'].cpu()\n",
1012 |         "            \n",
1013 |         "            acc_t = transformer_general_utils\\\n",
1014 |         "                .compute_accuracy(y_pred, batch_dict['y_target'])\n",
1015 |         "            f1_t = transformer_general_utils \\\n",
1016 |         "                .compute_macro_f1(y_pred, batch_dict['y_target'])\n",
1017 |         "\n",
1018 |         "            train_state['batch_preds'].append(y_pred.cpu())\n",
1019 |         "            train_state['batch_targets'].append(batch_dict['y_target'].cpu())\n",
1020 |         "            train_state['batch_indexes'].append(batch_dict['x_index'].cpu())\n",
1021 |         "\n",
1022 |         "            running_acc += (acc_t - running_acc) / (batch_index + 1)\n",
1023 |         "            running_f1 += (f1_t - running_f1) / (batch_index + 1)\n",
1024 |         "            \n",
1025 |         "\n",
1026 |         "            eval_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,\n",
1027 |         "                                 epoch=epoch_index)\n",
1028 |         "            eval_bar.update()\n",
1029 |         "            \n",
1030 |         "    train_state['val_accuracies'].append(running_acc)\n",
1031 |         "    train_state['val_losses'].append(running_loss)\n",
1032 |         "    \n",
1033 |         "        \n",
1034 |         "    train_state['val_preds'].append(\n",
1035 |         "        torch.cat(train_state['batch_preds']).cpu()\n",
1036 |         "    )\n",
1037 |         "\n",
1038 |         "    train_state['val_targets'].append(\n",
1039 |         "        torch.cat(train_state['batch_targets']).cpu()\n",
1040 |         "    )\n",
1041 |         "    train_state['val_indexes'].append(\n",
1042 |         "        torch.cat(train_state['batch_indexes']).cpu()\n",
1043 |         "    )\n",
1044 |         "    val_f1 = transformer_general_utils \\\n",
1045 |         "                .compute_macro_f1(train_state['val_preds'][-1],\n",
1046 |         "                                  train_state['val_targets'][-1],\n",
1047 |         "                                 )\n",
1048 |         "                                 \n",
1049 |         "    train_state['val_f1s'].append(val_f1)\n",
1050 |         "    \n",
1051 |         "    train_state['batch_preds'] = []\n",
1052 |         "    train_state['batch_targets'] = []\n",
1053 |         "    train_state['batch_indexes'] = []\n",
1054 |         "    \n",
1055 |         "    torch.save(\n",
1056 |         "        {\n",
1057 |         "            'model':model.state_dict(),\n",
1058 |         "        },\n",
1059 |         "        args.directory + f'_epoc_{epoch_index}_' + args.model_name,\n",
1060 |         "    )\n",
1061 |         "    \n",
1062 |         "    scheduler.step(val_f1)\n",
1063 |         "    early_stopping(val_f1, model)\n",
1064 |         "    optimizer._clear_and_load_backup()\n",
1065 |         "    epoch_bar.set_postfix( best_f1 = early_stopping.best_score, current = val_f1)\n",
1066 |         "    epoch_bar.update()    \n",
1067 |         "    \n",
1068 |         "    if early_stopping.early_stop:\n",
1069 |         "        print(\"Early stopping\")\n",
1070 |         "        break\n"
1071 |       ],
1072 |       "execution_count": 0,
1073 |       "outputs": []
1074 |     },
1075 |     {
1076 |       "cell_type": "code",
1077 |       "metadata": {
1078 |         "id": "JvhJbjv3ouqM",
1079 |         "colab_type": "code",
1080 |         "colab": {}
1081 |       },
1082 |       "source": [
1083 |         "epoch_index"
1084 |       ],
1085 |       "execution_count": 0,
1086 |       "outputs": []
1087 |     },
1088 |     {
1089 |       "cell_type": "code",
1090 |       "metadata": {
1091 |         "scrolled": true,
1092 |         "id": "UZ_tiTQsouqQ",
1093 |         "colab_type": "code",
1094 |         "colab": {}
1095 |       },
1096 |       "source": [
1097 |         "print(train_state['val_f1s'])"
1098 |       ],
1099 |       "execution_count": 0,
1100 |       "outputs": []
1101 |     },
1102 |     {
1103 |       "cell_type": "code",
1104 |       "metadata": {
1105 |         "id": "pGTvvqJOouqS",
1106 |         "colab_type": "code",
1107 |         "colab": {}
1108 |       },
1109 |       "source": [
1110 |         "from sklearn.metrics import classification_report\n",
1111 |         "from sklearn.metrics import confusion_matrix\n",
1112 |         "from sklearn.metrics import accuracy_score\n",
1113 |         "from sklearn.metrics import f1_score"
1114 |       ],
1115 |       "execution_count": 0,
1116 |       "outputs": []
1117 |     },
1118 |     {
1119 |       "cell_type": "code",
1120 |       "metadata": {
1121 |         "id": "zmsdTS5XouqU",
1122 |         "colab_type": "code",
1123 |         "colab": {}
1124 |       },
1125 |       "source": [
1126 |         "\n",
1127 |         "print('Train:',classification_report(\n",
1128 |         "    y_pred=(torch.argmax(train_state['train_preds'][-1],dim=1) ).cpu().long().numpy(),\n",
1129 |         "    y_true= train_state['train_targets'][-1].cpu().numpy(), \n",
1130 |         "    digits=4)\n",
1131 |         ")\n",
1132 |         "print('Dev:',classification_report(\n",
1133 |         "    y_pred=(torch.argmax(train_state['val_preds'][-1],dim=1) ).cpu().long().numpy(),\n",
1134 |         "    y_true= train_state['val_targets'][-1].cpu().numpy(), \n",
1135 |         "    digits=4)\n",
1136 |         ")\n"
1137 |       ],
1138 |       "execution_count": 0,
1139 |       "outputs": []
1140 |     },
1141 |     {
1142 |       "cell_type": "code",
1143 |       "metadata": {
1144 |         "id": "dBvhO_3MouqX",
1145 |         "colab_type": "code",
1146 |         "colab": {}
1147 |       },
1148 |       "source": [
1149 |         "best_run_index = train_state['val_f1s'].index(max(train_state['val_f1s']))\n",
1150 |         "print('Train:',classification_report(\n",
1151 |         "    y_pred=(torch.argmax(train_state['train_preds'][best_run_index],dim=1) ).cpu().long().numpy(),\n",
1152 |         "    y_true= train_state['train_targets'][best_run_index].cpu().numpy(), \n",
1153 |         "    digits=4)\n",
1154 |         ")\n",
1155 |         "print('Dev:',classification_report(\n",
1156 |         "    y_pred=(torch.argmax(train_state['val_preds'][best_run_index],dim=1) ).cpu().long().numpy(),\n",
1157 |         "    y_true= train_state['val_targets'][best_run_index].cpu().numpy(), \n",
1158 |         "    digits=4)\n",
1159 |         ")"
1160 |       ],
1161 |       "execution_count": 0,
1162 |       "outputs": []
1163 |     },
1164 |     {
1165 |       "cell_type": "code",
1166 |       "metadata": {
1167 |         "id": "ZyJlr1Ucouqa",
1168 |         "colab_type": "code",
1169 |         "colab": {}
1170 |       },
1171 |       "source": [
1172 |         "def sort_preds(indexes, preds):\n",
1173 |         "    \"\"\"Sorts the predictions in order, to reverse the effects of shuffle\n",
1174 |         "    done by dataloader\"\"\"\n",
1175 |         "    indexes = indexes.cpu().numpy().reshape(-1,1)\n",
1176 |         "    preds = preds.cpu().numpy()\n",
1177 |         "    arr_concat = np.hstack((indexes,preds)) #concat the preds and their indexes\n",
1178 |         "    sort_arr = arr_concat[ arr_concat[:,0].argsort()] #sort based on the indexes\n",
1179 |         "    sorted_preds = np.delete(sort_arr,0,axis=1)\n",
1180 |         "    return sorted_preds"
1181 |       ],
1182 |       "execution_count": 0,
1183 |       "outputs": []
1184 |     },
1185 |     {
1186 |       "cell_type": "code",
1187 |       "metadata": {
1188 |         "id": "PEKLcktCourg",
1189 |         "colab_type": "code",
1190 |         "colab": {}
1191 |       },
1192 |       "source": [
1193 |         "def get_optimal_models_v2(train_state, split):\n",
1194 |         "    l = zip(train_state[f'{split}_f1s'], range(len(train_state[f'{split}_f1s'])))\n",
1195 |         "    sorted_vals = sorted(l, key = lambda x:x[0], reverse=True)\n",
1196 |         "    model_idxes = [i[1] for i in sorted_vals]\n",
1197 |         "    \n",
1198 |         "    trgts= sort_preds(train_state[f'{split}_indexes'][-1],train_state[f'{split}_targets'][-1].reshape(-1,1))\n",
1199 |         "    total_preds = len(train_state[f'{split}_indexes'])\n",
1200 |         "    init = np.zeros(train_state[f'{split}_preds'][-1].shape)\n",
1201 |         "    max_f1 = 0\n",
1202 |         "    idxes = []\n",
1203 |         "    for i in model_idxes:\n",
1204 |         "        temp = sort_preds(train_state[f'{split}_indexes'][i],train_state[f'{split}_preds'][i])\n",
1205 |         "        temp2 = init+temp\n",
1206 |         "        f1 = f1_score(\n",
1207 |         "            y_pred=temp2.argmax(axis=1),\n",
1208 |         "            y_true= trgts, average ='macro'\n",
1209 |         "        )\n",
1210 |         "        if f1 > max_f1:\n",
1211 |         "            max_f1 = f1\n",
1212 |         "            init = init+temp\n",
1213 |         "            idxes.append(i)\n",
1214 |         "    print(f'Taking preds from {idxes} | Dev f1:{f1}')\n",
1215 |         "    return idxes"
1216 |       ],
1217 |       "execution_count": 0,
1218 |       "outputs": []
1219 |     },
1220 |     {
1221 |       "cell_type": "code",
1222 |       "metadata": {
1223 |         "scrolled": true,
1224 |         "id": "RI0eIVAtourj",
1225 |         "colab_type": "code",
1226 |         "colab": {}
1227 |       },
1228 |       "source": [
1229 |         "final_optimal_models = get_optimal_models_v2(train_state, 'val')\n",
1230 |         "final_optimal_models"
1231 |       ],
1232 |       "execution_count": 0,
1233 |       "outputs": []
1234 |     },
1235 |     {
1236 |       "cell_type": "markdown",
1237 |       "metadata": {
1238 |         "id": "siZdY82mours",
1239 |         "colab_type": "text"
1240 |       },
1241 |       "source": [
1242 |         "# Making preds on the given test set"
1243 |       ]
1244 |     },
1245 |     {
1246 |       "cell_type": "code",
1247 |       "metadata": {
1248 |         "id": "Zcztc0lGourz",
1249 |         "colab_type": "code",
1250 |         "colab": {}
1251 |       },
1252 |       "source": [
1253 |         "test_df = data_df_task_c"
1254 |       ],
1255 |       "execution_count": 0,
1256 |       "outputs": []
1257 |     },
1258 |     {
1259 |       "cell_type": "code",
1260 |       "metadata": {
1261 |         "id": "5hkU-POXour5",
1262 |         "colab_type": "code",
1263 |         "colab": {}
1264 |       },
1265 |       "source": [
1266 |         "test_dataset = dataset\n",
1267 |         "test_dataset.set_split('test')"
1268 |       ],
1269 |       "execution_count": 0,
1270 |       "outputs": []
1271 |     },
1272 |     {
1273 |       "cell_type": "code",
1274 |       "metadata": {
1275 |         "id": "Iaq8vD0Xour7",
1276 |         "colab_type": "code",
1277 |         "colab": {}
1278 |       },
1279 |       "source": [
1280 |         "test_dataset._target_df.sample(5)"
1281 |       ],
1282 |       "execution_count": 0,
1283 |       "outputs": []
1284 |     },
1285 |     {
1286 |       "cell_type": "code",
1287 |       "metadata": {
1288 |         "id": "M2PQOwWzousA",
1289 |         "colab_type": "code",
1290 |         "colab": {}
1291 |       },
1292 |       "source": [
1293 |         "print(len(test_df))\n",
1294 |         "print(test_dataset._target_df.split.value_counts())"
1295 |       ],
1296 |       "execution_count": 0,
1297 |       "outputs": []
1298 |     },
1299 |     {
1300 |       "cell_type": "code",
1301 |       "metadata": {
1302 |         "id": "uRfZ2GLHousC",
1303 |         "colab_type": "code",
1304 |         "colab": {}
1305 |       },
1306 |       "source": [
1307 |         "def evaluate_testset(model, state, dataset, split,args):\n",
1308 |         "    \"\"\"Returns the final layer output of our transformer model\n",
1309 |         "    Puts them in the '{split}_*' keys in the state dict\n",
1310 |         "    Args:\n",
1311 |         "        model: A pytorch transformers model\n",
1312 |         "        state: dict to store outputs\n",
1313 |         "        dataset: A pytorch Dataset\n",
1314 |         "        split: The split on which to evaluate the model on\n",
1315 |         "        args: Arguments from namespace, etc\n",
1316 |         "    Returns:\n",
1317 |         "        state: all evaluated output stored in the \"test\" key\n",
1318 |         "    \"\"\"\n",
1319 |         "    eval_bar = notebook.tqdm(\n",
1320 |         "        desc = 'evaluation progress: ',\n",
1321 |         "        total=dataset.get_num_batches(args.batch_size),\n",
1322 |         "        position=0,\n",
1323 |         "        leave=False,\n",
1324 |         "    )\n",
1325 |         "    dataset.set_split(split)\n",
1326 |         "    batch_generator = generate_batches(\n",
1327 |         "        dataset= dataset, batch_size= args.batch_size, shuffle=False,\n",
1328 |         "        device = args.device, drop_last=False,\n",
1329 |         "        pinned_memory = True, n_workers = 2, \n",
1330 |         "    )\n",
1331 |         "    eval_bar.reset(\n",
1332 |         "        total=dataset.get_num_batches(args.batch_size),\n",
1333 |         "    )\n",
1334 |         "    model.eval()\n",
1335 |         "    with torch.no_grad():\n",
1336 |         "        for batch_index, batch_dict in enumerate(batch_generator):\n",
1337 |         "            y_pred = model(\n",
1338 |         "                input_ids = batch_dict['x_data'],\n",
1339 |         "                attention_mask =  batch_dict['x_attn_mask'],\n",
1340 |         "            )[0]\n",
1341 |         "            y_pred = y_pred.view(-1, 3)\n",
1342 |         "\n",
1343 |         "            y_pred = y_pred.detach()\n",
1344 |         "            \n",
1345 |         "            state['batch_preds'].append(y_pred.cpu())\n",
1346 |         "            state['batch_indexes'].append(batch_dict['x_index'].cpu())\n",
1347 |         "            \n",
1348 |         "            eval_bar.update()\n",
1349 |         "            \n",
1350 |         "        if torch.cuda.is_available():\n",
1351 |         "            torch.cuda.empty_cache()\n",
1352 |         "    \n",
1353 |         "    state[f'{split}_preds'].append(\n",
1354 |         "        torch.cat(state['batch_preds']).cpu()\n",
1355 |         "    )\n",
1356 |         "    state[f'{split}_indexes'].append(\n",
1357 |         "        torch.cat(state['batch_indexes']).cpu()\n",
1358 |         "    )\n",
1359 |         "    \n",
1360 |         "    state['batch_preds'] = []\n",
1361 |         "    state['batch_indexes'] = []\n",
1362 |         "    \n",
1363 |         "    eval_bar.close()\n",
1364 |         "    return state"
1365 |       ],
1366 |       "execution_count": 0,
1367 |       "outputs": []
1368 |     },
1369 |     {
1370 |       "cell_type": "code",
1371 |       "metadata": {
1372 |         "id": "UDLAcVoOousD",
1373 |         "colab_type": "code",
1374 |         "colab": {}
1375 |       },
1376 |       "source": [
1377 |         "chosen_models = [all_model_paths[i] for i in final_optimal_models]"
1378 |       ],
1379 |       "execution_count": 0,
1380 |       "outputs": []
1381 |     },
1382 |     {
1383 |       "cell_type": "code",
1384 |       "metadata": {
1385 |         "id": "ZRhTG0jJousG",
1386 |         "colab_type": "code",
1387 |         "colab": {}
1388 |       },
1389 |       "source": [
1390 |         "test_state = general_utils.make_train_state()\n",
1391 |         "for model_path in notebook.tqdm(chosen_models, total=len(chosen_models)):\n",
1392 |         "    model.load_state_dict(torch.load(model_path)['model'])\n",
1393 |         "    test_state = evaluate_testset(model, test_state, test_dataset, 'test',args)"
1394 |       ],
1395 |       "execution_count": 0,
1396 |       "outputs": []
1397 |     },
1398 |     {
1399 |       "cell_type": "code",
1400 |       "metadata": {
1401 |         "id": "NyPLbSx_ousH",
1402 |         "colab_type": "code",
1403 |         "colab": {}
1404 |       },
1405 |       "source": [
1406 |         "test_state['test_preds'][-1].shape"
1407 |       ],
1408 |       "execution_count": 0,
1409 |       "outputs": []
1410 |     },
1411 |     {
1412 |       "cell_type": "code",
1413 |       "metadata": {
1414 |         "id": "x9zIQn2PousJ",
1415 |         "colab_type": "code",
1416 |         "colab": {}
1417 |       },
1418 |       "source": [
1419 |         "[test_state['test_preds'][i].size() for i in range(len(test_state['test_preds']))]"
1420 |       ],
1421 |       "execution_count": 0,
1422 |       "outputs": []
1423 |     },
1424 |     {
1425 |       "cell_type": "code",
1426 |       "metadata": {
1427 |         "id": "9k7efmwXousM",
1428 |         "colab_type": "code",
1429 |         "colab": {}
1430 |       },
1431 |       "source": [
1432 |         "len(test_dataset._target_df)"
1433 |       ],
1434 |       "execution_count": 0,
1435 |       "outputs": []
1436 |     },
1437 |     {
1438 |       "cell_type": "code",
1439 |       "metadata": {
1440 |         "id": "1-mAcbFFousO",
1441 |         "colab_type": "code",
1442 |         "colab": {}
1443 |       },
1444 |       "source": [
1445 |         "torch.zeros_like(test_state['test_preds'][0]).size()"
1446 |       ],
1447 |       "execution_count": 0,
1448 |       "outputs": []
1449 |     },
1450 |     {
1451 |       "cell_type": "code",
1452 |       "metadata": {
1453 |         "id": "Zw23sdIAousQ",
1454 |         "colab_type": "code",
1455 |         "colab": {}
1456 |       },
1457 |       "source": [
1458 |         "ensemble_pred = torch.zeros_like(test_state['test_preds'][0])\n",
1459 |         "for i in test_state['test_preds']:\n",
1460 |         "    ensemble_pred += i"
1461 |       ],
1462 |       "execution_count": 0,
1463 |       "outputs": []
1464 |     },
1465 |     {
1466 |       "cell_type": "code",
1467 |       "metadata": {
1468 |         "id": "W1tEtoDvousS",
1469 |         "colab_type": "code",
1470 |         "colab": {}
1471 |       },
1472 |       "source": [
1473 |         "# label_dict[\"IND\"] = 0\n",
1474 |         "# label_dict[\"GRP\"] = 1\n",
1475 |         "# label_dict[\"OTH\"] = 2\n",
1476 |         "#ref utils/offeval2020.py"
1477 |       ],
1478 |       "execution_count": 0,
1479 |       "outputs": []
1480 |     },
1481 |     {
1482 |       "cell_type": "code",
1483 |       "metadata": {
1484 |         "id": "f7l7scgnousU",
1485 |         "colab_type": "code",
1486 |         "colab": {}
1487 |       },
1488 |       "source": [
1489 |         "int_to_label = { 0: 'IND', 1:'GRP', 2:'OTH'}"
1490 |       ],
1491 |       "execution_count": 0,
1492 |       "outputs": []
1493 |     },
1494 |     {
1495 |       "cell_type": "code",
1496 |       "metadata": {
1497 |         "id": "qIG3GtyDousW",
1498 |         "colab_type": "code",
1499 |         "colab": {}
1500 |       },
1501 |       "source": [
1502 |         "t = []\n",
1503 |         "for i in torch.argmax(ensemble_pred, dim=1):\n",
1504 |         "    t.append(int_to_label[i.item()])\n",
1505 |         "\n",
1506 |         "collections.Counter(t)"
1507 |       ],
1508 |       "execution_count": 0,
1509 |       "outputs": []
1510 |     },
1511 |     {
1512 |       "cell_type": "code",
1513 |       "metadata": {
1514 |         "id": "N43KI4P4ousY",
1515 |         "colab_type": "code",
1516 |         "colab": {}
1517 |       },
1518 |       "source": [
1519 |         "assert len(t) == len(test_df)"
1520 |       ],
1521 |       "execution_count": 0,
1522 |       "outputs": []
1523 |     },
1524 |     {
1525 |       "cell_type": "code",
1526 |       "metadata": {
1527 |         "id": "ANkfSvUSousa",
1528 |         "colab_type": "code",
1529 |         "colab": {}
1530 |       },
1531 |       "source": [
1532 |         "offeval_task_c_pred_analysis_df = pd.DataFrame(\n",
1533 |         "    data={\n",
1534 |         "        'id':test_df.id,\n",
1535 |         "        'text':test_df.tweet,\n",
1536 |         "        'label':t,\n",
1537 |         "    }\n",
1538 |         ")"
1539 |       ],
1540 |       "execution_count": 0,
1541 |       "outputs": []
1542 |     },
1543 |     {
1544 |       "cell_type": "code",
1545 |       "metadata": {
1546 |         "id": "8tYSn6VNousb",
1547 |         "colab_type": "code",
1548 |         "colab": {}
1549 |       },
1550 |       "source": [
1551 |         "offeval_task_c_pred_label_df = pd.DataFrame(\n",
1552 |         "    data={\n",
1553 |         "        'id':test_df.id,\n",
1554 |         "        'label':t,\n",
1555 |         "    }\n",
1556 |         ")"
1557 |       ],
1558 |       "execution_count": 0,
1559 |       "outputs": []
1560 |     },
1561 |     {
1562 |       "cell_type": "code",
1563 |       "metadata": {
1564 |         "id": "ompoxc6Nousc",
1565 |         "colab_type": "code",
1566 |         "colab": {}
1567 |       },
1568 |       "source": [
1569 |         "offeval_task_c_pred_analysis_df.to_csv(\n",
1570 |         "    'offeval_task_c_pred_analysis.csv',index=False,\n",
1571 |         ")"
1572 |       ],
1573 |       "execution_count": 0,
1574 |       "outputs": []
1575 |     },
1576 |     {
1577 |       "cell_type": "code",
1578 |       "metadata": {
1579 |         "id": "BHiGB2Q-ouse",
1580 |         "colab_type": "code",
1581 |         "colab": {}
1582 |       },
1583 |       "source": [
1584 |         "offeval_task_c_pred_label_df.to_csv(\n",
1585 |         "    'offeval_task_c_pred_label.csv', index=False, header=False,\n",
1586 |         ")"
1587 |       ],
1588 |       "execution_count": 0,
1589 |       "outputs": []
1590 |     },
1591 |     {
1592 |       "cell_type": "code",
1593 |       "metadata": {
1594 |         "id": "8IVzPSmwousf",
1595 |         "colab_type": "code",
1596 |         "colab": {}
1597 |       },
1598 |       "source": [
1599 |         "offeval_task_c_pred_label_df.label.value_counts()\n"
1600 |       ],
1601 |       "execution_count": 0,
1602 |       "outputs": []
1603 |     },
1604 |     {
1605 |       "cell_type": "code",
1606 |       "metadata": {
1607 |         "id": "mutV5hWkoush",
1608 |         "colab_type": "code",
1609 |         "colab": {}
1610 |       },
1611 |       "source": [
1612 |         "offeval_task_c_pred_label_df.label.value_counts()"
1613 |       ],
1614 |       "execution_count": 0,
1615 |       "outputs": []
1616 |     },
1617 |     {
1618 |       "cell_type": "code",
1619 |       "metadata": {
1620 |         "id": "jr707IBvousi",
1621 |         "colab_type": "code",
1622 |         "colab": {}
1623 |       },
1624 |       "source": [
1625 |         ""
1626 |       ],
1627 |       "execution_count": 0,
1628 |       "outputs": []
1629 |     }
1630 |   ]
1631 | }


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cozek/OffensEval2020-code/874b86f14112326c9ba09965174142bdb0acdfb0/src/__init__.py


--------------------------------------------------------------------------------
/src/lookahead/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 lonePatinet
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/lookahead/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cozek/OffensEval2020-code/874b86f14112326c9ba09965174142bdb0acdfb0/src/lookahead/__init__.py


--------------------------------------------------------------------------------
/src/lookahead/optimizer.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import itertools as it
  4 | from torch.optim import Optimizer
  5 | from collections import defaultdict
  6 | 
  7 | class Lookahead(Optimizer):
  8 |     '''
  9 |     PyTorch implementation of the lookahead wrapper.
 10 |     Lookahead Optimizer: https://arxiv.org/abs/1907.08610
 11 |     '''
 12 |     def __init__(self, optimizer,alpha=0.5, k=6,pullback_momentum="none"):
 13 |         '''
 14 |         :param optimizer:inner optimizer
 15 |         :param k (int): number of lookahead steps
 16 |         :param alpha(float): linear interpolation factor. 1.0 recovers the inner optimizer.
 17 |         :param pullback_momentum (str): change to inner optimizer momentum on interpolation update
 18 |         '''
 19 |         if not 0.0 <= alpha <= 1.0:
 20 |             raise ValueError(f'Invalid slow update rate: {alpha}')
 21 |         if not 1 <= k:
 22 |             raise ValueError(f'Invalid lookahead steps: {k}')
 23 |         self.optimizer = optimizer
 24 |         self.param_groups = self.optimizer.param_groups
 25 |         self.alpha = alpha
 26 |         self.k = k
 27 |         self.step_counter = 0
 28 |         assert pullback_momentum in ["reset", "pullback", "none"]
 29 |         self.pullback_momentum = pullback_momentum
 30 |         self.state = defaultdict(dict)
 31 | 
 32 |         # Cache the current optimizer parameters
 33 |         for group in self.optimizer.param_groups:
 34 |             for p in group['params']:
 35 |                 param_state = self.state[p]
 36 |                 param_state['cached_params'] = torch.zeros_like(p.data)
 37 |                 param_state['cached_params'].copy_(p.data)
 38 | 
 39 |     def __getstate__(self):
 40 |         return {
 41 |             'state': self.state,
 42 |             'optimizer': self.optimizer,
 43 |             'alpha': self.alpha,
 44 |             'step_counter': self.step_counter,
 45 |             'k':self.k,
 46 |             'pullback_momentum': self.pullback_momentum
 47 |         }
 48 | 
 49 |     def zero_grad(self):
 50 |         self.optimizer.zero_grad()
 51 | 
 52 |     def state_dict(self):
 53 |         return self.optimizer.state_dict()
 54 | 
 55 |     def load_state_dict(self, state_dict):
 56 |         self.optimizer.load_state_dict(state_dict)
 57 | 
 58 |     def _backup_and_load_cache(self):
 59 |         """Useful for performing evaluation on the slow weights (which typically generalize better)
 60 |         """
 61 |         for group in self.optimizer.param_groups:
 62 |             for p in group['params']:
 63 |                 param_state = self.state[p]
 64 |                 param_state['backup_params'] = torch.zeros_like(p.data)
 65 |                 param_state['backup_params'].copy_(p.data)
 66 |                 p.data.copy_(param_state['cached_params'])
 67 | 
 68 |     def _clear_and_load_backup(self):
 69 |         for group in self.optimizer.param_groups:
 70 |             for p in group['params']:
 71 |                 param_state = self.state[p]
 72 |                 p.data.copy_(param_state['backup_params'])
 73 |                 del param_state['backup_params']
 74 | 
 75 |     def step(self, closure=None):
 76 |         """Performs a single Lookahead optimization step.
 77 |         Arguments:
 78 |             closure (callable, optional): A closure that reevaluates the model
 79 |                 and returns the loss.
 80 |         """
 81 |         loss = self.optimizer.step(closure)
 82 |         self.step_counter += 1
 83 | 
 84 |         if self.step_counter >= self.k:
 85 |             self.step_counter = 0
 86 |             # Lookahead and cache the current optimizer parameters
 87 |             for group in self.optimizer.param_groups:
 88 |                 for p in group['params']:
 89 |                     param_state = self.state[p]
 90 |                     p.data.mul_(self.alpha).add_(1.0 - self.alpha, param_state['cached_params'])  # crucial line
 91 |                     param_state['cached_params'].copy_(p.data)
 92 |                     if self.pullback_momentum == "pullback":
 93 |                         internal_momentum = self.optimizer.state[p]["momentum_buffer"]
 94 |                         self.optimizer.state[p]["momentum_buffer"] = internal_momentum.mul_(self.alpha).add_(
 95 |                             1.0 - self.alpha, param_state["cached_mom"])
 96 |                         param_state["cached_mom"] = self.optimizer.state[p]["momentum_buffer"]
 97 |                     elif self.pullback_momentum == "reset":
 98 |                         self.optimizer.state[p]["momentum_buffer"] = torch.zeros_like(p.data)
 99 | 
100 |         return loss
101 | 


--------------------------------------------------------------------------------
/src/radam/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [2019] [Liyuan Liu]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/src/radam/__init__.py:
--------------------------------------------------------------------------------
1 | from .radam import RAdam, PlainRAdam, AdamW
2 | 


--------------------------------------------------------------------------------
/src/radam/radam.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.optim.optimizer import Optimizer, required
  4 | 
  5 | class RAdam(Optimizer):
  6 | 
  7 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
  8 |         if not 0.0 <= lr:
  9 |             raise ValueError("Invalid learning rate: {}".format(lr))
 10 |         if not 0.0 <= eps:
 11 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 12 |         if not 0.0 <= betas[0] < 1.0:
 13 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 14 |         if not 0.0 <= betas[1] < 1.0:
 15 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 16 |         
 17 |         self.degenerated_to_sgd = degenerated_to_sgd
 18 |         if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
 19 |             for param in params:
 20 |                 if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
 21 |                     param['buffer'] = [[None, None, None] for _ in range(10)]
 22 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])
 23 |         super(RAdam, self).__init__(params, defaults)
 24 | 
 25 |     def __setstate__(self, state):
 26 |         super(RAdam, self).__setstate__(state)
 27 | 
 28 |     def step(self, closure=None):
 29 | 
 30 |         loss = None
 31 |         if closure is not None:
 32 |             loss = closure()
 33 | 
 34 |         for group in self.param_groups:
 35 | 
 36 |             for p in group['params']:
 37 |                 if p.grad is None:
 38 |                     continue
 39 |                 grad = p.grad.data.float()
 40 |                 if grad.is_sparse:
 41 |                     raise RuntimeError('RAdam does not support sparse gradients')
 42 | 
 43 |                 p_data_fp32 = p.data.float()
 44 | 
 45 |                 state = self.state[p]
 46 | 
 47 |                 if len(state) == 0:
 48 |                     state['step'] = 0
 49 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
 50 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
 51 |                 else:
 52 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
 53 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
 54 | 
 55 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 56 |                 beta1, beta2 = group['betas']
 57 | 
 58 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 59 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 60 | 
 61 |                 state['step'] += 1
 62 |                 buffered = group['buffer'][int(state['step'] % 10)]
 63 |                 if state['step'] == buffered[0]:
 64 |                     N_sma, step_size = buffered[1], buffered[2]
 65 |                 else:
 66 |                     buffered[0] = state['step']
 67 |                     beta2_t = beta2 ** state['step']
 68 |                     N_sma_max = 2 / (1 - beta2) - 1
 69 |                     N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
 70 |                     buffered[1] = N_sma
 71 | 
 72 |                     # more conservative since it's an approximated value
 73 |                     if N_sma >= 5:
 74 |                         step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
 75 |                     elif self.degenerated_to_sgd:
 76 |                         step_size = 1.0 / (1 - beta1 ** state['step'])
 77 |                     else:
 78 |                         step_size = -1
 79 |                     buffered[2] = step_size
 80 | 
 81 |                 # more conservative since it's an approximated value
 82 |                 if N_sma >= 5:
 83 |                     if group['weight_decay'] != 0:
 84 |                         p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
 85 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
 86 |                     p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
 87 |                     p.data.copy_(p_data_fp32)
 88 |                 elif step_size > 0:
 89 |                     if group['weight_decay'] != 0:
 90 |                         p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
 91 |                     p_data_fp32.add_(-step_size * group['lr'], exp_avg)
 92 |                     p.data.copy_(p_data_fp32)
 93 | 
 94 |         return loss
 95 | 
 96 | class PlainRAdam(Optimizer):
 97 | 
 98 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
 99 |         if not 0.0 <= lr:
100 |             raise ValueError("Invalid learning rate: {}".format(lr))
101 |         if not 0.0 <= eps:
102 |             raise ValueError("Invalid epsilon value: {}".format(eps))
103 |         if not 0.0 <= betas[0] < 1.0:
104 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
105 |         if not 0.0 <= betas[1] < 1.0:
106 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
107 |                     
108 |         self.degenerated_to_sgd = degenerated_to_sgd
109 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
110 | 
111 |         super(PlainRAdam, self).__init__(params, defaults)
112 | 
113 |     def __setstate__(self, state):
114 |         super(PlainRAdam, self).__setstate__(state)
115 | 
116 |     def step(self, closure=None):
117 | 
118 |         loss = None
119 |         if closure is not None:
120 |             loss = closure()
121 | 
122 |         for group in self.param_groups:
123 | 
124 |             for p in group['params']:
125 |                 if p.grad is None:
126 |                     continue
127 |                 grad = p.grad.data.float()
128 |                 if grad.is_sparse:
129 |                     raise RuntimeError('RAdam does not support sparse gradients')
130 | 
131 |                 p_data_fp32 = p.data.float()
132 | 
133 |                 state = self.state[p]
134 | 
135 |                 if len(state) == 0:
136 |                     state['step'] = 0
137 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
138 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
139 |                 else:
140 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
141 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
142 | 
143 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
144 |                 beta1, beta2 = group['betas']
145 | 
146 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
147 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
148 | 
149 |                 state['step'] += 1
150 |                 beta2_t = beta2 ** state['step']
151 |                 N_sma_max = 2 / (1 - beta2) - 1
152 |                 N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
153 | 
154 | 
155 |                 # more conservative since it's an approximated value
156 |                 if N_sma >= 5:
157 |                     if group['weight_decay'] != 0:
158 |                         p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
159 |                     step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
160 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
161 |                     p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
162 |                     p.data.copy_(p_data_fp32)
163 |                 elif self.degenerated_to_sgd:
164 |                     if group['weight_decay'] != 0:
165 |                         p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
166 |                     step_size = group['lr'] / (1 - beta1 ** state['step'])
167 |                     p_data_fp32.add_(-step_size, exp_avg)
168 |                     p.data.copy_(p_data_fp32)
169 | 
170 |         return loss
171 | 
172 | 
173 | class AdamW(Optimizer):
174 | 
175 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup = 0):
176 |         if not 0.0 <= lr:
177 |             raise ValueError("Invalid learning rate: {}".format(lr))
178 |         if not 0.0 <= eps:
179 |             raise ValueError("Invalid epsilon value: {}".format(eps))
180 |         if not 0.0 <= betas[0] < 1.0:
181 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
182 |         if not 0.0 <= betas[1] < 1.0:
183 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
184 |         
185 |         defaults = dict(lr=lr, betas=betas, eps=eps,
186 |                         weight_decay=weight_decay, warmup = warmup)
187 |         super(AdamW, self).__init__(params, defaults)
188 | 
189 |     def __setstate__(self, state):
190 |         super(AdamW, self).__setstate__(state)
191 | 
192 |     def step(self, closure=None):
193 |         loss = None
194 |         if closure is not None:
195 |             loss = closure()
196 | 
197 |         for group in self.param_groups:
198 | 
199 |             for p in group['params']:
200 |                 if p.grad is None:
201 |                     continue
202 |                 grad = p.grad.data.float()
203 |                 if grad.is_sparse:
204 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
205 | 
206 |                 p_data_fp32 = p.data.float()
207 | 
208 |                 state = self.state[p]
209 | 
210 |                 if len(state) == 0:
211 |                     state['step'] = 0
212 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
213 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
214 |                 else:
215 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
216 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
217 | 
218 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
219 |                 beta1, beta2 = group['betas']
220 | 
221 |                 state['step'] += 1
222 | 
223 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
224 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
225 | 
226 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
227 |                 bias_correction1 = 1 - beta1 ** state['step']
228 |                 bias_correction2 = 1 - beta2 ** state['step']
229 |                 
230 |                 if group['warmup'] > state['step']:
231 |                     scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup']
232 |                 else:
233 |                     scheduled_lr = group['lr']
234 | 
235 |                 step_size = scheduled_lr * math.sqrt(bias_correction2) / bias_correction1
236 |                 
237 |                 if group['weight_decay'] != 0:
238 |                     p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32)
239 | 
240 |                 p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
241 | 
242 |                 p.data.copy_(p_data_fp32)
243 | 
244 |         return loss
245 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cozek/OffensEval2020-code/874b86f14112326c9ba09965174142bdb0acdfb0/src/utils/__init__.py


--------------------------------------------------------------------------------
/src/utils/activations.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | def swish(x):
 6 |     """
 7 |     Simple implementation of Swish activation function
 8 |     https://arxiv.org/pdf/1710.05941.pdf
 9 |     """
10 |     return x * torch.sigmoid(x)
11 | 
12 | def mish(x):
13 |     """
14 |     Simple implementation of Mish activation Function
15 |     https://arxiv.org/abs/1908.08681
16 |     """
17 |     tanh = nn.Tanh()
18 |     softplus = nn.Softplus()
19 |     return x * tanh( softplus(x))
20 | 
21 | def penalized_tanh(x):
22 |     """
23 |     http://aclweb.org/anthology/D18-1472
24 |     """
25 |     alpha = 0.25
26 |     return torch.max(torch.tanh(x), alpha*torch.tanh(x))


--------------------------------------------------------------------------------
/src/utils/general.py:
--------------------------------------------------------------------------------
  1 | """
  2 | General Utilities
  3 | """
  4 | import os
  5 | import io
  6 | import mmap
  7 | import torch
  8 | import random
  9 | import numpy as np
 10 | import pandas as pd  # type: ignore
 11 | from tqdm import tqdm
 12 | from sklearn.metrics import classification_report
 13 | from sklearn.metrics import confusion_matrix
 14 | from sklearn.metrics import accuracy_score
 15 | from argparse import Namespace
 16 | import matplotlib.pyplot as plt
 17 | import seaborn as sns
 18 | 
 19 | 
 20 | def alert():
 21 |     from IPython.display import Audio
 22 | 
 23 |     wave = np.sin(2 * np.pi * 400 * np.arange(10000 * 0.35) / 10000)
 24 |     Audio(wave, rate=10000, autoplay=True)
 25 | 
 26 | 
 27 | def plot_train_state(train_state):
 28 |     """Plot the train state
 29 |     Args:
 30 |         train_state (dict): Dict containing train state information
 31 |     """
 32 | 
 33 |     sns.set(style="darkgrid")
 34 | 
 35 |     plot_df = pd.DataFrame(
 36 |         {
 37 |             "train_acc": train_state["train_accuracies"],
 38 |             "val_acc": train_state["val_accuracies"],
 39 |         }
 40 |     )
 41 |     plot_df.index += 1
 42 |     num_epochs = len(plot_df)
 43 | 
 44 |     fig, ax = plt.subplots(figsize=(10, 7))
 45 | 
 46 |     start, end = ax.get_xlim()
 47 |     ax.xaxis.set_ticks(np.arange(0, num_epochs + 1, 1))
 48 |     plt.ylabel("accuracy")
 49 |     plt.xlabel("epoch")
 50 |     axp = sns.lineplot(ax=ax, data=plot_df, legend="full")
 51 |     for epoch, train_acc, val_acc in zip(
 52 |         range(1, num_epochs + 1), plot_df["train_acc"], plot_df["val_acc"]
 53 |     ):
 54 |         plt.annotate(
 55 |             f"{train_acc:.3f}",
 56 |             xy=(epoch, train_acc),
 57 |             xytext=(0, 30),
 58 |             textcoords="offset points",
 59 |             ha="center",
 60 |             va="top",
 61 |             bbox=dict(boxstyle="square,pad=0.2", alpha=0.5),
 62 |             #         arrowprops=dict(arrowstyle = 'simple', connectionstyle='arc3,rad=0'),
 63 |         )
 64 |         plt.annotate(
 65 |             f"{val_acc:.3f}",
 66 |             xy=(epoch, val_acc),
 67 |             xytext=(0, -30),
 68 |             textcoords="offset points",
 69 |             ha="center",
 70 |             va="bottom",
 71 |             bbox=dict(boxstyle="square,pad=0.2", fc="orange", alpha=0.5),
 72 |             #         arrowprops=dict(arrowstyle = 'simple', connectionstyle='arc3,rad=0'),
 73 |         )
 74 | 
 75 | 
 76 | def get_misclassified_examples(torch_dataset, split_type, train_state, threshold=0.5):
 77 |     torch_dataset.set_split(split_type)
 78 |     new_df = torch_dataset._target_df.iloc[
 79 |         train_state[f"{split_type}_indexes"][-1].cpu().numpy()
 80 |     ]
 81 |     new_df.reset_index(drop=True, inplace=True)
 82 |     y_pred = (
 83 |         (torch.sigmoid(train_state[f"{split_type}_preds"][-1]) > threshold).cpu().long()
 84 |     )
 85 |     new_df = new_df.assign(pred=pd.Series(y_pred))
 86 |     new_df = new_df[new_df.label != new_df.pred][["text", "label", "pred"]]
 87 | 
 88 |     return new_df
 89 | 
 90 | 
 91 | def analyse_preds(y_pred, y_target, threshold=0.5):
 92 |     y_pred = (torch.sigmoid(y_pred) > threshold).cpu().long().numpy()
 93 |     y_target = y_target.cpu().numpy()
 94 | 
 95 |     conmat = confusion_matrix(y_pred=y_pred, y_true=y_target)
 96 |     confusion = pd.DataFrame(
 97 |         conmat, index=["NOT", "HS"], columns=["predicted_NOT", "predicted_HS"]
 98 |     )
 99 |     print("acc = ", accuracy_score(y_pred=y_pred, y_true=y_target))
100 |     print(classification_report(y_pred=y_pred, y_true=y_target, digits=4))
101 |     print(confusion)
102 | 
103 | 
104 | def make_train_state():
105 |     d = {
106 |         "train_preds": [],
107 |         "train_indexes": [],
108 |         "train_targets": [],
109 |         "train_accuracies": [],
110 |         "train_f1s": [],
111 |         "train_losses": [],
112 |         "val_preds": [],
113 |         "val_indexes": [],
114 |         "val_targets": [],
115 |         "val_accuracies": [],
116 |         "val_f1s": [],
117 |         "val_losses": [],
118 |         "test_preds": [],
119 |         "test_indexes": [],
120 |         "test_targets": [],
121 |         "test_accuracies": [],
122 |         "test_f1s": [],
123 |         "test_losses": [],
124 |         "batch_preds": [],
125 |         "batch_targets": [],
126 |         "batch_indexes": [],
127 |         "epoch_index": 0,
128 |         # "save_path": ''
129 |     }
130 |     return dict(d)
131 | 
132 | 
133 | def compute_accuracy(y_pred, y_target):
134 |     y_target = y_target.cpu()
135 |     y_pred_indices = (torch.sigmoid(y_pred) > 0.5).cpu().long()
136 |     n_correct = torch.eq(y_pred_indices, y_target).sum().item()
137 |     return n_correct / len(y_pred_indices) * 100
138 | 
139 | 
140 | def set_seed_everywhere(seed=42):
141 |     np.random.seed(seed)
142 |     random.seed(seed)
143 |     torch.manual_seed(seed)
144 |     if torch.cuda.is_available():
145 |         torch.cuda.manual_seed_all(seed)
146 | 
147 | 
148 | def describe_tensor(x):
149 |     """
150 |     Prints information about a given tensor
151 |     """
152 |     print("Type: {}".format(x.type()))
153 |     print("Shape/size: {}".format(x.shape))
154 |     print("Values: \n{}".format(x))
155 | 
156 | 
157 | class DefaultFilePaths:
158 |     """
159 |     Helper class that stores the location of datafiles, embeddings, etc.
160 |     Must be set up for your local machine. Default configuration is for the maintainer's
161 |     personal machine.
162 |     """
163 | 
164 |     def __init__(self, location="local"):
165 |         if location == "local":
166 |             self.PREFIX = "/Users/cozek/Documents/MTech/4th Sem/OffensEval/data"
167 |             self.glove = "/Users/cozek/Documents/MTech/3rd Sem/Project/glove.twitter.27B/glove.twitter.27B.200d.txt"
168 |             self.fasttext_bin = (
169 |                 "/Users/cozek/Documents/MTech/3rd Sem/Project/cc.en.300.bin"
170 |             )
171 |             self.bert_uncased_large = (
172 |                 "/Users/cozek/Documents/MTech/4th Sem/wwm_uncased_L-24_H-1024_A-16/"
173 |             )
174 |             self.gpt_2 = "/Users/cozek/Documents/MTech/4th Sem/gpt_2/"
175 |             self.offeval_data = {
176 |                 "en": {
177 |                     "task_a": self.PREFIX
178 |                     + "/OffenseEval2020Data/English/task_a_distant.tsv",
179 |                     "task_b": self.PREFIX
180 |                     + "/OffenseEval2020Data/English/task_b_distant.tsv",
181 |                     "task_c": self.PREFIX
182 |                     + "/OffenseEval2020Data/English/task_c_distant.tsv",
183 |                 },
184 |                 "en_presplit": self.PREFIX
185 |                 + "/OffenseEval2020Data/English/task_a_split.csv",
186 |                 "en_presplit_lite": self.PREFIX
187 |                 + "/OffenseEval2020Data/English/task_a_split_lite.csv",
188 |                 "en_presplit_tiny": self.PREFIX
189 |                 + "/OffenseEval2020Data/English/task_a_split_tiny.csv",
190 |             }
191 |             self.hasoc_data = {
192 |                 "en": {
193 |                     "train": self.PREFIX + "/hasoc_data/en/english_dataset.tsv",
194 |                     "test": self.PREFIX + "/hasoc_data/gold/hasoc2019_en_test-2919.tsv",
195 |                 },
196 |                 "en_presplit_task_a": self.PREFIX
197 |                 + "/hasoc_data/en/en_presplit_task_a.csv",
198 |                 "en_presplit_task_a_lite": self.PREFIX
199 |                 + "/hasoc_data/en/en_presplit_task_a_tiny.csv",
200 |             }
201 |         elif location == "server":
202 |             self.PREFIX = "/home/kaushik.das/OffensEval2020/data"
203 |             self.glove = "/home/kaushik.das/embeddings/glove.twitter.27B.200d.txt"
204 |             self.fasttext_bin = "/home/kaushik.das/embeddings/crawl-300d-2M-subword.bin"
205 |             self.bert_uncased_large = (
206 |                 "/home/kaushik.das/pytorch_transformers/bert_uncased/"
207 |             )
208 |             self.memotion = {
209 |                 'loc' : self.PREFIX + '/memotion_dataset_7k/',
210 |                 'task_a_advprop_df': self.PREFIX + 'memotion_dataset_7k/images_advprop_df_task_a.pickle',
211 |                 'task_a_simple_df': self.PREFIX + 'memotion_dataset_7k/images_simple_df_task_a.pickle',
212 | 
213 |             }
214 |             self.gpt_2 = "/home/kaushik.das/pytorch_transformers/gpt2/"
215 |             self.distilgpt2 = "/home/kaushik.das/pytorch_transformers/distilgpt2/"
216 |             self.model_storage = "/home/kaushik.das/OffensEval2020/saved_models/"
217 |             self.trac_data = {
218 |                 "en_dev": self.PREFIX + "/TRAC/eng/trac2_eng_dev.csv",
219 |                 "en_train": self.PREFIX + "/TRAC/eng/trac2_eng_train.csv",
220 |                 "en_task_a_dataframe": self.PREFIX + "/TRAC/eng/trac2_eng_task_a_df.csv",
221 |                 "en_task_b_dataframe": self.PREFIX + "/TRAC/eng/trac2_eng_task_b_df.csv",
222 | 
223 | 
224 |                 "hin_dev": self.PREFIX + "/TRAC/hin/trac2_hin_dev.csv",
225 |                 "hin_train": self.PREFIX + "/TRAC/hin/trac2_hin_train.csv",
226 |                 "iben_dev": self.PREFIX + "/TRAC/iben/trac2_iben_dev.csv",
227 |                 "iben_train": self.PREFIX + "/TRAC/iben/trac2_iben_train.csv",
228 |             }
229 |             self.offeval_data = {
230 |                 "en": {
231 |                     "task_a": self.PREFIX
232 |                     + "/OffenseEval2020Data/English/task_a_distant.tsv",
233 |                     "task_b": self.PREFIX
234 |                     + "/OffenseEval2020Data/English/task_b_distant.tsv",
235 |                     "task_c": self.PREFIX
236 |                     + "/OffenseEval2020Data/English/task_c_distant_ann.tsv",
237 |                 },
238 |                 # TASK C
239 |                 "en_task_c_presplit_final": self.PREFIX
240 |                 + "/OffenseEval2020Data/English/offeval2020_task_c_en_presplit.csv",
241 |                 "en_task_c_presplit_lite": self.PREFIX
242 |                 + "/OffenseEval2020Data/English/en_task_c_presplit_lite.csv",
243 |                 "en_task_c_presplit_full": self.PREFIX
244 |                 + "/OffenseEval2020Data/English/en_task_c_presplit_full.csv",
245 |                 # TASK B
246 |                 "en_task_b_presplit_lite": self.PREFIX
247 |                 + "/OffenseEval2020Data/English/en_task_b_presplit_lite.csv",
248 |                 "en_task_b_presplit_full": self.PREFIX
249 |                 + "/OffenseEval2020Data/English/en_task_b_presplit_full.csv",
250 |                 "en_public_test_b": self.PREFIX  # testset
251 |                 + "/OffenseEval2020Data/English/task_b_test/test_b_tweets.tsv",
252 |                 # TASK A
253 |                 "en_public_test_a": self.PREFIX  # testset
254 |                 + "/OffenseEval2020Data/English/public_data_A/test_a_tweets.tsv",
255 |                 "en_presplit_full": self.PREFIX
256 |                 + "/OffenseEval2020Data/English/task_a_split_full.csv",
257 |                 "en_presplit_lite": self.PREFIX
258 |                 + "/OffenseEval2020Data/English/task_a_split_lite.csv",
259 |                 "en_presplit_tiny": self.PREFIX
260 |                 + "/OffenseEval2020Data/English/task_a_split_tiny.csv",
261 |                 "en_presplit_tiny_fixed": self.PREFIX
262 |                 + "/OffenseEval2020Data/English/task_a_split_tiny_fixed.csv",
263 |                 "en_presplit_lite_fixed": self.PREFIX
264 |                 + "/OffenseEval2020Data/English/task_a_split_lite_fixed.csv",
265 |                 "en_presplit_full_fixed": self.PREFIX
266 |                 + "/OffenseEval2020Data/English/task_a_split_full_fixed.csv",
267 |                 # std <= 0.3
268 |                 "en_safe_presplit_full": self.PREFIX
269 |                 + "/OffenseEval2020Data/English/task_a_split_full_safe.csv",
270 |                 # std <= 0.2
271 |                 "en_verysafe_presplit_tiny": self.PREFIX
272 |                 + "/OffenseEval2020Data/English/task_a_split_tiny_verysafe.csv",
273 |                 "en_verysafe_presplit_full": self.PREFIX
274 |                 + "/OffenseEval2020Data/English/task_a_split_full_verysafe.csv",
275 |             }
276 | 
277 |             self.hasoc_data = {
278 |                 "en": {
279 |                     "train": self.PREFIX + "/hasoc_data/en/english_dataset.tsv",
280 |                     "test": self.PREFIX + "/hasoc_data/gold/hasoc2019_en_test-2919.tsv",
281 |                 },
282 |                 "en_presplit_task_a": self.PREFIX
283 |                 + "/hasoc_data/en/en_presplit_task_a.csv",
284 |                 "en_presplit_task_a_lite": self.PREFIX
285 |                 + "/hasoc_data/en/en_presplit_task_a_tiny.csv",
286 |             }
287 | 
288 | 
289 | if __name__ == "__main__":
290 |     d = DefaultFilePaths()
291 | 


--------------------------------------------------------------------------------
/src/utils/offenseval2020.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utils for OffensEval 2020 dataset
 3 | """
 4 | from argparse import Namespace
 5 | from tqdm import tqdm
 6 | import pandas as pd  # type: ignore
 7 | import numpy as np
 8 | import random
 9 | import torch
10 | import mmap
11 | import io
12 | import os
13 | 
14 | 
15 | def load_dataset(path: str):
16 |     """Loads OffensEval 2020 dataset
17 |     Args:
18 |         path (str): full path of a task_*_distant.tsv file
19 |             provided by the organiser
20 |     Returns:
21 |         pandas.DataFrame containing the data
22 |     """
23 |     data_df = pd.read_csv(path, sep="\t", quoting=3)
24 |     # remove errenous space character in column name in some files
25 |     data_df.columns = list(map(lambda c: c.strip(), data_df.columns))
26 | 
27 |     return data_df
28 | 
29 | 
30 | def labeller(df: pd.DataFrame, threshold: float, task: str, drop_cols: bool):
31 |     """Adds a label to the samples in the given DataFrame
32 |     Args:
33 |         df (pd.DataFrame): A dataframe containing the samples their given confidence
34 |         as df.text and df.average respectively
35 |         threshold (float):  Probability to label a sample as positive
36 |         task: one of 'a','b','c'
37 |         drop_cols: drops some columns that are not necessary 
38 |     Returns:
39 |         df (pd.DataFrame): with a added column that labels of each sample respectively
40 |         label_dict (dict): The labels and their corresponding integer values 
41 |     """
42 |     task = task.lower()
43 | 
44 |     assert isinstance(df, pd.DataFrame)
45 |     assert 0.0 <= threshold <= 1.0
46 |     assert task in ["a", "b", "c"]
47 |     assert isinstance(drop_cols, bool)
48 | 
49 |     if task in ["a", "b"]:
50 |         df["label"] = df.average >= 0.5
51 |         df["label"] = df["label"].astype(int)
52 |     elif task == "c":
53 |         cols = {"average_ind": 0, "average_grp": 1, "average_oth": 2}
54 |         df["label"] = df[list(cols.keys())].idxmax(axis=1)
55 |         df["label"] = df["label"].apply(lambda x: cols[x])
56 | 
57 |     if drop_cols:
58 |         df = df[["id", "text", "label"]]
59 | 
60 |     label_dict = {}
61 |     if task == "a":
62 |         label_dict["OFF"] = 1
63 |         label_dict["NOT"] = 0
64 |     elif task == "b": #bug
65 |         label_dict["UNT"] = 1 
66 |         label_dict["TIN"] = 0
67 |     elif task == "c":
68 |         label_dict["IND"] = 0
69 |         label_dict["GRP"] = 1
70 |         label_dict["OTH"] = 2
71 | 
72 |     return df, label_dict
73 | 


--------------------------------------------------------------------------------
/src/utils/transformer/__init.py__:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cozek/OffensEval2020-code/874b86f14112326c9ba09965174142bdb0acdfb0/src/utils/transformer/__init.py__


--------------------------------------------------------------------------------
/src/utils/transformer/data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for creating the dataset for 
  3 | 
  4 | """
  5 | from typing import Callable
  6 | from torch.utils.data import Dataset, DataLoader
  7 | from argparse import Namespace
  8 | import collections
  9 | import pandas as pd
 10 | import numpy as np
 11 | import string
 12 | import torch
 13 | import nltk
 14 | 
 15 | class GPT2Preprocessor():
 16 |     def __init__(self,transformer_tokenizer,sentence_detector):
 17 |         self.transformer_tokenizer = transformer_tokenizer
 18 |         self.sentence_detector = sentence_detector
 19 |         
 20 |     def add_eos_tokens(self, text):
 21 |         eos_token = ' ' + self.transformer_tokenizer.eos_token + ' '
 22 |         sentences = self.sentence_detector.tokenize(text)
 23 |         eos_added_text  = eos_token.join(sentences) + ' ' + self.transformer_tokenizer.eos_token
 24 |         return eos_added_text 
 25 | 
 26 | class Vectorizer():
 27 |     def __init__(self,tokenizer: Callable, max_seq_len: int):
 28 |         """
 29 |         Args:
 30 |             tokenizer (Callable): transformer tokenizer
 31 |             max_seq_len (int): Maximum sequence lenght 
 32 |         """
 33 |         self.tokenizer = tokenizer
 34 |         self._max_seq_len = max_seq_len
 35 | 
 36 |     def vectorize(self,text :str):
 37 |         sequence = \
 38 |             self.tokenizer.prepare_for_tokenization(text,add_prefix_space=True)
 39 |         indices = self.tokenizer.encode(sequence)
 40 |         
 41 |         out_vector = np.zeros(self._max_seq_len, dtype=np.int64)
 42 |         out_vector[: len(indices)] = indices
 43 |         # max len is restricted to 1024
 44 |         return out_vector[:min(self._max_seq_len,1024)]        
 45 | 
 46 | class HateDataset(Dataset):
 47 |     def __init__(self, data_df: pd.DataFrame, tokenizer: Callable, max_len:int=None):
 48 |         """
 49 |         Args:
 50 |             data_df (pandas.DataFrame): df containing the labels and text
 51 |             tokenizer (tokenizer module for the transformer)
 52 |         """
 53 |         self.data_df = data_df
 54 |         self.tokenizer = tokenizer
 55 | 
 56 |         # measure_len = lambda context: len(context.split(" "))
 57 |         # self._max_seq_length = max(map(measure_len, data_df.text)) + 2
 58 |         if max_len == None:
 59 |             self._max_seq_length = self._get_max_len(data_df,tokenizer)
 60 |         else:
 61 |             self._max_seq_length = max_len
 62 | 
 63 |         self.train_df = self.data_df[self.data_df.split == 'train']
 64 |         self.train_size = len(self.train_df)
 65 | 
 66 |         self.val_df = self.data_df[self.data_df.split == 'val']
 67 |         self.val_size = len(self.val_df)
 68 | 
 69 |         self.test_df = self.data_df[self.data_df.split == 'test']
 70 |         self.test_size = len(self.test_df)
 71 | 
 72 | 
 73 |         self._vectorizer = Vectorizer(tokenizer, self._max_seq_length)
 74 | 
 75 | 
 76 |         self._lookup_dict = {
 77 |             'train': (self.train_df, self.train_size),
 78 |             'val': (self.val_df, self.val_size),
 79 |             'test': (self.test_df, self.test_size)
 80 |         }
 81 | 
 82 |         self.set_split('train')
 83 | 
 84 |         class_counts = data_df.label.value_counts().to_dict()
 85 |          #sorted on the basis of class label,eg, 0,1,2..
 86 |         cts = sorted([(lbl,cts) for lbl,cts in class_counts.items()], key=lambda x: x[0])
 87 |         freq = [ x[1] for x in cts ]
 88 |         # print(freq,cts)
 89 |         self.class_weights = 1.0/ torch.tensor(freq, dtype=torch.float32)
 90 |     
 91 |     def _get_max_len(self,data_df: pd.DataFrame, tokenizer: Callable):
 92 |         prep_func = lambda x: self.tokenizer.prepare_for_tokenization(x,add_prefix_space=True)
 93 |         len_func = lambda x: len(prep_func(x))
 94 |         max_len = data_df.text.map(len_func).max() 
 95 |         return max_len
 96 | 
 97 |         # max_len = 0
 98 |         # for seq in data_df['text']:
 99 |         #     temp = tokenizer.prepare_for_tokenization(seq,add_prefix_space=True)
100 |         #     tokenized_seq = tokenizer.tokenize(temp)
101 |         #     if len(tokenized_seq) > max_len:
102 |         #         max_len = len(tokenized_seq)
103 |         # return max_len
104 | 
105 |         
106 | 
107 |     def set_split(self, split="train"):
108 |         """ selects the splits in the dataset using a column in the dataframe """
109 |         self._target_split = split
110 |         self._target_df, self._target_size = self._lookup_dict[split]
111 |     
112 |     def __len__(self):
113 |         return self._target_size
114 |     
115 |     def __getitem__(self, index):
116 |         """the primary entry point method for PyTorch datasets
117 |         
118 |         Args:
119 |             index (int): the index to the data point 
120 |         Returns:
121 |             a dictionary holding the data point's features (x_data) and label (y_target)
122 |         """
123 |         row = self._target_df.iloc[index]
124 | 
125 |         sequence = self._vectorizer.vectorize(row.text)
126 | 
127 |         label = row.label
128 |         return {'x_data': sequence,
129 |                 'x_index': index,
130 |                 'y_target': label}
131 |     
132 |     def get_num_batches(self, batch_size):
133 |         """Given a batch size, return the number of batches in the dataset
134 |         
135 |         Args:
136 |             batch_size (int)
137 |         Returns:
138 |             number of batches in the dataset
139 |         """
140 |         return len(self) // batch_size
141 | 
142 | class TracDataset(Dataset):
143 |     def __init__(self, data_df: pd.DataFrame, tokenizer: Callable):
144 |         """
145 |         Args:
146 |             data_df (pandas.DataFrame): df containing the labels and text
147 |             tokenizer (tokenizer module for the transformer)
148 |         """
149 |         self.data_df = data_df
150 |         self.tokenizer = tokenizer
151 | 
152 |         # measure_len = lambda context: len(context.split(" "))
153 |         # self._max_seq_length = max(map(measure_len, data_df.text)) + 2
154 |         self._max_seq_length = self._get_max_len(data_df,tokenizer)
155 | 
156 |         self.train_df = self.data_df[self.data_df.split == 'train']
157 |         self.train_size = len(self.train_df)
158 | 
159 |         self.val_df = self.data_df[self.data_df.split == 'dev']
160 |         self.val_size = len(self.val_df)
161 | 
162 |         self.test_df = self.data_df[self.data_df.split == 'test']
163 |         self.test_size = len(self.test_df)
164 | 
165 | 
166 |         self._vectorizer = Vectorizer(tokenizer, self._max_seq_length)
167 |         
168 | 
169 |         self._lookup_dict = {
170 |             'train': (self.train_df, self.train_size),
171 |             'val': (self.val_df, self.val_size),
172 |             'test': (self.test_df, self.test_size)
173 |         }
174 | 
175 |         self.set_split('train')
176 | 
177 |         class_counts = data_df.label.value_counts().to_dict()
178 |          #sorted on the basis of class label,eg, 0,1,2..
179 |         cts = sorted([(lbl,cts) for lbl,cts in class_counts.items()], key=lambda x: x[0])
180 |         freq = [ x[1] for x in cts ]
181 |         # print(freq,cts)
182 |         self.class_weights = 1.0/ torch.tensor(freq, dtype=torch.float32)
183 |     
184 |     def _get_max_len(self,data_df: pd.DataFrame, tokenizer: Callable):
185 |         max_len = 0
186 |         for seq in data_df['text']:
187 |             temp = tokenizer.prepare_for_tokenization(seq,add_prefix_space=True)
188 |             tokenized_seq = tokenizer.tokenize(temp)
189 |             if len(tokenized_seq) > max_len:
190 |                 max_len = len(tokenized_seq)
191 |         return max_len
192 | 
193 |         
194 | 
195 |     def set_split(self, split="train"):
196 |         """ selects the splits in the dataset using a column in the dataframe """
197 |         self._target_split = split
198 |         self._target_df, self._target_size = self._lookup_dict[split]
199 |     
200 |     def __len__(self):
201 |         return self._target_size
202 |     
203 |     def __getitem__(self, index):
204 |         """the primary entry point method for PyTorch datasets
205 |         
206 |         Args:
207 |             index (int): the index to the data point 
208 |         Returns:
209 |             a dictionary holding the data point's features (x_data) and label (y_target)
210 |         """
211 |         row = self._target_df.iloc[index]
212 | 
213 |         sequence = self._vectorizer.vectorize(row.text)
214 | 
215 |         label = row.label
216 |         return {'x_data': sequence,
217 |                 'x_index': index,
218 |                 'y_target': label}
219 |     
220 |     def get_num_batches(self, batch_size):
221 |         """Given a batch size, return the number of batches in the dataset
222 |         
223 |         Args:
224 |             batch_size (int)
225 |         Returns:
226 |             number of batches in the dataset
227 |         """
228 |         return len(self) // batch_size
229 | 
230 | def generate_batches(dataset, batch_size, shuffle=True,
231 |                      drop_last=False, device="cpu", pinned_memory = False, n_workers = 0): 
232 |     """
233 |     A generator function which wraps the PyTorch DataLoader. It will 
234 |       ensure each tensor is on the write device location.
235 |     """
236 |     dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
237 |                             shuffle=shuffle, drop_last=drop_last,
238 |                             pin_memory= pinned_memory,
239 |                             num_workers = n_workers,
240 |                             )
241 | 
242 |     for data_dict in dataloader:
243 |         out_data_dict = {}
244 |         # print(data_dict.items())
245 |         for name, tensor in data_dict.items():
246 |             out_data_dict[name] = data_dict[name].to(device, non_blocking= (True if pinned_memory else False) )
247 |         yield out_data_dict


--------------------------------------------------------------------------------
/src/utils/transformer/general.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics import classification_report
 2 | from sklearn.metrics import confusion_matrix
 3 | from sklearn.metrics import accuracy_score
 4 | from sklearn.metrics import f1_score
 5 | import torch
 6 | import pandas as pd
 7 | import numpy as np
 8 | import torch.nn as nn
 9 | 
10 | class EarlyStopping:
11 |     """Early stops the training if validation loss doesn't improve after a given patience."""
12 |     def __init__(self, patience=7, verbose=False, delta=0):
13 |         """
14 |         Args:
15 |             patience (int): How long to wait after last time validation loss improved.
16 |                             Default: 7
17 |             verbose (bool): If True, prints a message for each validation loss improvement. 
18 |                             Default: False
19 |             delta (float): Minimum change in the monitored quantity to qualify as an improvement.
20 |                             Default: 0
21 |         """
22 |         self.patience = patience
23 |         self.verbose = verbose
24 |         self.counter = 0
25 |         self.best_score = None
26 |         self.early_stop = False
27 |         self.val_loss_min = np.Inf
28 |         self.delta = delta
29 | 
30 |     def __call__(self, val_loss, model):
31 | 
32 |         score = val_loss
33 | 
34 |         if self.best_score is None:
35 |             self.best_score = score
36 |             self.save_checkpoint(val_loss, model)
37 |         elif score < self.best_score + self.delta:
38 |             self.counter += 1
39 |             print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
40 |             if self.counter >= self.patience:
41 |                 self.early_stop = True
42 |         else:
43 |             self.best_score = score
44 |             self.save_checkpoint(val_loss, model)
45 |             self.counter = 0
46 | 
47 |     def save_checkpoint(self, val_loss, model):
48 |         '''Saves model when validation loss decrease.'''
49 |         if self.verbose:
50 |             print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
51 |         torch.save(model.state_dict(), 'checkpoint.pt')
52 |         self.val_loss_min = val_loss
53 | 
54 | def compute_accuracy(y_pred, y_target):
55 |     y_pred = y_pred.cpu()
56 |     y_target = y_target.cpu()
57 |     return torch.eq(torch.argmax(y_pred,dim=1),y_target).sum().item() / len(y_pred)
58 | 
59 | def compute_macro_f1(y_pred, y_target, average = 'macro'):
60 |     y_pred = (torch.argmax(y_pred,dim=1)).cpu().long().numpy()
61 |     y_target = y_target.cpu().numpy()
62 | 
63 |     return f1_score(y_true = y_target, y_pred=y_pred , average=average)
64 | 
65 | 
66 | def analyse_preds(y_pred, y_target, threshold=0.5):
67 |     y_pred = (torch.argmax(y_pred,dim=1) > threshold).cpu().long().numpy()
68 |     # y_pred = (torch.argmax(y_pred > threshold,dim=1)).cpu().long().numpy()
69 |     y_target = y_target.cpu().numpy()
70 | 
71 |     conmat = confusion_matrix(y_pred=y_pred, y_true=y_target)
72 |     confusion = pd.DataFrame(
73 |         conmat, index=["NOT", "HS"], columns=["predicted_NOT", "predicted_HS"]
74 |     )
75 |     print("acc = ", accuracy_score(y_pred=y_pred, y_true=y_target))
76 |     print(classification_report(y_pred=y_pred, y_true=y_target, digits=4))
77 |     print(confusion)
78 | 
79 | def analyse_preds2(y_pred, y_target, threshold=0.5):
80 |     # y_pred = (torch.argmax(y_pred,dim=1) > threshold).cpu().long().numpy()
81 |     y_pred = torch.argmax(nn.Sigmoid()(y_pred) > threshold,dim=1).cpu().long().numpy()
82 |     y_target = y_target.cpu().numpy()
83 | 
84 |     conmat = confusion_matrix(y_pred=y_pred, y_true=y_target)
85 |     confusion = pd.DataFrame(
86 |         conmat, index=["NOT", "HS"], columns=["predicted_NOT", "predicted_HS"]
87 |     )
88 |     print("acc = ", accuracy_score(y_pred=y_pred, y_true=y_target))
89 |     print(classification_report(y_pred=y_pred, y_true=y_target, digits=4))
90 |     print(confusion)
91 |     


--------------------------------------------------------------------------------
/src/utils/transformer/roberta.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | 
  5 | class RobertaAttention(nn.Module):
  6 |     """Implements Attention Head Classifier
  7 |     on Pretrained Roberta Transformer representations.
  8 |     Attention Head Implementation based on: https://www.aclweb.org/anthology/P16-2034/
  9 |     """
 10 |     def penalized_tanh(self,x):
 11 |         """
 12 |         http://aclweb.org/anthology/D18-1472
 13 |         """
 14 |         alpha = 0.25
 15 |         return torch.max(torch.tanh(x), alpha*torch.tanh(x))
 16 |     def swish(self, x):
 17 |         """
 18 |         Simple implementation of Swish activation function
 19 |         https://arxiv.org/pdf/1710.05941.pdf
 20 |         """
 21 |         return x * torch.sigmoid(x)
 22 |     
 23 |     def mish(self, x):
 24 |         """
 25 |         Simple implementation of Mish activation Function
 26 |         https://arxiv.org/abs/1908.08681
 27 |         """
 28 |         tanh = nn.Tanh()
 29 |         softplus = nn.Softplus()
 30 |         return x * tanh( softplus(x))
 31 |     
 32 |     def __init__(self, model_name, num_labels):
 33 |         """
 34 |         Args:
 35 |             model_name: model name, eg, roberta-base'
 36 |         """
 37 |         super().__init__()
 38 |         self.w = nn.Linear(768,1, bias=False)
 39 |         self.roberta = RobertaModel.from_pretrained(model_name)
 40 |         self.prediction_layer = nn.Linear(768, num_labels)
 41 |         
 42 |         self.init_weights()
 43 |         
 44 |     def init_weights(self):
 45 |         for name, param in self.prediction_layer.named_parameters():
 46 |             if 'bias' in name:
 47 |                 nn.init.constant_(param, 0.0)
 48 |             elif 'weight' in name:
 49 |                 nn.init.xavier_uniform_(param)
 50 |         for name, param in self.w.named_parameters():
 51 |             if 'bias' in name:
 52 |                 nn.init.constant_(param, 0.0)
 53 |             elif 'weight' in name:
 54 |                 nn.init.xavier_uniform_(param)
 55 |         
 56 |     def forward(self, input_ids,attention_mask):
 57 |         """
 58 |         Args:
 59 |             input_ids: sent encoded into indices
 60 |             attention_mask: their respective attention masks,
 61 |         """
 62 |         #elmo layer takes care of padding
 63 |         embeddings = self.roberta(input_ids = input_ids,
 64 |                   attention_mask = attention_mask)
 65 |         H = embeddings[0] #final hidden layer outputs 
 66 | #         print(H.shape)
 67 |         M = self.penalized_tanh(H)
 68 |         alpha = torch.softmax(self.w(M), dim=1)
 69 |         r = torch.bmm(H.permute(0,2,1),alpha)
 70 |         h_star = self.penalized_tanh(r)
 71 |         preds = self.prediction_layer(h_star.permute(0,2,1))
 72 |         return preds
 73 |     
 74 | class RobertaAttentionReg(nn.Module):
 75 |     """Implements Attention Head Classifier
 76 |     on Pretrained Roberta Transformer representations.
 77 |     Attention Head Implementation based on: https://www.aclweb.org/anthology/P16-2034/
 78 |     """
 79 |     def swish(self, x):
 80 |         """
 81 |         Simple implementation of Swish activation function
 82 |         https://arxiv.org/pdf/1710.05941.pdf
 83 |         """
 84 |         return x * torch.sigmoid(x)
 85 |     
 86 |     def mish(self, x):
 87 |         """
 88 |         Simple implementation of Mish activation Function
 89 |         https://arxiv.org/abs/1908.08681
 90 |         """
 91 |         tanh = nn.Tanh()
 92 |         softplus = nn.Softplus()
 93 |         return x * tanh( softplus(x))
 94 |     
 95 |     def __init__(self, model_name, num_labels):
 96 |         """
 97 |         Args:
 98 |             model_name: model name, eg, roberta-base'
 99 |         """
100 |         super().__init__()
101 |         self.w = nn.Linear(768,1, bias=False)
102 |         self.roberta = RobertaModel.from_pretrained(model_name)
103 |         self.prediction_layer = nn.Linear(768, num_labels)
104 |         self.dropout = nn.Dropout(p=0.1)
105 |         self.init_weights()
106 |         
107 |     def init_weights(self):
108 |         for name, param in self.prediction_layer.named_parameters():
109 |             if 'bias' in name:
110 |                 nn.init.constant_(param, 0.0)
111 |             elif 'weight' in name:
112 |                 nn.init.kaiming_normal_(param)
113 |         for name, param in self.w.named_parameters():
114 |             if 'bias' in name:
115 |                 nn.init.constant_(param, 0.0)
116 |             elif 'weight' in name:
117 |                 nn.init.kaiming_normal_(param)
118 |         
119 |     def forward(self, input_ids,attention_mask):
120 |         """
121 |         Args:
122 |             input_ids: sent encoded into indices
123 |             attention_mask: their respective attention masks,
124 |         """
125 |         #elmo layer takes care of padding
126 |         embeddings = self.roberta(input_ids = input_ids,
127 |                   attention_mask = attention_mask)
128 |         
129 |         H = embeddings[0] #final hidden layer outputs 
130 | #         print(H.shape)
131 |         M = self.mish(H)
132 |         alpha = torch.softmax(self.w(M), dim=1)
133 |         alpha = self.dropout(alpha)
134 |         
135 |         r = torch.bmm(H.permute(0,2,1),alpha)        
136 |         
137 |         h_star = self.mish(r)
138 |         h_star = self.dropout(h_star)
139 |         
140 |         preds = self.prediction_layer(h_star.permute(0,2,1))
141 |         return preds
142 | 
143 | class RobertaAttentionNorm(nn.Module):
144 |     """Implements Attention Head Classifier
145 |     on Pretrained Roberta Transformer representations.
146 |     Attention Head Implementation based on: https://www.aclweb.org/anthology/P16-2034/
147 |     """
148 |     def swish(self, x):
149 |         """
150 |         Simple implementation of Swish activation function
151 |         https://arxiv.org/pdf/1710.05941.pdf
152 |         """
153 |         return x * torch.sigmoid(x)
154 |     
155 |     def mish(self, x):
156 |         """
157 |         Simple implementation of Mish activation Function
158 |         https://arxiv.org/abs/1908.08681
159 |         """
160 |         tanh = nn.Tanh()
161 |         softplus = nn.Softplus()
162 |         return x * tanh( softplus(x))
163 |     
164 |     def __init__(self, model_name, num_labels, max_seq_len):
165 |         """
166 |         Args:
167 |             model_name: model name, eg, roberta-base'
168 |         """
169 |         super().__init__()
170 |         self.w = nn.Linear(768,1, bias=False)
171 |         self.roberta = RobertaModel.from_pretrained(model_name)
172 |         self.prediction_layer = nn.Linear(768, num_labels)
173 |         self.dropout = nn.Dropout(p=0.1)
174 |         self.batchnorm = nn.BatchNorm1d(max_seq_len)
175 |         self.init_weights()
176 |         
177 |     def init_weights(self):
178 |         for name, param in self.prediction_layer.named_parameters():
179 |             if 'bias' in name:
180 |                 nn.init.constant_(param, 0.0)
181 |             elif 'weight' in name:
182 |                 nn.init.kaiming_normal_(param)
183 |         for name, param in self.w.named_parameters():
184 |             if 'bias' in name:
185 |                 nn.init.constant_(param, 0.0)
186 |             elif 'weight' in name:
187 |                 nn.init.kaiming_normal_(param)
188 |         
189 |     def forward(self, input_ids,attention_mask):
190 |         """
191 |         Args:
192 |             input_ids: sent encoded into indices
193 |             attention_mask: their respective attention masks,
194 |         """
195 |         #elmo layer takes care of padding
196 |         embeddings = self.roberta(input_ids = input_ids,
197 |                   attention_mask = attention_mask)
198 |         
199 |         H = embeddings[0] #final hidden layer outputs 
200 | 
201 |         H = self.batchnorm(H)
202 | 
203 |         M = self.swish(H)
204 |         alpha = torch.softmax(self.w(M), dim=1)
205 |         alpha = self.dropout(alpha)
206 |         
207 |         r = torch.bmm(H.permute(0,2,1),alpha)        
208 |         
209 |         h_star = self.swish(r)
210 |         h_star = self.dropout(h_star)
211 |         
212 |         preds = self.prediction_layer(h_star.permute(0,2,1))
213 | 
214 |         return preds


--------------------------------------------------------------------------------