├── .ipynb_checkpoints └── toxic_tutorial_with_answer-checkpoint.ipynb ├── Dataset └── __init__.py ├── Readme.md ├── resources ├── early-stopping-graphic.jpg ├── hybrid.png ├── kfold.png ├── onehot.png ├── overfitting.png ├── textcnn.png ├── textrnn.png └── word-embedding.jpeg ├── toxic_tutorial.ipynb └── toxic_tutorial_with_answer.ipynb /.ipynb_checkpoints/toxic_tutorial_with_answer-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# [Toxic comment classification challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)\n", 8 | "\n", 9 | "In this competition, you’re challenged to build a multi-headed model that’s capable of detecting different types of of toxicity like threats, obscenity, insults, and identity-based hate better than Perspective’s current models. You’ll be using a dataset of comments from Wikipedia’s talk page edits. Improvements to the current model will hopefully help online discussion become more productive and respectful." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "collapsed": false 17 | }, 18 | "outputs": [ 19 | { 20 | "name": "stderr", 21 | "output_type": "stream", 22 | "text": [ 23 | "Using TensorFlow backend.\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "########################################\n", 29 | "# Load the packages\n", 30 | "########################################\n", 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "import re\n", 34 | "\n", 35 | "import matplotlib.pyplot as plt\n", 36 | "import seaborn as sns\n", 37 | "\n", 38 | "from nltk.stem import SnowballStemmer\n", 39 | "\n", 40 | "from keras.preprocessing.text import Tokenizer\n", 41 | "from keras.preprocessing.sequence import pad_sequences\n", 42 | "from keras.layers import Dense, Input, Embedding, Bidirectional, GRU, Conv1D, GlobalMaxPooling1D, Dropout, TimeDistributed\n", 43 | "from keras.layers.merge import concatenate\n", 44 | "from keras.models import Model\n", 45 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", 46 | "\n", 47 | "%matplotlib inline" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "########################################\n", 59 | "# Define the hyper parameters\n", 60 | "########################################\n", 61 | "path = 'Dataset/'\n", 62 | "TRAIN_DATA_FILE=path + 'train.csv'\n", 63 | "TEST_DATA_FILE=path + 'test.csv'\n", 64 | "\n", 65 | "MAX_SEQUENCE_LENGTH = 100\n", 66 | "MAX_NB_WORDS = 100000\n", 67 | "EMBEDDING_DIM = 50" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## Prepare the training / testing data" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 3, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "########################################\n", 86 | "# Load the training / testing set with pandas csv format\n", 87 | "########################################\n", 88 | "train_df = pd.read_csv(TRAIN_DATA_FILE)\n", 89 | "test_df = pd.read_csv(TEST_DATA_FILE)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "## Expolary Data Analysis" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 4, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "A quick view of training set\n" 111 | ] 112 | }, 113 | { 114 | "data": { 115 | "text/html": [ 116 | "
\n", 117 | "\n", 130 | "\n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | "
idcomment_texttoxicsevere_toxicobscenethreatinsultidentity_hate
00000997932d777bfExplanation\\nWhy the edits made under my usern...000000
1000103f0d9cfb60fD'aww! He matches this background colour I'm s...000000
2000113f07ec002fdHey man, I'm really not trying to edit war. It...000000
30001b41b1c6bb37e\"\\nMore\\nI can't make any real suggestions on ...000000
40001d958c54c6e35You, sir, are my hero. Any chance you remember...000000
\n", 202 | "
" 203 | ], 204 | "text/plain": [ 205 | " id comment_text toxic \\\n", 206 | "0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \n", 207 | "1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 \n", 208 | "2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 \n", 209 | "3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... 0 \n", 210 | "4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 \n", 211 | "\n", 212 | " severe_toxic obscene threat insult identity_hate \n", 213 | "0 0 0 0 0 0 \n", 214 | "1 0 0 0 0 0 \n", 215 | "2 0 0 0 0 0 \n", 216 | "3 0 0 0 0 0 \n", 217 | "4 0 0 0 0 0 " 218 | ] 219 | }, 220 | "execution_count": 4, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "print(\"A quick view of training set\")\n", 227 | "train_df.head()" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 5, 233 | "metadata": { 234 | "collapsed": false 235 | }, 236 | "outputs": [ 237 | { 238 | "name": "stdout", 239 | "output_type": "stream", 240 | "text": [ 241 | "A quick view of testing set\n" 242 | ] 243 | }, 244 | { 245 | "data": { 246 | "text/html": [ 247 | "
\n", 248 | "\n", 261 | "\n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | "
idcomment_text
000001cee341fdb12Yo bitch Ja Rule is more succesful then you'll...
10000247867823ef7== From RfC == \\n\\n The title is fine as it is...
200013b17ad220c46\" \\n\\n == Sources == \\n\\n * Zawe Ashton on Lap...
300017563c3f7919a:If you have a look back at the source, the in...
400017695ad8997ebI don't anonymously edit articles at all.
\n", 297 | "
" 298 | ], 299 | "text/plain": [ 300 | " id comment_text\n", 301 | "0 00001cee341fdb12 Yo bitch Ja Rule is more succesful then you'll...\n", 302 | "1 0000247867823ef7 == From RfC == \\n\\n The title is fine as it is...\n", 303 | "2 00013b17ad220c46 \" \\n\\n == Sources == \\n\\n * Zawe Ashton on Lap...\n", 304 | "3 00017563c3f7919a :If you have a look back at the source, the in...\n", 305 | "4 00017695ad8997eb I don't anonymously edit articles at all." 306 | ] 307 | }, 308 | "execution_count": 5, 309 | "metadata": {}, 310 | "output_type": "execute_result" 311 | } 312 | ], 313 | "source": [ 314 | "print(\"A quick view of testing set\")\n", 315 | "test_df.head()" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "# Check the labels distribution" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "### Check the balance of labels\n", 330 | "\n", 331 | "We would like to know the positive ratio of training set. Because we do not want the model become a lazy guy, for a less frequent positive case, we may give it more penalty when model targets it wrong." 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 6, 337 | "metadata": { 338 | "collapsed": true 339 | }, 340 | "outputs": [], 341 | "source": [ 342 | "'''\n", 343 | "What's the positive ratio of each class ?\n", 344 | "'''\n", 345 | "def get_pos_ratio(data):\n", 346 | " return data.sum() / len(data)\n", 347 | "\n", 348 | "pos_ratio = []\n", 349 | "for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:\n", 350 | " pos_ratio.append(get_pos_ratio(train_df[col]))" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 7, 356 | "metadata": { 357 | "collapsed": false 358 | }, 359 | "outputs": [ 360 | { 361 | "name": "stdout", 362 | "output_type": "stream", 363 | "text": [ 364 | "Congrats, you passed the test.\n" 365 | ] 366 | } 367 | ], 368 | "source": [ 369 | "assert pos_ratio[0] == 0.09584448302009764, \"The answer is not correct.\"\n", 370 | "print(\"Congrats, you passed the test.\")" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 8, 376 | "metadata": { 377 | "collapsed": false 378 | }, 379 | "outputs": [ 380 | { 381 | "data": { 382 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAgcAAAEaCAYAAACFAfTjAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJzt3XucVXW9//HXGyYveAEJMC4iKugJ\nRVAnJU0j8Ii3wEwOcqxUVMqjpaSZ1jFNs1LrhxnqUfOC6RFvKXiXMNQ0RVAMED0gkIyigFwiERX4\n/P5Y3xn3HvbMbGBmNsy8n4/Hfuy1v+u71vqsPbPX/uzv+q7vUkRgZmZmVqlFqQMwMzOzzYuTAzMz\nM8vj5MDMzMzyODkwMzOzPE4OzMzMLI+TAzMzM8vj5MDMNmuSbpf0i1LHYdacODkwa8YkTZbUQ9Lu\nkl4pdTxmtnlwcmDWTEn6HLArMAc4AGiU5EBSWWNsx8w2npMDs+ZrH+D1yIZJLaeO5EBSSPqBpLmS\nlki6WlKLnPnDJc2StEzSk5J2rbbsWZJmA7NrWP9XJL0gabmkBZJOKVBnJ0mPSFqctvOIpC45809J\n8a2UNE/SSam8u6RnJK1Isd+zge+VWbPi5MCsmZF0qqTlwPPAl9P0ecCV6Yt5t1oW/wZZIrE/MBgY\nntZ5HPAT4HigPfAccHe1ZY8DDgJ6FoipK/A48Pu0fB9gWoHttwBuI2vx6Ap8BIxO69gOuBY4KiJ2\nAA7OWcflwFPATkCXtB0zq4GTA7NmJiJui4g2wFSgL7AvMAPYMSLaRMS8Wha/MiKWRsTbwDXAsFT+\nXeBXETErItYAvwT65LYepPlLI+KjAus9CfhzRNwdEZ9GxAcRsV5ykMofiIhVEbESuAL4ak6VdcA+\nkraNiIURMTOVf0qWUHSKiNUR8dfa3yWz5s3JgVkzIqltah1YQfbLehLwJrAXsEzSuXWsYkHO9D+A\nTml6V+B3ad3LgaWAgM41LFvdLsBbRcTfStKNkv4h6Z/As0AbSS0j4kNgKPA9YKGkRyX9W1r0ghTP\nZEkzJQ2va1tmzZmTA7NmJP1yb0P2S/8PafoJ4Oup1eCaOlaxS850V+DdNL0A+G5aR+Vj24h4IXfz\ntax3AbBHEbtwHlkic1BE7AgclsqV9u/JiPh3oCPwBnBzKn8vIs6IiE5k+369pO5FbM+sWXJyYNY8\n5V6dsB/ZKYZi/Ch1CtwFOAeo7Nj3P8BFkvYGkNRa0pANiOcu4HBJ/yGpTNLnJfUpUG8Hsn4GyyW1\nBS6pnCFpZ0mDUt+Dj4F/AWvTvCE5HReXkSUqazcgPrNmxcmBWfN0APCKpM8DayNiWZHLjSNLJKYB\njwK3AETEg8CVwNjU3D8DOKrYYFIfhqPJWgaWpvX3LlD1GmBbYAnwIlmrR6UWafl30zq+CvxXmvcl\n4CVJ/wLGA+fU0bfCrFlTdhWTmVntJAXQIyLmlDoWM2tYbjkwMzOzPE4OzMzMLE+jJAeSbpW0SNKM\nauXfl/RmurToqpzyiyTNSfMG5pQfmcrmSLowp3w3SS9Jmi3pHklbNcZ+mTUnESGfUjBrHhqr5eB2\n4MjcAklfIxthbd+I2Bv4TSrvCZwI7J2WuV5SS0ktgevIOjn1BIalupB1hBoVET3IeiKf1uB7ZGZm\n1kQ1yg1QIuJZSd2qFZ8J/DoiPk51FqXywcDYVD5P0hzgwDRvTkTMBZA0FhgsaRbQH/jPVGcMcClw\nQ11xtWvXLrp1qx6WmZlZ0zR16tQlEdG+rnqlvDvansChkq4AVgPnR8TLZCOqvZhTr4LPRllbUK38\nIODzwPI0ZGv1+uuRNAIYAdC1a1emTJlSD7uSb/jw4TzyyCN06NCBGTOyMymXXnopN998M+3bZ3+T\nX/7ylxx99NFMmDCBCy+8kE8++YStttqKq6++mv79+wNwzz33cMUVV7B27VqOOeYYrrrqqrzt3H//\n/QwZMoSXX36Z8vLyet8PMzNrWiT9o5h6peyQWEZ2E5S+wI+AeyWJNNJZNbER5QVFxE0RUR4R5ZVf\n1PXtlFNO4YknnlivfOTIkUybNo1p06Zx9NFHA9CuXTsefvhhpk+fzpgxY/j2t78NwAcffMCPfvQj\nJk6cyMyZM3n//feZOHFi1bpWrlzJtddey0EHHdQg+2BmZs1XKZODCuBPkZlMdsOUdqk8d4jWLmSD\nmtRUvoRsbPWyauUlc9hhh9G2bdui6u6333506pQNT7/33nuzevVqPv74Y+bOncuee+5Z1dJw+OGH\n88ADD1Qtd/HFF3PBBRewzTbb1P8OmJlZs1bK5OAhsr4CSNoT2Irsi348cKKkrdOtY3sAk4GXgR7p\nyoStyDotjk/3ov8LcEJa78lko7htdkaPHs2+++7L8OHDWbZs/QHpHnjgAfbbbz+23nprunfvzhtv\nvMH8+fNZs2YNDz30EAsWZGdVXn31VRYsWMCxxx7b2LtgZmbNQGNdyng38DdgL0kVkk4DbgV2T5c3\njgVOTq0IM4F7gdfJhkY9KyLWpj4FZwNPArOAe3Nux/pj4Iep8+LnSUO6bk7OPPNM3nrrLaZNm0bH\njh0577zz8ubPnDmTH//4x9x4440A7LTTTtxwww0MHTqUQw89lG7dulFWVsa6desYOXIkv/3tb0ux\nG2Zm1gw06+GTy8vLoyE6JALMnz+fY489tqpDYm3zKioq6N+/P7fddhuHHHJIwfXddNNNzJkzh5/+\n9KfssccebL/99gC89957tG3blvHjx7tTopmZ1UrS1Iio88vCIyQ2koULF1ZNP/jgg+yzzz4ALF++\nnGOOOYZf/epX6yUGixZlV3cuW7aM66+/ntNPP53WrVuzZMkS5s+fz/z58+nbt68TAzMzq1elvJSx\nyRo2bBiTJk1iyZIldOnShZ///OdMmjSJadOmIYlu3bpVnT4YPXo0c+bM4fLLL+fyyy8H4KmnnqJD\nhw6cc845vPbaawD87Gc/Y8899yzZPpmZWfPh0woNdFrBzMxsc+PTCmZmZrZRfFqhBt/63aOlDqFR\n3XnOMaUOwczMNhNuOTAzM7M8Tg7MzMwsj5MDMzMzy+PkwMzMzPI4OTAzM7M8Tg7MzMwsj5MDMzMz\ny+PkwMzMzPI4OTAzM7M8Tg7MzMwsj5MDMzMzy+PkwMzMzPI4OTAzM7M8jZIcSLpV0iJJMwrMO19S\nSGqXXkvStZLmSPq7pP1z6p4saXZ6nJxTfoCk6WmZayWpMfbLzMysKWqsloPbgSOrF0raBfh34O2c\n4qOAHukxArgh1W0LXAIcBBwIXCJpp7TMDalu5XLrbcvMzMyK0yjJQUQ8CywtMGsUcAEQOWWDgTsi\n8yLQRlJHYCAwISKWRsQyYAJwZJq3Y0T8LSICuAM4riH3x8zMrCkrWZ8DSYOAdyLitWqzOgMLcl5X\npLLayisKlNe03RGSpkiasnjx4k3YAzMzs6apJMmBpFbAT4GfFZpdoCw2orygiLgpIsojorx9+/bF\nhGtmZtaslKrlYA9gN+A1SfOBLsArkr5A9st/l5y6XYB36yjvUqDczMzMNkJJkoOImB4RHSKiW0R0\nI/uC3z8i3gPGA99JVy30BVZExELgSeAISTuljohHAE+meSsl9U1XKXwHGFeK/TIzM2sKGutSxruB\nvwF7SaqQdFot1R8D5gJzgJuB/wKIiKXA5cDL6XFZKgM4E/hDWuYt4PGG2A8zM7PmoKwxNhIRw+qY\n3y1nOoCzaqh3K3BrgfIpwD6bFqWZmZmBR0g0MzOzapwcmJmZWR4nB2ZmZpbHyYGZmZnlcXJgZmZm\neZwcmJmZWR4nB2ZmZpbHyYGZmZnlcXJgZmZmeZwcmJmZWR4nB2ZmZpbHyYGZmZnlcXJgZmZmeZwc\nmJmZWR4nB2ZmZpbHyYGZmZnlcXJgZmZmeZwcmJmZWZ5GSQ4k3SppkaQZOWVXS3pD0t8lPSipTc68\niyTNkfSmpIE55UemsjmSLswp303SS5JmS7pH0laNsV9mZmZNUWO1HNwOHFmtbAKwT0TsC/wfcBGA\npJ7AicDeaZnrJbWU1BK4DjgK6AkMS3UBrgRGRUQPYBlwWsPujpmZWdPVKMlBRDwLLK1W9lRErEkv\nXwS6pOnBwNiI+Dgi5gFzgAPTY05EzI2IT4CxwGBJAvoD96flxwDHNegOmZmZNWGbS5+D4cDjaboz\nsCBnXkUqq6n888DynESjsrwgSSMkTZE0ZfHixfUUvpmZWdNR8uRA0k+BNcBdlUUFqsVGlBcUETdF\nRHlElLdv335DwzUzM2vyykq5cUknA8cCAyKi8gu9Atglp1oX4N00Xah8CdBGUllqPcitb2ZmZhuo\nZC0Hko4EfgwMiohVObPGAydK2lrSbkAPYDLwMtAjXZmwFVmnxfEpqfgLcEJa/mRgXGPth5mZWVNT\nVHIgaZikL6bpvSQ9K+lpSf9W5PJ3A38D9pJUIek0YDSwAzBB0jRJ/wMQETOBe4HXgSeAsyJibWoV\nOBt4EpgF3JvqQpZk/FDSHLI+CLcUtfdmZma2nmJPK/wCODhN/4bsl/y/gOvJrhSoVUQMK1Bc4xd4\nRFwBXFGg/DHgsQLlc8muZjAzM7NNVGxy0D4i3pe0DfAVsib8T8nO95uZmVkTUmxysFhSd6AX8HJE\nfCypFYWvFDAzM7MtWLHJweXAVGAtMDSVDQBea4igzMzMrHSKSg4i4nZJ96bpyisLXiK7YsDMzMya\nkA25lHFb4JuSLkivyyjxOAlmZmZW/4q9lPGrwJvAScDFqbgHcEMDxWVmZmYlUmzLwTXA0Ig4kmyo\nY8hOK/jyQTMzsyam2OSgW0RMTNOVwxx/gk8rmJmZNTnFJgevSxpYrexwYHo9x2NmZmYlVuwv//OA\nRyQ9Cmwr6Ubg68DgBovMzMzMSqKoloOIeBHYF5gJ3ArMAw6MiJcbMDYzMzMrgaJaDiRtDSyOiKty\nyj4naeuI+LjBojMzM7NGV2yfgwnAAdXKDiC7Q6KZmZk1IcUmB73ILl3MNRnoXb/hmJmZWakVmxys\nAHauVrYz8GH9hmNmZmalVmxy8ADwv5L2kdRKUi/gDuDehgvNzMzMSqHY5OCnwCyyUwkrgRfJhlP+\nSQPFZWZmZiVS7F0ZVwNnSTobaAcsiYioYzEzMzPbAhV9V0ZJrYEvkXVO/Jqk/pL6F7nsrZIWSZqR\nU9ZW0gRJs9PzTqlckq6VNEfS3yXtn7PMyan+bEkn55QfIGl6WuZaSSp2v8zMzCxfsXdlPAV4F3gY\nuCXn8Ycit3M7cGS1sguBiRHRA5iYXgMcRXbHxx7ACNKdHyW1BS4BDiK74dMllQlFqjMiZ7nq2zIz\nM7MiFdtycAVwQkTsHBG75Tx2L2bhiHgWWFqteDAwJk2PAY7LKb8jMi8CbSR1BAYCEyJiaUQsIxt7\n4cg0b8eI+Fs61XFHzrrMzMxsAxWbHJQBT9XztneOiIUA6blDKu8MLMipV5HKaiuvKFBekKQRkqZI\nmrJ48eJN3gkzM7Omptjk4ErgvyUV3UdhExTqLxAbUV5QRNwUEeURUd6+ffuNDNHMzKzpKvbLfiTw\n38BKSW/nPjZh2++nUwKk50WpvALYJadeF7L+DrWVdylQbmZmZhuh2Fs2f6sBtj0eOBn4dXoel1N+\ntqSxZJ0PV0TEQklPAr/M6YR4BHBRRCyVtFJSX7Ihnr8D/L4B4jUzM2sWih3n4JlN2Yiku4F+QDtJ\nFWRXHfwauFfSacDbwJBU/THgaGAOsAo4NcWwVNLlQOVtoi+LiMpOjmeSXRGxLfB4epiZmdlG2JBb\nNv8MGAZ8PiJaSzoC2DMiRte1fEQMq2HWgAJ1AzirhvXcCtxaoHwKsE9dcZiZmVndiu1zMIrsy/ck\nPuvsN5PsF7uZmZk1IcX2OfgG0D0iPpS0DiAi3pFU4yWDZmZmtmUqtuXgE6olEpLaAx/Ue0RmZmZW\nUsUmB/cBYyTtBlWXHo4GxjZUYGZmZlYaxSYHPwHmA9OBNsBssrEEft4wYZmZmVmp1NnnII2K+BXg\nxxFxbjqd4Fs2m5mZNVF1thxExDpgXER8nF4vdmJgVlqjRo1i7733Zp999mHYsGGsXr26at73v/99\ntt9++/WWuf/++5HElClTAPjkk0849dRT6dWrF71792bSpEmNFb6ZbeaKPa3wbBqB0MxK7J133uHa\na69lypQpzJgxg7Vr1zJ2bNb9Z8qUKSxfvny9ZVauXMm1117LQQcdVFV28803AzB9+nQmTJjAeeed\nx7p16xpnJ8xss1ZscvAP4HFJt0u6XNJllY+GDM7MCluzZg0fffQRa9asYdWqVXTq1Im1a9fyox/9\niKuuumq9+hdffDEXXHAB22yzTVXZ66+/zoAB2ThkHTp0oE2bNlWtCmbWvBWbHGwLPEQ2AFIXshsg\n7UL+DY/MrBF07tyZ888/n65du9KxY0dat27NEUccwejRoxk0aBAdO3bMq//qq6+yYMECjj322Lzy\n3r17M27cONasWcO8efOYOnUqCxYswMys2A6JfwSer+x3YGals2zZMsaNG8e8efNo06YNQ4YM4Y47\n7uC+++5br9/AunXrGDlyJLfffvt66xk+fDizZs2ivLycXXfdlYMPPpiysmLHRTOzpqzOI0FErJM0\nLiJ2aIyAzKx2f/7zn9ltt91o3749AMcffzyXXHIJH330Ed27dwdg1apVdO/enalTpzJjxgz69esH\nwHvvvcegQYMYP3485eXljBo1qmq9Bx98MD169Gj0/TGzzY87JJptYbp27cqLL77IqlWriAgmTpzI\nD3/4Q9577z3mz5/P/PnzadWqFXPmzKF169YsWbKkqrxv375VicGqVav48MMPAZgwYQJlZWX07Nmz\nxHtnZpuDYtsQKzskjgMW8NnNl4iInzVEYGZW2EEHHcQJJ5zA/vvvT1lZGfvttx8jRozY4PUsWrSI\ngQMH0qJFCzp37swf//jHBojWzLZEKmbIAkm31TQvIk6t14gaUXl5edTUO/tbv3u0kaMprTvPOabU\nIZhZibz55psMHTq06vXcuXO57LLL6NevH9/73vdYvXo1ZWVlXH/99Rx44IFcffXV3HXXXUB25cys\nWbNYvHgxixcvLriec889t9H3yQqTNDUiyuus15zHM3Jy8BknBw1v+D3DSx1Co7l16K2lDsE20tq1\na+ncuTMvvfQSZ5xxBiNHjuSoo47iscce46qrrlqv0+vDDz/MqFGjePrpp2tcz6677tqIe2C1KTY5\nKOq0gqTda5oXEXM3JDAzM9t8TZw4kT322INdd90VSfzzn/8EYMWKFXTq1Gm9+nfffTfDhg2rdT22\n5Sm2z8Ecsn4GyimrbHJoWa8RmZlZyYwdO7bqy/6aa65h4MCBnH/++axbt44XXnghr+6qVat44okn\nGD16dK3rsS1PUVcrRESLiGiZnlsAnYCbgG9vagCSRkqaKWmGpLslbSNpN0kvSZot6R5JW6W6W6fX\nc9L8bjnruSiVvylp4KbGZWbW3HzyySeMHz+eIUOGAHDDDTcwatQoFixYwKhRozjttNPy6j/88MMc\ncsghtG3bttb12Jan2EsZ80TEe8C5wK82ZeOSOgM/AMojYh+yVogTgSuBURHRA1gGVP5HngYsi4ju\nwKhUD0k903J7A0cC10tyi4aZ2QZ4/PHH2X///dl5550BGDNmDMcffzwAQ4YMYfLkyXn1a2odqL4e\n2/JsVHKQ7AW0qocYyoBtJZWl9S0E+gP3p/ljgOPS9OD0mjR/gCSl8rER8XFEzCM7DXJgPcRmZtZs\nVO8/0KlTJ5555hkAnn766bxBslasWMEzzzzD4MGD61yPbXmK7ZD4HDljG5B9ie8NbNKNlyLiHUm/\nAd4GPgKeAqYCyyNiTapWAXRO053JxlkgItZIWgF8PpW/mLPq3GXMzKwOq1atYsKECdx4441VZTff\nfDPnnHMOa9asYZtttuGmm26qmvfggw9yxBFHsN1229W5HtvyFNsh8Q/VXn8IvBYRszdl45J2IvvV\nvxuwHLgPOKpA1crERDXMq6m80DZHACMgG2nOzMygVatWfPDBB3llX/nKV5g6dWrB+qeccgqnnHJK\nUeuxLU9RyUFEjKm71kY5HJgXEYsBJP0JOBhoI6kstR50Ad5N9SvI7gZZkU5DtAaW5pRXyl0mT0Tc\nRNaZkvLy8uY7yIOZbfHuGfVsqUNoNENHHlbqEJqVovocSPqTpEOrlR0q6f6alinS20BfSa1S34EB\nwOvAX4ATUp2TgXFpenx6TZr/dGSjOI0HTkxXM+wG9ADye86YmZlZUYo9rfBVoPo1KX8DHtqUjUfE\nSynBeAVYA7xK9qv+UWCspF+kslvSIrcAf5Q0h6zF4MS0npmS7iVLLNYAZ0XE2k2JzczMrLkqNjlY\nDWwH/DOnbHvg000NICIuAS6pVjyXAlcbRMRq1k9SKuddAVyxqfGYmZk1d8VeyvgkcKOkHQHS82jg\niYYKzMzMzEqj2OTgPGBHYKmkRWRN+q3JBkIyMzOzJqTYqxWWAcdI+gLZVQEL0iiJZmZm1sQUOwjS\nEcD8iPg/4L1UthfQNSImNGB8ZmZm1siKPa1wHbCyWtnKVG5mZmZNSLHJQYeIWFitbCHwhXqOx8zM\nzEqs2ORgrqT+1cr6AfPqNxwzMzMrtWLHObgU+JOkW4C3gD2AU9PDzMzMmpCiWg4iYhxwBNlASMek\n54Gp3MzMzJqQYlsOiIjJ+H4FZmZmTV6dLQeSukm6XdI7kj5Oz2Mk7d4YAZqZmVnjqjU5kPRFspsi\ndQB+CgxKz+2BKWm+mZmZNSF1nVb4NXBdRFxcrfz2dMfEq4CvN0hkZmZmVhJ1JQeHASfXMO+3+FJG\nMzOzJqeuPgctqfm2zJ+m+WZmZtaE1JUcvEzNYxmcAkyp12jMzMys5Oo6rXAx8GS6ydL9ZEMmdwSG\nkJ1uGNiw4ZmZmVljq7XlICJeIBv8qDcwEXgjPfcGjkzzzczMrAmpc5yDiPhbRBwG7ADsAuwYEYdG\nxPP1EYCkNpLul/SGpFmSviypraQJkman551SXUm6VtIcSX+XtH/Oek5O9WdLqqkTpZmZmdWh2Bsv\nEREfRcQ7EbGqnmP4HfBERPwbWYvELOBCYGJE9CBrqbgw1T0K6JEeI4AbACS1BS4BDgIOBC6pTCjM\nzMxswxSdHDQESTuSXS55C0BEfBIRy4HBwJhUbQxwXJoeDNwRmReBNpI6kvV9mBARSyNiGTABOLIR\nd8XMzKzJKGlyAOwOLAZuk/SqpD9I2g7YOSIWAqTnDql+Z2BBzvIVqaymcjMzM9tANSYHkq7Ome7f\nQNsvA/YHboiI/YAP+ewUQsGwCpRFLeXrr0AaIWmKpCmLFy/e0HjNzMyavNpaDkbkTD/UQNuvACoi\n4qX0+n6yZOH9dLqA9Lwop/4uOct3Ad6tpXw9EXFTRJRHRHn79u3rbUfMzMyaitrGOXhN0v3A68DW\nki4rVCkifraxG4+I9yQtkLRXRLwJDEjbe51sHIVfp+dxaZHxwNmSxpJ1PlwREQslPQn8MqcT4hHA\nRRsbl5mZWXNWW3JwAlnrwa5kzfa7FKhTsOl+A30fuEvSVsBcshEZWwD3SjoNeJts0CWAx4CjgTnA\nqlSXiFgq6XKyER0BLouIpfUQm5mZWbNTY3IQEYuAXwBIKouImoZR3iQRMQ0oLzBrQIG6AZxVw3pu\nBW6t3+jMzMyan7qGTwYgIk5NTfZfJ7sK4B3gEf86NzMza3qKupRR0peBt4DvAfsC3wXmpHIzMzNr\nQopqOQCuAf4rIsZWFkgaClwLfKkhAjMzM7PSKHYQpD2Be6uV3Q90r99wzMzMrNSKTQ5mAydWKxtC\ndqrBzMzMmpBiTyucCzwi6QfAP4BuZDc/OraB4jIzM7MSKfZqhRck7QEcA3QCHgYe89UKZmZmTU+x\nLQekux3e2YCxmJmZ2Wag1HdlNDMzs82MkwMzMzPL4+TAzMzM8hSdHEjatSEDMTMzs83DhrQcvAqQ\nLmc0MzOzJqrWqxUkTQWmkiUGLVPxpWTDJpuZmVkTVFfLwQnAU8CuQCtJrwBbS/qapNYNHp2ZmZk1\nurqSgxYRcX9EXAisBAYDAr4PTJM0u6EDNDMzs8ZV1yBI/yupK/A6sA2wE7A6Io4HkNS2geMzMzOz\nRlZrchARB0kqA3oBfwVGAztIugF4JT08hLKZmVkTUufVChGxJiJeBT6JiMOAD4FJZDdeurI+gpDU\nUtKrkh5Jr3eT9JKk2ZLukbRVKt86vZ6T5nfLWcdFqfxNSQPrIy4zM7PmaEMuZRyZniMi7omICyLi\n8HqK4xxgVs7rK4FREdEDWAaclspPA5ZFRHdgVKqHpJ5kt5TeGzgSuF5SS8zMzGyDFZ0cRMTtaXL3\n+gxAUheyuz3+Ib0W0B+4P1UZAxyXpgen16T5A1L9wcDYiPg4IuYBc4AD6zNOMzOz5mKDh09Od2es\nT9cAFwDr0uvPA8sjYk16XQF0TtOdgQUpjjXAilS/qrzAMmZmZrYBSnpvBUnHAosiYmpucYGqUce8\n2papvs0RkqZImrJ48eINitfMzKw5KPWNlw4BBkmaD4wlO51wDdAmXSUB0AV4N01XALsApPmtya6W\nqCovsEyeiLgpIsojorx9+/b1uzdmZmZNQEmTg4i4KCK6REQ3sg6FT0fEScBfyEZnBDgZGJemx6fX\npPlPR0Sk8hPT1Qy7kV1JMbmRdsPMzKxJqWsQpFL5MTBW0i/I7utwSyq/BfijpDlkLQYnAkTETEn3\nkg3WtAY4KyLWNn7YZmZmW77NJjmIiElk4ycQEXMpcLVBRKwGhtSw/BXAFQ0XoZmZWfNQ6j4HZmZm\ntplxcmBmZmZ5nByYmZlZHicHZmZmlsfJgZmZmeVxcmBmZmZ5nByYmZlZHicHZmZmlsfJgZmZmeVx\ncmBmZmZ5nByYmZlZHicHZmZmlsfJgZmZmeVxcmBmZmZ5nByYmZlZHicHZmZmlsfJgZmZmeVxcmBm\nZmZ5nBzYZmP48OF06NCBffbZp6rs0ksvpXPnzvTp04c+ffrw2GOPAfDJJ59w6qmn0qtXL3r37s2k\nSZOqlunXrx977bVX1TKLFi1q7F0xM9uilTQ5kLSLpL9ImiVppqRzUnlbSRMkzU7PO6VySbpW0hxJ\nf5e0f866Tk71Z0s6uVT7ZBsVbACAAAATWklEQVTvlFNO4YknnlivfOTIkUybNo1p06Zx9NFHA3Dz\nzTcDMH36dCZMmMB5553HunXrqpa56667qpbp0KFD4+yAmVkTUeqWgzXAeRHxRaAvcJaknsCFwMSI\n6AFMTK8BjgJ6pMcI4AbIkgngEuAg4EDgksqEwrYchx12GG3bti2q7uuvv86AAQMA6NChA23atGHK\nlCkNGZ6ZWbNR0uQgIhZGxCtpeiUwC+gMDAbGpGpjgOPS9GDgjsi8CLSR1BEYCEyIiKURsQyYABzZ\niLtiDWj06NHsu+++DB8+nGXLlgHQu3dvxo0bx5o1a5g3bx5Tp05lwYIFVcuceuqp9OnTh8svv5yI\nKFXoZtaEFDr1Wek3v/kNkliyZAkAEcEPfvADunfvzr777ssrr7xSVbdly5ZVpz0HDRrUaPFviFK3\nHFSR1A3YD3gJ2DkiFkKWQACV7cKdgQU5i1WksprKC21nhKQpkqYsXry4PnfBGsCZZ57JW2+9xbRp\n0+jYsSPnnXcekH1Iu3TpQnl5Oeeeey4HH3wwZWVlQHZKYfr06Tz33HM899xz/PGPfyzlLphZE1HT\nqc8FCxYwYcIEunbtWlX2+OOPM3v2bGbPns1NN93EmWeeWTVv2223rTrtOX78+EaJfUNtFsmBpO2B\nB4BzI+KftVUtUBa1lK9fGHFTRJRHRHn79u03PFhrVDvvvDMtW7akRYsWnHHGGUyePBmAsrIyRo0a\nxbRp0xg3bhzLly+nR48eAHTunOWFO+ywA//5n/9ZtYyZ2aao6dTnyJEjueqqq5A++yoaN24c3/nO\nd5BE3759Wb58OQsXLmzMcDdJyZMDSZ8jSwzuiog/peL30+kC0nNld/MKYJecxbsA79ZSblu43A/T\ngw8+WNWct2rVKj788EMAJkyYQFlZGT179mTNmjVVzXqffvopjzzySMEmQDOz+jB+/Hg6d+5M7969\n88rfeecddtnls6+lLl268M477wCwevVqysvL6du3Lw899FCjxlusslJuXFmadQswKyL+X86s8cDJ\nwK/T87ic8rMljSXrfLgiIhZKehL4ZU4nxCOAixpjH6z+DBs2jEmTJrFkyRK6dOnCz3/+cyZNmsS0\nadOQRLdu3bjxxhsBWLRoEQMHDqRFixZ07ty56tTBxx9/zMCBA/n0009Zu3Ythx9+OGeccUYpd8vM\nmqhVq1ZxxRVX8NRTT603r1Bfp8qWhbfffptOnToxd+5c+vfvT69evdhjjz0aPN4NUdLkADgE+DYw\nXdK0VPYTsqTgXkmnAW8DQ9K8x4CjgTnAKuBUgIhYKuly4OVU77KIWNo4u2D15e67716v7LTTTitY\nt1u3brz55pvrlW+33XZMnTq13mMzM6vurbfeYt68eVWtBhUVFey///5MnjyZLl265HWSrqiooFOn\nTgBVz7vvvjv9+vXj1VdfdXKQKyL+SuH+AgADCtQP4Kwa1nUrcGv9RWfFeu/moaUOodF84Yx7Sh2C\nmW0mevXqlTfIWrdu3ZgyZQrt2rVj0KBBjB49mhNPPJGXXnqJ1q1b07FjR5YtW0arVq3YeuutWbJk\nCc8//zwXXHBBCfeisFK3HJiZmW0RCp36rKl18+ijj+axxx6je/futGrVittuuw2AWbNm8d3vfpcW\nLVqwbt06LrzwQnr27NmYu1EUJwdmZmZFKHTqM9f8+fOrpiVx3XXXrVfn4IMPZvr06fUdWr1zcmBm\nZk3aLRdvfs32Dem0y6/a5HWU/FJGMzMz27w4OTAzM7M8Tg7MrNlYu3Yt++23H8ceeywAhx56aNUY\n9506deK447LbuKxYsYKvf/3r9O7dm7333ruqM5lZc+E+B2bWbPzud7/ji1/8Iv/8ZzZK+3PPPVc1\n75vf/CaDBw8G4LrrrqNnz548/PDDLF68mL322ouTTjqJrbbaqiRxmzU2txyYWbNQUVHBo48+yumn\nn77evJUrV/L0009XtRxIYuXKlUQE//rXv2jbtm3Vjb3MmgP/t5tZs3Duuedy1VVXsXLlyvXmPfjg\ngwwYMIAdd9wRgLPPPptBgwbRqVMnVq5cyT333EOLFv4tZc2H/9vNrMl75JFH6NChAwcccEDB+Xff\nfTfDhg2rev3kk0/Sp08f3n33XaZNm8bZZ59ddSrCrDlwcmBmTd7zzz/P+PHj6datGyeeeCJPP/00\n3/rWtwD44IMPmDx5Msccc0xV/dtuu43jjz8eSXTv3p3ddtuNN954o1ThmzU6Jwdm1uT96le/oqKi\ngvnz5zN27Fj69+/PnXfeCcB9993HscceyzbbbFNVv2vXrkycOBGA999/nzfffJPdd9+9JLGblYKT\nAzNr1saOHZt3SgHg4osv5oUXXqBXr14MGDCAK6+8knbt2pUoQrPG5w6JZtas9OvXj379+lW9njRp\n0np1OnXqxFNPPdV4QZltZpwcmNlmZeoZI0odQqM64OabSh2C2Xp8WsHMzMzyODkwMzOzPE4OzMzM\nLE+TSg4kHSnpTUlzJF1Y6njMzMy2RE0mOZDUErgOOAroCQyT1LO0UZmZmW15mkxyABwIzImIuRHx\nCTAWGFzimMzMzLY4iohSx1AvJJ0AHBkRp6fX3wYOioizq9UbAVReK7UX8GajBlq3dsCSUgexBfD7\nVDy/V8Xx+1Q8v1fF2Rzfp10jon1dlZrSOAcqULZe5hMRNwGb7YXFkqZERHmp49jc+X0qnt+r4vh9\nKp7fq+Jsye9TUzqtUAHskvO6C/BuiWIxMzPbYjWl5OBloIek3SRtBZwIjC9xTGZmZlucJnNaISLW\nSDobeBJoCdwaETNLHNbG2GxPeWxm/D4Vz+9Vcfw+Fc/vVXG22PepyXRINDMzs/rRlE4rmJmZWT1w\ncmBmZmZ5nBw0MEltJP3XRi5bLuna+o7JtiySukmaUeo4tgS5nzdJ/SQ90kDb6Sfp4IZYd2OT9EI9\nr6/q/1VSH0lH1+f6rXE4OWh4bYCNSg4iYkpE/KCe42kWNvXgLekySYfXZ0zWKDb485aGXt9Q/YAm\nkRxEREPuRx+gZMlBTYmPpNvTwHkbs868hEfSoMp7+Ug6bmOH7Zc0X1K7jY2jvjk5aHi/BvaQNE3S\n1ekxQ9J0SUMBJH1D0p+V6Sjp/yR9IfeXj6TtJd2Wlvu7pG+WdK8amaQNvbKmH5tw8I6In0XEnzd2\n+U0h6Yfpf2SGpHNTcZmkMelvf7+kVqnuryW9nsp/k8p2lvSgpNfS4+BU/i1Jk9P/4o2VX4qS/iXp\nilT3RUk7p/L2kh6Q9HJ6HFKCt2NDVX3egKuB7dP79YakuyQJqg7EP5P0V2CIpD0kPSFpqqTnJP1b\nqvd1SS9JejV9RneW1A34HjAyvZeHlmZX64ekf6XnfpIm1fB+Ffo/y/uCrVxPzuutgMuAoel9Gtp4\ne5VpoMQnL+GJiPER8ev08jiye/s0hoZNvCLCjwZ8AN2AGWn6m8AEskstdwbeBjqmeXcCZwOPAMNS\nWT/gkTR9JXBNznp3KvW+pTi2Ax4FXgNmAEOBA4BngKlkl5Z2BL4ITK72vvw9Ta9XP5VPAn6Z5p0H\ntAceIBvT4mXgkFre8/eAd4BpwKHArsBE4O/puWuqOw74Tpr+LnBXmr4dOCFNfwl4Ie3jZGCHBnw/\nDwCmp/d1e2AmsB/ZaJ+HpDq3AucDbcmG/6686qhNer4HODdNtwRap/f/YeBzqfz6nP0O4Otp+irg\nv9P0/wJfSdNdgVml/n/bwM9bP2AF2YBoLYC/5ezPfOCCnOUmAj3S9EHA05Wfs5z393Tgt2n6UuD8\nUu9vPb1n/6rt/arl/6zqM1JtPbl/g1OA0ZvBvgkYDbxOdrx6LOfzXdvx58r0mf8/suPIVmTH7cVk\nx5ahlftI9mNkKTAvzdsDeCUnlh7A1FpinQ/8HHiF7Bjwb6n8QLLjz6vpea8a4tiO7Njwcqo7eFPe\nuyYzzsEW4ivA3RGxFnhf0jNkXzzjge+Tfbm+GBF3F1j2cLKBnQCIiGWNEG8xjgTejYhjACS1Bh4n\n+8dcnH4tXBERwyVtJWn3iJhL9s98r6TPAb+vXh8YntbfJiK+mtb9v8CoiPirpK5kH+QvVg8oIuZL\n+h+yA0Plr5yHgTsiYoyk4cC1ZFn+COB5SfPIEpC+uetKv37uAYZGxMuSdgQ+qqf3rpCvAA9GxIdp\n+38iOygtiIjnU507gR8A1wCrgT9IepQssQToD3wHIP2vrVB2r5EDgJfTj8FtgUWp/ic5y04F/j1N\nHw70TPUBdpS0Q0SsrNc9bliTI6ICILUmdAP+mubdk8q3Jzuw35ezr1un5y7APZI6kh2Q5zVO2CVT\n6P16kcL/Z1uSb5B9qfYi+2H2OnBrEcefsog4MDXfXxIRh0v6GVAe6b49kk4BiIgXJI0n+0F3f5q3\nQlKfiJgGnEqWUNVmSUTsr6zfzPlkCekbwGGRjeVzOPDLiPhmgTh+SZbUDpfUBpgs6c+Vx5IN5eSg\ncRW6/0OlzsA6YGdJLSJiXYFlN8dBKaYDv5F0JdlBYxmwDzAhHWhbAgtT3XuB/yBr+h2aHnvVUh/S\nATzZlC+rLwPHp+k/kv1CJiLeTx+yvwDfiIil1ZbbC1gYES+n+v8sYluboqb/kep/+0gHiwOBAWSJ\n49lkiUFN6x0TERcVmPdppJ8owFo+Oy60AL4cEQ2ZDDW0j3Omc/cNoPKg2QJYHhF9Ciz/e+D/RcR4\nSf3IWgyasvXer1r+z9aQTk2n0w9bNXKsG+IwPvth9q6kp1N5XcefP6XnqWSJ0ob6A3CqpB+SHe8O\nrKN+7vYqj1etgTGSepAdBz5Xw7JHAIMknZ9eb0Nq8duIuN3noBGsBHZI08+SnX9rKak92T/sZGXn\n028D/pPsD/nDAut5iuxDCYCknRo06iJFxP/xWVP4r8hOncyMiD7p0SsijkjV7wH+Q9Ke2aIxm+xL\nq6b68NkBHD77sqqs23kTfsXmftn2Aj4AOhWo19hJ2bPAcZJaSdqO7BfPc0BXSV9OdYYBf02/eFtH\nxGPAuWTnICFrIj8Tss52qbVjInCCpA6pvK2kXeuIpfr/XKEvz81N7uetKCnhmydpCGRfdJJ6p9mt\nyU5PAZy8KdvZUtXyfzaf7LMPMJjCX1qb0/tU6HNc1/GnMlmqnlgW6wHgKOBYslMKH9RRv9D2Lgf+\nEhH7AF8n+9IvRMA3c/ala0RsVGIATg4aXPpneF7ZpT1fJjvn/RrwNNk5z/eAnwDPRcRzZInB6ZKq\nN5f/AthJWSe114CvNdpO1EJSJ2BVRNwJ/IbsfG37yi8ySZ+TtDdARLxF9k9/MZ+1CLxZU/0CNuTL\nqvpB6QU+Oy1zEqlpOf0iOorsvP75knartp43gE6SvpTq76AN7xxZtIh4hazpcTLwEtkvj2VkSePJ\nkv5Odg74BrL9eySVPQOMTKs5B/iapOlkv0D2jojXgf8Gnkr1J5D1BanND4Dy1AntdbJOeJu1ap+3\nqzdg0ZOA09JnaybZlx1kLQX3SXqO/FvvPgx8Q02gQ2IRavo/uxn4qqTJZJ/7Qs3XfyFr7StJh8Qc\nzwInpmS5I58dPzfk+FOptoQnb15ErCY7/XkD2Q/AjZGboJ5SSxxPAt9PrThI2m8jt5fZlA4LfvgB\nDCRLeKaRdYQpJ/tl8SxZEjQTOCOn/vlkGXy3nLKC9ck6BJXn1GtHllT8neyc4f/UEteeOXEdStYk\n+DQ5HRLJziu/BuyflhlEdjAT63dIfDHVfRHYvtTvux9++FH3g8IdEh9Kj8rPd53Hn3TsmZ+m26Zj\nXV6HxDTvkLSNV4E9Ullfsi/3lnXEOh9ol6bLgUlp+stkHSKfJ2tFqCmObYEbyVpxZ5A6s2/sw/dW\nMDMzayCpD0DriLi41LFsCHdINDMzawCSHiS7pLGmjsKbLbcc2BZN0qlk59hzPR8RZ5UiHjOz2qSE\noXrfph9HxJOliKcmTg7MzMwsj69WMDMzszxODszMzCyPkwMzMzPL4+TAzDaIsrs4Vj7WSfoo5/VJ\npY7PzDadOySa2UaTNB84PUp0e2szaxhuOTCzeiOps6RV6a5wlWUHSXpPUpmk0yU9K+n6dMe6WZK+\nllO3jaTbJC2UVCHpMkk+Tpk1Mn/ozKzeRMQ7ZPetGJJT/C2yO+KtSa8PJrtnRTuy4WAfzEkm7iS7\nJfYeZEPIHkN2q1sza0RODsysvo0hSwhIN6kaSnab7EoLgd9HxKcR8b/AXOAoSZ3Jbgs8MiJWRXZT\nsmv47IZZZtZIPHyymdW3B4HrJHUF9gUWR3a3yUoVkd/Z6R9kt8velexmWO+nG8tB9gNmfoNHbGZ5\nnByYWb2KiFWSHiC7DXIf8lsNALpUe90VeBdYAKwC2kbEugYP1Mxq5NMKZtYQ7gCGk/UZuLPavI6S\nzk4dFE8k61/wREQsAJ4BfiNpR0ktJHWXdFjjhm5mTg7MrCE8C7QEXoqIimrzXgD2BpYClwLfjIhl\nad63gO2A14FlwH3AFxojYDP7jE8rmNlGi4huNZSHpArWP6UAsC4izgTOLLDcMuC79RqkmW0wtxyY\nWb2T1BfYh+yXv5ltYZwcmFm9knQX8ARwTkR8WOp4zGzDefhkMzMzy+OWAzMzM8vj5MDMzMzyODkw\nMzOzPE4OzMzMLI+TAzMzM8vz/wHhvYac232hDQAAAABJRU5ErkJggg==\n", 383 | "text/plain": [ 384 | "" 385 | ] 386 | }, 387 | "metadata": {}, 388 | "output_type": "display_data" 389 | } 390 | ], 391 | "source": [ 392 | "x = train_df.iloc[:,2:].sum()\n", 393 | "\n", 394 | "plt.figure(figsize=(8,4))\n", 395 | "ax= sns.barplot(x.index, x.values, alpha=0.8)\n", 396 | "plt.title(\"# per class\")\n", 397 | "plt.ylabel('# of Occurrences', fontsize=12)\n", 398 | "plt.xlabel('Type ', fontsize=12)\n", 399 | "\n", 400 | "rects = ax.patches\n", 401 | "labels = x.values\n", 402 | "for rect, label in zip(rects, labels):\n", 403 | " height = rect.get_height()\n", 404 | " ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')\n", 405 | "\n", 406 | "plt.show()" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "### Correlations of labels\n", 414 | "\n", 415 | "Because this is a mulit-label classification, we will want to know the relation betweens labels, which helps for feature engineering and model design. For example, if we know that a toxic comment is always a insult comment, when we have a high confident toxic comment, we can also consider it as insult comment." 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 9, 421 | "metadata": { 422 | "collapsed": false 423 | }, 424 | "outputs": [ 425 | { 426 | "data": { 427 | "text/plain": [ 428 | "" 429 | ] 430 | }, 431 | "execution_count": 9, 432 | "metadata": {}, 433 | "output_type": "execute_result" 434 | }, 435 | { 436 | "data": { 437 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAi0AAAHeCAYAAABXBztYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJzs3Xd4VNXWx/HvmklCkd4TgmBBrheF\nAAEVsCFFQbBhB1FB9NpA7Ep7wYKIvWO59oaFIlgoCgJKb4IgRTCVKiAIIcns94+EkJAAkyvTkt/H\nZx7mnLPPnDXjyWRl7b3PMeccIiIiIuHOE+oARERERPyhpEVEREQigpIWERERiQhKWkRERCQiKGkR\nERGRiKCkRURERCKCkhYRERGJCEpaREREJCIoaREREZGIEBWMg2RuWafL7vrh6hb9Qx1CRBjdfEeo\nQ4gYJ01NC3UIEWF5i9hQhxAxrlodE+oQIsb3yZMtWMcK1O/Z6BrHB+09+EOVFhEREYkISlpEREQk\nIgSle0hEREQCyJcd6giCQkmLiIhIpHO+UEcQFOoeEhERkYigSouIiEik86nSIiIiIhI2VGkRERGJ\ncK6UjGlR0iIiIhLp1D0kIiIiEj5UaREREYl0paR7SJUWERERiQiqtIiIiEQ6XRFXREREIoK6h0RE\nRETChyotIiIikU5TnkVERETChyotIiIiEa60XBFXlRYRERGJCKq0iIiIRLpSMqZFSYuIiEikU/eQ\niIiISPhQpUVERCTSlZIr4qrSIiIiIhFBlRYREZFIV0rGtChpERERiXSlZPaQuodEREQkIqjSIiIi\nEulKSfeQKi0iIiISEVRpERERiXSlZEyLkhYREZEI51zpuE5LqUtaBj72NDNmzaVa1SqMff/VUIcT\nUglnN+eGIX3weL1M/fg7xr7yeYHtHa89n07XdcaX7WPv33t57cGXSF6dRIUqFbnn1fs5oUlDfvhs\nGm8Ofi1E7yB4opu1onzvO8DjIWPKRPZ+8WGhNjGtz6XcVdfjnCN7/Vp2PzMcgHLX3UJ0i9Mxj4fM\nxfP5+83ngx1+0LRrfyaPPfEwHq+X998Zw/PPjC6w/YzWiTw64mH+fUojbrrhLiaM+zZvW934WJ59\n8VHq1o3FOcdV3W8i6Y+UYL+FoIhp2YqKt98BXg97Jk7k748Kn09lzjmXCr2uBxyZa9ey85Gc86nW\nlGlk/b4OAN/GTWwf+FAQIw++luckcvv/3YrX62HiR1/z0UufFNh++U2X0fnqC8jOzmbH1h2MvHsU\nG1M2kdC6KbcN+U9eu2NPqMew2x5l1rezg/0W5CgqdUnLxZ07cM1l3Xho+KhQhxJSHo+HPsNvZti1\ng9mWvpUR459i/pS5JK9Oymvz47jpfPfBNwAktm9Fr4G9ebTXUDIz9vHxqA84tlF96jWqH6q3EDwe\nD+X79uevoXfj27qZSiNfY9/cWfiSNxxoEluXspddy84Hb8Pt3oVVrgJAVKPGRP3rFHbedSMAlR57\nkajGCWQtXxyStxJIHo+HJ54aQveLbiA1JZ3JP3zON5Om8tuqtXltkpPTuP0/D3Dbnb0L7f/yayN5\netQrTP9+NsccUx5fSS13ezxU7Nef7ffeTfbmzVR79TUyZs8ie8OB88lbty7HXHMt2+64DbdrF1al\nSt42ty+DbTf1CUXkQefxeOj3yB3ce839bE7bwqsTX2T2dz+xYfUfeW1WL1/DLZ1vI2NvBt16XsjN\nD9/EsFsfZfHsJdzU6RYAKlapyPsz32b+9AWheiuBp4G4JVNiwqlUrlQx1GGE3IkJDUlfn8ampI1k\nZWYxa8KPtOxwWoE2e3btyXtepnxZwAGQsSeDlfN/ZV/GvmCGHDJRDU/Gl5aCb2MaZGWxb+Y0Ylq1\nLdCmTIeuZHz9JW73LgDcju152ywmBqKiICoavF58O/4MavzB0jyxCb+v28CG9UlkZmby5ecTuaBL\n+wJtkv5IYcXyVYUSkpManYA3Korp3+f8Fbx799/s2bM3aLEHU/S/TiY7NYXstJzzae+0aZRpU/B8\nKndhV/aM/RK3K/d82r69qJcq8f6V0IjU9amk/ZFOVmYW08b9QJuOrQu0WTx7CRl7MwBYsfBXasbW\nLPQ6Z3c5k7nfz8trJ5HLr0qLmV0CTHPO7chdrgKc45wbG8jgJHCq1anOlrQtectb07bQsFmjQu3O\nv64zF/a5iKjoKIZePTCYIYYNq1aD7C2b8pZ9WzcTddLJBdp44+IBqPjYi5jHw55P3iZz0VyyVi0n\nc9kiqrz1BWBkfP1lgQpNSRIbW5vU5PS85dTUdFokNvVr3xNOPI6dO3by9vsvcmz9eGb8MJthQ0aV\nyGqLp0YNfJvynU+bNxN98kHnU3zO+VT1hRfB42H322+zb95cICcJrvbqa7jsbP7+8EMyZs0MXvBB\nViO2BpvSNuctb07fwsnN/nXI9p2vvoA5388ttP7cbucwZvTnRexRgpTAn5Wi+FtpGbI/YQFwzm0H\nhgQmJAkGwwqtc84VWvfNu5O4/aybeX/EO3S/48pghBZ+rPBnxcEfldeLNzaevwb1Y9fTwzjm1nux\n8hXw1KmLN74+2/tczvY+3Yk+tTlR/24SlLCDzYr4nIo6p4oSFeXl9DMSGTLwCTqccxn1G9Tj6msv\nPdohhgc/zifzevHWjefP/v3YMXwYle69FzumAgBbrryCbbfczM5HhlPx9tvxxsUFIejQ8Pd7CqD9\npefRqMlJfPLqmALrq9WqxvH/Oo550+cHJEYJLn+TlqLaHbZKY2Z9zWy+mc1/492Pih+ZBNTW9C3U\niK2Rt1w9tgZ/btx2yPazxv9Iy46nHXJ7Sea2bsZbo1besqd6TXzbthRo49u6mX1zZ0J2Nr5N6WSn\nJuGJiyfm9DPJ+m0F7N0De/ewb+Ecok5qHOy3EBSpqenExdfJW46Lq0N62qbD7FFw32VLV7BhfRLZ\n2dlMmjiFJgkl83Pybd6Mp1a+86lmTbK3FjyfsjdvzqmgZGfjS08nKykpr/ri27o1p01aGvsWLybq\nxIbBCz7INqdtpla+7p6adWqwNX1roXbN2zajxx3X8PANg8ncl1lg27ldz2bmN7PIzirhs2ucLzCP\nMONv0jLfzJ42sxPM7HgzewY47Igm59xo51yicy6xz3VX//NI5ahas2Q1scfFUatebaKio2jT9Uzm\nTZ5ToE2dBrF5z5u3SyR9fWqwwwwLWatX4omNx1OrDkRFEdO2HZnzZhVokzlnJtGnNgPAKlbGE1cP\n38ZUfJs3Et24KXi84PUS3bgp2SW0e2jRgmUcf3wDjq0fT3R0NJdc1oVvJk31e9/KVSpTvXpVAM48\n63RWrVwTyHBDJnPlSrx14/HUyTmfyrZrR8bsgudTxsyZxDTLPZ8qVSYqvh7ZaalYhQoQHZ23PvqU\nU8nasD7YbyFoVi5ZRd3j6lKnXh2ioqNod9E5zJ78U4E2JzY+gQEj+vPwjYPZvrXw2J92F53L1HHf\nByvk0PFlB+YRZvydPXQHMAj4BDDgO+C2QAUVSPcOGcG8RUvZvn0n513cg1t79+Syrp1CHVbQ+bJ9\nvDH4NQa+OxSP18O0T6eQvDqJKwdcw9qla5g/ZS4X9OpCk7YJZGVmsXvnLl4Y8Gze/i/PfJ1yFcsT\nFR1Fq46nMbznkAIzj0oUXzZ/v/4sFYeMypnyPHUS2UnrKXf1jWStWUnmvNlkLppLdEJLKj//Ds7n\nY887r+D+2sm+n6YTdWpzKj/3X3COzEVzyZxfMqdcZmdn88C9wxjz5Zt4vF4+fO8zVq1cwwMP38ni\nhb/wzdfTaNb8VN754CUqV6lEpwvO5f6H7qTtaV3w+XwMGTiCLya8g5mxZPFy3nv701C/pcDwZfPX\n889SdWTO+bT360lkr1/PMTfcSNaqlWTMns2+eXOJadmS6v/NOZ/+evUV3M6dRDduTMUB9+T8BWwe\ndn/0QYFZRyWNL9vH84NeZOQHj+PxePj6k29Z/9sGbrinF6uW/MbsyT9xy8C+lDumHENfHQTAxpRN\nDLxxMAC142tTM64mS35aGsq3IUeR+dvn/E9kblkX+IOUAFe36B/qECLC6OY7jtxIADhpalqoQ4gI\ny1vEHrmRAHDV6phQhxAxvk+eXMQApsDYO3dMQH7Plm11edDegz+ONC7lWedcfzObQOGhhzjnugUs\nMhEREZF8jtQ99F7uv6X7SmwiIiLhrJRMeT5s0uKc2z/Y9lfnXIFpAGZW+KIeIiIiEnxhONMnEPyd\nPfSjmV2xf8HM7ga+DExIIiIiIoX5O3voHGC0mV0O1AZ+BVoFKigREREphlLSPeRXpcU5lwZ8A5wB\nNADedc7tCmBcIiIiEgHM7HwzW2Vma8zsgSK21zezqWa21Mx+MLP4fNt6mdnq3EevIx3L33sPTQbS\ngFOAeOAtM5vhnLvH/7clIiIiARGiSouZeYGXgA5AMjDPzMY751bkazaKnGLHO2bWDngc6Glm1ci5\nJVAiOTOUF+Tue8i7yvo7puUl59x1zrntzrlfgNaALpYhIiISBpzLDsjDD62ANc65dc65fcDHwEUH\ntfk3sP/y2N/n294JmOyc25abqEwGzj/cwfztHhprZrXN7EIzuxCo5pwb7s++IiIiUmLVBfJfDj05\nd11+S4DLcp9fAlQ0s+p+7luAX0lL7syhucDlwBXAHDPr7s++IiIiEmA+X0Ae+W9+nPvoe9CRi7pi\n7sEXo70HONvMFgFnAylAlp/7FuDv7KGHgZb7r9ViZjWBKcBnfu4vIiIiEcY5NxoYfZgmyUC9fMvx\nQIG76zrnUoFLAcysAnCZc26HmSWTMzs5/74/HC4ef8e0eA66uNzWYuwrIiIigeR8gXkc2TygoZkd\nZ2YxwFXA+PwNzKyGme3PGR4E3sp9/i3Q0cyqmllVoGPuukPyt9LytZl9C3yUu3wlMMnPfUVERKQE\ncs5lmdnt5CQbXuAt59xyMxsGzHfOjSenmvK4mTlgBnBb7r7bzGw4OYkPwDDn3LbDHc/fpMUBrwFt\nyemDGg2cXqx3JiIiIoERwovLOecmcVAhwzk3ON/zzzjEcBLn3FscqLwckb9JSwfn3P3AF/tXmNn/\nAff7eyAREREJkFJy76HDJi1m9h/gVuB4M1uab1NFYFYgAxMRERHJ70iVlg+Br8m5el3+S/P+daR+\nJxEREQmSUnLvocMmLc65HeRc+fbq4IQjIiIiUjR/x7SIiIhIuNKYFhEREYkIpaR7SBeIExERkYig\nSouIiEikU6VFREREJHyo0iIiIhLpNBBXREREIoK6h0RERETChyotIiIika6UdA+p0iIiIiIRQZUW\nERGRSKcxLSIiIiLhQ5UWERGRSFdKxrQEJWm5ukX/YBwm4n204NlQhxARHm8xKNQhRIydGWtDHUJE\nePL3uFCHEDFSMlaHOgQpirqHRERERMKHuodEREQinSotIiIiIuFDlRYREZFI51yoIwgKJS0iIiKR\nTt1DIiIiIuFDlRYREZFIp0qLiIiISPhQpUVERCTS6Yq4IiIiEhHUPSQiIiISPlRpERERiXSl5Dot\nqrSIiIhIRFClRUREJNJpTIuIiIhI+FClRUREJNKVkkqLkhYREZFIV0qu06LuIREREYkIqrSIiIhE\nOOfTlGcRERGRsKFKi4iISKTTQFwRERGJCBqIKyIiIhI+VGkRERGJdBqIKyIiIhI+VGkRERGJdBqI\nKyIiIhGhlCQt6h4SERGRiKBKi4iISKRzGogrIiIiEjZUaREREYl0GtMSuRLObs5z017mhemvcfF/\nLiu0veO15/PUt8/z5KRnGf7ZCOIb1gOgQpWKDP34Ed5b8Qm9h90c7LDDzsDHnuasLldxcY9bQh1K\nWDnh7CbcOu1Jbp/+FG3+0/WQ7U7u3IrBGz4g9tTjghhdaHXseA6/LJvOihUzufee2wptb9v2NOb8\n/DV/717PpZd0yVvftMm/mTF9HIsXTWXB/Mlc3v3Qn2tJ0ejsptw39Ske+OEZzv1Pt0Lbz7i2PXd/\n8wR3TXqc28YMofaJdQHwRHm56qn/cPc3T3DvlFG0u/WiYIceMme2O4NvfvqcyXO/pO+dvQptv+GW\na5k081PG//AR73z+MnHxdUIQpQRSiUtaPB4PfYbfzKO9/o+72t9G225n5SUl+/04bjp3d7qTezv3\nZ9yrX9BrYG8AMjP28fGoD3jv0f+GIvSwc3HnDrz69COhDiOsmMe4YPj1fNhrJC+3v4/G3c6gRsO6\nhdrFHFOWVtd3InnhmuAHGSIej4fnnnuErt160rTpuVx55UWc/K+GBdokJaXQp88APv54bIH1f+/Z\nw429+5PQ7Dwu7NqDUaOGUrlypWCGH1TmMS4ZdgNvXP8ET3a4h2bdWuclJfstHDeLp86/n2c6P8j3\nr31F10E9AWja+TS8MVE8df79PHvhQ5x+zXlUja8RircRVB6PhyEj7uemq+6kc5vLufCSTpxwUsE/\nCFYsW8mlHXrS7Zyr+WbCVO4bcmeIog0BnwvMI8yUuKTlxISGpK9PY1PSRrIys5g14UdadjitQJs9\nu/bkPS9TviyQ8z8mY08GK+f/yr6MfcEMOWwlJpxK5UoVQx1GWKmbcAJ/rt/I9qTN+DKzWT7hZxp1\naFGo3Tl3d2f2q1+RVYrOpZYtE1i7dj2///4HmZmZfPrpOLp27VigzYYNySz75Vd8B5WyV6/+nTVr\nfgcgLW0jmzdvpWbN6kGLPdiOTTiRrRvS2Za0iezMbBZP+InGHRMLtMnI9z0VU75M3kBLB5QpVwaP\n10N02Riy92Wx9689lHRNmjdmw/okkjakkJmZxcSx39H+grMLtJkzawF792QAsHjBL9SOqx2KUEPD\n+QLzCDN+j2kxs2OAPc7lvAsz8wBlnXN/Byq4/0W1OtXZkrYlb3lr2hYaNmtUqN3513Xmwj4XERUd\nxdCrBwYzRIlgFetUY0fa1rzlnWnbqNvshAJt6jSuT+W46qyetogz+nYOdoghUzculuSktLzllJR0\nWrZqVuzXSUxMICYmmrVr1x/F6MJL5dpV2Z564DzanraV+gknFmrXumcHzurThajoKF69JqfquXTS\nHBp3aMHgua8QUy6GccPfY8+O3UGLPVRqx9YiPWVj3nJ66iaatjjlkO0vv/YiZkydHYzQJIiKU2mZ\nCpTPt1wemHJ0w/nnDCu0zhUxFeybdydx+1k38/6Id+h+x5XBCE1KqvznlxkdB/Xgu0c+CF08IWKF\nf/SK/Nk7nDp1avH2f5+jz013F3vfiFLEh1XU25393mRGnN2fiSM+pP0dlwBwbNMTcNk+hp12K4+d\n2Y+z+3ShWr1agY445IpzfnXrfgGnND2ZN158N8BRhRF1DxVS1jm3a/9C7vPyh2psZn3NbL6ZzV+3\na8M/ibFYtqZvoUbsgf7d6rE1+HPjtkO2nzX+R1p2PO2Q20Xy+yt9G5VjD3RbVIqtxl8bt+ctl6lQ\nllqN6tHr44HcOfNZ4pudyFVv3l0qBuMmp6QRXy82b7lu3Tqkpab7vX/FihUYN/YdhgwZydy5CwMR\nYtjYkb6NKnEHzqMqsdXZuenPQ7ZfPOEnGnfI6T5qdlEbVk5fgi8rm11bd7J+wW/Ua3J8wGMOtfTU\nTdSpe6C7p05cLTalby7UrvVZrfjPXTdyS88BZO7LDGaIEgTFSVp2m1nz/Qtm1gI4ZEeqc260cy7R\nOZd4fIX6/yTGYlmzZDWxx8VRq15toqKjaNP1TOZNnlOgTZ0GB75Ym7dLJH19atDik8iWsmQd1Y6r\nQ5V6NfFEe2nc9XR+m7wgb3vGX3sY1ewWnm/bn+fb9id50Ro+7v0Uact+D2HUwTF//hJOPPE4GjSo\nR3R0NFdccRFffTXZr32jo6MZM+YN3v/gMz7/YmKAIw29pCVrqdGgDtXia+KN9pLQ9QyW5zuPAGo0\nODDz5eR2zdiyPicB3J66hYatGwMQU64M9ZudyKa1Jf87bNmiFTQ4rh7xx8YRHR1Fl4s7MvWbGQXa\nnHxqI4aNeohbeg5g25ZDJ4ElkfP5AvIIN8W5Tkt/YIyZ7f/piAXCrl/Fl+3jjcGvMfDdoXi8HqZ9\nOoXk1UlcOeAa1i5dw/wpc7mgVxeatE0gKzOL3Tt38cKAZ/P2f3nm65SrWJ6o6ChadTyN4T2HkLw6\nKYTvKHTuHTKCeYuWsn37Ts67uAe39u7JZV07hTqskHLZPr4e/DbXvns/5vWw+NPpbF6dwjkDLiN1\n6e/8NqVkVwgOJzs7m/79BzHxqw/weD288/YnrPj1N4YMvocFC5fw1VeTadGiKWM+fYOqVSvTpUsH\nBg8eQEKz87i8e1fObHsa1atV5bqeVwDQp89dLFm6IsTvKjB82T6+HPw2N737IOb1MO/TH9i4OplO\nd3UnadnvrJiygDa9OtKwzalkZ2WxZ8duPr77FQBmvfsdVz55C/d89yRmMG/MdNJW/hHidxR42dnZ\nDHvwSd789AW8Hi+ffTSeNavWcef9N/PL4l+Z9u0M7h9yJ+WPKcfzb44AIDV5I//pOSDEkQdJGHbl\nBIIVp9/YzKKBRoABK51zftXeutfvVjo+zX/oowXPHrmR8HiLQaEOIWI8kj491CFEhDtjzwx1CBFj\n/O7VoQ4hYvy2eX4RI3ECY/ej1wXk9+wxD78btPfgjyNWWsysnXNumpldetCmhmaGc+6LAMUmIiIi\n/gjD6cmB4E/30NnANKCoS1Q6QEmLiIiIBNwRkxbn3JDcf28IfDgiIiJSbKVkTIvfs4fM7D0zq5xv\nub6ZTQ1MWCIiIuI3ny8wjzBTnCnPM4E5ZtbZzG4CJgMaOSoiIiJB4feUZ+fca2a2HPge2AI0c875\nf+UoERERCQx1DxVkZj2Bt4DrgLeBSWbWNEBxiYiIiBRQnIvLXQa0dc5tAj4ysy/JSV6Kf0c0ERER\nOXpCOOXZzM4HngO8wBvOuREHbX8GODd3sTxQyzlXJXdbNrAsd9sfzrluhztWcbqHLj5oea6Z6aY9\nIiIipZSZeYGXgA5AMjDPzMY75/IuZ+2cuytf+zsoWOzY45xL8Pd4xekeijezL81ss5ltNLPPgZJ/\na1EREZFwF7q7PLcC1jjn1jnn9gEfAxcdpv3VwEf/69sszuyh/wLjybnnUF1gQu46ERERCaFA3TDR\nzPqa2fx8j74HHboukP8Gfcm56woxs/rAceRcsHa/srmv+7OZXVzUfvkVZ0xLTedc/iTlbTPrX4z9\nRUREJII450YDow/TpKh7Ex2qRHMV8JlzLjvfumOdc6lmdjwwzcyWOefWHupgxam0bDGzHmbmzX30\nALYWY38REREJhNB1DyUD9fItxwOph2h7FQd1DTnnUnP/XQf8wBEm9xQnabkRuAJIB9KA7oAu7S8i\nIlJ6zSPnBsrHmVkMOYnJ+IMbmVkjoCrwU751Vc2sTO7zGkAbYMXB++ZXnO6hegdPRTKzNsAfxXgN\nEREROdpCdHE551yWmd0OfEvOlOe3nHPLzWwYMN85tz+BuRr42DmXP9CTgdfMzEdOEWVE/llHRSlO\n0vIC0NyPdSIiIhJMIbxOi3NuEjDpoHWDD1oeWsR+s4FTi3OsIyYtZnYG0BqoaWYD8m2qRE5WJSIi\nIhJw/lRaYoAKuW0r5lu/k5xxLSIiIhJKpeTeQ0dMWpxz04HpZva2c27DodqZ2QvOuTuOanQiIiIi\nuYpzGf9DJiy52vzDWEREROR/4FRpERERkYhQSpKW4lynRURERCRkjmalpahL+YqIiEig+UI35TmY\nil1pMbNjDrHpuX8Yi4iIiMgh+Z20mFlrM1sB/Jq73NTMXt6/3Tn39tEPT0RERI4odPceCqriVFqe\nATqRe5NE59wS4KxABCUiIiJysGKNaXHOJZkVGLqSfai2IiIiEiRhWBUJhOIkLUlm1hpwuXdyvJPc\nriIREREJnYL3ISy5itM9dAtwG1AXSAYScpdFREREAs6vSouZeYGezrlrAxyPiIiIFFcp6R7yq9Li\nnMsGLgpwLCIiIiKHVJwxLbPM7EXgE2D3/pXOuYVHPSoRERHxXymptBQnaWmd+++wfOsc0O5IO45u\nvqM4MZVaj7cYFOoQIsKDC4aHOoSIMfLY80IdQkQYevneUIcQMVZ8UivUIUgRdMPEgzjnzg1kICIi\nIiKHU5wr4tY2szfN7Ovc5X+bWe/AhSYiIiJ+0RVxC3kb+BaIy13+Deh/tAMSERERKUpxkpYazrlP\nAR+Acy4LXRFXREQk9HwBeoSZ4gzE3W1m1ckZfIuZnQ5ohK2IiEiIaSBuYXcD44ETzGwWUBPoHpCo\nRERERA5SnNlDC8zsbKARYMAq51xmwCITERER/5SSSktxZg8tAe4D9jrnflHCIiIiIsFUnIG43YAs\n4FMzm2dm95jZsQGKS0RERPxVSgbi+p20OOc2OOdGOudaANcATYDfAxaZiIiI+MX5XEAe4aY4A3Ex\nswbAFcCV5Ex3vu/ohyQiIiJSmN9Ji5nNAaKBMcDlzrl1AYtKRERE/BeGXTmBUJxKSy/n3MqARSIi\nIiJyGMVJWv40szeBOOfcBWb2b+AM59ybAYpNRERE/BCO408CQfceEhERkYigew+JiIhEulIy5Vn3\nHhIREYlwLgwTjEAoTtIyAN17SEREREKkOEnLCcAFQD3gMuC0Yu4vIiIigVBKKi3FGdMyyDm3E6gK\ntAdGA68EJCoRERGRgxQnadk/6LYL8KpzbhwQc/RDEhERkeJwvsA8wk1xundSzOw1cqosT5hZGYqX\n9IiIiEgghGGCEQjFSTquIOc6Lec757YD1YB7AxKViIiIyEH8rrQ45/4Gvsi3nAakBSIoERER8V84\nduUEgrp3REREJCJoyrKIiEiEKy2VFiUtIiIiEa60JC3qHhIREZGIoEqLiIhIpHMW6giCQpUWERER\niQiqtIiIiES40jKmpUQmLdHNWlG+9x3g8ZAxZSJ7v/iwUJuY1udS7qrrcc6RvX4tu58ZDkC5624h\nusXpmMdD5uL5/P3m88EOPyROOLsJnYb0xOP1sOjjH5j1yoQi253cuRWXv9KP1y8cSNqy34McZXga\n+NjTzJg1l2pVqzD2/VdDHU5Ite9wFiOfHILX6+Gdtz/h6acKfh5t2rTiiScHccop/+L66+5k7Niv\nC2yvWLECCxZNZsL477h7wJCNXCVkAAAgAElEQVRghh5U3kbNKNOtN3g8ZM6dQub3XxTYHtP1Brwn\nngqARZfBKlRm9+AeBxqUKUf5e18g65c57Bv7ejBDD7oWZ7fg5qE34/F6+Pbjbxnz8pgC2y/pcwmd\nru5EdlY2O7bt4Nl7nmVTyiYAbnjwBlq2awnAx89/zIwJM4IevxxdJS9p8Xgo37c/fw29G9/WzVQa\n+Rr75s7Cl7zhQJPYupS97Fp2PngbbvcurHIVAKIaNSbqX6ew864bAaj02ItENU4ga/nikLyVYDGP\nccHw63n/2sfZmb6NPuOHs2rKQrasTinQLuaYsrS6vhPJC9eEJtAwdXHnDlxzWTceGj4q1KGElMfj\n4elnhtHtwp6kpKQz48dxTJo4hZUrD5wvSUkp3Nz3Xvr1u6nI1xg0eAAzf5wTrJBDwzyUuaQve0YP\nxe3YSrk7R5K1fC5uU3Jek30T/pv3PLpNZzxxxxd4iZhO15C9bnnQQg4Vj8fDrY/cysPXPsyWtC08\nO+FZfp78M0mrk/LarF2+ln5d+pGxN4POPTpz40M3MuK2EbRs15ITTzmR28+/neiYaEaOGcm87+ex\nZ9eeEL6jwHE+jWmJSFENT8aXloJvYxpkZbFv5jRiWrUt0KZMh65kfP0lbvcuANyO7XnbLCYGoqIg\nKhq8Xnw7/gxq/KFQN+EE/ly/ke1Jm/FlZrN8ws806tCiULtz7u7O7Fe/IitjXwiiDF+JCadSuVLF\nUIcRcomJTVm3dgPr1yeRmZnJZ59NoMuFHQq0+eOPFJb/shKfr3AtO6HZKdSqVYOpU38MVsgh4Tm2\nIb4tabhtGyE7i6zFM4lq3OqQ7aMSziRr8YHPxFP3eKxiZbJ/K9l/TAGclHASqetTSf8jnazMLGZM\nmMEZHc8o0GbpT0vJ2JsBwMpFK6kRWwOAYxsey7Kfl+HL9pGxJ4N1K9aReE5i0N9DsJSWGyYWO2kx\ns2MCEcjRYtVqkL1lU96yb+tmPNVrFGjjjYvHE1ePio+9SKURLxPdLOcLI2vVcjKXLaLKW19Q5a0v\nyFw8r0CFpqSqWKcaO9K25i3vTNtGxTpVC7Sp07g+leOqs3raomCHJxEiLq4OySkH7uyRkpJOXFwd\nv/Y1Mx5//GEefujxQIUXNqxSNdz2LXnLbsdWrHL1ottWqYlVq0X2mmW5K4wyXW9g31fvBCPUkKte\npzpbUg98VlvStlC9dtGfFUCnKzsx//v5ADlJyrmJlClbhkpVK9GkdZO8hEYil9/dQ2bWGngDqAAc\na2ZNgZudc7cGKrj/iRVRInMHLXu9eGPj+WtQPzzVa1Lp0RfY0e8GrFJlvPH12d7ncgAqDX2KzH83\nIWvF0sDHHW5cvg/NjI6DejDuntdCF4+EPSviZ8+5g3/4itb35p58++0PpKSUgtuZFfkdVfTnFJXQ\nlqylP+X9yRt9xvlkrVyA27G1yPYlTXHOqXMvOZeGTRpy3xX3AbDox0Wc1PQkRn05ip3bdrJywUp8\n2WFYOjhKXCmZ8lycMS3PAJ2A8QDOuSVmdtahGptZX6AvwNMJDenVIPafxOk3t3Uz3hq18pY91Wvi\n27alQBvf1s1krVoB2dn4NqWTnZqEJy6e6FMSyPptBezN6fPct3AOUSc1LvFJy1/p26gce+Cvl0qx\n1fhr44EuszIVylKrUT16fTwQgAo1K3PVm3fzce+nNBhX8qSkpBFf98DPed26dUhL2+jXvq1aNaN1\nm5bc1LcHFY4pT3RMNLt27WbI4JGBCjdk3I6tWJUDf/Fb5eq4nduKbBuV0JaML0fnLXvqN8J73L+J\nPuMCrExZ8EZBxl72ff1ewOMOhS1pW6gRd+CzqhFbg22bCn9WCW0TuPL2K7n/ivvJ2peVt/6TFz/h\nkxc/AeC+5+8j5feUQvtKZCnWQFznXNJBmW/2YdqOBkYDbLvkbP/+3DoKslavxBMbj6dWHXzbthDT\ntl3ezKD9MufMJObM89j3/TdYxcp44urh25iKr3YsZTpcyN7PvWAQ3bgpeyd8FqzQQyZlyTqqHVeH\nKvVqsjN9G427ns6Xd76Utz3jrz2ManZL3vJ1Hz/M5Ec/VMIiBSxYsJQTTmxA/frxpKZupHv3rtx4\nQz+/9u194115z6/tcRnNmzcpkQkLgC9pNZ4asVjVWrid23ISkw+fKdTOasZh5Srg27Aqb13GR8/m\nPY9KPBdP/IklNmEB+G3Jb8QdF0fterXZmr6Vs7qexcg7C54Xxzc+njsev4NBPQexY+uOvPUej4dj\nKh3DX9v/osG/GtDg5AYsvGthsN9C0ITj+JNAKE7SkpTbReTMLAa4E/g1MGH9A75s/n79WSoOGZUz\n5XnqJLKT1lPu6hvJWrOSzHmzyVw0l+iEllR+/h2cz8eed17B/bWTfT9NJ+rU5lR+7r/gHJmL5pI5\nf3ao31HAuWwfXw9+m2vfvR/zelj86XQ2r07hnAGXkbr0d36bUnJ/0I+Ge4eMYN6ipWzfvpPzLu7B\nrb17clnXTqEOK+iys7O5e8AQxo5/F6/Xw3vvjuHXX1czcNBdLFy4jEkTp9C8RRM++vhVqlSpzAWd\nz+Phgf1pmVjKPiufj4yxr1PupiG5U56n4tuYREzHq8lOXkP2inkARCecSdbimSEONrR82T5eGfQK\nj7z3CB6vh+8++Y4/fvuDHgN6sHrZauZMnkPvh3tTtnxZHnzlQQA2p25mWO9heKO9PPn5kwD8/dff\njOo3qmR3D5WS2UPmb5+zmdUAngPaAwZ8B/Rzzh2xczWYlZZI9uLC+FCHEBEeXDD8yI0EgKrHnhfq\nECJC+m0JoQ4hYlzxSUaoQ4gYk/6YFLRMIqnleQH5PVtv3tSwyob8rrQ457YA1wYwFhEREfkf+Fl/\niHjFmT1UE7gJaJB/P+fcjUc/LBEREZGCijOmZRzwIzCFwwzAFRERkeAqLWNaipO0lHfO3R+wSERE\nROR/UlqSluJcEfcrM+scsEhEREREDqM4lZZ+wENmtg/YR84MIuecqxSQyERERMQvGoh7EOec7ggn\nIiIiIeN395Dl6GFmg3KX65nZoW9NKiIiIkHhfBaQhz/M7HwzW2Vma8zsgUO0ucLMVpjZcjP7MN/6\nXma2OvfR60jHKk730MuAD2gHDAd2AS8BLYvxGiIiIlJCmJmXnFygA5AMzDOz8c65FfnaNAQeBNo4\n5/40s1q566sBQ4BEcm5tvCB33z8PdbziDMQ9zTl3G7AXIPdFY4r17kREROSoc84C8vBDK2CNc26d\nc24f8DFw0UFtbgJe2p+MOOc25a7vBEx2zm3L3TYZOP9wBytOpSUzN6NykHexuZJ7IwcREZEIEcIb\nJtYFkvItJwOnHdTmJAAzmwV4gaHOuW8OsW/dwx2sOEnL88CXQC0zexToDgwsxv4iIiISQcysL9A3\n36rRzrnR+ZsUsdvBc5migIbAOUA88KOZneLnvoVeyC/OuQ/MbAFwXu6BLnbOhd9dnkVEREoZn39d\nOcWWm6CMPkyTZKBevuV4ILWINj875zKB381sFTlJTDI5iUz+fX84XDzFmT10OpDinHvJOfcikGxm\nB5eAREREpPSYBzQ0s+PMLAa4Chh/UJuxwLkAZlaDnO6idcC3QEczq2pmVYGOuesOqTjdQ68AzfMt\n7y5inYiIiASZn4NmA3Bcl2Vmt5OTbHiBt5xzy81sGDDfOTeeA8nJCnLuXXivc24rgJkNJyfxARjm\nnNt2uOMVJ2kx5w5cc8855zOz4uwvIiIiARDKew855yYBkw5aNzjfcwcMyH0cvO9bwFv+Hqs4U57X\nmdmdZhad++hHTnlHREREJOCKk7TcArQGUjgwpanvYfcQERGRgHMuMI9wU5zZQ5vIGWAjIiIiEnTF\nmT000swq5XYNTTWzLWbWI5DBiYiIyJGF8t5DwVSc7qGOzrmdwIXkdA+dBNwbkKhERETEbz5nAXmE\nm+IkLdG5/3YGPjrStCQRERGRo6k4U5YnmNlKYA9wa+69h/YGJiwRERHxV6iu0xJsfldanHMPAGcA\nibmX4t1N4Ts5ioiIiASE35UWMysL3AC0NTMHzCTnirgiIiISQuE4PTkQitM99C7wF/BC7vLVwHvA\n5Uc7KBEREZGDFSdpaeSca5pv+XszW3K0AxIREZHiCceZPoFQnKRlkZmd7pz7GSD3Ds+zAhOWiIiI\n+Ku0DMQ9YtJiZssAR86U5+vM7I/c5frAisCGJyIiIpLDn0rLhfmeVwXOzH0+A9h+1CMSERGRYikt\nA3GPOOXZObfBObcBuJicgbc1gJq5z7sFNjwRERGRHOb8TM/MbClwhnNud+7yMcBPzrkmR9q3RqWT\nSkkO+M/szPg71CFEhBhv9JEbCQB//jE11CFEhFoNOoY6hIjx1749oQ4hYmTtSwnaQJP58RcH5Pds\nYvLYsBosU5yBuAZk51vOzl0nIiIiIaSBuIX9F5hjZl/mLl8MvHn0QxIREREpzO+kxTn3tJn9ALQl\np8Jyg3NuUaACExEREf/oOi1FcM4tBBYGKBYRERGRQypW0iIiIiLhp7TMdlHSIiIiEuFKS/fQEa/T\nIiIiIhIOVGkRERGJcKVlyrMqLSIiIhIRVGkRERGJcL5QBxAkqrSIiIhIRFClRUREJMK5UnJXHSUt\nIiIiEc5XSi7Uou4hERERiQiqtIiIiEQ4XynpHlKlRURERCKCKi0iIiIRTgNxRUREJCLoOi0iIiIi\nYUSVFhERkQhXWrqHVGkRERGRiKBKi4iISIQrLWNalLSIiIhEuNKStKh7SERERCKCKi0iIiIRTgNx\nRURERMKIKi0iIiIRzlc6Ci2qtIiIiEhkUKVFREQkwpWWuzwraREREYlwLtQBBIm6h0RERCQilMik\npV37M/l5wTfMXTyZO+/qW2j7Ga0TmTbjS9K3raDrRZ0KbKsbH8uYsW8xe97XzJo7iXrH1g1W2EHX\nseM5/LJsOitWzOTee24rtL1t29OY8/PX/L17PZde0iVvfdMm/2bG9HEsXjSVBfMnc3n3rsEMOyTa\ndziLhYunsmTZ9wy4+5ZC29u0acXM2RPYvnM1F198QaHtFStW4Lc1P/HU0/8XjHDD0sDHnuasLldx\ncY/Cn19pc177s5i78DsWLJlK/wE3F9reuk1Lfpg5js3bV9Lt4vMLbNuyYxUzZo9nxuzxfPjJa8EK\nOWQ6dTyH5b/MYOWKmdx3b+HvqTPbnsbcOd+w9+8NXHppvu+ppo2ZOWM8SxZPY+GCyVx+ebdghh10\nvgA9wk2J6x7yeDw88dQQul90A6kp6Uz+4XO+mTSV31atzWuTnJzG7f95gNvu7F1o/5dfG8nTo15h\n+vezOeaY8vh84fi/7Z/zeDw899wjdO58DcnJafw0eyJfffUdv65cndcmKSmFPn0GcNddBb9U/96z\nhxt792fNmt+Jja3Nzz9N4rvJ09mxY2ew30ZQeDwenn5mGN0u7ElKSjozfhzHpIlTWLlyTV6bpKQU\nbu57L/363VTkawwaPICZP84JVshh6eLOHbjmsm48NHxUqEMJKY/Hw5NPD+WSbr1ITUln2owv+HrS\nVFYVOJ9Sue3m+7i9X59C++/Zs5ezWpfsX8D7eTwenn/uUc7vfDXJyWn8/NMkJnz1Hb/+euB76o+k\nFHr3uYsBdxVMhv/+ew/X39gv73tq7s9f8913P5TY76nSosQlLc0Tm/D7ug1sWJ8EwJefT+SCLu0L\nJC1Jf6QAFEpITmp0At6oKKZ/PxuA3bv/DlLUwdeyZQJr167n99//AODTT8fRtWvHAknLhg3JQOHP\nafXq3/Oep6VtZPPmrdSsWb3EfhkkJjZl3doNrM89pz77bAJdLuxQIGn54xDnFEBCs1OoVasGkydP\np3nzJsEJOgwlJpxKStrGUIcRci0Sm7Iu33fUF59NpHOX9gWTlsOcT6VJq5bNCn1PdevaqUDScujv\nqXV5z9PSNrKphH9P+ax0DMT1q3vIzNr4sy4cxMbWJjU5PW85NTWd2Ljafu17wonHsXPHTt5+/0Wm\n/TiWocPvw+MpkT1o1I2LJTkpLW85JSWduLqxxX6dxMQEYmKiWbt2/VGMLrzExdUhOeWgzyqujl/7\nmhmPP/4wDz/0eKDCkwgTG1eblOQD51Nqiv/fUQBly5Zh2owv+W7aZ3S+sH0gQgwbcXXrkJScmrec\nnJLm989efi1LwfeUC9Aj3PhbaXkBaO7HupCzIrJN5/z76KOivJx+RiLnnnkxyUmpvPH2s1x97aV8\n8N5nRzvMkCsqKff3c9qvTp1avP3f57ix913F3jeS/JNzqu/NPfn22x9IyZf0SOn2T84ngFP/dRbp\n6Zuo36Ae4ye+x4rlv7E+txJR0vzTzwpyv6fefp4bb+xfor+nSovDJi1mdgbQGqhpZgPybaoEeI+w\nb1+gL8AxZWpRNqbyPwzVP6mp6cTFH8jE4+LqkJ62ye99ly1dkVe2nTRxCoktE0pk0pKckkZ8vQOV\nlbp165CWmn6YPQqqWLEC48a+w5AhI5k7d2EgQgwbKSlpxNc96LPys5ujVatmtG7Tkpv69qDCMeWJ\njolm167dDBk8MlDhSphLTUmnbvyB8ymurv/fUQDp6TltN6xPYuaPc2jS9N8lNmlJSU6jXnxc3nJ8\n3Vi/f/Yg53tq/Lh3GTxkJHNK+PdUaelIPFLfRwxQgZzkpmK+x06g++F2dM6Nds4lOucSg5WwACxa\nsIzjj2/AsfXjiY6O5pLLuvDNpKl+71u5SmWqV68KwJlnnV6gn7kkmT9/CSeeeBwNGtQjOjqaK664\niK++muzXvtHR0YwZ8wbvf/AZn38xMcCRht6CBUs54cQG1M89p7p378qkiVP82rf3jXdxcqO2ND75\nTB566DE++vBLJSyl3MIFSznhhPp531GXdu/C135+R1WuUomYmBgAqlWvymmntyix31EA8+YvLvQ9\nNeGr7/zaNzo6ms/HvMn773/G559/FeBIJVjMn3KZmdV3zm34Xw9So9JJQa3Jte94No+OeAiP18uH\n733GM6Ne5YGH72Txwl/45utpNGt+Ku988BKVq1QiIyODTRu30Pa0nKlyZ5/bmmGPPoCZsWTxcgbc\nOYjMzMygxL0zI7gDf88/vx1PjRqKx+vhnbc/YcQTLzBk8D0sWLiEr76aTIsWTRnz6RtUrVqZvXsz\n2LhxEwnNzuOaqy/l9defYsWK3/Jeq0+fu1iydEVQ4o7xRgflOPl17HQOT4wcjNfr4b13x/DkyJcY\nOOguFi5cxqSJU2jeogkfffwqVarkfFabNm6mZWLB6fTX9riM5s2bcPeAIUGL+88//PtlGAz3DhnB\nvEVL2b59J9WrVeHW3j25rGunI+8YBLUadAzq8Tp0PJvHnhiI1+vlg/fG8NSTr/DgwH4sXvgLX0+a\nSrPmp/LeR69QpUolMvZmsHHTFlq3vIBWpzXjmecfwefz4fF4eOWlt3n/3TFBjf2vfXuCerwLzm/H\nU0/9H16Ph7ff+YTHRzzP0CH3MH9BzvdUYoumfDbmzbzvqfSNm2ia0I5rrrmUN19/muX5vqd697mL\nJUuWBy32rH0pQRsd+1HctQH5PXt16gdhNcLX36SlJnAf0Bgou3+9c66dPwcJdtISqYKdtESqUCQt\nkSqckpZwFuykJZIFO2mJZMFMWj6I6xGQ37PXpr4fVkmLv1NjPgBWAscB/wesB+YFKCYRERGRQvxN\nWqo7594EMp1z051zNwKnBzAuERER8ZOmPBe0f1BHmpl1AVKB+MCEJCIiIlKYv0nLI2ZWGbibnOuz\nVALuClhUIiIi4jdfWI08CRy/khbn3P75YjuAcwMXjoiIiEjR/L2M/0lmNtXMfsldbmJmAwMbmoiI\niPijtNzl2d+BuK8DD5I7tsU5txS4KlBBiYiIiP9Ky0Bcf5OW8s65uQetyzrawYiIiIgcir8DcbeY\n2QnkJl5m1h3QHeBERETCQGkZiOtvpeU24DXgX2aWAvQHbglYVCIiIhIRzOx8M1tlZmvM7IHDtOtu\nZs7MEnOXG5jZHjNbnPt49UjHOmKlxcw8QKJzrr2ZHQN4nHN/FecNiYiISOCEatCsmXmBl4AOQDIw\nz8zGO+dWHNSuInAnMOegl1jrnEvw93hHrLQ453zA7bnPdythERERCS8hnD3UCljjnFvnnNsHfAxc\nVES74cBIYO//8Pby+Ns9NNnM7jGzemZWbf/jnxxYREREIl5dICnfcnLuujxm1gyol++ab/kdZ2aL\nzGy6mZ15pIP5OxD3xtx/b8u3zgHH+7m/iIiIBIgL0EBcM+sL9M23arRzbnT+JkWFk29/D/AMcH0R\n7dKAY51zW82sBTDWzBo753YeKh5/r4h7nD/tREREpOTITVBGH6ZJMlAv33I8Ofcn3K8icArwg5kB\n1AHGm1k359x8ICP3OAvMbC1wEjD/UAfzt9KCmbUGGuTfxzn3rr/7i4iISGCE8Oq184CGZnYckELO\nhWev2b/RObcDqLF/2cx+AO5xzs03s5rANudctpkdDzQE1h3uYH4lLWb2HnACsBjI3h8LoKRFREQk\nxEKVtDjnsszsduBbwAu85ZxbbmbDgPnOufGH2f0sYJiZZZGTW9zinNt2uOP5W2lJBP7tnAvHq/qK\niIhIiDjnJgGTDlo3+BBtz8n3/HPg8+Icy9/ZQ7+Q0w8lIiIiYaa03HvosJUWM5tATtwVgRVmNpfc\nQTMAzrlugQ1PREREJMeRuodGkTOd6Qng4nzr968TERGRECst9x46bNLinJsOYGbR+5/vZ2blAhmY\niIiISH5H6h76D3ArcLyZLc23qSIwK5CBiYiIiH9COOU5qI7UPfQh8DXwOJD/zo1/HWlakoiIiASH\nkhbyLgqzA7g6OOGIiIiIFM3vK+KKiIhIeArH6cmB4O91WkRERERCSpUWERGRCKcpzyIiIhIRSstA\nXHUPiYiISERQpUVERCTCaSCuiIiISBgJSqVleYvYYBwm4j35e1yoQ4gIQy/fG+oQIkatBh1DHUJE\n2LT+u1CHEDFuTrwv1CFIEXylpNai7iEREZEIp4G4IiIiImFElRYREZEIVzo6h1RpERERkQihSouI\niEiEKy1jWpS0iIiIRLjSchl/dQ+JiIhIRFClRUREJMKVluu0qNIiIiIiEUGVFhERkQhXOuosqrSI\niIhIhFClRUREJMJpyrOIiIhEBA3EFREREQkjqrSIiIhEuNJRZ1GlRURERCKEKi0iIiIRTgNxRURE\nJCJoIK6IiIhIGFGlRUREJMKVjjqLKi0iIiISIVRpERERiXAaiCsiIiIRwZWSDiJ1D4mIiEhEUKVF\nREQkwpWW7iFVWkRERCQiqNIiIiIS4XRxOREREZEwokqLiIhIhCsddRYlLSIiIhFP3UMiIiIiYaRE\nVlpiWrai4u13gNfDnokT+fujDwu1KXPOuVTodT3gyFy7lp2PDAeg1pRpZP2+DgDfxk1sH/hQECMP\nrkZnN+Wiwdfh8XqY88n3fP/K+ALbz7i2Pa17dsDn87Fv914+e/ANNq5JwRPl5Yon+lK3cQM8UV4W\nfPEj014eF6J3ERzeRs0o0603eDxkzp1C5vdfFNge0/UGvCeeCoBFl8EqVGb34B4HGpQpR/l7XyDr\nlznsG/t6MEMPqvPan8XjIwfi9Xp5751Pefbp1wpsb92mJY89MZDGpzSi9/X9GT/2m7xtW3asYsXy\nVQAkJ6VxzZU3BzX2cDLwsaeZMWsu1apWYez7r4Y6nJA65ewErhl8A+b18OMnU5n0ytgC28+5tiPt\nenbC5/ORsXsv7zz4GqlrkvO2V4urwSOTn2Hcs2P49vXxB798iVFapjyXvKTF46Fiv/5sv/dusjdv\nptqrr5ExexbZGzbkNfHWrcsx11zLtjtuw+3ahVWpkrfN7ctg2019QhF5UJnHuGTYDYzu8Rg70rfS\nb/yjrJi8gI1rUvLaLBw3i58+mALAv9u3oOugnrzRawRNO5+GNyaKp86/n+iyMdw7ZRSLxs/iz+Qt\noXo7gWUeylzSlz2jh+J2bKXcnSPJWj4Xt+nAF+O+Cf/Nex7dpjOeuOMLvERMp2vIXrc8aCGHgsfj\n4cmnh3JJt16kpqQzbcYXfD1pKqtWrslrk5SUym0338ft/Qr/jO3Zs5ezWncLZshh6+LOHbjmsm48\nNHxUqEMJKfN46DGsD0/1GMa29G0MHj+CxZPnF0hKfh73Iz988B0ACe0TuXJQL57p9Wje9qsGXc+y\nHxYHPXYJDL+7h8zscn/WhVr0v04mOzWF7LQ0yMpi77RplGnTtkCbchd2Zc/YL3G7dgHgtm8PRagh\ndWzCiWzdkM62pE1kZ2azeMJPNO6YWKBNxq49ec9jypcBl9Nn6oAy5crg8XqILhtD9r4s9v61h5LK\nc2xDfFvScNs2QnYWWYtnEtW41SHbRyWcSdbiHw/sX/d4rGJlsn8r2V+cLRKbsm7dBjasTyIzM5Mv\nPptI5y7tC7RJ+iOF5ctX4fOVlr8L/zeJCadSuVLFUIcRcscnnMimDelsTtpEdmYWcybMIqFjywJt\n9ub7nipTvkyBEanNOrZk8x8bSV2dFKyQQ8YF6L9wU5wxLQ/6uS6kPDVq4Nu0KW/Zt3kz3ho1CrTx\nxsfjrVePqi+8SNWXXiam5YFfQBYTQ7VXX6PqSy8XSnZKksq1q7I9dWve8va0rVSuXbVQu9Y9O/DA\n9Ge58IFrGDv0HQCWTppDxp4MBs99hYGzX+CH179iz47dQYs92KxSNdz2A1Ukt2MrVrl60W2r1MSq\n1SJ7zbLcFUaZrjew76t3ghFqSMXG1SYlOS1vOTUlndi42n7vX7ZsGabN+JLvpn1G5wvbH3kHKfGq\n1K7GttQDP3t/pm2lau1qhdq163k+I6a/yOUP9OSDoW8CEFOuDBfccjHjnxsTtHhDyRegR7g5YveQ\nmV0AdAbqmtnz+TZVArICFdj/zKzwuoOSRfN68daN58/+/fDUrEm1519g6w034Hbv4v/bu/P4KOr7\nj+OvT0LwQLnkDKig4irurr0AACAASURBVFkgKGgRVMADwXKoIFql3kjxBBW1Klpq1aq1/XnhXRWt\ngloVRVEKKiIqhwYR5FKinHJ4URUkyef3x0xgEzbJbshmd5P3M4997Bzfmf3MNzOz3/1+vzOzftBp\nFG7YQGbz5jS4+x/kL/uSglWrqib2qhQlnzxKoXrG2MnMGDuZDn2P5LhLT+a5K8ewV/t98YJCRh8x\njF3r1WHY+JtYMv0zvl2+dvsVVAdR96nov0Bq5XQl/9MPwIPDPavzieQvnIP/sCFq+urEou5Tsf9S\na3vg0axZs5a9W+3JhIljWTB/MXnLvq7MECXNxLpPTR07ialjJ3FE3670uXQAj115H/2HD2LyY6+x\n+edNVRGqVJFY+rSsAuYAfcP3IhuB4aUtZGZDgCEAd+7fhsHZzXcgzNgVrltHRpMmW8czGjemYEPx\nvhYF69axZcECKCigcM0a8pcvJ7NlS/IXLaRwQ/DlUrB6Nb/m5lJrvzbVstDyw5pvqZ+9rbagfvM9\n+HHtd6Wmz331A0655XwAOvTrwsJ351KYX8D/NvxI3pzF7Nlun2pbaPEfNmD1t9XWWb098B+/jZq2\nVk5XNr/08NbxjL0PILP1wWR17oXttDNk1oLNm/j1jbEJj7uqrVq5hhYttx3n2S2asWZ17PvEmjVB\n2q/yljP9vY9o1/5gFVpquO/WbKBh9rZjr0HzPfi+jPPUzFffZ/AtFwKwT04bOvb+LQOvG8yudetQ\nWFjIls2/MvWpSaUun85SsSknEcptHnL3ue7+BLCvuz8Z8fqPu5e697j7w+7e0d07VlWBBWDLwoVk\ntmhJRrNmUKsWO/foweYZ7xdLs3n6dGp36ACA1a1HrZZ7UrB6FbbbbpCVtXV61m/akv9VXpXFXpWW\nz/2CRq2a0bBlYzKzMsnp05n5k+cUS9OoVbOtwwf16MD6vDUAfL9qPW2OPAQIqmD37rAfa7+ofgW7\nIoXLl5DRqDnWoAlk1qJWTlcKFszaLp01zsZ22Y3CrxZtnbb52X/y861D+Pm2i9j82hNsmfNOtSyw\nAHw851P23Xdv9tq7JVlZWZwy4CTeeH1KTMvWq1+X2rVrA9BwjwYc8dvDinXglZpp2dylNG3VnEYt\nm5CZVYsj+nQhd3LxY69JxHmqXY9DWRuep24/7UZGdh3GyK7DmPz4RCbe/1K1LbDUJLE0D80jbGAp\npaquXeWHtQMKC9h4zz9pcMddkJHBpjdepyAvjzrnnkf+ooVsnjGDX2fNpHanTuzxryfxwkI2PjgG\n//FHsg45hN1HXBVU7VsGPz37TLGrjqqTwoJCXhr1BBc+dR2WmcGs8e/wzZIV9Bw+gOXzlrHgv3Po\ncvYJtOnSloL8fH754Seeu3IMAO8/9RaD7hzKVW/diRnMev5dVi+sxr+ICwvZ/PIj7HLhTeElz1Mo\n/GY5tU84g4IVS7cWYLJyjiI/d3qSg02egoICRl75Z158+V9kZmbyzNjnWfj5Eq674XJyP/6MN16f\nQodD2zL22THUr1+XE3v14NrrL+fITr044IB9+cc9t1BYWEhGRgb/vPuhGl1oufqm25n1yad8//2P\nHNv/LIadP5hT+/RMdlhVrrCgkKdHPcqIp24gIzOD6eOnsmrJCvoPH0TevC/I/e9sjj27Fwd3aUdB\nfj4//fATj155b7LDTopU7H+SCFZem7OZ7V3WfHcv91v9m+7H1Ix6qx1057LsZIeQFm4eqDbqWO05\nZl6yQ0gLa/PeSnYIaeOijiOTHULaeDzvhSgd4hJj8N6nJOR7duxX/6mybYhFuTUtsRRKRERERBIt\n5pvLmdlGtl2HUxvIAn5y97qJCExERERiU1OaM2IutLh7sTsdmVl/oPQ7bImIiIhUogrfxt/dXzaz\nayszGBEREYlfTXnKczzNQ6dEjGYAHak5NVIiIiKSZPHUtPSJGM4H8oB+lRqNiIiIxK2m3Fwunj4t\n5yYyEBEREamYmnKflnie8nyHmdU1sywzm2Jm683srEQGJyIiIlIknqc8n+DuPwK/A1YA+wNXJyQq\nERERiVkhnpBXqomn0JIVvvcGnnX36E+MExEREUmAeAotr5rZQoKrhqaYWWNA91MXERFJMk/QXyzM\n7EQzW2RmS6PdCsXMhprZPDPLNbPpZnZwxLzrwuUWmVm5D9iKudDi7tcCnYGO7r4F+AldPSQiIpJ0\nhQl6lcfMMoH7gV7AwcAZkYWS0L/dva275wB3AHeHyx4MnA4cApwIPBCur1Tx3lzuIKCVmUUu91Sc\n6xAREZHq4XBgqbt/CWBmzxFUaCwoShD2hy1Sh233eOsHPOfum4FlZrY0XN8HpX1YPDeXGwvsC+QC\nBUWxoEKLiIhIUrknptOsmQ0BhkRMetjdH44YbwEsjxhfARwRZT0XAyMInl3YI2LZD0ss26KseOKp\naekIHOyJyhkRERFJKWEB5eEykli0xaKs537gfjP7PXADcHasy0aKp9DyGdAMWB3HMiIiIpJgSbw8\neQWwZ8R4S2BVGemfA8ZUcNm4Ci2NgAVmNhPYXDTR3fvGsQ4RERGpZEm8I+4soI2ZtQZWEnSs/X1k\nAjNr4+5LwtGTgKLhCcC/zexuIBtoA8ws68PiKbTcHEdaERERqebcPd/MLgHeBDKBx919vpmNBma7\n+wTgEjM7DtgCfEfQNESYbjxBp9184GJ3L4j6QaF4nj30boW2SERERBIqmQ9MdPfXgddLTBsVMXx5\nGcv+FfhrrJ9VbqHFzKa7e1cz20jxDjIWfJ7XjfXDRERERCqq3EKLu3cN33dPfDgiIiISr1R8TlAi\nxHMbfxEREZGkifeOuCIiIpJiasot1FRoERERSXNJvOS5Sql5SERERNKCalpERETSXDIvea5KqmkR\nERGRtKCaFhERkTRXUy55VqFFREQkzdWUq4fUPCQiIiJpQTUtIiIiaU7NQ5Xo9CW1q+Jj0t7KzUvK\nTyQsGNck2SGkjY2//pLsENLCRR1HJjuEtPHQ7DuSHYLUYKppERERSXM15ZJnFVpERETSXKE64oqI\niIikDtW0iIiIpLmaUc+imhYRERFJE6ppERERSXM15ZJn1bSIiIhIWlBNi4iISJqrKTUtKrSIiIik\nOT17SERERCSFqKZFREQkzdWU5iHVtIiIiEhaUE2LiIhImtOzh0RERCQtqCOuiIiISApRTYuIiEia\nU0dcERERkRSimhYREZE0V1P6tKjQIiIikubUPCQiIiKSQlTTIiIikuZqyn1aVNMiIiIiaUE1LSIi\nImmusIZ0xFVNi4iIiKQF1bSIiIikuZrSp0WFFhERkTSn5iERERGRFKKaFhERkTRXU5qHVNMiIiIi\naUE1LSIiImmupvRpUaFFREQkzal5KI116taRJ999nKenP8EZFw/abv7AC0/lX1Mf5dHJD/H35+6g\naYsmAOQc2Z5H3nxw6+vNpRPp0vPIqg4/KY7q0ZlJH7zI5JkvMeSys7ebf+7QM3l9+ngmvPMsT774\nANktmyUhyuQ57JjDePjth3l02qMMHDZwu/knX3AyD055kPvfvJ9bn72VJuE+BXDudefywOQHeGDy\nAxzd5+iqDLvK9TyhG/M/m8bCBdMZefXF280/qusRzPxoEpt+/opTTjlp6/T27Q9h+rQJzM2dysdz\nJjNwYN+qDDspfnNMDrdO+T9ue+deev+x/3bzu515AqMn/Z2bX7+T657/C9n7tSw2v2F2Ix6YP5ae\nF1b/vCrNDbfezdEnnU7/s4YmOxSpItWu0JKRkcHlt1zKtYP/xDndL+DYft3Zu81exdIsmb+Uob0v\n5oLjL+LdidO46PoLAcidMZcLew7lwp5DGTHoajZt2sTsd+ckYzOqVEZGBjfdfg0Xnn4ZvbsM5Hcn\n92Tf/VsXS7Ng3kJOOX4wfbudwaRXpzDypsuSFG3Vy8jIYNgtwxh19iiGHjuUY/oew55t9iyW5ov5\nX3D5SZdzcc+LmT5xOuf96TwAOvXoxH6/2Y9LTryE4X2Hc+pFp7LLbrskYzMSLiMjg3v+76/8rs9Z\ntG3fnUGD+nPQQW2Kpfl6+UrOv2A4zz73crHpP//8C+ecdzntc3pw0u/O4u67bqZevbpVGX6VsowM\nzhp9Af8456/ccPxwjujbdbtCyYevvMeoE6/k5t5X88ZDrzDoxuI/Jk6/8RzmvZNblWGnnP69j+fB\nu29JdhgpodA9Ia9UU+0KLQfmHMCqvFWs/noN+VvymfrKO3Q5oXhtSe6MuWzetBmABR9/TuPmjbdb\nzzEnHcXMt2dtTVedtTv0EL7KW87yr1ayZUs+E19+i+N6HVMszUfvz2HTL0Fe5M75jKbZTZMRalLs\nn7M/q/JWsSbcp6a9Oo3OJ3QulubTDz7duq8s/GQhjZo3AmCvNnsx78N5FBYUsvmXzXy54Es6dutY\n5dtQFQ7v1IEvvshj2bKv2bJlC+PHv0LfPj2LpfnqqxXMm/c5hYWFxaYvWfIlS5cuA2D16m9Yu24D\njRvvUWWxV7V9cvZj7VdrWLd8LQVb8vno1ffJOaFTsTSb/vfL1uGddt2JyNr/Did0Yt3X37BqyfKq\nCjkldcxpS726uyc7DKlCcRVazGwXMzsgUcFUhkbNG7F29bqt4+vWrN/6BRJN7zN68dHbM7eb3r1v\nN6a8/HZCYkw1TZs3Yc3Kb7aOr1m1lqbNm5SafuCZ/Zg2ZUZVhJYS9mi2B+tXrd86vn71evZoWvoX\nas9BPZn99myAoJDSvSM77bwTdRvUpd2R7crcH9NZdotmLF+xauv4ipWryc6OvxmxU8ccatfO4osv\n8ioxutRSv2lDvo3Yp75bvYEGTRtul67H4BO5/d37GHjtYJ65+TEAau+yE72G9mfC/z1fZfFK6vME\n/aWamDvimlkf4C6gNtDazHKA0e6eUg2qhm03zUup4jrulGM5oN3+XDHgymLTGzZpyD4HtmbWu7MT\nEmOqse2zrNQ86zugF79pfxBn9huS4KhSh0XJoNLyp/vJ3WnTrg0jTxsJwCfvfcL+7ffnrpfu4sdv\nf2ThnIUUFhRGXTbdxZNPpWnWrAlPPHEP5513RdzLppNY82rq2ElMHTuJI/p2pc+lA3jsyvvoP3wQ\nkx97jc0/b6qKUCVNuFfP80pJ8Vw9dDNwOPAOgLvnmlmr0hKb2RBgCMD+9Q8ku07L0pJWqnWr19Ek\normncbNGbFizYbt0h3btwFmX/p4rBlzJll+3FJvXvc8xTJ/0PgX5BQmPNxWsWbWWZi22Nfc0y27C\n2jXrtkt35NGH88fh53FmvyHb5Vl1tn71ehplb6sdadS8Ed+u/Xa7dDldcxh0ySCuOe0a8n/N3zp9\n3H3jGHffOABG3jOSlctWJj7oJFi5YjV7tszeOt6yRXNWr/6mjCWK23333ZjwylOMuukOPpr5cSJC\nTBnfrdlAw4h9qkHzPfh+7Xelpp/56vsMviXoe7dPThs69v4tA68bzK5161BYWMiWzb8y9alJCY9b\nJNniaR7Kd/cfYk3s7g+7e0d371hVBRaAhXMX0aJ1C5rt2YxaWbXo0a8bMyZ/UCzNfofsy4jbr+D6\n80bx/Ybvt1tHj37dmfJKzWgaApj3yQJatd6Tlntlk5VVi5P6n8CUSdOKpTmo7QGMvutPDB08gm/X\nl35yrY4Wz11Mdutsmu7ZlFpZtTi6z9F8OPnDYmn2OWQfLr3tUkafP5ofNmw7TDIyMti9ftDm3urA\nVrQ6qBUfT6ueX8izZuey336tadVqT7KysjjttH68+tpbMS2blZXFi88/xtNPv8CLL76W4EiTb9nc\npTRt1ZxGLZuQmVWLI/p0IXfyrGJpmrTa1rTWrsehrM1bA8Dtp93IyK7DGNl1GJMfn8jE+19SgUUo\nxBPySjXx1LR8Zma/BzLNrA1wGZByHRsKCwq558b7uOOZ28jIyOCNcW+St/grzr3qbBbNXcyMyR8w\n9IYh7FJnF25+8EYAvlm5lhvOGwVA05ZNaZzdmLkffJrMzahSBQUFjL7uTh4bfy+ZGZm88OwEli76\nksuuuYjPcj9n6pvTuOamy9i1zi7c89jtAKxa8Q1/HDwiyZFXjcKCQsbcOIZbxt5CRmYGb417i68X\nf81ZI85iybwlfDT5I86//nx23nVnrhtzHQDrVq1j9PmjyczK5M4X7wTg540/c9fld1Xb5qGCggIu\nv+IGXp/4bzIzMnjiyXEsWLCYm2+6itlz5vLaa5PpeFh7Xnj+MRo0qMfvTjqem0ZdSfucHgwc2Iej\njjqChns04A9/OA2A8y8Yzty585O8VYlRWFDI06MeZcRTN5CRmcH08VNZtWQF/YcPIm/eF+T+dzbH\nnt2Lg7u0oyA/n59++IlHr7w32WGnnKtvup1Zn3zK99//yLH9z2LY+YM5tUTnb6leLNZ2YzPbFbge\nOCGc9CbwF3cv9/Ka7i2PT73iWgpaublm1WBU1H67lN5JWIp7a83cZIeQFv6Q3bn8RALAQ7PvSHYI\naSOr0T5Regwmxl4N2ybke/brb+dV2TbEIp6alpPc/XqCggsAZjYQUBd2ERERSbh4+rRcF+M0ERER\nqULq0xIys15Ab6CFmd0TMasukB99KREREakq1fkWAZFiaR5aBcwG+gKR97TfCAxPRFAiIiIiJZVb\naHH3ucBcM/u3u9ecm3OIiIikiVR8TlAixNMRt5WZ3QYcDOxcNNHd96n0qERERERKiKcj7r+AMQT9\nWLoDTwFjExGUiIiIxK6mPHsonkLLLu4+heDeLl+5+81Aj8SEJSIiIrFy94S8Uk08zUObzCwDWGJm\nlwArAd3lS0RERKpEPIWWK4BdCW7f/xeCWpazExGUiIiIxC4V76mSCDEXWty96Gle/wPOTUw4IiIi\nItHF3KfFzPY3s0fM7C0zm1r0SmRwIiIiUr5k9mkxsxPNbJGZLTWza6PMP9rMPjazfDMbUGJegZnl\nhq8J5X1WPM1DzwMPAo8ABXEsJyIiIgmUrPu0mFkmcD9wPLACmGVmE9x9QUSyr4FzgKuirOIXd8+J\n9fPiKbTku/uYONKLiIhI9XY4sNTdvwQws+eAfsDWQou754XzCnf0w8ptHjKzhmbWEHjVzIaZWfOi\naeF0ERERSaIkNg+1AJZHjK8Ip8VqZzObbWYfmln/8hLHUtMyB3DAwvGrI+Y5oDviioiIVENmNgQY\nEjHpYXd/ODJJlMXiaavay91Xmdk+wFQzm+fuX5SWOJZnD7WO5VPN7Hh3nxxHoCIiIlIJEnXJc1hA\nebiMJCuAPSPGWxI8aDnW9a8K3780s3eADkCphZZ47ohbnr9V4rpEREQkRklsHpoFtDGz1mZWGzgd\nKPcqIAAza2BmO4XDjYAuRPSFiaYyCy3RqohERESkmnL3fOAS4E3gc2C8u883s9Fm1hfAzDqZ2Qpg\nIPCQmc0PFz8ImG1mc4G3gdtLXHW0nXiuHio39kpcl4iIiMQoWZc8A7j768DrJaaNihieRdBsVHK5\nGUDbeD6rMmtaRERERBKmMmta8ipxXSIiIhIjryGNHfHcxn+2mV1sZg2izXf3UyovLBEREZHi4mke\nOh3IJrhF73Nm1tPM1PlWREQkyQrdE/JKNTEXWtx9qbtfD+wP/Bt4HPjazP6sO+OKiIgkTzIfmFiV\n4uqIa2btgL8DdwIvAgOAHwE97VlEREQSKuaOuGY2B/geeAy41t03h7M+MrMuiQhOREREyldTOuLG\nc/XQwKKnOBYxs9buvkydcEVERCTR4mkeeiHGaSIiIlKFakqflnJrWszsQOAQoJ6ZRdao1AV2TlRg\nIiIiEptULGAkQizNQwcAvwPqA30ipm8ELkxEUCIiIiIllVtocfdXgFfMrLO7f1AFMYmIiEgcakY9\nC1h5VUpmNtLd7zCze4mSL+5+WaKCSyQzG+LuDyc7jnSgvIqN8il2yqvYKJ9io3yqOWLpiPt5+D4b\nmBPlla6GJDuANKK8io3yKXbKq9gon2KjfKohYmkeejUc/Nndn4+cZ2YDExKViIiISAnxXPJ8XYzT\nRERERCpdLJc89wJ6Ay3M7J6IWXWB/EQFVgXU/hk75VVslE+xU17FRvkUG+VTDRFLR9z2QA4wGhgV\nMWsj8La7f5e48EREREQC5RZatiY0y3L3LQmOR0RERCSqeJ49dLiZ3QzsHS5ngLv7PokITERERCRS\nPB1xHwPuBroCnYCO4XtKMLP6Zjasgst2LNFfR2JkZt3M7MgdWH60mR1XmTHtCDNrZWafJTuOVBd5\nvIX7wGsJ+pwd2r9SiZnNqOT1bd1XzSzHzHpX5vrjjCXqtpnZE2Y2oILrLLZNZtbXzK4Nh/ub2cEV\nXG+emTWqaBySXPEUWn5w9zfcfa27byh6JSyy+NUHKlRocffZ6XqTvMpmZvHUvgF0Ayr8peLuo9z9\nvxVdXpIm7uPNzDIr8Dnd2IH9K5W4eyK3I4fggomkSNC2Fdsmd5/g7reHo/2BChVadjQOSbI4nvR4\nO3An0Bk4tOiVqCdLVuBJlM8BvwC5YZx3Ap8B84BBYZqTgf8SNG01BxYDzQhOjK+FaXYD/hUu9ylw\narK3LYyrDjARmBtu1yDgMOBdgpv8vRlu00HAzIjlWgGfhsPbpQ+nvwPcGs67EmgMvAjMCl9dSomp\nFbAGWBnm+1EEzYdTwrybAuwVpn0F+EM4fBHwTDj8BDAgHO4EzAi3cSawexXk64gwPz8Drgi3aSHw\nZLgNLwC7RhwDC8Lpd4XTmgIvhTHPBY4Mp58VbkMu8BCQGU7/H/DXMO2HQNNwekx5niqvEsfbrHAf\neiHMu2fY1l8uj6AD/3TgdGBfYFK4D74HHBim6wN8BHxCcIw2jbZ/JXu7dzDP/he+dysjv6LtY1uP\nkRLraRXut7WBr4F1YT4NSuK2GXBfuA0Tgdcjju+yzj9/C4+XxQTnke22CTgnXPeRwLfAsnDevsDH\nEbG0AeaUEWse8GfgY4LzfNE+eDjB+eeT8P2AUuKoAzwe7vefAP2SvW/VpFc8O+XbUV5Tk70BEfG1\nAj4Lh08FJgOZ4cnv64gD5GngEuA14IxwWje2FVr+BvwzYr0Nkr1tEdv0SMR4vfDAahyODwIeD4dz\ngX3C4WuAG4CsMtK/AzwQse5/A13D4b2Az8uI62bgqojxV4Gzw+HzgJfD4abA0vCEtBhoGE5/AhgQ\nnhy+BDqF0+sCtRKcp4eFJ606BIXV+UAHgsdVdAnTPA5cBTQEFrHty6V++D4OuCIczgz/LweF+ZAV\nTn+AbQU2B/qEw3cAN8Sb56nwKnG8dQN+AFoS1N5+ELEtecDIiOWmAG3C4SMIzyFAg4i8vQD4e7T9\nK51fFC+0bJdfZexjT1BGoSUcPge4LwW27RS2nXuzge/D47u880/R/7s38N9o2xQ5HiVP3gZywuFb\ngUvLiDWvaD5BbeGj4fDWcw5wHPBiKXHcCpxV9D8iOJ/VSfb+VVNeMTcFuHv3WNOmgK7As+5eAHxj\nZu8S/IqfAFxK8OvkQ3d/NsqyxxH8IgTAU+eS7nnAXWb2N4IC13fAb4DJZgbBSWJ1mHY8cBrBr7ZB\n4euAMtJD8OVb5Djg4DAdQF0z293dN8YQZ2eCExfAWIIvZtz9GzMbRXByOdndvy2x3AHAanefFab/\nMYbP2lFdgZfc/ScAM/sPQaFqubu/H6Z5GrgM+CewCXjUzCYS/A8AegB/CGMuAH4ws8EEBaJZYR7u\nAqwN0/8asewc4PhweEfyPBXMdPcVAGaWS/CFOj2cNy6cvhvBr+TnI7Zzp/C9JTDOzJoTFGCXVU3Y\nSRMtvz4k+j6WTo5m27l3lZlNDaeXd/75T/g+hyAv4vUocK6ZjSA43x1eTvrIzys6X9UDnjSzNgQ/\nLrJKWfYEoK+ZXRWO70z4Q6MCcUucYi60mFlTghJmtrv3CjtBdXb3xxIWXcVZGfNaAIVAUzPLcPfC\nKMum3AMz3X2xmR1G8EvkNoJfM/PdvXOU5OMIvhj+EyzqS8ysbRnpAX6KGM4g+N/+UhmhRwy3BTYQ\n/AIrKRn5Xtp+UjIOd/d8MzscOJagUHsJQYGltPU+6e7R7hi9xcOfaEAB247ByszzZNgcMRy5XbBt\n38oAvnf3nCjL3wvc7e4TzKwbQQ1LdbZdfpWxj+UT9j+04Bu/dhXHGq9ox7FR9vmnKD9K7juxehG4\nCZhK0DRUXn/LaJ/3F4J7j51sZq0IaoCiMYJuA4sqEKfsoHg64j5B0A5Z9IWzmKAPQKrYCOweDk8D\nBplZppk1Jij9zww7mf4L+D1BqXhElPW8RXCyAMDMGiQ06hiZWTbB85+eBu4iqFpvbGadw/lZZnYI\ngLt/QXAw3si2GpRFpaWPomQeRPuSKRKZ7xBUARfVVJ1J+Gs7PBn3Imh+ucrMWpdYz0Ig28w6hel3\nr0Cn4HhNA/qb2a5mVoegz9N7wF5F+QScAUwPawnqufvrBPt9UZ5MAf4YxpxpZnXDaQPMrEk4vaGZ\n7V1OLPHkeSoo+X8vV1h7tqzomWUWaB/OrkfQdwXg7B35nHRVxj6WR1BzB9CP6DUAqZJP04DTw2Oh\nOVBUQx/P+adIWdtUbJ67byL4fhpDcI6viMh98Jwy4ngTuDQsQGJmHSr4eVIB8RRaGrn7eIJaCtw9\nn+CLMSWEJev3w0sAOxN0ZJtLUPIe6e5rgD8B77n7ewQFlgvM7KASq7oFaGBmn5nZXLYddMnWlqDg\nlQtcT9C5cQDwtzDOXIpfZTGOoDPoeAB3/7Wc9JEuAzqa2admtgAYWkZcrwInm1mumR0VLnuumX0K\nDAYuN7OdgEeA89x9FUFn38eLDvqI+AYB94bxTSaodk0Yd/+YoDA+k6AT6KMEzW6fA2eH29CQ4ES4\nO/BaOO1dYHi4msuB7mY2j6Cq+RB3X0DQj+itMP1kgk7SZYknz5OuxPF2ZxyLngmcH/6P5xN8CUNQ\ns/K8mb0HrI9IX3L/qs5K28ceAY4xs5kEP1Z+irLs2wTNi7lmNqhKoo3uJWAJQXP2GILtiPf8U6Ss\nbXoOuNrMPjGztPJTwQAAAXhJREFUfcNpzxDU8rxVwdjvAG4zs/cJmq9Ki+MvBAXHT8P9/y8V/Dyp\ngHjuiPsOYQdXdz/UzH4L/M3dj0lgfCIiIuUK+5jUc/cbkx2LJE481e8jCDqy7huWRBsTlJxFRESS\nxsxeIrj0ubR+ZlJNxFzTAltvPHYAQUekRa5nEdUYZnYuQVNIpPfd/eJkxCMiUpawIFOy79w17v5m\nMuKRyhHLU55PKWu+u/+nrPkiIiIilSGW5qE+4XsTgo5TRdfddye4JEyFFhEREUm4cgst7n4ugAUP\nRDvY3VeH482B+xMbnoiIiEggnkueWxUVWELfAPtXcjwiIiIiUcVz9dA7ZvYm8CzBtfCnE1y/LiIi\nIpJw8V49dArBs1kAprn7SwmJSkRERKSEuAotIiIiIslSbvOQmU13965mtpHiD8IyggfJ1U1YdCIi\nIiIh1bSIiIhIWojn6iERERGRpFGhRURERNKCCi0iIiKSFlRoERERkbSgQouIiIikhf8HkbctBhnq\ncuEAAAAASUVORK5CYII=\n", 438 | "text/plain": [ 439 | "" 440 | ] 441 | }, 442 | "metadata": {}, 443 | "output_type": "display_data" 444 | } 445 | ], 446 | "source": [ 447 | "corr=train_df.corr()\n", 448 | "plt.figure(figsize=(10,8))\n", 449 | "sns.heatmap(corr,\n", 450 | " xticklabels=corr.columns.values,\n", 451 | " yticklabels=corr.columns.values, annot=True)" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "## Preprocessing\n", 459 | "\n", 460 | "We apply 2 propressing method on texts:\n", 461 | "\n", 462 | "1. Make all alphabet lower case:\n", 463 | " * It is very important. We do not want the model cosider 'Hello' and 'hello' as different words.\n", 464 | "2. Remove some special tokens and deal with postfix:\n", 465 | " * For example: what's -> what is, aren't -> are not. Giving the same concept as same token helps for regularization.\n", 466 | "\n", 467 | "Always remember to do preprocess for NLP task. In many kinds of cases, the performance of model trianed with cleaned text significantly outperforms the model trained with raw data. Knowing your data is always the best policy." 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 10, 473 | "metadata": { 474 | "collapsed": false 475 | }, 476 | "outputs": [ 477 | { 478 | "name": "stdout", 479 | "output_type": "stream", 480 | "text": [ 481 | "Processing text dataset\n" 482 | ] 483 | } 484 | ], 485 | "source": [ 486 | "########################################\n", 487 | "## Text pre-processing and cleaning\n", 488 | "########################################\n", 489 | "print('Processing text dataset')\n", 490 | "from collections import defaultdict\n", 491 | "\n", 492 | "# regex to remove all Non-Alpha Numeric and space\n", 493 | "special_character_removal=re.compile(r'[^a-z\\d ]',re.IGNORECASE)\n", 494 | "\n", 495 | "# regex to replace all numeric\n", 496 | "replace_numbers=re.compile(r'\\d+',re.IGNORECASE)\n", 497 | "\n", 498 | "def clean_text(text, stem_words=False):\n", 499 | " # Clean the text, with the option to remove stopwords and to stem words.\n", 500 | " text = text.lower()\n", 501 | " text = re.sub(r\"what's\", \"what is \", text)\n", 502 | " text = re.sub(r\"\\'s\", \" \", text)\n", 503 | " text = re.sub(r\"\\'ve\", \" have \", text)\n", 504 | " text = re.sub(r\"can't\", \"cannot \", text)\n", 505 | " text = re.sub(r\"n't\", \" not \", text)\n", 506 | " text = re.sub(r\"i'm\", \"i am \", text)\n", 507 | " text = re.sub(r\"i’m\", \"i am\", text)\n", 508 | " text = re.sub(r\"\\'re\", \" are \", text)\n", 509 | " text = re.sub(r\"\\'d\", \" would \", text)\n", 510 | " text = re.sub(r\"\\'ll\", \" will \", text)\n", 511 | " text = re.sub(r\",\", \" \", text)\n", 512 | " text = re.sub(r\"\\.\", \" \", text)\n", 513 | " text = re.sub(r\"'\", \" \", text)\n", 514 | " text = re.sub(r\"\\s{2,}\", \" \", text)\n", 515 | " text = replace_numbers.sub('', text)\n", 516 | " text = special_character_removal.sub('',text)\n", 517 | " \n", 518 | " return text" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": 11, 524 | "metadata": { 525 | "collapsed": true 526 | }, 527 | "outputs": [], 528 | "source": [ 529 | "'''\n", 530 | "Apply preprocessing and extract the training sentences and testing senteces from pandas dataframe.\n", 531 | "Note that there are some N/A comment in the train/test set. Fill them up first.\n", 532 | "'''\n", 533 | "train_comments = []\n", 534 | "test_comments = []" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": 12, 540 | "metadata": { 541 | "collapsed": true 542 | }, 543 | "outputs": [], 544 | "source": [ 545 | "list_sentences_train = train_df[\"comment_text\"].fillna(\"no comment\").values\n", 546 | "list_classes = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n", 547 | "train_labels = train_df[list_classes].values\n", 548 | "list_sentences_test = test_df[\"comment_text\"].fillna(\"no comment\").values\n", 549 | "\n", 550 | "train_comments = [clean_text(text) for text in list_sentences_train]\n", 551 | "test_comments = [clean_text(text) for text in list_sentences_test]" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 13, 557 | "metadata": { 558 | "collapsed": true 559 | }, 560 | "outputs": [], 561 | "source": [ 562 | "assert len(train_comments) == 159571 and len(test_comments) == 153164, \"It seems that you lost some data.\"\n", 563 | "assert 'E' not in train_comments[0], \"It seems you did not preprocess the sentecnes. I found a upper case alphabet in your train set.\"" 564 | ] 565 | }, 566 | { 567 | "cell_type": "markdown", 568 | "metadata": {}, 569 | "source": [ 570 | "### Let's have a comparsion between cleaned text and original one" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 14, 576 | "metadata": { 577 | "collapsed": false 578 | }, 579 | "outputs": [ 580 | { 581 | "name": "stdout", 582 | "output_type": "stream", 583 | "text": [ 584 | "Cleaned\n", 585 | " explanationwhy the edits made under my username hardcore metallica fan were reverted they were not vandalisms just closure on some gas after i voted at new york dolls fac and please do not remove the template from the talk page since i am retired now \n", 586 | "\n", 587 | "Raw\n", 588 | " Explanation\n", 589 | "Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27\n", 590 | "\n", 591 | "------------------\n", 592 | "Cleaned\n", 593 | " d aww he matches this background colour i am seemingly stuck with thanks talk january utc\n", 594 | "\n", 595 | "Raw\n", 596 | " D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)\n", 597 | "\n", 598 | "------------------\n", 599 | "Cleaned\n", 600 | " hey man i am really not trying to edit war it just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info \n", 601 | "\n", 602 | "Raw\n", 603 | " Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.\n", 604 | "\n", 605 | "------------------\n", 606 | "Cleaned\n", 607 | " morei cannot make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if noone else does first if you have any preferences for formatting style on references or want to do it yourself please let me know there appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up it listed in the relevant form eg wikipediagoodarticlenominationstransport \n", 608 | "\n", 609 | "Raw\n", 610 | " \"\n", 611 | "More\n", 612 | "I can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of \"\"types of accidents\"\" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n", 613 | "\n", 614 | "There appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport \"\n", 615 | "\n", 616 | "------------------\n", 617 | "Cleaned\n", 618 | " you sir are my hero any chance you remember what page that on\n", 619 | "\n", 620 | "Raw\n", 621 | " You, sir, are my hero. Any chance you remember what page that's on?\n", 622 | "\n", 623 | "------------------\n" 624 | ] 625 | } 626 | ], 627 | "source": [ 628 | "for i in range(5):\n", 629 | " print(\"Cleaned\\n\", train_comments[i] + '\\n')\n", 630 | " print(\"Raw\\n\", train_df.iloc[i]['comment_text'] + '\\n')\n", 631 | " print(\"------------------\")" 632 | ] 633 | }, 634 | { 635 | "cell_type": "markdown", 636 | "metadata": {}, 637 | "source": [] 638 | }, 639 | { 640 | "cell_type": "markdown", 641 | "metadata": {}, 642 | "source": [ 643 | "# Tokenization\n", 644 | "\n", 645 | "Tokenization separates a sentence into words by space, for example:\n", 646 | "\n", 647 | "* \"Hello world\" -> [\"Hello\", \"world\"]\n", 648 | "\n", 649 | "The input of the neural network is a digit not a word. So we have to apply one hot encoding or index encoding on them.\n", 650 | "![onehot](resources/onehot.png)\n", 651 | "Now we use keras tokenizer to learn the encoding table." 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": 15, 657 | "metadata": { 658 | "collapsed": true 659 | }, 660 | "outputs": [], 661 | "source": [ 662 | "# Create a tokenize, which transforms a sentence to a list of ids\n", 663 | "tokenizer = Tokenizer(num_words=MAX_NB_WORDS)\n", 664 | "# Build the relation between words and ids \n", 665 | "tokenizer.fit_on_texts(train_comments + test_comments)" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": 16, 671 | "metadata": { 672 | "collapsed": false 673 | }, 674 | "outputs": [ 675 | { 676 | "data": { 677 | "text/plain": [ 678 | "{'the': 1,\n", 679 | " 'to': 2,\n", 680 | " 'i': 3,\n", 681 | " 'of': 4,\n", 682 | " 'a': 5,\n", 683 | " 'and': 6,\n", 684 | " 'you': 7,\n", 685 | " 'is': 8,\n", 686 | " 'that': 9,\n", 687 | " 'it': 10,\n", 688 | " 'in': 11,\n", 689 | " 'not': 12,\n", 690 | " 'for': 13,\n", 691 | " 'this': 14,\n", 692 | " 'on': 15,\n", 693 | " 'are': 16,\n", 694 | " 'have': 17,\n", 695 | " 'be': 18,\n", 696 | " 'as': 19,\n", 697 | " 'do': 20,\n", 698 | " 'your': 21,\n", 699 | " 'with': 22,\n", 700 | " 'if': 23,\n", 701 | " 'article': 24,\n", 702 | " 'was': 25,\n", 703 | " 'but': 26,\n", 704 | " 'or': 27,\n", 705 | " 'an': 28,\n", 706 | " 'my': 29,\n", 707 | " 'page': 30,\n", 708 | " 'from': 31,\n", 709 | " 'by': 32,\n", 710 | " 'at': 33,\n", 711 | " 'wikipedia': 34,\n", 712 | " 'would': 35,\n", 713 | " 'will': 36,\n", 714 | " 'about': 37,\n", 715 | " 'so': 38,\n", 716 | " 'there': 39,\n", 717 | " 'am': 40,\n", 718 | " 'what': 41,\n", 719 | " 'me': 42,\n", 720 | " 'can': 43,\n", 721 | " 'all': 44,\n", 722 | " 'has': 45,\n", 723 | " 'they': 46,\n", 724 | " 'he': 47,\n", 725 | " 'no': 48,\n", 726 | " 'one': 49,\n", 727 | " 'like': 50,\n", 728 | " 'just': 51,\n", 729 | " 'please': 52,\n", 730 | " 'we': 53,\n", 731 | " 'should': 54,\n", 732 | " 'which': 55,\n", 733 | " 'any': 56,\n", 734 | " 'talk': 57,\n", 735 | " 'been': 58,\n", 736 | " 'more': 59,\n", 737 | " 'some': 60,\n", 738 | " 'who': 61,\n", 739 | " 'other': 62,\n", 740 | " 'here': 63,\n", 741 | " 'see': 64,\n", 742 | " 'think': 65,\n", 743 | " 'also': 66,\n", 744 | " 'did': 67,\n", 745 | " 'his': 68,\n", 746 | " 'does': 69,\n", 747 | " 'fuck': 70,\n", 748 | " 'because': 71,\n", 749 | " 'know': 72,\n", 750 | " 'people': 73,\n", 751 | " 'up': 74,\n", 752 | " 'how': 75,\n", 753 | " 'only': 76,\n", 754 | " 'out': 77,\n", 755 | " 'why': 78,\n", 756 | " 'when': 79,\n", 757 | " 'edit': 80,\n", 758 | " 'use': 81,\n", 759 | " 'then': 82,\n", 760 | " 'were': 83,\n", 761 | " 'may': 84,\n", 762 | " 'articles': 85,\n", 763 | " 'time': 86,\n", 764 | " 'them': 87,\n", 765 | " 'now': 88,\n", 766 | " 'being': 89,\n", 767 | " 'their': 90,\n", 768 | " 'than': 91,\n", 769 | " 'get': 92,\n", 770 | " 'even': 93,\n", 771 | " 'thanks': 94,\n", 772 | " 'make': 95,\n", 773 | " 'had': 96,\n", 774 | " 'could': 97,\n", 775 | " 'good': 98,\n", 776 | " 'very': 99,\n", 777 | " 'its': 100,\n", 778 | " 'information': 101,\n", 779 | " 'sources': 102,\n", 780 | " 'well': 103,\n", 781 | " 'want': 104,\n", 782 | " 'such': 105,\n", 783 | " 'way': 106,\n", 784 | " 'name': 107,\n", 785 | " 'these': 108,\n", 786 | " 'first': 109,\n", 787 | " 'say': 110,\n", 788 | " 'section': 111,\n", 789 | " 'new': 112,\n", 790 | " 'go': 113,\n", 791 | " 'source': 114,\n", 792 | " 'need': 115,\n", 793 | " 'help': 116,\n", 794 | " 'deletion': 117,\n", 795 | " 'pages': 118,\n", 796 | " 'really': 119,\n", 797 | " 'where': 120,\n", 798 | " 'much': 121,\n", 799 | " 'again': 122,\n", 800 | " 'editing': 123,\n", 801 | " 'many': 124,\n", 802 | " 'made': 125,\n", 803 | " 'most': 126,\n", 804 | " 'used': 127,\n", 805 | " 'into': 128,\n", 806 | " 'thank': 129,\n", 807 | " 'find': 130,\n", 808 | " 'discussion': 131,\n", 809 | " 'same': 132,\n", 810 | " 'edits': 133,\n", 811 | " 'those': 134,\n", 812 | " 'user': 135,\n", 813 | " 'cannot': 136,\n", 814 | " 'since': 137,\n", 815 | " 'work': 138,\n", 816 | " 'point': 139,\n", 817 | " 'look': 140,\n", 818 | " 'deleted': 141,\n", 819 | " 'before': 142,\n", 820 | " 'after': 143,\n", 821 | " 'someone': 144,\n", 822 | " 'right': 145,\n", 823 | " 'still': 146,\n", 824 | " 'add': 147,\n", 825 | " 'two': 148,\n", 826 | " 'over': 149,\n", 827 | " 'too': 150,\n", 828 | " 'him': 151,\n", 829 | " 'read': 152,\n", 830 | " 'take': 153,\n", 831 | " 'image': 154,\n", 832 | " 'back': 155,\n", 833 | " 'something': 156,\n", 834 | " 'going': 157,\n", 835 | " 'fact': 158,\n", 836 | " 'said': 159,\n", 837 | " 'list': 160,\n", 838 | " 'link': 161,\n", 839 | " 'u': 162,\n", 840 | " 'own': 163,\n", 841 | " 'stop': 164,\n", 842 | " 'added': 165,\n", 843 | " 'our': 166,\n", 844 | " 'she': 167,\n", 845 | " 'without': 168,\n", 846 | " 'content': 169,\n", 847 | " 'her': 170,\n", 848 | " 'might': 171,\n", 849 | " 'another': 172,\n", 850 | " 'under': 173,\n", 851 | " 'sure': 174,\n", 852 | " 'history': 175,\n", 853 | " 'removed': 176,\n", 854 | " 'blocked': 177,\n", 855 | " 'seems': 178,\n", 856 | " 'however': 179,\n", 857 | " 'note': 180,\n", 858 | " 'editors': 181,\n", 859 | " 'never': 182,\n", 860 | " 'welcome': 183,\n", 861 | " 'better': 184,\n", 862 | " 'actually': 185,\n", 863 | " 'gay': 186,\n", 864 | " 'put': 187,\n", 865 | " 'place': 188,\n", 866 | " 'case': 189,\n", 867 | " 'us': 190,\n", 868 | " 'let': 191,\n", 869 | " 'hi': 192,\n", 870 | " 'done': 193,\n", 871 | " 'while': 194,\n", 872 | " 'using': 195,\n", 873 | " 'comment': 196,\n", 874 | " 'off': 197,\n", 875 | " 'both': 198,\n", 876 | " 'feel': 199,\n", 877 | " 'person': 200,\n", 878 | " 'anything': 201,\n", 879 | " 'question': 202,\n", 880 | " 'reason': 203,\n", 881 | " 'block': 204,\n", 882 | " 'ask': 205,\n", 883 | " 'things': 206,\n", 884 | " 'believe': 207,\n", 885 | " 'best': 208,\n", 886 | " 'yourself': 209,\n", 887 | " 'part': 210,\n", 888 | " 'vandalism': 211,\n", 889 | " 'hope': 212,\n", 890 | " 'thing': 213,\n", 891 | " 'links': 214,\n", 892 | " 'shit': 215,\n", 893 | " 'fucking': 216,\n", 894 | " 'comments': 217,\n", 895 | " 'little': 218,\n", 896 | " 'already': 219,\n", 897 | " 'though': 220,\n", 898 | " 'nigger': 221,\n", 899 | " 'policy': 222,\n", 900 | " 'subject': 223,\n", 901 | " 'change': 224,\n", 902 | " 'nothing': 225,\n", 903 | " 'free': 226,\n", 904 | " 'personal': 227,\n", 905 | " 'world': 228,\n", 906 | " 'must': 229,\n", 907 | " 'keep': 230,\n", 908 | " 'wrong': 231,\n", 909 | " 'against': 232,\n", 910 | " 'utc': 233,\n", 911 | " 'problem': 234,\n", 912 | " 'anyone': 235,\n", 913 | " 'above': 236,\n", 914 | " 'give': 237,\n", 915 | " 'remove': 238,\n", 916 | " 'few': 239,\n", 917 | " 'agree': 240,\n", 918 | " 'rather': 241,\n", 919 | " 'last': 242,\n", 920 | " 'wiki': 243,\n", 921 | " 'trying': 244,\n", 922 | " 'reliable': 245,\n", 923 | " 'text': 246,\n", 924 | " 'different': 247,\n", 925 | " 'long': 248,\n", 926 | " 'years': 249,\n", 927 | " 'come': 250,\n", 928 | " 'issue': 251,\n", 929 | " 'understand': 252,\n", 930 | " 'mean': 253,\n", 931 | " 'others': 254,\n", 932 | " 'copyright': 255,\n", 933 | " 'tag': 256,\n", 934 | " 'reference': 257,\n", 935 | " 'english': 258,\n", 936 | " 'word': 259,\n", 937 | " 'editor': 260,\n", 938 | " 'got': 261,\n", 939 | " 's': 262,\n", 940 | " 'probably': 263,\n", 941 | " 'says': 264,\n", 942 | " 'try': 265,\n", 943 | " 'great': 266,\n", 944 | " 'sorry': 267,\n", 945 | " 'found': 268,\n", 946 | " 'questions': 269,\n", 947 | " 'making': 270,\n", 948 | " 'suck': 271,\n", 949 | " 'speedy': 272,\n", 950 | " 'references': 273,\n", 951 | " 'doing': 274,\n", 952 | " 'original': 275,\n", 953 | " 'stupid': 276,\n", 954 | " 'either': 277,\n", 955 | " 'every': 278,\n", 956 | " 'enough': 279,\n", 957 | " 'continue': 280,\n", 958 | " 'state': 281,\n", 959 | " 'else': 282,\n", 960 | " 'simply': 283,\n", 961 | " 'least': 284,\n", 962 | " 'leave': 285,\n", 963 | " 'around': 286,\n", 964 | " 'e': 287,\n", 965 | " 'show': 288,\n", 966 | " 'life': 289,\n", 967 | " 'yes': 290,\n", 968 | " 'poop': 291,\n", 969 | " 'example': 292,\n", 970 | " 'adding': 293,\n", 971 | " 'ip': 294,\n", 972 | " 'far': 295,\n", 973 | " 'day': 296,\n", 974 | " 'consensus': 297,\n", 975 | " 'etc': 298,\n", 976 | " 'needs': 299,\n", 977 | " 'between': 300,\n", 978 | " 'opinion': 301,\n", 979 | " 'thought': 302,\n", 980 | " 'through': 303,\n", 981 | " 'down': 304,\n", 982 | " 'check': 305,\n", 983 | " 'hello': 306,\n", 984 | " 'given': 307,\n", 985 | " 'real': 308,\n", 986 | " 'lot': 309,\n", 987 | " 'request': 310,\n", 988 | " 'material': 311,\n", 989 | " 'book': 312,\n", 990 | " 'ever': 313,\n", 991 | " 'called': 314,\n", 992 | " 'war': 315,\n", 993 | " 'fair': 316,\n", 994 | " 'site': 317,\n", 995 | " 'support': 318,\n", 996 | " 'yet': 319,\n", 997 | " 'maybe': 320,\n", 998 | " 'created': 321,\n", 999 | " 'view': 322,\n", 1000 | " 'having': 323,\n", 1001 | " 'matter': 324,\n", 1002 | " 'penis': 325,\n", 1003 | " 'saying': 326,\n", 1004 | " 'bit': 327,\n", 1005 | " 'delete': 328,\n", 1006 | " 'write': 329,\n", 1007 | " 'seem': 330,\n", 1008 | " 'hate': 331,\n", 1009 | " 'notable': 332,\n", 1010 | " 'term': 333,\n", 1011 | " 'ass': 334,\n", 1012 | " 'always': 335,\n", 1013 | " 'quite': 336,\n", 1014 | " 'old': 337,\n", 1015 | " 'tell': 338,\n", 1016 | " 'reverted': 339,\n", 1017 | " 'perhaps': 340,\n", 1018 | " 'images': 341,\n", 1019 | " 'correct': 342,\n", 1020 | " 'number': 343,\n", 1021 | " 'message': 344,\n", 1022 | " 'instead': 345,\n", 1023 | " 'clearly': 346,\n", 1024 | " 'whether': 347,\n", 1025 | " 'encyclopedia': 348,\n", 1026 | " 'true': 349,\n", 1027 | " 'clear': 350,\n", 1028 | " 'account': 351,\n", 1029 | " 'until': 352,\n", 1030 | " 'post': 353,\n", 1031 | " 'mention': 354,\n", 1032 | " 'language': 355,\n", 1033 | " 'evidence': 356,\n", 1034 | " 'states': 357,\n", 1035 | " 'bad': 358,\n", 1036 | " 'important': 359,\n", 1037 | " 'makes': 360,\n", 1038 | " 'd': 361,\n", 1039 | " 'further': 362,\n", 1040 | " 'research': 363,\n", 1041 | " 'bitch': 364,\n", 1042 | " 'review': 365,\n", 1043 | " 'times': 366,\n", 1044 | " 'claim': 367,\n", 1045 | " 'getting': 368,\n", 1046 | " 'idea': 369,\n", 1047 | " 'man': 370,\n", 1048 | " 'written': 371,\n", 1049 | " 'media': 372,\n", 1050 | " 'website': 373,\n", 1051 | " 'title': 374,\n", 1052 | " 'users': 375,\n", 1053 | " 'once': 376,\n", 1054 | " 'consider': 377,\n", 1055 | " 'version': 378,\n", 1056 | " 'words': 379,\n", 1057 | " 'dont': 380,\n", 1058 | " 'die': 381,\n", 1059 | " 'considered': 382,\n", 1060 | " 'c': 383,\n", 1061 | " 'top': 384,\n", 1062 | " 'means': 385,\n", 1063 | " 'changes': 386,\n", 1064 | " 'httpwww': 387,\n", 1065 | " 'contributions': 388,\n", 1066 | " 'current': 389,\n", 1067 | " 'big': 390,\n", 1068 | " 'several': 391,\n", 1069 | " 'guidelines': 392,\n", 1070 | " 'cunt': 393,\n", 1071 | " 'oh': 394,\n", 1072 | " 'year': 395,\n", 1073 | " 'each': 396,\n", 1074 | " 'based': 397,\n", 1075 | " 'criteria': 398,\n", 1076 | " 'pov': 399,\n", 1077 | " 'admin': 400,\n", 1078 | " 'main': 401,\n", 1079 | " 'revert': 402,\n", 1080 | " 'possible': 403,\n", 1081 | " 'whole': 404,\n", 1082 | " 'start': 405,\n", 1083 | " 'listed': 406,\n", 1084 | " 'redirect': 407,\n", 1085 | " 'course': 408,\n", 1086 | " 'group': 409,\n", 1087 | " 'seen': 410,\n", 1088 | " 'general': 411,\n", 1089 | " 'mentioned': 412,\n", 1090 | " 'template': 413,\n", 1091 | " 'include': 414,\n", 1092 | " 'kind': 415,\n", 1093 | " 'second': 416,\n", 1094 | " 'faggot': 417,\n", 1095 | " 'following': 418,\n", 1096 | " 'regarding': 419,\n", 1097 | " 'left': 420,\n", 1098 | " 'notice': 421,\n", 1099 | " 'statement': 422,\n", 1100 | " 'address': 423,\n", 1101 | " 'date': 424,\n", 1102 | " 'call': 425,\n", 1103 | " 'care': 426,\n", 1104 | " 'end': 427,\n", 1105 | " 'ok': 428,\n", 1106 | " 'issues': 429,\n", 1107 | " 'three': 430,\n", 1108 | " 'less': 431,\n", 1109 | " 'suggest': 432,\n", 1110 | " 'topic': 433,\n", 1111 | " 'american': 434,\n", 1112 | " 'move': 435,\n", 1113 | " 'sentence': 436,\n", 1114 | " 'sense': 437,\n", 1115 | " 'including': 438,\n", 1116 | " 'sex': 439,\n", 1117 | " 'facts': 440,\n", 1118 | " 'love': 441,\n", 1119 | " 'rules': 442,\n", 1120 | " 'appropriate': 443,\n", 1121 | " 'school': 444,\n", 1122 | " 'happy': 445,\n", 1123 | " 'changed': 446,\n", 1124 | " 'hey': 447,\n", 1125 | " 'create': 448,\n", 1126 | " 'project': 449,\n", 1127 | " 'days': 450,\n", 1128 | " 'picture': 451,\n", 1129 | " 'next': 452,\n", 1130 | " 'provide': 453,\n", 1131 | " 'mind': 454,\n", 1132 | " 'anyway': 455,\n", 1133 | " 'th': 456,\n", 1134 | " 'looking': 457,\n", 1135 | " 'myself': 458,\n", 1136 | " 'although': 459,\n", 1137 | " 'known': 460,\n", 1138 | " 'per': 461,\n", 1139 | " 'jpg': 462,\n", 1140 | " 'included': 463,\n", 1141 | " 'info': 464,\n", 1142 | " 'away': 465,\n", 1143 | " 'relevant': 466,\n", 1144 | " 'started': 467,\n", 1145 | " 'everyone': 468,\n", 1146 | " 'four': 469,\n", 1147 | " 'later': 470,\n", 1148 | " 'god': 471,\n", 1149 | " 'pretty': 472,\n", 1150 | " 'specific': 473,\n", 1151 | " 'explain': 474,\n", 1152 | " 'summary': 475,\n", 1153 | " 'attack': 476,\n", 1154 | " 'b': 477,\n", 1155 | " 'looks': 478,\n", 1156 | " 'answer': 479,\n", 1157 | " 'sign': 480,\n", 1158 | " 'common': 481,\n", 1159 | " 'recent': 482,\n", 1160 | " 'removing': 483,\n", 1161 | " 'currently': 484,\n", 1162 | " 'lead': 485,\n", 1163 | " 'full': 486,\n", 1164 | " 'stuff': 487,\n", 1165 | " 'itself': 488,\n", 1166 | " 'claims': 489,\n", 1167 | " 'p': 490,\n", 1168 | " 'line': 491,\n", 1169 | " 'especially': 492,\n", 1170 | " 'order': 493,\n", 1171 | " 'country': 494,\n", 1172 | " 'notability': 495,\n", 1173 | " 'names': 496,\n", 1174 | " 'dick': 497,\n", 1175 | " 'wrote': 498,\n", 1176 | " 'during': 499,\n", 1177 | " 'certainly': 500,\n", 1178 | " 'related': 501,\n", 1179 | " 'interested': 502,\n", 1180 | " 'taken': 503,\n", 1181 | " 'appears': 504,\n", 1182 | " 'within': 505,\n", 1183 | " 'public': 506,\n", 1184 | " 'g': 507,\n", 1185 | " 'community': 508,\n", 1186 | " 'able': 509,\n", 1187 | " 'black': 510,\n", 1188 | " 'com': 511,\n", 1189 | " 'wish': 512,\n", 1190 | " 'discuss': 513,\n", 1191 | " 'unless': 514,\n", 1192 | " 'writing': 515,\n", 1193 | " 'neutral': 516,\n", 1194 | " 'hell': 517,\n", 1195 | " 'official': 518,\n", 1196 | " 'below': 519,\n", 1197 | " 'single': 520,\n", 1198 | " 'interest': 521,\n", 1199 | " 'completely': 522,\n", 1200 | " 'cock': 523,\n", 1201 | " 'hard': 524,\n", 1202 | " 'youfuck': 525,\n", 1203 | " 'nice': 526,\n", 1204 | " 'game': 527,\n", 1205 | " 'policies': 528,\n", 1206 | " 'wanted': 529,\n", 1207 | " 'today': 530,\n", 1208 | " 'everything': 531,\n", 1209 | " 'position': 532,\n", 1210 | " 'news': 533,\n", 1211 | " 'according': 534,\n", 1212 | " 'report': 535,\n", 1213 | " 'lol': 536,\n", 1214 | " 'came': 537,\n", 1215 | " 'remember': 538,\n", 1216 | " 'sucks': 539,\n", 1217 | " 'learn': 540,\n", 1218 | " 'guy': 541,\n", 1219 | " 'balls': 542,\n", 1220 | " 'quote': 543,\n", 1221 | " 'warning': 544,\n", 1222 | " 'style': 545,\n", 1223 | " 'wo': 546,\n", 1224 | " 'reading': 547,\n", 1225 | " 'process': 548,\n", 1226 | " 'due': 549,\n", 1227 | " 'therefore': 550,\n", 1228 | " 'paragraph': 551,\n", 1229 | " 'obviously': 552,\n", 1230 | " 'live': 553,\n", 1231 | " 'entry': 554,\n", 1232 | " 'talking': 555,\n", 1233 | " 'stay': 556,\n", 1234 | " 'useful': 557,\n", 1235 | " 'future': 558,\n", 1236 | " 'white': 559,\n", 1237 | " 'google': 560,\n", 1238 | " 'similar': 561,\n", 1239 | " 'government': 562,\n", 1240 | " 'whatever': 563,\n", 1241 | " 'attacks': 564,\n", 1242 | " 'argument': 565,\n", 1243 | " 'past': 566,\n", 1244 | " 'involved': 567,\n", 1245 | " 'political': 568,\n", 1246 | " 't': 569,\n", 1247 | " 'city': 570,\n", 1248 | " 'faith': 571,\n", 1249 | " 'working': 572,\n", 1250 | " 'system': 573,\n", 1251 | " 'ago': 574,\n", 1252 | " 'exactly': 575,\n", 1253 | " 'asked': 576,\n", 1254 | " 'often': 577,\n", 1255 | " 'nonsense': 578,\n", 1256 | " 'web': 579,\n", 1257 | " 'published': 580,\n", 1258 | " 'high': 581,\n", 1259 | " 'email': 582,\n", 1260 | " 'particular': 583,\n", 1261 | " 'o': 584,\n", 1262 | " 'nor': 585,\n", 1263 | " 'de': 586,\n", 1264 | " 'response': 587,\n", 1265 | " 'form': 588,\n", 1266 | " 'guess': 589,\n", 1267 | " 'reasons': 590,\n", 1268 | " 'sandbox': 591,\n", 1269 | " 'edited': 592,\n", 1270 | " 'regards': 593,\n", 1271 | " 'admins': 594,\n", 1272 | " 'noticed': 595,\n", 1273 | " 'almost': 596,\n", 1274 | " 'become': 597,\n", 1275 | " 'truth': 598,\n", 1276 | " 'british': 599,\n", 1277 | " 'company': 600,\n", 1278 | " 'united': 601,\n", 1279 | " 'definition': 602,\n", 1280 | " 'needed': 603,\n", 1281 | " 'placed': 604,\n", 1282 | " 'fine': 605,\n", 1283 | " 'major': 606,\n", 1284 | " 'party': 607,\n", 1285 | " 'terms': 608,\n", 1286 | " 'small': 609,\n", 1287 | " 'law': 610,\n", 1288 | " 'cheers': 611,\n", 1289 | " 'film': 612,\n", 1290 | " 'vandalize': 613,\n", 1291 | " 'side': 614,\n", 1292 | " 'posted': 615,\n", 1293 | " 'likely': 616,\n", 1294 | " 'search': 617,\n", 1295 | " 'along': 618,\n", 1296 | " 'books': 619,\n", 1297 | " 'five': 620,\n", 1298 | " 'music': 621,\n", 1299 | " 'problems': 622,\n", 1300 | " 'otherwise': 623,\n", 1301 | " 'appreciate': 624,\n", 1302 | " 'power': 625,\n", 1303 | " 'took': 626,\n", 1304 | " 'f': 627,\n", 1305 | " 'saw': 628,\n", 1306 | " 'fat': 629,\n", 1307 | " 'moved': 630,\n", 1308 | " 'generally': 631,\n", 1309 | " 'mother': 632,\n", 1310 | " 'false': 633,\n", 1311 | " 'tried': 634,\n", 1312 | " 'points': 635,\n", 1313 | " 'shows': 636,\n", 1314 | " 'short': 637,\n", 1315 | " 'cited': 638,\n", 1316 | " 'kill': 639,\n", 1317 | " 'taking': 640,\n", 1318 | " 'area': 641,\n", 1319 | " 'present': 642,\n", 1320 | " 'status': 643,\n", 1321 | " 'national': 644,\n", 1322 | " 'certain': 645,\n", 1323 | " 'stated': 646,\n", 1324 | " 'm': 647,\n", 1325 | " 'sort': 648,\n", 1326 | " 'dispute': 649,\n", 1327 | " 'entire': 650,\n", 1328 | " 'recently': 651,\n", 1329 | " 'deleting': 652,\n", 1330 | " 'large': 653,\n", 1331 | " 'piece': 654,\n", 1332 | " 'username': 655,\n", 1333 | " 'jews': 656,\n", 1334 | " 'death': 657,\n", 1335 | " 'soon': 658,\n", 1336 | " 'set': 659,\n", 1337 | " 'citation': 660,\n", 1338 | " 'indeed': 661,\n", 1339 | " 'family': 662,\n", 1340 | " 'im': 663,\n", 1341 | " 'provided': 664,\n", 1342 | " 'rule': 665,\n", 1343 | " 'description': 666,\n", 1344 | " 'explanation': 667,\n", 1345 | " 'guys': 668,\n", 1346 | " 'wp': 669,\n", 1347 | " 'actual': 670,\n", 1348 | " 'follow': 671,\n", 1349 | " 'story': 672,\n", 1350 | " 'interesting': 673,\n", 1351 | " 'band': 674,\n", 1352 | " 'reverting': 675,\n", 1353 | " 'uploaded': 676,\n", 1354 | " 'context': 677,\n", 1355 | " 'views': 678,\n", 1356 | " 'aware': 679,\n", 1357 | " 'open': 680,\n", 1358 | " 'human': 681,\n", 1359 | " 'category': 682,\n", 1360 | " 'reply': 683,\n", 1361 | " 'knowledge': 684,\n", 1362 | " 'appear': 685,\n", 1363 | " 'works': 686,\n", 1364 | " 'week': 687,\n", 1365 | " 'themselves': 688,\n", 1366 | " 'file': 689,\n", 1367 | " 'obvious': 690,\n", 1368 | " 'decide': 691,\n", 1369 | " 'improve': 692,\n", 1370 | " 'theory': 693,\n", 1371 | " 'simple': 694,\n", 1372 | " 'university': 695,\n", 1373 | " 'cite': 696,\n", 1374 | " 'went': 697,\n", 1375 | " 'yeah': 698,\n", 1376 | " 'thus': 699,\n", 1377 | " 'type': 700,\n", 1378 | " 'alone': 701,\n", 1379 | " 'px': 702,\n", 1380 | " 'ones': 703,\n", 1381 | " 'told': 704,\n", 1382 | " 'external': 705,\n", 1383 | " 'john': 706,\n", 1384 | " 'seriously': 707,\n", 1385 | " 'various': 708,\n", 1386 | " 'proposed': 709,\n", 1387 | " 'administrator': 710,\n", 1388 | " 'attention': 711,\n", 1389 | " 'disagree': 712,\n", 1390 | " 'german': 713,\n", 1391 | " 'comes': 714,\n", 1392 | " 'banned': 715,\n", 1393 | " 'hand': 716,\n", 1394 | " 'citations': 717,\n", 1395 | " 'result': 718,\n", 1396 | " 'job': 719,\n", 1397 | " 'internet': 720,\n", 1398 | " 'cause': 721,\n", 1399 | " 'mr': 722,\n", 1400 | " 'test': 723,\n", 1401 | " 'complete': 724,\n", 1402 | " 'goes': 725,\n", 1403 | " 'previous': 726,\n", 1404 | " 'meaning': 727,\n", 1405 | " 're': 728,\n", 1406 | " 'longer': 729,\n", 1407 | " 'happened': 730,\n", 1408 | " 'avoid': 731,\n", 1409 | " 'solid': 732,\n", 1410 | " 'author': 733,\n", 1411 | " 'members': 734,\n", 1412 | " 'allowed': 735,\n", 1413 | " 'addition': 736,\n", 1414 | " 'proper': 737,\n", 1415 | " 'heard': 738,\n", 1416 | " 'contributing': 739,\n", 1417 | " 'deal': 740,\n", 1418 | " 'usually': 741,\n", 1419 | " 'enjoy': 742,\n", 1420 | " 'idiot': 743,\n", 1421 | " 'series': 744,\n", 1422 | " 'r': 745,\n", 1423 | " 'contact': 746,\n", 1424 | " 'middle': 747,\n", 1425 | " 'himself': 748,\n", 1426 | " 'necessary': 749,\n", 1427 | " 'exist': 750,\n", 1428 | " 'creating': 751,\n", 1429 | " 'sourced': 752,\n", 1430 | " 'together': 753,\n", 1431 | " 'living': 754,\n", 1432 | " 'conflict': 755,\n", 1433 | " 'available': 756,\n", 1434 | " 'head': 757,\n", 1435 | " 'vandal': 758,\n", 1436 | " 'ban': 759,\n", 1437 | " 'uk': 760,\n", 1438 | " 'n': 761,\n", 1439 | " 'fag': 762,\n", 1440 | " 'calling': 763,\n", 1441 | " 'sites': 764,\n", 1442 | " 'npov': 765,\n", 1443 | " 'standard': 766,\n", 1444 | " 'sections': 767,\n", 1445 | " 'automatically': 768,\n", 1446 | " 'video': 769,\n", 1447 | " 'rest': 770,\n", 1448 | " 'tags': 771,\n", 1449 | " 'copy': 772,\n", 1450 | " 'opinions': 773,\n", 1451 | " 'july': 774,\n", 1452 | " 'contribs': 775,\n", 1453 | " 'worked': 776,\n", 1454 | " 'serious': 777,\n", 1455 | " 'statements': 778,\n", 1456 | " 'valid': 779,\n", 1457 | " 'science': 780,\n", 1458 | " 'act': 781,\n", 1459 | " 'hours': 782,\n", 1460 | " 'personally': 783,\n", 1461 | " 'space': 784,\n", 1462 | " 'debate': 785,\n", 1463 | " 'multiple': 786,\n", 1464 | " 'assume': 787,\n", 1465 | " 'wikiproject': 788,\n", 1466 | " 'separate': 789,\n", 1467 | " 'crap': 790,\n", 1468 | " 'fix': 791,\n", 1469 | " 'respect': 792,\n", 1470 | " 'asshole': 793,\n", 1471 | " 'member': 794,\n", 1472 | " 'accurate': 795,\n", 1473 | " 'details': 796,\n", 1474 | " 'afd': 797,\n", 1475 | " 'accept': 798,\n", 1476 | " 'bias': 799,\n", 1477 | " 'doubt': 800,\n", 1478 | " 'couple': 801,\n", 1479 | " 'tildes': 802,\n", 1480 | " 'gets': 803,\n", 1481 | " 'months': 804,\n", 1482 | " 'moron': 805,\n", 1483 | " 'online': 806,\n", 1484 | " 'record': 807,\n", 1485 | " 'attempt': 808,\n", 1486 | " 'data': 809,\n", 1487 | " 'refer': 810,\n", 1488 | " 'k': 811,\n", 1489 | " 'criticism': 812,\n", 1490 | " 'historical': 813,\n", 1491 | " 'play': 814,\n", 1492 | " 'women': 815,\n", 1493 | " 'close': 816,\n", 1494 | " 'accepted': 817,\n", 1495 | " 'church': 818,\n", 1496 | " 'early': 819,\n", 1497 | " 'third': 820,\n", 1498 | " 'table': 821,\n", 1499 | " 'biased': 822,\n", 1500 | " 'rights': 823,\n", 1501 | " 'level': 824,\n", 1502 | " 'indicate': 825,\n", 1503 | " 'actions': 826,\n", 1504 | " 'tagged': 827,\n", 1505 | " 'song': 828,\n", 1506 | " 'action': 829,\n", 1507 | " 'wikipedian': 830,\n", 1508 | " 'vote': 831,\n", 1509 | " 'lack': 832,\n", 1510 | " 'violation': 833,\n", 1511 | " 'face': 834,\n", 1512 | " 'run': 835,\n", 1513 | " 'apparently': 836,\n", 1514 | " 'racist': 837,\n", 1515 | " 'photo': 838,\n", 1516 | " 'south': 839,\n", 1517 | " 'difference': 840,\n", 1518 | " 'directly': 841,\n", 1519 | " 'countries': 842,\n", 1520 | " 'uses': 843,\n", 1521 | " 'meant': 844,\n", 1522 | " 'situation': 845,\n", 1523 | " 'sometimes': 846,\n", 1524 | " 'none': 847,\n", 1525 | " 'legal': 848,\n", 1526 | " 'except': 849,\n", 1527 | " 'upon': 850,\n", 1528 | " 'primary': 851,\n", 1529 | " 'period': 852,\n", 1530 | " 'dicks': 853,\n", 1531 | " 'born': 854,\n", 1532 | " 'thinking': 855,\n", 1533 | " 'specifically': 856,\n", 1534 | " 'explaining': 857,\n", 1535 | " 'damn': 858,\n", 1536 | " 'significant': 859,\n", 1537 | " 'jewish': 860,\n", 1538 | " 'httpen': 861,\n", 1539 | " 'helpful': 862,\n", 1540 | " 'rationale': 863,\n", 1541 | " 'described': 864,\n", 1542 | " 'cases': 865,\n", 1543 | " 'march': 866,\n", 1544 | " 'dead': 867,\n", 1545 | " 'children': 868,\n", 1546 | " 'supposed': 869,\n", 1547 | " 'asking': 870,\n", 1548 | " 'among': 871,\n", 1549 | " 'quality': 872,\n", 1550 | " 'modern': 873,\n", 1551 | " 'changing': 874,\n", 1552 | " 'x': 875,\n", 1553 | " 'niggers': 876,\n", 1554 | " 'india': 877,\n", 1555 | " 'incorrect': 878,\n", 1556 | " 'men': 879,\n", 1557 | " 'box': 880,\n", 1558 | " 'inclusion': 881,\n", 1559 | " 'unsigned': 882,\n", 1560 | " 'release': 883,\n", 1561 | " 'proof': 884,\n", 1562 | " 'prove': 885,\n", 1563 | " 'outside': 886,\n", 1564 | " 'wanker': 887,\n", 1565 | " 'culture': 888,\n", 1566 | " 'album': 889,\n", 1567 | " 'team': 890,\n", 1568 | " 'coming': 891,\n", 1569 | " 'speak': 892,\n", 1570 | " 'friend': 893,\n", 1571 | " 'worth': 894,\n", 1572 | " 'international': 895,\n", 1573 | " 'okay': 896,\n", 1574 | " 'august': 897,\n", 1575 | " 'wait': 898,\n", 1576 | " 'preceding': 899,\n", 1577 | " 'character': 900,\n", 1578 | " 'bastard': 901,\n", 1579 | " 'messages': 902,\n", 1580 | " 'particularly': 903,\n", 1581 | " 'purpose': 904,\n", 1582 | " 'red': 905,\n", 1583 | " 'readers': 906,\n", 1584 | " 'military': 907,\n", 1585 | " 'america': 908,\n", 1586 | " 'field': 909,\n", 1587 | " 'behavior': 910,\n", 1588 | " 'house': 911,\n", 1589 | " 'error': 912,\n", 1590 | " 'computer': 913,\n", 1591 | " 'bring': 914,\n", 1592 | " 'jew': 915,\n", 1593 | " 'existing': 916,\n", 1594 | " 'earlier': 917,\n", 1595 | " 'putting': 918,\n", 1596 | " 'numbers': 919,\n", 1597 | " 'arguments': 920,\n", 1598 | " 'looked': 921,\n", 1599 | " 'st': 922,\n", 1600 | " 'linked': 923,\n", 1601 | " 'pillars': 924,\n", 1602 | " 'happen': 925,\n", 1603 | " 'control': 926,\n", 1604 | " 'gave': 927,\n", 1605 | " 'sock': 928,\n", 1606 | " 'reader': 929,\n", 1607 | " 'january': 930,\n", 1608 | " 'discussed': 931,\n", 1609 | " 'jesus': 932,\n", 1610 | " 'produce': 933,\n", 1611 | " 'figure': 934,\n", 1612 | " 'million': 935,\n", 1613 | " 'contribute': 936,\n", 1614 | " 'neither': 937,\n", 1615 | " 'examples': 938,\n", 1616 | " 'irrelevant': 939,\n", 1617 | " 'friends': 940,\n", 1618 | " 'june': 941,\n", 1619 | " 'giving': 942,\n", 1620 | " 'disruptive': 943,\n", 1621 | " 'business': 944,\n", 1622 | " 'entirely': 945,\n", 1623 | " 'gives': 946,\n", 1624 | " 'possibly': 947,\n", 1625 | " 'body': 948,\n", 1626 | " 'north': 949,\n", 1627 | " 'april': 950,\n", 1628 | " 'huge': 951,\n", 1629 | " 'eat': 952,\n", 1630 | " 'pictures': 953,\n", 1631 | " 'israel': 954,\n", 1632 | " 'lost': 955,\n", 1633 | " 'totally': 956,\n", 1634 | " 'behind': 957,\n", 1635 | " 'loser': 958,\n", 1636 | " 'religion': 959,\n", 1637 | " 'allow': 960,\n", 1638 | " 'absolutely': 961,\n", 1639 | " 'wants': 962,\n", 1640 | " 'french': 963,\n", 1641 | " 'decided': 964,\n", 1642 | " 'poor': 965,\n", 1643 | " 'sound': 966,\n", 1644 | " 'across': 967,\n", 1645 | " 'finally': 968,\n", 1646 | " 'html': 969,\n", 1647 | " 'nobody': 970,\n", 1648 | " 'light': 971,\n", 1649 | " 'independent': 972,\n", 1650 | " 'knows': 973,\n", 1651 | " 'sounds': 974,\n", 1652 | " 'named': 975,\n", 1653 | " 'infobox': 976,\n", 1654 | " 'watch': 977,\n", 1655 | " 'majority': 978,\n", 1656 | " 'gone': 979,\n", 1657 | " 'individual': 980,\n", 1658 | " 'killed': 981,\n", 1659 | " 'mistake': 982,\n", 1660 | " 'home': 983,\n", 1661 | " 'map': 984,\n", 1662 | " 'reported': 985,\n", 1663 | " 'greek': 986,\n", 1664 | " 'acceptable': 987,\n", 1665 | " 'width': 988,\n", 1666 | " 'scientific': 989,\n", 1667 | " 'christian': 990,\n", 1668 | " 'abuse': 991,\n", 1669 | " 'money': 992,\n", 1670 | " 'concerns': 993,\n", 1671 | " 'chance': 994,\n", 1672 | " 'takes': 995,\n", 1673 | " 'press': 996,\n", 1674 | " 'population': 997,\n", 1675 | " 'blocking': 998,\n", 1676 | " 'meet': 999,\n", 1677 | " 'groups': 1000,\n", 1678 | " ...}" 1679 | ] 1680 | }, 1681 | "execution_count": 16, 1682 | "metadata": {}, 1683 | "output_type": "execute_result" 1684 | } 1685 | ], 1686 | "source": [ 1687 | "tokenizer.word_index # map 'the' to 1, map 'to' to 2,......" 1688 | ] 1689 | }, 1690 | { 1691 | "cell_type": "code", 1692 | "execution_count": 17, 1693 | "metadata": { 1694 | "collapsed": true 1695 | }, 1696 | "outputs": [], 1697 | "source": [ 1698 | "# Transform training/testing sentences to training/testing sequences.\n", 1699 | "train_sequences = tokenizer.texts_to_sequences(train_comments)\n", 1700 | "test_sequences = tokenizer.texts_to_sequences(test_comments)" 1701 | ] 1702 | }, 1703 | { 1704 | "cell_type": "markdown", 1705 | "metadata": {}, 1706 | "source": [ 1707 | "## have a look on transformed sequences" 1708 | ] 1709 | }, 1710 | { 1711 | "cell_type": "code", 1712 | "execution_count": 18, 1713 | "metadata": { 1714 | "collapsed": false 1715 | }, 1716 | "outputs": [ 1717 | { 1718 | "name": "stdout", 1719 | "output_type": "stream", 1720 | "text": [ 1721 | "Transformed\n", 1722 | " [1, 133, 125, 173, 29, 655, 4387, 9203, 1159, 83, 339, 46, 83, 12, 13230, 51, 6379, 15, 60, 2522, 143, 3, 2650, 33, 112, 1128, 16118, 2514, 6, 52, 20, 12, 238, 1, 413, 31, 1, 57, 30, 137, 3, 40, 3881, 88]\n", 1723 | "\n", 1724 | "Cleaned\n", 1725 | " explanationwhy the edits made under my username hardcore metallica fan were reverted they were not vandalisms just closure on some gas after i voted at new york dolls fac and please do not remove the template from the talk page since i am retired now \n", 1726 | "\n", 1727 | "------------------\n" 1728 | ] 1729 | } 1730 | ], 1731 | "source": [ 1732 | "for i in range(1):\n", 1733 | " print(\"Transformed\\n\", str(train_sequences[i]) + '\\n')\n", 1734 | " print(\"Cleaned\\n\", train_comments[i] + '\\n')\n", 1735 | " print(\"------------------\")" 1736 | ] 1737 | }, 1738 | { 1739 | "cell_type": "code", 1740 | "execution_count": 19, 1741 | "metadata": { 1742 | "collapsed": false 1743 | }, 1744 | "outputs": [ 1745 | { 1746 | "name": "stdout", 1747 | "output_type": "stream", 1748 | "text": [ 1749 | "Found 382324 unique tokens\n", 1750 | "Shape of data tensor: (159571, 100)\n", 1751 | "Shape of label tensor: (159571, 6)\n", 1752 | "Shape of test_data tensor: (153164, 100)\n" 1753 | ] 1754 | } 1755 | ], 1756 | "source": [ 1757 | "word_index = tokenizer.word_index\n", 1758 | "print('Found %s unique tokens' % len(word_index))\n", 1759 | "\n", 1760 | "train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", 1761 | "print('Shape of data tensor:', train_data.shape)\n", 1762 | "print('Shape of label tensor:', train_labels.shape)\n", 1763 | "\n", 1764 | "test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", 1765 | "print('Shape of test_data tensor:', test_data.shape)" 1766 | ] 1767 | }, 1768 | { 1769 | "cell_type": "code", 1770 | "execution_count": 20, 1771 | "metadata": { 1772 | "collapsed": true 1773 | }, 1774 | "outputs": [], 1775 | "source": [ 1776 | "'''\n", 1777 | "Try to build a tokenzier, which transform [['Hello', 'World'], ['Greeting', 'my', 'friend'], ['Hello', 'have', 'a', 'nice', 'day']]\n", 1778 | "to a list of index sequences. Note that the index should start from 1 because 0 is reserverd for padding token for some framework.\n", 1779 | "'''\n", 1780 | "tests_input_sentences = [['Hello', 'World'], ['Greeting', 'my', 'friend'], ['Hello', 'have', 'a', 'nice', 'day']]\n", 1781 | "transform_this_sentences = [['Hello', 'my', 'friend']]\n", 1782 | "\n", 1783 | "def index_encoding(sentences, raw_sent):\n", 1784 | " word2idx = {}\n", 1785 | " idx2word = {}\n", 1786 | " ctr = 1\n", 1787 | " for sentence in sentences:\n", 1788 | " for word in sentence:\n", 1789 | " if word not in word2idx.keys():\n", 1790 | " word2idx[word] = ctr\n", 1791 | " idx2word[ctr] = word\n", 1792 | " ctr += 1\n", 1793 | " results = []\n", 1794 | " for sent in raw_sent:\n", 1795 | " results.append([word2idx[word] for word in sent])\n", 1796 | " return results" 1797 | ] 1798 | }, 1799 | { 1800 | "cell_type": "code", 1801 | "execution_count": 21, 1802 | "metadata": { 1803 | "collapsed": false 1804 | }, 1805 | "outputs": [ 1806 | { 1807 | "name": "stdout", 1808 | "output_type": "stream", 1809 | "text": [ 1810 | "Congrats, you passed the test.\n" 1811 | ] 1812 | } 1813 | ], 1814 | "source": [ 1815 | "transformed = index_encoding(tests_input_sentences, transform_this_sentences)\n", 1816 | "assert transformed == [[1, 4, 5]], \"The answer is not correct.\"\n", 1817 | "print(\"Congrats, you passed the test.\")" 1818 | ] 1819 | }, 1820 | { 1821 | "cell_type": "markdown", 1822 | "metadata": {}, 1823 | "source": [ 1824 | "## Models" 1825 | ] 1826 | }, 1827 | { 1828 | "cell_type": "markdown", 1829 | "metadata": {}, 1830 | "source": [ 1831 | "## Text RNN\n", 1832 | "\n", 1833 | "![TextRNN](resources/textrnn.png)\n", 1834 | "\n", 1835 | "Here we present a classical structure, 2 layer bidirectional GRU, for text classification. Instead of adding a fully connected layer after all time steps, here we only select the last hidden unit of sequence. (LB: 50%, AUC: 0.982)" 1836 | ] 1837 | }, 1838 | { 1839 | "cell_type": "code", 1840 | "execution_count": 22, 1841 | "metadata": { 1842 | "collapsed": true 1843 | }, 1844 | "outputs": [], 1845 | "source": [ 1846 | "########################################\n", 1847 | "## Define the text rnn model structure\n", 1848 | "########################################\n", 1849 | "def get_text_rnn():\n", 1850 | " recurrent_units = 48\n", 1851 | " dense_units = 32\n", 1852 | " output_units = 6\n", 1853 | " \n", 1854 | " input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))\n", 1855 | " embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(input_layer)\n", 1856 | " \n", 1857 | " x = Bidirectional(GRU(recurrent_units, return_sequences=True))(embedding_layer)\n", 1858 | " x = Bidirectional(GRU(recurrent_units, return_sequences=False))(x)\n", 1859 | " \n", 1860 | " x = Dense(dense_units, activation=\"relu\")(x)\n", 1861 | " output_layer = Dense(output_units, activation=\"sigmoid\")(x)\n", 1862 | " \n", 1863 | " model = Model(inputs=input_layer, outputs=output_layer)\n", 1864 | " model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", 1865 | " return model" 1866 | ] 1867 | }, 1868 | { 1869 | "cell_type": "markdown", 1870 | "metadata": {}, 1871 | "source": [ 1872 | "## TextCNN\n", 1873 | "\n", 1874 | "![TextCNN](resources/textcnn.png)\n", 1875 | "\n", 1876 | "Convolution in natural langauge proceessing can be consider as a special type of ngram. We simply select the kernels with window sizes (2, 3, 4) to extract regional features. (LB: 50%, AUC: 0.982)" 1877 | ] 1878 | }, 1879 | { 1880 | "cell_type": "code", 1881 | "execution_count": 23, 1882 | "metadata": { 1883 | "collapsed": true 1884 | }, 1885 | "outputs": [], 1886 | "source": [ 1887 | "########################################\n", 1888 | "## Define the text cnn model structure\n", 1889 | "########################################\n", 1890 | "def get_text_cnn():\n", 1891 | " filter_nums = 120\n", 1892 | " dense_units = 72\n", 1893 | " output_units = 6\n", 1894 | " \n", 1895 | " input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n", 1896 | " embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH,)(input_layer)\n", 1897 | " \n", 1898 | " conv_0 = Conv1D(filter_nums, 2, kernel_initializer=\"normal\", padding=\"valid\", activation=\"relu\")(embedding_layer)\n", 1899 | " conv_1 = Conv1D(filter_nums, 3, kernel_initializer=\"normal\", padding=\"valid\", activation=\"relu\")(embedding_layer)\n", 1900 | " conv_2 = Conv1D(filter_nums, 4, kernel_initializer=\"normal\", padding=\"valid\", activation=\"relu\")(embedding_layer)\n", 1901 | "\n", 1902 | " maxpool_0 = GlobalMaxPooling1D()(conv_0)\n", 1903 | " maxpool_1 = GlobalMaxPooling1D()(conv_1)\n", 1904 | " maxpool_2 = GlobalMaxPooling1D()(conv_2)\n", 1905 | "\n", 1906 | " merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2])\n", 1907 | " h1 = Dense(units=dense_units, activation=\"relu\")(merged_tensor)\n", 1908 | " output = Dense(units=output_units, activation='sigmoid')(h1)\n", 1909 | "\n", 1910 | " model = Model(inputs=input_layer, outputs=output)\n", 1911 | " model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", 1912 | " return model" 1913 | ] 1914 | }, 1915 | { 1916 | "cell_type": "markdown", 1917 | "metadata": {}, 1918 | "source": [ 1919 | "## Hybrid Text NN\n", 1920 | "\n", 1921 | "![hybrid](resources/hybrid.png)\n", 1922 | "\n", 1923 | "This structure mixed the feature representation ideas of RNN and CNN. We firstly place the recurrent layer after embedding for building the word's level sequential information and make it connected with a convolution layer to extract the regional features of hiddens. (LB: 30%, AUC: 0.983)" 1924 | ] 1925 | }, 1926 | { 1927 | "cell_type": "code", 1928 | "execution_count": 24, 1929 | "metadata": { 1930 | "collapsed": true 1931 | }, 1932 | "outputs": [], 1933 | "source": [ 1934 | "########################################\n", 1935 | "## Define the text hybrid model structure\n", 1936 | "########################################\n", 1937 | "def get_hybrid_textnn():\n", 1938 | " recurrent_units = 48\n", 1939 | " dense_units = 32\n", 1940 | " filter_nums = 64\n", 1941 | " output_units = 6\n", 1942 | "\n", 1943 | " input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))\n", 1944 | " embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(input_layer)\n", 1945 | " \n", 1946 | " x = Bidirectional(GRU(recurrent_units, return_sequences=True))(embedding_layer)\n", 1947 | " x = Conv1D(filter_nums, 2, kernel_initializer=\"normal\", padding=\"valid\", activation=\"relu\")(x) \n", 1948 | " \n", 1949 | " max_pool = GlobalMaxPooling1D()(x)\n", 1950 | " max_pool = Dropout(0.5)(max_pool)\n", 1951 | " \n", 1952 | " output_layer = Dense(output_units, activation=\"sigmoid\")(max_pool)\n", 1953 | "\n", 1954 | " model = Model(inputs=input_layer, outputs=output_layer)\n", 1955 | " model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", 1956 | " return model" 1957 | ] 1958 | }, 1959 | { 1960 | "cell_type": "markdown", 1961 | "metadata": {}, 1962 | "source": [ 1963 | "### K-Fold Cross Valiadtion\n", 1964 | "\n" 1965 | ] 1966 | }, 1967 | { 1968 | "cell_type": "code", 1969 | "execution_count": 25, 1970 | "metadata": { 1971 | "collapsed": true 1972 | }, 1973 | "outputs": [], 1974 | "source": [ 1975 | "########################################\n", 1976 | "## Construct the cross-validation framework\n", 1977 | "########################################\n", 1978 | "def _train_model_by_logloss(model, batch_size, train_x, train_y, val_x, val_y, fold_id):\n", 1979 | " # set an early stopping checker.\n", 1980 | " # the training phase would stop when validation log loss decreases continuously for `patience` rounds. \n", 1981 | " early_stopping = EarlyStopping(monitor='val_loss', patience=5)\n", 1982 | " bst_model_path = \"ToxicModel\" + str(fold_id) + '.h5'\n", 1983 | " model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)\n", 1984 | " \n", 1985 | " # training on given fold data\n", 1986 | " hist = model.fit(train_x, train_y,\n", 1987 | " validation_data=(val_x, val_y),\n", 1988 | " epochs=50, batch_size=batch_size, shuffle=True,\n", 1989 | " callbacks=[early_stopping, model_checkpoint])\n", 1990 | " \n", 1991 | " # get the minimal validation log loss on this fold\n", 1992 | " bst_val_score = min(hist.history['val_loss'])\n", 1993 | "\n", 1994 | " # return the model with best weight, best fold-val score\n", 1995 | " return model, bst_val_score\n", 1996 | "\n", 1997 | "def train_folds(X, y, fold_count, batch_size, get_model_func):\n", 1998 | " fold_size = len(X) // fold_count\n", 1999 | " models = []\n", 2000 | " score = 0\n", 2001 | " \n", 2002 | " # split the whole dataset to `fold_count` fold, and train our model on each fold\n", 2003 | " for fold_id in range(0, fold_count):\n", 2004 | " fold_start = fold_size * fold_id\n", 2005 | " fold_end = fold_start + fold_size\n", 2006 | "\n", 2007 | " if fold_id == fold_size - 1:\n", 2008 | " fold_end = len(X)\n", 2009 | "\n", 2010 | " # Generate the train/val data on fold i\n", 2011 | " train_x = np.concatenate([X[:fold_start], X[fold_end:]])\n", 2012 | " train_y = np.concatenate([y[:fold_start], y[fold_end:]])\n", 2013 | "\n", 2014 | " val_x = X[fold_start:fold_end]\n", 2015 | " val_y = y[fold_start:fold_end]\n", 2016 | " \n", 2017 | " print(\"Training on fold #\", fold_id)\n", 2018 | " model, bst_val_score = _train_model_by_logloss(get_model_func(), batch_size, train_x, train_y, val_x, val_y, fold_id)\n", 2019 | " score += bst_val_score\n", 2020 | " models.append(model)\n", 2021 | " return models, score / fold_count" 2022 | ] 2023 | }, 2024 | { 2025 | "cell_type": "code", 2026 | "execution_count": 26, 2027 | "metadata": { 2028 | "collapsed": false 2029 | }, 2030 | "outputs": [ 2031 | { 2032 | "name": "stdout", 2033 | "output_type": "stream", 2034 | "text": [ 2035 | "Training on fold # 0\n", 2036 | "Train on 79786 samples, validate on 79785 samples\n", 2037 | "Epoch 1/50\n", 2038 | "79786/79786 [==============================] - 9s 107us/step - loss: 0.1187 - acc: 0.9692 - val_loss: 0.0560 - val_acc: 0.9803\n", 2039 | "Epoch 2/50\n", 2040 | "79786/79786 [==============================] - 5s 59us/step - loss: 0.0484 - acc: 0.9823 - val_loss: 0.0530 - val_acc: 0.9806\n", 2041 | "Epoch 3/50\n", 2042 | "79786/79786 [==============================] - 5s 59us/step - loss: 0.0405 - acc: 0.9848 - val_loss: 0.0538 - val_acc: 0.9811\n", 2043 | "Epoch 4/50\n", 2044 | "79786/79786 [==============================] - 5s 59us/step - loss: 0.0342 - acc: 0.9870 - val_loss: 0.0574 - val_acc: 0.9807\n", 2045 | "Epoch 5/50\n", 2046 | "79786/79786 [==============================] - 5s 59us/step - loss: 0.0287 - acc: 0.9893 - val_loss: 0.0644 - val_acc: 0.9801\n", 2047 | "Epoch 6/50\n", 2048 | "79786/79786 [==============================] - 5s 59us/step - loss: 0.0237 - acc: 0.9914 - val_loss: 0.0689 - val_acc: 0.9794\n", 2049 | "Epoch 7/50\n", 2050 | "79786/79786 [==============================] - 5s 60us/step - loss: 0.0190 - acc: 0.9931 - val_loss: 0.0799 - val_acc: 0.9793\n", 2051 | "Training on fold # 1\n", 2052 | "Train on 79786 samples, validate on 79785 samples\n", 2053 | "Epoch 1/50\n", 2054 | "79786/79786 [==============================] - 5s 64us/step - loss: 0.1229 - acc: 0.9661 - val_loss: 0.0563 - val_acc: 0.9803\n", 2055 | "Epoch 2/50\n", 2056 | "79786/79786 [==============================] - 5s 59us/step - loss: 0.0494 - acc: 0.9819 - val_loss: 0.0518 - val_acc: 0.9815\n", 2057 | "Epoch 3/50\n", 2058 | "79786/79786 [==============================] - 5s 59us/step - loss: 0.0412 - acc: 0.9844 - val_loss: 0.0536 - val_acc: 0.9815\n", 2059 | "Epoch 4/50\n", 2060 | "79786/79786 [==============================] - 5s 59us/step - loss: 0.0348 - acc: 0.9868 - val_loss: 0.0592 - val_acc: 0.9811\n", 2061 | "Epoch 5/50\n", 2062 | "79786/79786 [==============================] - 5s 60us/step - loss: 0.0289 - acc: 0.9891 - val_loss: 0.0629 - val_acc: 0.9801\n", 2063 | "Epoch 6/50\n", 2064 | "79786/79786 [==============================] - 5s 60us/step - loss: 0.0231 - acc: 0.9916 - val_loss: 0.0673 - val_acc: 0.9797\n", 2065 | "Epoch 7/50\n", 2066 | "79786/79786 [==============================] - 5s 59us/step - loss: 0.0177 - acc: 0.9936 - val_loss: 0.0785 - val_acc: 0.9797\n" 2067 | ] 2068 | } 2069 | ], 2070 | "source": [ 2071 | "models, val_loss = train_folds(train_data, train_labels, 2, 256, get_text_cnn)" 2072 | ] 2073 | }, 2074 | { 2075 | "cell_type": "markdown", 2076 | "metadata": {}, 2077 | "source": [ 2078 | "# Practice: Try to beat the baseline: val_loss=0.050" 2079 | ] 2080 | }, 2081 | { 2082 | "cell_type": "code", 2083 | "execution_count": 27, 2084 | "metadata": { 2085 | "collapsed": true 2086 | }, 2087 | "outputs": [], 2088 | "source": [ 2089 | "your_batch_size = 256\n", 2090 | "\n", 2091 | "def get_your_model():\n", 2092 | " '''your show time'''\n", 2093 | " return model" 2094 | ] 2095 | }, 2096 | { 2097 | "cell_type": "code", 2098 | "execution_count": 28, 2099 | "metadata": { 2100 | "collapsed": true 2101 | }, 2102 | "outputs": [], 2103 | "source": [ 2104 | "#models, val_loss = train_folds(train_data, train_labels, 2, your_batch_size, get_your_model)" 2105 | ] 2106 | }, 2107 | { 2108 | "cell_type": "markdown", 2109 | "metadata": {}, 2110 | "source": [ 2111 | "# Make the predections" 2112 | ] 2113 | }, 2114 | { 2115 | "cell_type": "code", 2116 | "execution_count": 29, 2117 | "metadata": { 2118 | "collapsed": false 2119 | }, 2120 | "outputs": [ 2121 | { 2122 | "name": "stdout", 2123 | "output_type": "stream", 2124 | "text": [ 2125 | "Predicting testing results...\n", 2126 | "153164/153164 [==============================] - 2s 10us/step\n", 2127 | "153164/153164 [==============================] - 1s 9us/step\n" 2128 | ] 2129 | } 2130 | ], 2131 | "source": [ 2132 | "#test_data = test_df\n", 2133 | "CLASSES = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n", 2134 | "submit_path_prefix = \"ToxicNN-\" + str(MAX_SEQUENCE_LENGTH) \n", 2135 | "\n", 2136 | "print(\"Predicting testing results...\")\n", 2137 | "test_predicts_list = []\n", 2138 | "for fold_id, model in enumerate(models):\n", 2139 | " test_predicts = model.predict(test_data, batch_size=256, verbose=1)\n", 2140 | " test_predicts_list.append(test_predicts)\n", 2141 | "\n", 2142 | "# merge each folds' predictions by averaging\n", 2143 | "test_predicts = np.zeros(test_predicts_list[0].shape)\n", 2144 | "for fold_predict in test_predicts_list:\n", 2145 | " test_predicts += fold_predict\n", 2146 | "test_predicts /= len(test_predicts_list)\n", 2147 | "\n", 2148 | "# create the submission file\n", 2149 | "test_ids = test_df[\"id\"].values\n", 2150 | "test_ids = test_ids.reshape((len(test_ids), 1))\n", 2151 | "test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)\n", 2152 | "test_predicts[\"id\"] = test_ids\n", 2153 | "test_predicts = test_predicts[[\"id\"] + CLASSES]\n", 2154 | "submit_path = submit_path_prefix + \"-Loss{:4f}.csv\".format(val_loss)\n", 2155 | "test_predicts.to_csv(submit_path, index=False)" 2156 | ] 2157 | }, 2158 | { 2159 | "cell_type": "markdown", 2160 | "metadata": {}, 2161 | "source": [ 2162 | "## Discussions\n", 2163 | "\n", 2164 | "\n", 2165 | "### Better method to compose a sequence of vectors into a single vector ?\n", 2166 | "\n", 2167 | "Either in CNN or RNN, the outputs are a sequence of vectors, which means a best practice to compose a sequence of vectors into a single one is very important. We have tried to simply select the last one (in RNN) or select the one with max value (in CNN and hybrid NN) to represent a sequence, and there clearly is much room for improvement. For example, how about selecting the top K max vectors? or averaging the whole sequence to get one vector? Furthermore, we can apply **weighted averaging** to the sequence, which is called **Attention** in natural language processing and it does help a lot on catching information in long sequences." 2168 | ] 2169 | }, 2170 | { 2171 | "cell_type": "markdown", 2172 | "metadata": {}, 2173 | "source": [ 2174 | "### Jointly train or not ?\n", 2175 | "\n", 2176 | "This is a multilabel classification challenge, so why do we jointly train 6 labels together rather than train them one by one? Indeed, this is a good question. In some cases which are labeled sparsely or not clearly related to other classes (like threat in this dataset), training these labels independently might get a better socre because these cases should build their own unique feature representations. You can give it a try, and find the best combination on training labels." 2177 | ] 2178 | }, 2179 | { 2180 | "cell_type": "markdown", 2181 | "metadata": {}, 2182 | "source": [ 2183 | "### The power of unsupervised learing\n", 2184 | "\n", 2185 | "In the above tutorial, we just defined a **random initialized** embedding matrix for text classificaiton. With this method, the embedding matrix will fit well on the training set but it would also be biasd by some special tokens or noise since our dataset is not that large, that cause overfitting.\n", 2186 | "\n", 2187 | "We can deal with this by using some pretrained resources, like:\n", 2188 | "\n", 2189 | "* [GloVe embedding](https://nlp.stanford.edu/projects/glove/)\n", 2190 | "* [Fasttext embedding](https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md)\n", 2191 | "* [Word2Vec](https://radimrehurek.com/gensim/models/word2vec.html)\n", 2192 | "\n", 2193 | "You might get a significant boost by replacing the old matrix with vectors pretrained on a big corpus which catch the similarity between words." 2194 | ] 2195 | }, 2196 | { 2197 | "cell_type": "code", 2198 | "execution_count": 32, 2199 | "metadata": { 2200 | "collapsed": false 2201 | }, 2202 | "outputs": [ 2203 | { 2204 | "name": "stdout", 2205 | "output_type": "stream", 2206 | "text": [ 2207 | "Total 400000 word vectors.\n" 2208 | ] 2209 | } 2210 | ], 2211 | "source": [ 2212 | "'''\n", 2213 | "Try to load the word embedding as the format\n", 2214 | "{\n", 2215 | " 'word1': embedding,\n", 2216 | " 'word2': embedding,\n", 2217 | " ...\n", 2218 | "}\n", 2219 | "i.e. A key-value pair whose key is the word and the whose value is the embedding\n", 2220 | "'''\n", 2221 | "GLOVE_EMBEDDING = {}\n", 2222 | "\n", 2223 | "def load_embedding(embeddings_index, embedding_path='glove.6B.50d.txt'):\n", 2224 | " '''return a dict whose key is word, value is pretrained word embedding'''\n", 2225 | " f = open(embedding_path, 'r', encoding='utf-8')\n", 2226 | " for line in f:\n", 2227 | " values = line.split()\n", 2228 | " try:\n", 2229 | " word = values[0]\n", 2230 | " coefs = np.asarray(values[1:], dtype='float32')\n", 2231 | " embeddings_index[word] = coefs\n", 2232 | " except:\n", 2233 | " print(\"Err on \", values[:2])\n", 2234 | " f.close()\n", 2235 | " print('Total %s word vectors.' % len(embeddings_index))\n", 2236 | " return embeddings_index\n", 2237 | "\n", 2238 | "GLOVE_EMBEDDING = load_embedding(GLOVE_EMBEDDING)" 2239 | ] 2240 | }, 2241 | { 2242 | "cell_type": "code", 2243 | "execution_count": 33, 2244 | "metadata": { 2245 | "collapsed": false 2246 | }, 2247 | "outputs": [ 2248 | { 2249 | "name": "stdout", 2250 | "output_type": "stream", 2251 | "text": [ 2252 | "Congrats, you passed the test.\n" 2253 | ] 2254 | } 2255 | ], 2256 | "source": [ 2257 | "assert len(GLOVE_EMBEDDING) == 400000 , \"Failed. Did you load the whole file ?\"\n", 2258 | "assert 'hello' in GLOVE_EMBEDDING.keys(), \"Oops, it seems that you miss some words\"\n", 2259 | "assert len(GLOVE_EMBEDDING['hello'] == 50), \"You have a wrong dimension. Check it again.\"\n", 2260 | "\n", 2261 | "print(\"Congrats, you passed the test.\")" 2262 | ] 2263 | }, 2264 | { 2265 | "cell_type": "code", 2266 | "execution_count": null, 2267 | "metadata": { 2268 | "collapsed": true 2269 | }, 2270 | "outputs": [], 2271 | "source": [ 2272 | "def build_embedding_matrix(embeddings_index, word_index):\n", 2273 | " embedding_matrix = np.zeros((MAX_NB_WORDS, 50))\n", 2274 | "\n", 2275 | " for word, i in word_index.items():\n", 2276 | " embedding_vector = embeddings_index.get(word)\n", 2277 | " if i >= MAX_NB_WORDS:\n", 2278 | " continue\n", 2279 | " if embedding_vector is not None:\n", 2280 | " embedding_matrix[i] = embedding_vector\n", 2281 | " print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))\n", 2282 | " return embedding_matrix\n", 2283 | "\n", 2284 | "GLOVE_EMBEDDING = build_embedding_matrix(GLOVE_EMBEDDING, word_index)" 2285 | ] 2286 | }, 2287 | { 2288 | "cell_type": "markdown", 2289 | "metadata": {}, 2290 | "source": [ 2291 | "## Replace the embedding to a pretrained one is easy\n", 2292 | "\n", 2293 | "To freeze or keep train the word embedding depends on your task. Sometimes, we tend to add a WordEncoder among the word embedding.\n", 2294 | "The WordEncoder can be a simple Dense layer or something complicated (for example: Highway network).\n", 2295 | "\n", 2296 | "```\n", 2297 | "embedding_layer = Embedding(MAX_NB_WORDS,\n", 2298 | " EMBEDDING_DIM,\n", 2299 | " weights=[GLOVE_EMBEDDING],\n", 2300 | " input_length=MAX_SEQUENCE_LENGTH,\n", 2301 | " trainable=False)(TENSOR)\n", 2302 | "```" 2303 | ] 2304 | }, 2305 | { 2306 | "cell_type": "code", 2307 | "execution_count": null, 2308 | "metadata": { 2309 | "collapsed": true 2310 | }, 2311 | "outputs": [], 2312 | "source": [ 2313 | "'''\n", 2314 | "Try to create a model with pretrained word embedding which has a single layer dense WordEncoder.\n", 2315 | "Reference: https://keras.io/layers/wrappers/\n", 2316 | "'''\n", 2317 | "def get_your_model_with_pretrained_embedding():\n", 2318 | " '''your show time'''\n", 2319 | " recurrent_units = 48\n", 2320 | " dense_units = 32\n", 2321 | " filter_nums = 64\n", 2322 | " output_units = 6\n", 2323 | "\n", 2324 | " input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))\n", 2325 | " embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, weights=[GLOVE_EMBEDDING], input_length=MAX_SEQUENCE_LENGTH)(input_layer)\n", 2326 | " embedding_layer = TimeDistributed(Dense(50, activation='relu'))(embedding_layer)\n", 2327 | " x = Bidirectional(GRU(recurrent_units, return_sequences=True))(embedding_layer)\n", 2328 | " x = Conv1D(filter_nums, 2, kernel_initializer=\"normal\", padding=\"valid\", activation=\"relu\")(x) \n", 2329 | "\n", 2330 | " max_pool = GlobalMaxPooling1D()(x)\n", 2331 | " max_pool = Dropout(0.5)(max_pool)\n", 2332 | "\n", 2333 | " output_layer = Dense(output_units, activation=\"sigmoid\")(max_pool)\n", 2334 | "\n", 2335 | " model = Model(inputs=input_layer, outputs=output_layer)\n", 2336 | " model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", 2337 | " return model" 2338 | ] 2339 | }, 2340 | { 2341 | "cell_type": "code", 2342 | "execution_count": null, 2343 | "metadata": { 2344 | "collapsed": true 2345 | }, 2346 | "outputs": [], 2347 | "source": [ 2348 | "assert type(get_your_model_with_pretrained_embedding().layers[2]) == TimeDistributed, \"Your model do not have a word encoder.\"\n", 2349 | "print(\"Congrats, you passed the test.\")" 2350 | ] 2351 | }, 2352 | { 2353 | "cell_type": "code", 2354 | "execution_count": null, 2355 | "metadata": { 2356 | "collapsed": true 2357 | }, 2358 | "outputs": [], 2359 | "source": [ 2360 | "models, val_loss = train_folds(train_data, train_labels, 2, 256, get_your_model_with_pretrained_embedding)" 2361 | ] 2362 | }, 2363 | { 2364 | "cell_type": "markdown", 2365 | "metadata": {}, 2366 | "source": [ 2367 | "# Reference: [My solution to this challenge](https://github.com/zake7749/DeepToxic)" 2368 | ] 2369 | }, 2370 | { 2371 | "cell_type": "code", 2372 | "execution_count": null, 2373 | "metadata": { 2374 | "collapsed": true 2375 | }, 2376 | "outputs": [], 2377 | "source": [] 2378 | } 2379 | ], 2380 | "metadata": { 2381 | "kernelspec": { 2382 | "display_name": "Python 3", 2383 | "language": "python", 2384 | "name": "python3" 2385 | }, 2386 | "language_info": { 2387 | "codemirror_mode": { 2388 | "name": "ipython", 2389 | "version": 3 2390 | }, 2391 | "file_extension": ".py", 2392 | "mimetype": "text/x-python", 2393 | "name": "python", 2394 | "nbconvert_exporter": "python", 2395 | "pygments_lexer": "ipython3", 2396 | "version": "3.4.1" 2397 | } 2398 | }, 2399 | "nbformat": 4, 2400 | "nbformat_minor": 1 2401 | } 2402 | -------------------------------------------------------------------------------- /Dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IKMLab/sentence_classification_tutorial/9cde37484f94b84fe1e47bb199553640b8b99a5e/Dataset/__init__.py -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | please check the [slide](https://docs.google.com/presentation/d/1Un2Q18Sy1R4Qf2fP_gwnOeO3Mt2vURQ1uySoiCvblqM/edit?usp=sharing) for more detail. 2 | -------------------------------------------------------------------------------- /resources/early-stopping-graphic.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IKMLab/sentence_classification_tutorial/9cde37484f94b84fe1e47bb199553640b8b99a5e/resources/early-stopping-graphic.jpg -------------------------------------------------------------------------------- /resources/hybrid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IKMLab/sentence_classification_tutorial/9cde37484f94b84fe1e47bb199553640b8b99a5e/resources/hybrid.png -------------------------------------------------------------------------------- /resources/kfold.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IKMLab/sentence_classification_tutorial/9cde37484f94b84fe1e47bb199553640b8b99a5e/resources/kfold.png -------------------------------------------------------------------------------- /resources/onehot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IKMLab/sentence_classification_tutorial/9cde37484f94b84fe1e47bb199553640b8b99a5e/resources/onehot.png -------------------------------------------------------------------------------- /resources/overfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IKMLab/sentence_classification_tutorial/9cde37484f94b84fe1e47bb199553640b8b99a5e/resources/overfitting.png -------------------------------------------------------------------------------- /resources/textcnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IKMLab/sentence_classification_tutorial/9cde37484f94b84fe1e47bb199553640b8b99a5e/resources/textcnn.png -------------------------------------------------------------------------------- /resources/textrnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IKMLab/sentence_classification_tutorial/9cde37484f94b84fe1e47bb199553640b8b99a5e/resources/textrnn.png -------------------------------------------------------------------------------- /resources/word-embedding.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IKMLab/sentence_classification_tutorial/9cde37484f94b84fe1e47bb199553640b8b99a5e/resources/word-embedding.jpeg -------------------------------------------------------------------------------- /toxic_tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# [Toxic comment classification challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)\n", 8 | "\n", 9 | "In this competition, you’re challenged to build a multi-headed model that’s capable of detecting different types of of toxicity like threats, obscenity, insults, and identity-based hate better than Perspective’s current models. You’ll be using a dataset of comments from Wikipedia’s talk page edits. Improvements to the current model will hopefully help online discussion become more productive and respectful." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "########################################\n", 21 | "# Load the packages\n", 22 | "########################################\n", 23 | "import numpy as np\n", 24 | "import pandas as pd\n", 25 | "import re\n", 26 | "\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "import seaborn as sns\n", 29 | "\n", 30 | "from nltk.stem import SnowballStemmer\n", 31 | "\n", 32 | "from keras.preprocessing.text import Tokenizer\n", 33 | "from keras.preprocessing.sequence import pad_sequences\n", 34 | "from keras.layers import Dense, Input, Embedding, Bidirectional, GRU, Conv1D, GlobalMaxPooling1D, Dropout, TimeDistributed\n", 35 | "from keras.layers.merge import concatenate\n", 36 | "from keras.models import Model\n", 37 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", 38 | "\n", 39 | "%matplotlib inline" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "########################################\n", 51 | "# Define the hyper parameters\n", 52 | "########################################\n", 53 | "path = 'Dataset/'\n", 54 | "TRAIN_DATA_FILE=path + 'train.csv'\n", 55 | "TEST_DATA_FILE=path + 'test.csv'\n", 56 | "\n", 57 | "MAX_SEQUENCE_LENGTH = 100\n", 58 | "MAX_NB_WORDS = 100000\n", 59 | "EMBEDDING_DIM = 50" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## Prepare the training / testing data" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "########################################\n", 78 | "# Load the training / testing set with pandas csv format\n", 79 | "########################################\n", 80 | "train_df = pd.read_csv(TRAIN_DATA_FILE)\n", 81 | "test_df = pd.read_csv(TEST_DATA_FILE)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## Expolary Data Analysis" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "print(\"A quick view of training set\")\n", 98 | "train_df.head()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "print(\"A quick view of testing set\")\n", 108 | "test_df.head()" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "# Check the labels distribution" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "### Check the balance of labels\n", 123 | "\n", 124 | "We would like to know the positive ratio of training set. Because we do not want the model become a lazy guy, for a less frequent positive case, we may give it more penalty when model targets it wrong." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": { 131 | "collapsed": true 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "'''\n", 136 | "What's the positive ratio of each class ?\n", 137 | "'''\n", 138 | "def get_pos_ratio(data):\n", 139 | " pass\n", 140 | "\n", 141 | "pos_ratio = []\n", 142 | "for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:\n", 143 | " pos_ratio.append(get_pos_ratio(train_df[col]))" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "assert pos_ratio[0] == 0.09584448302009764, \"The answer is not correct.\"\n", 153 | "print(\"Congrats, you passed the test.\")" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "x = train_df.iloc[:,2:].sum()\n", 163 | "\n", 164 | "plt.figure(figsize=(8,4))\n", 165 | "ax= sns.barplot(x.index, x.values, alpha=0.8)\n", 166 | "plt.title(\"# per class\")\n", 167 | "plt.ylabel('# of Occurrences', fontsize=12)\n", 168 | "plt.xlabel('Type ', fontsize=12)\n", 169 | "\n", 170 | "rects = ax.patches\n", 171 | "labels = x.values\n", 172 | "for rect, label in zip(rects, labels):\n", 173 | " height = rect.get_height()\n", 174 | " ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')\n", 175 | "\n", 176 | "plt.show()" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "### Correlations of labels\n", 184 | "\n", 185 | "Because this is a mulit-label classification, we will want to know the relation betweens labels, which helps for feature engineering and model design. For example, if we know that a toxic comment is always a insult comment, when we have a high confident toxic comment, we can also consider it as insult comment." 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "corr=train_df.corr()\n", 195 | "plt.figure(figsize=(10,8))\n", 196 | "sns.heatmap(corr,\n", 197 | " xticklabels=corr.columns.values,\n", 198 | " yticklabels=corr.columns.values, annot=True)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "## Preprocessing\n", 206 | "\n", 207 | "We apply 2 propressing method on texts:\n", 208 | "\n", 209 | "1. Make all alphabet lower case:\n", 210 | " * It is very important. We do not want the model cosider 'Hello' and 'hello' as different words.\n", 211 | "2. Remove some special tokens and deal with postfix:\n", 212 | " * For example: what's -> what is, aren't -> are not. Giving the same concept as same token helps for regularization.\n", 213 | "\n", 214 | "Always remember to do preprocess for NLP task. In many kinds of cases, the performance of model trianed with cleaned text significantly outperforms the model trained with raw data. Knowing your data is always the best policy." 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "########################################\n", 224 | "## Text pre-processing and cleaning\n", 225 | "########################################\n", 226 | "print('Processing text dataset')\n", 227 | "from collections import defaultdict\n", 228 | "\n", 229 | "# regex to remove all Non-Alpha Numeric and space\n", 230 | "special_character_removal=re.compile(r'[^a-z\\d ]',re.IGNORECASE)\n", 231 | "\n", 232 | "# regex to replace all numeric\n", 233 | "replace_numbers=re.compile(r'\\d+',re.IGNORECASE)\n", 234 | "\n", 235 | "def clean_text(text, stem_words=False):\n", 236 | " # Clean the text, with the option to remove stopwords and to stem words.\n", 237 | " text = text.lower()\n", 238 | " text = re.sub(r\"what's\", \"what is \", text)\n", 239 | " text = re.sub(r\"\\'s\", \" \", text)\n", 240 | " text = re.sub(r\"\\'ve\", \" have \", text)\n", 241 | " text = re.sub(r\"can't\", \"cannot \", text)\n", 242 | " text = re.sub(r\"n't\", \" not \", text)\n", 243 | " text = re.sub(r\"i'm\", \"i am \", text)\n", 244 | " text = re.sub(r\"i’m\", \"i am\", text)\n", 245 | " text = re.sub(r\"\\'re\", \" are \", text)\n", 246 | " text = re.sub(r\"\\'d\", \" would \", text)\n", 247 | " text = re.sub(r\"\\'ll\", \" will \", text)\n", 248 | " text = re.sub(r\",\", \" \", text)\n", 249 | " text = re.sub(r\"\\.\", \" \", text)\n", 250 | " text = re.sub(r\"'\", \" \", text)\n", 251 | " text = re.sub(r\"\\s{2,}\", \" \", text)\n", 252 | " text = replace_numbers.sub('', text)\n", 253 | " text = special_character_removal.sub('',text)\n", 254 | " \n", 255 | " return text" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": { 262 | "collapsed": true 263 | }, 264 | "outputs": [], 265 | "source": [ 266 | "'''\n", 267 | "Apply preprocessing and extract the training sentences and testing senteces from pandas dataframe.\n", 268 | "Note that there are some N/A comment in the train/test set. Fill them up first.\n", 269 | "'''\n", 270 | "train_comments = []\n", 271 | "test_comments = []" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": { 278 | "collapsed": true 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "assert len(train_comments) == 159571 and len(test_comments) == 153164, \"It seems that you lost some data.\"\n", 283 | "assert 'E' not in train_comments[0], \"It seems you did not preprocess the sentecnes. I found a upper case alphabet in your train set.\"" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "### Let's have a comparsion between cleaned text and original one" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "for i in range(5):\n", 300 | " print(\"Cleaned\\n\", train_comments[i] + '\\n')\n", 301 | " print(\"Raw\\n\", train_df.iloc[i]['comment_text'] + '\\n')\n", 302 | " print(\"------------------\")" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "# Tokenization\n", 315 | "\n", 316 | "Tokenization separates a sentence into words by space, for example:\n", 317 | "\n", 318 | "* \"Hello world\" -> [\"Hello\", \"world\"]\n", 319 | "\n", 320 | "The input of the neural network is a digit not a word. So we have to apply one hot encoding or index encoding on them.\n", 321 | "![onehot](resources/onehot.png)\n", 322 | "Now we use keras tokenizer to learn the encoding table." 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "collapsed": true 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "# Create a tokenize, which transforms a sentence to a list of ids\n", 334 | "tokenizer = Tokenizer(num_words=MAX_NB_WORDS)\n", 335 | "# Build the relation between words and ids \n", 336 | "tokenizer.fit_on_texts(train_comments + test_comments)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": { 343 | "collapsed": true 344 | }, 345 | "outputs": [], 346 | "source": [ 347 | "tokenizer.word_index # map 'the' to 1, map 'to' to 2,......" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": { 354 | "collapsed": true 355 | }, 356 | "outputs": [], 357 | "source": [ 358 | "# Transform training/testing sentences to training/testing sequences.\n", 359 | "train_sequences = tokenizer.texts_to_sequences(train_comments)\n", 360 | "test_sequences = tokenizer.texts_to_sequences(test_comments)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "## have a look on transformed sequences" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": { 374 | "collapsed": true 375 | }, 376 | "outputs": [], 377 | "source": [ 378 | "for i in range(1):\n", 379 | " print(\"Transformed\\n\", str(train_sequences[i]) + '\\n')\n", 380 | " print(\"Cleaned\\n\", train_comments[i] + '\\n')\n", 381 | " print(\"------------------\")" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": { 388 | "collapsed": true 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "word_index = tokenizer.word_index\n", 393 | "print('Found %s unique tokens' % len(word_index))\n", 394 | "\n", 395 | "train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", 396 | "print('Shape of data tensor:', train_data.shape)\n", 397 | "print('Shape of label tensor:', train_labels.shape)\n", 398 | "\n", 399 | "test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", 400 | "print('Shape of test_data tensor:', test_data.shape)" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": { 407 | "collapsed": true 408 | }, 409 | "outputs": [], 410 | "source": [ 411 | "'''\n", 412 | "Try to build a tokenzier, which transform [['Hello', 'World'], ['Greeting', 'my', 'friend'], ['Hello', 'have', 'a', 'nice', 'day']]\n", 413 | "to a list of index sequences. Note that the index should start from 1 because 0 is reserverd for padding token for some framework.\n", 414 | "'''\n", 415 | "tests_input_sentences = [['Hello', 'World'], ['Greeting', 'my', 'friend'], ['Hello', 'have', 'a', 'nice', 'day']]\n", 416 | "transform_this_sentences = [['Hello', 'my', 'friend']]\n", 417 | "\n", 418 | "def index_encoding(sentences, raw_sent):\n", 419 | " pass\n", 420 | " return results" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": { 427 | "collapsed": true 428 | }, 429 | "outputs": [], 430 | "source": [ 431 | "transformed = index_encoding(tests_input_sentences, transform_this_sentences)\n", 432 | "assert transformed == [[1, 4, 5]], \"The answer is not correct.\"\n", 433 | "print(\"Congrats, you passed the test.\")" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "## Models" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "## Text RNN\n", 448 | "\n", 449 | "![TextRNN](resources/textrnn.png)\n", 450 | "\n", 451 | "Here we present a classical structure, 2 layer bidirectional GRU, for text classification. Instead of adding a fully connected layer after all time steps, here we only select the last hidden unit of sequence. (LB: 50%, AUC: 0.982)" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": { 458 | "collapsed": true 459 | }, 460 | "outputs": [], 461 | "source": [ 462 | "########################################\n", 463 | "## Define the text rnn model structure\n", 464 | "########################################\n", 465 | "def get_text_rnn():\n", 466 | " recurrent_units = 48\n", 467 | " dense_units = 32\n", 468 | " output_units = 6\n", 469 | " \n", 470 | " input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))\n", 471 | " embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(input_layer)\n", 472 | " \n", 473 | " x = Bidirectional(GRU(recurrent_units, return_sequences=True))(embedding_layer)\n", 474 | " x = Bidirectional(GRU(recurrent_units, return_sequences=False))(x)\n", 475 | " \n", 476 | " x = Dense(dense_units, activation=\"relu\")(x)\n", 477 | " output_layer = Dense(output_units, activation=\"sigmoid\")(x)\n", 478 | " \n", 479 | " model = Model(inputs=input_layer, outputs=output_layer)\n", 480 | " model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", 481 | " return model" 482 | ] 483 | }, 484 | { 485 | "cell_type": "markdown", 486 | "metadata": {}, 487 | "source": [ 488 | "## TextCNN\n", 489 | "\n", 490 | "![TextCNN](resources/textcnn.png)\n", 491 | "\n", 492 | "Convolution in natural langauge proceessing can be consider as a special type of ngram. We simply select the kernels with window sizes (2, 3, 4) to extract regional features. (LB: 50%, AUC: 0.982)" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": null, 498 | "metadata": { 499 | "collapsed": true 500 | }, 501 | "outputs": [], 502 | "source": [ 503 | "########################################\n", 504 | "## Define the text cnn model structure\n", 505 | "########################################\n", 506 | "def get_text_cnn():\n", 507 | " filter_nums = 120\n", 508 | " dense_units = 72\n", 509 | " output_units = 6\n", 510 | " \n", 511 | " input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n", 512 | " embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH,)(input_layer)\n", 513 | " \n", 514 | " conv_0 = Conv1D(filter_nums, 2, kernel_initializer=\"normal\", padding=\"valid\", activation=\"relu\")(embedding_layer)\n", 515 | " conv_1 = Conv1D(filter_nums, 3, kernel_initializer=\"normal\", padding=\"valid\", activation=\"relu\")(embedding_layer)\n", 516 | " conv_2 = Conv1D(filter_nums, 4, kernel_initializer=\"normal\", padding=\"valid\", activation=\"relu\")(embedding_layer)\n", 517 | "\n", 518 | " maxpool_0 = GlobalMaxPooling1D()(conv_0)\n", 519 | " maxpool_1 = GlobalMaxPooling1D()(conv_1)\n", 520 | " maxpool_2 = GlobalMaxPooling1D()(conv_2)\n", 521 | "\n", 522 | " merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2])\n", 523 | " h1 = Dense(units=dense_units, activation=\"relu\")(merged_tensor)\n", 524 | " output = Dense(units=output_units, activation='sigmoid')(h1)\n", 525 | "\n", 526 | " model = Model(inputs=input_layer, outputs=output)\n", 527 | " model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", 528 | " return model" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": {}, 534 | "source": [ 535 | "## Hybrid Text NN\n", 536 | "\n", 537 | "![hybrid](resources/hybrid.png)\n", 538 | "\n", 539 | "This structure mixed the feature representation ideas of RNN and CNN. We firstly place the recurrent layer after embedding for building the word's level sequential information and make it connected with a convolution layer to extract the regional features of hiddens. (LB: 30%, AUC: 0.983)" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": null, 545 | "metadata": { 546 | "collapsed": true 547 | }, 548 | "outputs": [], 549 | "source": [ 550 | "########################################\n", 551 | "## Define the text hybrid model structure\n", 552 | "########################################\n", 553 | "def get_hybrid_textnn():\n", 554 | " recurrent_units = 48\n", 555 | " dense_units = 32\n", 556 | " filter_nums = 64\n", 557 | " output_units = 6\n", 558 | "\n", 559 | " input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))\n", 560 | " embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(input_layer)\n", 561 | " \n", 562 | " x = Bidirectional(GRU(recurrent_units, return_sequences=True))(embedding_layer)\n", 563 | " x = Conv1D(filter_nums, 2, kernel_initializer=\"normal\", padding=\"valid\", activation=\"relu\")(x) \n", 564 | " \n", 565 | " max_pool = GlobalMaxPooling1D()(x)\n", 566 | " max_pool = Dropout(0.5)(max_pool)\n", 567 | " \n", 568 | " output_layer = Dense(output_units, activation=\"sigmoid\")(max_pool)\n", 569 | "\n", 570 | " model = Model(inputs=input_layer, outputs=output_layer)\n", 571 | " model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", 572 | " return model" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": {}, 578 | "source": [ 579 | "### K-Fold Cross Valiadtion\n", 580 | "![kfold](resources/kfold.png)" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": { 587 | "collapsed": true 588 | }, 589 | "outputs": [], 590 | "source": [ 591 | "########################################\n", 592 | "## Construct the cross-validation framework\n", 593 | "########################################\n", 594 | "def _train_model_by_logloss(model, batch_size, train_x, train_y, val_x, val_y, fold_id):\n", 595 | " # set an early stopping checker.\n", 596 | " # the training phase would stop when validation log loss decreases continuously for `patience` rounds. \n", 597 | " early_stopping = EarlyStopping(monitor='val_loss', patience=5)\n", 598 | " bst_model_path = \"ToxicModel\" + str(fold_id) + '.h5'\n", 599 | " model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)\n", 600 | " \n", 601 | " # training on given fold data\n", 602 | " hist = model.fit(train_x, train_y,\n", 603 | " validation_data=(val_x, val_y),\n", 604 | " epochs=50, batch_size=batch_size, shuffle=True,\n", 605 | " callbacks=[early_stopping, model_checkpoint])\n", 606 | " \n", 607 | " # get the minimal validation log loss on this fold\n", 608 | " bst_val_score = min(hist.history['val_loss'])\n", 609 | "\n", 610 | " # return the model with best weight, best fold-val score\n", 611 | " return model, bst_val_score\n", 612 | "\n", 613 | "def train_folds(X, y, fold_count, batch_size, get_model_func):\n", 614 | " fold_size = len(X) // fold_count\n", 615 | " models = []\n", 616 | " score = 0\n", 617 | " \n", 618 | " # split the whole dataset to `fold_count` fold, and train our model on each fold\n", 619 | " for fold_id in range(0, fold_count):\n", 620 | " fold_start = fold_size * fold_id\n", 621 | " fold_end = fold_start + fold_size\n", 622 | "\n", 623 | " if fold_id == fold_size - 1:\n", 624 | " fold_end = len(X)\n", 625 | "\n", 626 | " # Generate the train/val data on fold i\n", 627 | " train_x = np.concatenate([X[:fold_start], X[fold_end:]])\n", 628 | " train_y = np.concatenate([y[:fold_start], y[fold_end:]])\n", 629 | "\n", 630 | " val_x = X[fold_start:fold_end]\n", 631 | " val_y = y[fold_start:fold_end]\n", 632 | " \n", 633 | " print(\"Training on fold #\", fold_id)\n", 634 | " model, bst_val_score = _train_model_by_logloss(get_model_func(), batch_size, train_x, train_y, val_x, val_y, fold_id)\n", 635 | " score += bst_val_score\n", 636 | " models.append(model)\n", 637 | " return models, score / fold_count" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "metadata": { 644 | "collapsed": true 645 | }, 646 | "outputs": [], 647 | "source": [ 648 | "models, val_loss = train_folds(train_data, train_labels, 2, 256, get_text_cnn)" 649 | ] 650 | }, 651 | { 652 | "cell_type": "markdown", 653 | "metadata": {}, 654 | "source": [ 655 | "# Practice: Try to beat the baseline: val_loss=0.050" 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": null, 661 | "metadata": { 662 | "collapsed": true 663 | }, 664 | "outputs": [], 665 | "source": [ 666 | "your_batch_size = 256\n", 667 | "\n", 668 | "def get_your_model():\n", 669 | " '''your show time'''\n", 670 | " return model" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": null, 676 | "metadata": { 677 | "collapsed": true 678 | }, 679 | "outputs": [], 680 | "source": [ 681 | "models, val_loss = train_folds(train_data, train_labels, 2, your_batch_size, get_your_model)" 682 | ] 683 | }, 684 | { 685 | "cell_type": "markdown", 686 | "metadata": {}, 687 | "source": [ 688 | "# Make the predections" 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": null, 694 | "metadata": { 695 | "collapsed": true 696 | }, 697 | "outputs": [], 698 | "source": [ 699 | "#test_data = test_df\n", 700 | "CLASSES = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n", 701 | "submit_path_prefix = \"ToxicNN-\" + str(MAX_SEQUENCE_LENGTH) \n", 702 | "\n", 703 | "print(\"Predicting testing results...\")\n", 704 | "test_predicts_list = []\n", 705 | "for fold_id, model in enumerate(models):\n", 706 | " test_predicts = model.predict(test_data, batch_size=256, verbose=1)\n", 707 | " test_predicts_list.append(test_predicts)\n", 708 | "\n", 709 | "# merge each folds' predictions by averaging\n", 710 | "test_predicts = np.zeros(test_predicts_list[0].shape)\n", 711 | "for fold_predict in test_predicts_list:\n", 712 | " test_predicts += fold_predict\n", 713 | "test_predicts /= len(test_predicts_list)\n", 714 | "\n", 715 | "# create the submission file\n", 716 | "test_ids = test_df[\"id\"].values\n", 717 | "test_ids = test_ids.reshape((len(test_ids), 1))\n", 718 | "test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)\n", 719 | "test_predicts[\"id\"] = test_ids\n", 720 | "test_predicts = test_predicts[[\"id\"] + CLASSES]\n", 721 | "submit_path = submit_path_prefix + \"-Loss{:4f}.csv\".format(val_loss)\n", 722 | "test_predicts.to_csv(submit_path, index=False)" 723 | ] 724 | }, 725 | { 726 | "cell_type": "markdown", 727 | "metadata": {}, 728 | "source": [ 729 | "## Discussions\n", 730 | "\n", 731 | "\n", 732 | "### Better method to compose a sequence of vectors into a single vector ?\n", 733 | "\n", 734 | "Either in CNN or RNN, the outputs are a sequence of vectors, which means a best practice to compose a sequence of vectors into a single one is very important. We have tried to simply select the last one (in RNN) or select the one with max value (in CNN and hybrid NN) to represent a sequence, and there clearly is much room for improvement. For example, how about selecting the top K max vectors? or averaging the whole sequence to get one vector? Furthermore, we can apply **weighted averaging** to the sequence, which is called **Attention** in natural language processing and it does help a lot on catching information in long sequences." 735 | ] 736 | }, 737 | { 738 | "cell_type": "markdown", 739 | "metadata": {}, 740 | "source": [ 741 | "### Jointly train or not ?\n", 742 | "\n", 743 | "This is a multilabel classification challenge, so why do we jointly train 6 labels together rather than train them one by one? Indeed, this is a good question. In some cases which are labeled sparsely or not clearly related to other classes (like threat in this dataset), training these labels independently might get a better socre because these cases should build their own unique feature representations. You can give it a try, and find the best combination on training labels." 744 | ] 745 | }, 746 | { 747 | "cell_type": "markdown", 748 | "metadata": {}, 749 | "source": [ 750 | "### The power of unsupervised learing\n", 751 | "\n", 752 | "In the above tutorial, we just defined a **random initialized** embedding matrix for text classificaiton. With this method, the embedding matrix will fit well on the training set but it would also be biasd by some special tokens or noise since our dataset is not that large, that cause overfitting.\n", 753 | "\n", 754 | "We can deal with this by using some pretrained resources, like:\n", 755 | "\n", 756 | "* [GloVe embedding](https://nlp.stanford.edu/projects/glove/)\n", 757 | "* [Fasttext embedding](https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md)\n", 758 | "* [Word2Vec](https://radimrehurek.com/gensim/models/word2vec.html)\n", 759 | "\n", 760 | "You might get a significant boost by replacing the old matrix with vectors pretrained on a big corpus which catch the similarity between words." 761 | ] 762 | }, 763 | { 764 | "cell_type": "code", 765 | "execution_count": null, 766 | "metadata": { 767 | "collapsed": true 768 | }, 769 | "outputs": [], 770 | "source": [ 771 | "'''\n", 772 | "Try to load the word embedding as the format\n", 773 | "{\n", 774 | " 'word1': embedding,\n", 775 | " 'word2': embedding,\n", 776 | " ...\n", 777 | "}\n", 778 | "i.e. A key-value pair whose key is the word and the whose value is the embedding\n", 779 | "'''\n", 780 | "GLOVE_EMBEDDING = {}\n", 781 | "\n", 782 | "def load_embedding(embeddings_index, embedding_path='glove.6B.50d.txt'):\n", 783 | " '''return a dict whose key is word, value is pretrained word embedding'''\n", 784 | " return embeddings_index\n", 785 | "\n", 786 | "GLOVE_EMBEDDING = load_embedding(GLOVE_EMBEDDING)" 787 | ] 788 | }, 789 | { 790 | "cell_type": "code", 791 | "execution_count": null, 792 | "metadata": { 793 | "collapsed": true 794 | }, 795 | "outputs": [], 796 | "source": [ 797 | "assert len(GLOVE_EMBEDDING) == 400000 , \"Failed. Did you load the whole file ?\"\n", 798 | "assert 'hello' in GLOVE_EMBEDDING.keys(), \"Oops, it seems that you miss some words\"\n", 799 | "assert len(GLOVE_EMBEDDING['hello'] == 50), \"You have a wrong dimension. Check it again.\"\n", 800 | "\n", 801 | "print(\"Congrats, you passed the test.\")" 802 | ] 803 | }, 804 | { 805 | "cell_type": "code", 806 | "execution_count": null, 807 | "metadata": { 808 | "collapsed": true 809 | }, 810 | "outputs": [], 811 | "source": [ 812 | "def build_embedding_matrix(embeddings_index, word_index):\n", 813 | " embedding_matrix = np.zeros((MAX_NB_WORDS, 50))\n", 814 | "\n", 815 | " for word, i in word_index.items():\n", 816 | " embedding_vector = embeddings_index.get(word)\n", 817 | " if i >= MAX_NB_WORDS:\n", 818 | " continue\n", 819 | " if embedding_vector is not None:\n", 820 | " embedding_matrix[i] = embedding_vector\n", 821 | " print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))\n", 822 | " return embedding_matrix\n", 823 | "\n", 824 | "GLOVE_EMBEDDING = build_embedding_matrix(GLOVE_EMBEDDING, word_index)" 825 | ] 826 | }, 827 | { 828 | "cell_type": "markdown", 829 | "metadata": {}, 830 | "source": [ 831 | "## Replace the embedding to a pretrained one is easy\n", 832 | "\n", 833 | "To freeze or keep train the word embedding depends on your task. Sometimes, we tend to add a WordEncoder among the word embedding.\n", 834 | "The WordEncoder can be a simple Dense layer or something complicated (for example: Highway network).\n", 835 | "\n", 836 | "```\n", 837 | "embedding_layer = Embedding(MAX_NB_WORDS,\n", 838 | " EMBEDDING_DIM,\n", 839 | " weights=[GLOVE_EMBEDDING],\n", 840 | " input_length=MAX_SEQUENCE_LENGTH,\n", 841 | " trainable=False)(TENSOR)\n", 842 | "```" 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": null, 848 | "metadata": { 849 | "collapsed": true 850 | }, 851 | "outputs": [], 852 | "source": [ 853 | "'''\n", 854 | "Try to create a model with pretrained word embedding which has a single layer dense WordEncoder.\n", 855 | "Reference: https://keras.io/layers/wrappers/\n", 856 | "'''\n", 857 | "def get_your_model_with_pretrained_embedding():\n", 858 | " '''your show time'''\n", 859 | " return model" 860 | ] 861 | }, 862 | { 863 | "cell_type": "code", 864 | "execution_count": null, 865 | "metadata": { 866 | "collapsed": true 867 | }, 868 | "outputs": [], 869 | "source": [ 870 | "assert type(get_your_model_with_pretrained_embedding().layers[2]) == TimeDistributed, \"Your model do not have a word encoder.\"\n", 871 | "print(\"Congrats, you passed the test.\")" 872 | ] 873 | }, 874 | { 875 | "cell_type": "code", 876 | "execution_count": null, 877 | "metadata": { 878 | "collapsed": true 879 | }, 880 | "outputs": [], 881 | "source": [ 882 | "models, val_loss = train_folds(train_data, train_labels, 2, 256, get_your_model_with_pretrained_embedding)" 883 | ] 884 | }, 885 | { 886 | "cell_type": "markdown", 887 | "metadata": {}, 888 | "source": [ 889 | "# Reference: [My solution to this challenge](https://github.com/zake7749/DeepToxic)" 890 | ] 891 | }, 892 | { 893 | "cell_type": "code", 894 | "execution_count": null, 895 | "metadata": { 896 | "collapsed": true 897 | }, 898 | "outputs": [], 899 | "source": [] 900 | } 901 | ], 902 | "metadata": { 903 | "kernelspec": { 904 | "display_name": "Python 3", 905 | "language": "python", 906 | "name": "python3" 907 | }, 908 | "language_info": { 909 | "codemirror_mode": { 910 | "name": "ipython", 911 | "version": 3 912 | }, 913 | "file_extension": ".py", 914 | "mimetype": "text/x-python", 915 | "name": "python", 916 | "nbconvert_exporter": "python", 917 | "pygments_lexer": "ipython3", 918 | "version": "3.6.3" 919 | } 920 | }, 921 | "nbformat": 4, 922 | "nbformat_minor": 1 923 | } 924 | --------------------------------------------------------------------------------