├── Baselines
    └── Naive Bayes.ipynb
├── CNN
    ├── .ipynb_checkpoints
    │   ├── cnn_keras-checkpoint.ipynb
    │   └── cnn_nlp_nb-checkpoint.ipynb
    ├── cnn-keras.ipynb
    └── cnn_nlp_nb.ipynb
├── Dataset
    └── train_E6oV3lV.csv
├── Experience_Paper.pdf
├── GRU
    └── gru.ipynb
├── Project_Synopsis.pdf
├── README.md
└── Reference
    ├── .ipynb_checkpoints
        └── Naive Bayes-checkpoint.ipynb
    ├── test_predictions.csv
    ├── test_tweets.csv
    └── train.csv


/Baselines/Naive Bayes.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 73,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Import libraries\n",
 10 |     "import pandas as pd\n",
 11 |     "from sklearn.model_selection import train_test_split\n",
 12 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
 13 |     "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
 14 |     "import pandas as pd\n",
 15 |     "import numpy as np"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 74,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "data": {
 25 |       "text/plain": [
 26 |        "Index(['id', 'label', 'tweet'], dtype='object')"
 27 |       ]
 28 |      },
 29 |      "execution_count": 74,
 30 |      "metadata": {},
 31 |      "output_type": "execute_result"
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "df = pd.read_csv('../Dataset/train_E6oV3lV.csv')\n",
 36 |     "df.columns"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 75,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "from sklearn.utils import shuffle\n",
 46 |     "df = shuffle(df)\n",
 47 |     "\n",
 48 |     "X_train, X_test, y_train, y_test = train_test_split(df['tweet'], \n",
 49 |     "                                                    df['label'], \n",
 50 |     "                                                    random_state=1)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 76,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Instantiate the CountVectorizer method\n",
 60 |     "count_vector = CountVectorizer(stop_words = 'english')\n",
 61 |     "\n",
 62 |     "# Fit the training data and then return the matrix\n",
 63 |     "training_data = count_vector.fit_transform(X_train)\n",
 64 |     "\n",
 65 |     "# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()\n",
 66 |     "testing_data = count_vector.transform(X_test)"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 77,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "name": "stdout",
 76 |      "output_type": "stream",
 77 |      "text": [
 78 |       "Accuracy score:  0.9564510073833062\n",
 79 |       "Precision score:  0.864406779661017\n",
 80 |       "Recall score:  0.45293072824156305\n",
 81 |       "F1 score:  0.5944055944055944\n"
 82 |      ]
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "from sklearn.naive_bayes import MultinomialNB\n",
 87 |     "naive_bayes = MultinomialNB()\n",
 88 |     "naive_bayes.fit(training_data, y_train)\n",
 89 |     "predictions = naive_bayes.predict(testing_data)\n",
 90 |     "\n",
 91 |     "print('Accuracy score: ', format(accuracy_score(y_test, predictions)))\n",
 92 |     "print('Precision score: ', format(precision_score(y_test, predictions)))\n",
 93 |     "print('Recall score: ', format(recall_score(y_test, predictions)))\n",
 94 |     "print('F1 score: ', format(f1_score(y_test, predictions)))"
 95 |    ]
 96 |   }],
 97 |  "metadata": {
 98 |   "kernelspec": {
 99 |    "display_name": "Python 3",
100 |    "language": "python",
101 |    "name": "python3"
102 |   },
103 |   "language_info": {
104 |    "codemirror_mode": {
105 |     "name": "ipython",
106 |     "version": 3
107 |    },
108 |    "file_extension": ".py",
109 |    "mimetype": "text/x-python",
110 |    "name": "python",
111 |    "nbconvert_exporter": "python",
112 |    "pygments_lexer": "ipython3",
113 |    "version": "3.5.2"
114 |   }
115 |  },
116 |  "nbformat": 4,
117 |  "nbformat_minor": 2
118 | }
119 | 


--------------------------------------------------------------------------------
/CNN/.ipynb_checkpoints/cnn_keras-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
  8 |     "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
  9 |    },
 10 |    "outputs": [
 11 |     {
 12 |      "name": "stdout",
 13 |      "output_type": "stream",
 14 |      "text": [
 15 |       "['test_tweets_anuFYb8.csv', 'train_E6oV3lV.csv']\n"
 16 |      ]
 17 |     }
 18 |    ],
 19 |    "source": [
 20 |     "# This Python 3 environment comes with many helpful analytics libraries installed\n",
 21 |     "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n",
 22 |     "# For example, here's several helpful packages to load in \n",
 23 |     "\n",
 24 |     "import numpy as np # linear algebra\n",
 25 |     "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
 26 |     "\n",
 27 |     "# Input data files are available in the \"../input/\" directory.\n",
 28 |     "# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n",
 29 |     "\n",
 30 |     "import os\n",
 31 |     "print(os.listdir(\"../input\"))\n",
 32 |     "\n",
 33 |     "# Any results you write to the current directory are saved as output."
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import numpy as np\n",
 45 |     "import pandas as pd"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 34,
 51 |    "metadata": {
 52 |     "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
 53 |     "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
 54 |    },
 55 |    "outputs": [
 56 |     {
 57 |      "name": "stdout",
 58 |      "output_type": "stream",
 59 |      "text": [
 60 |       "   id                        ...                                                                      tweet\n",
 61 |       "0   1                        ...                           @user when a father is dysfunctional and is s...\n",
 62 |       "1   2                        ...                          @user @user thanks for #lyft credit i can't us...\n",
 63 |       "2   3                        ...                                                        bihday your majesty\n",
 64 |       "3   4                        ...                          #model   i love u take with u all the time in ...\n",
 65 |       "4   5                        ...                                     factsguide: society now    #motivation\n",
 66 |       "\n",
 67 |       "[5 rows x 3 columns]\n"
 68 |      ]
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "df = pd.read_csv('../input/train_E6oV3lV.csv')\n",
 73 |     "df['label'] = df['label'].map({0: 2, 1: 1})\n",
 74 |     "print(df.head())\n",
 75 |     "# print(df['label'].head())\n",
 76 |     "df = df.drop('id', axis=1)\n",
 77 |     "\n",
 78 |     "# print(df[:100])\n",
 79 |     "# zero = 0\n",
 80 |     "# one = 0\n",
 81 |     "# for i in df['label']:\n",
 82 |     "#     if(i==0):\n",
 83 |     "#         zero+=1\n",
 84 |     "#     else:\n",
 85 |     "#         one+=1\n",
 86 |     "    \n",
 87 |     "# print(zero,one)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 35,
 93 |    "metadata": {
 94 |     "collapsed": true
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "from keras.preprocessing.text import Tokenizer\n",
 99 |     "from keras.preprocessing.sequence import pad_sequences\n",
100 |     "from sklearn.model_selection import train_test_split\n",
101 |     "import copy"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 36,
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "name": "stdout",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "25569\n",
114 |       "6393\n",
115 |       "       label                                              tweet\n",
116 |       "20529      2   @user âbe with those who help your being.â...\n",
117 |       "17833      2     @user a   comic to make you #smile and laugh. \n",
118 |       "27050      2  lmao @ guys liking random girls pictures like ...\n",
119 |       "17683      2  #goodmorning âï¸ð¥ð #neymar #playing ð...\n",
120 |       "25301      2  happy at work c.onference: right mindset leads...\n",
121 |       "<class 'pandas.core.frame.DataFrame'>\n"
122 |      ]
123 |     }
124 |    ],
125 |    "source": [
126 |     "\n",
127 |     "train, test = train_test_split(df, test_size=0.2)\n",
128 |     "print(len(train))\n",
129 |     "print(len(test))\n",
130 |     "print(train.head())\n",
131 |     "\n",
132 |     "print(type(train))"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 37,
138 |    "metadata": {
139 |     "collapsed": true
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "train_df = copy.deepcopy(train)\n",
144 |     "test_df = copy.deepcopy(test)\n",
145 |     "\n",
146 |     "# concatenate column 1 and column 2 as one text\n",
147 |     "# print(train_df[1])\n",
148 |     "\n",
149 |     "# convert string to lower case\n",
150 |     "train_texts = train_df['tweet'].values\n",
151 |     "train_texts = [s.lower() for s in train_texts]\n",
152 |     "\n",
153 |     "test_texts = test_df['tweet'].values\n",
154 |     "test_texts = [s.lower() for s in test_texts]\n",
155 |     "\n",
156 |     "# print(train_texts.head())\n",
157 |     "# print(train_texts)\n"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 38,
163 |    "metadata": {
164 |     "collapsed": true
165 |    },
166 |    "outputs": [],
167 |    "source": [
168 |     "\n",
169 |     "# =======================Convert string to index================\n",
170 |     "# Tokenizer\n",
171 |     "tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')\n",
172 |     "tk.fit_on_texts(train_texts)\n",
173 |     "# If we already have a character list, then replace the tk.word_index\n",
174 |     "# If not, just skip below part\n",
175 |     "\n",
176 |     "# -----------------------Skip part start--------------------------\n",
177 |     "# construct a new vocabulary\n",
178 |     "alphabet = \"abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\\\"/\\\\|_@#$%^&*~`+-=<>()[]{}\"\n",
179 |     "char_dict = {}\n",
180 |     "for i, char in enumerate(alphabet):\n",
181 |     "    char_dict[char] = i + 1\n",
182 |     "\n",
183 |     "# Use char_dict to replace the tk.word_index\n",
184 |     "tk.word_index = char_dict.copy()\n",
185 |     "# Add 'UNK' to the vocabulary\n",
186 |     "tk.word_index[tk.oov_token] = max(char_dict.values()) + 1\n",
187 |     "# -----------------------Skip part end----------------------------\n",
188 |     "\n",
189 |     "# Convert string to index\n",
190 |     "train_sequences = tk.texts_to_sequences(train_texts)\n",
191 |     "test_texts = tk.texts_to_sequences(test_texts)\n",
192 |     "\n",
193 |     "# Padding\n",
194 |     "train_data = pad_sequences(train_sequences, maxlen=1014, padding='post')\n",
195 |     "test_data = pad_sequences(test_texts, maxlen=1014, padding='post')\n",
196 |     "\n",
197 |     "# Convert to numpy array\n",
198 |     "train_data = np.array(train_data, dtype='float32')\n",
199 |     "test_data = np.array(test_data, dtype='float32')\n",
200 |     "\n",
201 |     "# =======================Get classes================\n",
202 |     "train_classes = train_df['label'].values\n",
203 |     "train_class_list = [x - 1 for x in train_classes]\n",
204 |     "\n",
205 |     "test_classes = test_df['label'].values\n",
206 |     "test_class_list = [x - 1 for x in test_classes]\n",
207 |     "\n",
208 |     "from keras.utils import to_categorical\n",
209 |     "\n",
210 |     "train_classes = to_categorical(train_class_list)\n",
211 |     "test_classes = to_categorical(test_class_list)"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 39,
217 |    "metadata": {},
218 |    "outputs": [
219 |     {
220 |      "name": "stdout",
221 |      "output_type": "stream",
222 |      "text": [
223 |       "6393\n",
224 |       "{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '0': 27, '1': 28, '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34, '8': 35, '9': 36, ' ': 37, ',': 38, ';': 39, '.': 40, '!': 41, '?': 42, ':': 43, \"'\": 44, '\"': 45, '/': 46, '\\\\': 47, '|': 48, '_': 49, '@': 50, '#': 51, '$': 52, '%': 53, '^': 54, '&': 55, '*': 56, '~': 57, '`': 58, '+': 59, '-': 60, '=': 61, '<': 62, '>': 63, '(': 64, ')': 65, '[': 66, ']': 67, '{': 68, '}': 69, 'UNK': 70}\n"
225 |      ]
226 |     }
227 |    ],
228 |    "source": [
229 |     "print(len(test_classes))\n",
230 |     "print(tk.word_index)"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 40,
236 |    "metadata": {},
237 |    "outputs": [
238 |     {
239 |      "data": {
240 |       "text/plain": [
241 |        "70"
242 |       ]
243 |      },
244 |      "execution_count": 40,
245 |      "metadata": {},
246 |      "output_type": "execute_result"
247 |     }
248 |    ],
249 |    "source": [
250 |     "\n",
251 |     "\n",
252 |     "vocab_size = len(tk.word_index)\n",
253 |     "vocab_size\n",
254 |     "\n"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 41,
260 |    "metadata": {
261 |     "collapsed": true
262 |    },
263 |    "outputs": [],
264 |    "source": [
265 |     "embedding_weights = [] #(71, 70)\n",
266 |     "embedding_weights.append(np.zeros(vocab_size)) # first row is pad\n",
267 |     "\n",
268 |     "for char, i in tk.word_index.items(): # from index 1 to 70\n",
269 |     "    onehot = np.zeros(vocab_size)\n",
270 |     "    onehot[i-1] = 1\n",
271 |     "    embedding_weights.append(onehot)\n",
272 |     "embedding_weights = np.array(embedding_weights)\n"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 42,
278 |    "metadata": {},
279 |    "outputs": [
280 |     {
281 |      "name": "stdout",
282 |      "output_type": "stream",
283 |      "text": [
284 |       "(71, 70)\n"
285 |      ]
286 |     },
287 |     {
288 |      "data": {
289 |       "text/plain": [
290 |        "array([[0., 0., 0., ..., 0., 0., 0.],\n",
291 |        "       [1., 0., 0., ..., 0., 0., 0.],\n",
292 |        "       [0., 1., 0., ..., 0., 0., 0.],\n",
293 |        "       ...,\n",
294 |        "       [0., 0., 0., ..., 1., 0., 0.],\n",
295 |        "       [0., 0., 0., ..., 0., 1., 0.],\n",
296 |        "       [0., 0., 0., ..., 0., 0., 1.]])"
297 |       ]
298 |      },
299 |      "execution_count": 42,
300 |      "metadata": {},
301 |      "output_type": "execute_result"
302 |     }
303 |    ],
304 |    "source": [
305 |     "print(embedding_weights.shape) # first row all 0 for PAD, 69 char, last row for UNK\n",
306 |     "embedding_weights"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 43,
312 |    "metadata": {
313 |     "collapsed": true
314 |    },
315 |    "outputs": [],
316 |    "source": [
317 |     "from keras.layers import Input, Embedding, Activation, Flatten, Dense\n",
318 |     "from keras.layers import Conv1D, MaxPooling1D, Dropout\n",
319 |     "from keras.models import Model"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": 44,
325 |    "metadata": {
326 |     "collapsed": true
327 |    },
328 |    "outputs": [],
329 |    "source": [
330 |     "# parameter \n",
331 |     "input_size = 1014\n",
332 |     "# vocab_size = 69\n",
333 |     "embedding_size = 70\n",
334 |     "conv_layers = [[256, 7, 3], \n",
335 |     "               [256, 7, 3], \n",
336 |     "               [256, 3, -1], \n",
337 |     "               [256, 3, -1], \n",
338 |     "               [256, 3, -1], \n",
339 |     "               [256, 3, 3]]\n",
340 |     "\n",
341 |     "fully_connected_layers = [1024, 1024]\n",
342 |     "num_of_classes = 2\n",
343 |     "dropout_p = 0.5\n",
344 |     "optimizer = 'adam'\n",
345 |     "loss = 'categorical_crossentropy'"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": 45,
351 |    "metadata": {
352 |     "collapsed": true
353 |    },
354 |    "outputs": [],
355 |    "source": [
356 |     "# Embedding layer Initialization\n",
357 |     "embedding_layer = Embedding(vocab_size+1, \n",
358 |     "                            embedding_size,\n",
359 |     "                            input_length=input_size,\n",
360 |     "                            weights=[embedding_weights])"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": 46,
366 |    "metadata": {},
367 |    "outputs": [
368 |     {
369 |      "name": "stdout",
370 |      "output_type": "stream",
371 |      "text": [
372 |       "_________________________________________________________________\n",
373 |       "Layer (type)                 Output Shape              Param #   \n",
374 |       "=================================================================\n",
375 |       "input (InputLayer)           (None, 1014)              0         \n",
376 |       "_________________________________________________________________\n",
377 |       "embedding_4 (Embedding)      (None, 1014, 70)          4970      \n",
378 |       "_________________________________________________________________\n",
379 |       "conv1d_13 (Conv1D)           (None, 1008, 256)         125696    \n",
380 |       "_________________________________________________________________\n",
381 |       "activation_13 (Activation)   (None, 1008, 256)         0         \n",
382 |       "_________________________________________________________________\n",
383 |       "max_pooling1d_7 (MaxPooling1 (None, 336, 256)          0         \n",
384 |       "_________________________________________________________________\n",
385 |       "conv1d_14 (Conv1D)           (None, 330, 256)          459008    \n",
386 |       "_________________________________________________________________\n",
387 |       "activation_14 (Activation)   (None, 330, 256)          0         \n",
388 |       "_________________________________________________________________\n",
389 |       "max_pooling1d_8 (MaxPooling1 (None, 110, 256)          0         \n",
390 |       "_________________________________________________________________\n",
391 |       "conv1d_15 (Conv1D)           (None, 108, 256)          196864    \n",
392 |       "_________________________________________________________________\n",
393 |       "activation_15 (Activation)   (None, 108, 256)          0         \n",
394 |       "_________________________________________________________________\n",
395 |       "conv1d_16 (Conv1D)           (None, 106, 256)          196864    \n",
396 |       "_________________________________________________________________\n",
397 |       "activation_16 (Activation)   (None, 106, 256)          0         \n",
398 |       "_________________________________________________________________\n",
399 |       "conv1d_17 (Conv1D)           (None, 104, 256)          196864    \n",
400 |       "_________________________________________________________________\n",
401 |       "activation_17 (Activation)   (None, 104, 256)          0         \n",
402 |       "_________________________________________________________________\n",
403 |       "conv1d_18 (Conv1D)           (None, 102, 256)          196864    \n",
404 |       "_________________________________________________________________\n",
405 |       "activation_18 (Activation)   (None, 102, 256)          0         \n",
406 |       "_________________________________________________________________\n",
407 |       "max_pooling1d_9 (MaxPooling1 (None, 34, 256)           0         \n",
408 |       "_________________________________________________________________\n",
409 |       "flatten_3 (Flatten)          (None, 8704)              0         \n",
410 |       "_________________________________________________________________\n",
411 |       "dense_7 (Dense)              (None, 1024)              8913920   \n",
412 |       "_________________________________________________________________\n",
413 |       "dropout_5 (Dropout)          (None, 1024)              0         \n",
414 |       "_________________________________________________________________\n",
415 |       "dense_8 (Dense)              (None, 1024)              1049600   \n",
416 |       "_________________________________________________________________\n",
417 |       "dropout_6 (Dropout)          (None, 1024)              0         \n",
418 |       "_________________________________________________________________\n",
419 |       "dense_9 (Dense)              (None, 2)                 2050      \n",
420 |       "=================================================================\n",
421 |       "Total params: 11,342,700\n",
422 |       "Trainable params: 11,342,700\n",
423 |       "Non-trainable params: 0\n",
424 |       "_________________________________________________________________\n"
425 |      ]
426 |     }
427 |    ],
428 |    "source": [
429 |     "# Model \n",
430 |     "\n",
431 |     "# Input\n",
432 |     "inputs = Input(shape=(input_size,), name='input', dtype='int64')  # shape=(?, 1014)\n",
433 |     "# Embedding \n",
434 |     "x = embedding_layer(inputs)\n",
435 |     "# Conv \n",
436 |     "for filter_num, filter_size, pooling_size in conv_layers:\n",
437 |     "    x = Conv1D(filter_num, filter_size)(x) \n",
438 |     "    x = Activation('relu')(x)\n",
439 |     "    if pooling_size != -1:\n",
440 |     "        x = MaxPooling1D(pool_size=pooling_size)(x) # Final shape=(None, 34, 256)\n",
441 |     "x = Flatten()(x) # (None, 8704)\n",
442 |     "# Fully connected layers \n",
443 |     "for dense_size in fully_connected_layers:\n",
444 |     "    x = Dense(dense_size, activation='relu')(x) # dense_size == 1024\n",
445 |     "    x = Dropout(dropout_p)(x)\n",
446 |     "# Output Layer\n",
447 |     "predictions = Dense(num_of_classes, activation='softmax')(x)\n",
448 |     "# Build model\n",
449 |     "model = Model(inputs=inputs, outputs=predictions)\n",
450 |     "model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) # Adam, categorical_crossentropy\n",
451 |     "model.summary()"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "code",
456 |    "execution_count": 49,
457 |    "metadata": {
458 |     "collapsed": true
459 |    },
460 |    "outputs": [],
461 |    "source": [
462 |     "# 1000 training samples and 100 testing samples\n",
463 |     "indices = np.arange(train_data.shape[0])\n",
464 |     "np.random.shuffle(indices)\n",
465 |     "\n",
466 |     "x_train = train_data[indices]\n",
467 |     "y_train = train_classes[indices]\n",
468 |     "\n",
469 |     "x_test = test_data\n",
470 |     "y_test = test_classes"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": 50,
476 |    "metadata": {},
477 |    "outputs": [
478 |     {
479 |      "name": "stdout",
480 |      "output_type": "stream",
481 |      "text": [
482 |       "Train on 25569 samples, validate on 6393 samples\n",
483 |       "Epoch 1/10\n",
484 |       " - 13s - loss: 0.1876 - acc: 0.9315 - val_loss: 0.1360 - val_acc: 0.9496\n",
485 |       "Epoch 2/10\n",
486 |       " - 13s - loss: 0.1238 - acc: 0.9555 - val_loss: 0.1172 - val_acc: 0.9571\n",
487 |       "Epoch 3/10\n",
488 |       " - 13s - loss: 0.0919 - acc: 0.9678 - val_loss: 0.1198 - val_acc: 0.9546\n",
489 |       "Epoch 4/10\n",
490 |       " - 13s - loss: 0.0641 - acc: 0.9774 - val_loss: 0.1294 - val_acc: 0.9537\n",
491 |       "Epoch 5/10\n",
492 |       " - 13s - loss: 0.0593 - acc: 0.9795 - val_loss: 0.1109 - val_acc: 0.9629\n",
493 |       "Epoch 6/10\n",
494 |       " - 13s - loss: 0.0362 - acc: 0.9870 - val_loss: 0.1243 - val_acc: 0.9611\n",
495 |       "Epoch 7/10\n",
496 |       " - 13s - loss: 0.0374 - acc: 0.9867 - val_loss: 0.1432 - val_acc: 0.9604\n",
497 |       "Epoch 8/10\n",
498 |       " - 13s - loss: 0.0213 - acc: 0.9923 - val_loss: 0.1498 - val_acc: 0.9620\n",
499 |       "Epoch 9/10\n",
500 |       " - 13s - loss: 0.0153 - acc: 0.9947 - val_loss: 0.1718 - val_acc: 0.9617\n",
501 |       "Epoch 10/10\n",
502 |       " - 13s - loss: 0.0126 - acc: 0.9960 - val_loss: 0.3250 - val_acc: 0.9565\n"
503 |      ]
504 |     },
505 |     {
506 |      "data": {
507 |       "text/plain": [
508 |        "<keras.callbacks.History at 0x7f3cf0413be0>"
509 |       ]
510 |      },
511 |      "execution_count": 50,
512 |      "metadata": {},
513 |      "output_type": "execute_result"
514 |     }
515 |    ],
516 |    "source": [
517 |     "\n",
518 |     "\n",
519 |     "# Training\n",
520 |     "model.fit(x_train, y_train,\n",
521 |     "          validation_data=(x_test, y_test),\n",
522 |     "          batch_size=128,\n",
523 |     "          epochs=10,\n",
524 |     "          verbose=2)\n",
525 |     "\n"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "code",
530 |    "execution_count": null,
531 |    "metadata": {
532 |     "collapsed": true
533 |    },
534 |    "outputs": [],
535 |    "source": []
536 |   }
537 |  ],
538 |  "metadata": {
539 |   "kernelspec": {
540 |    "display_name": "Python 3",
541 |    "language": "python",
542 |    "name": "python3"
543 |   },
544 |   "language_info": {
545 |    "codemirror_mode": {
546 |     "name": "ipython",
547 |     "version": 3
548 |    },
549 |    "file_extension": ".py",
550 |    "mimetype": "text/x-python",
551 |    "name": "python",
552 |    "nbconvert_exporter": "python",
553 |    "pygments_lexer": "ipython3",
554 |    "version": "3.6.8"
555 |   }
556 |  },
557 |  "nbformat": 4,
558 |  "nbformat_minor": 1
559 | }
560 | 


--------------------------------------------------------------------------------
/CNN/.ipynb_checkpoints/cnn_nlp_nb-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
  8 |     "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
  9 |    },
 10 |    "outputs": [
 11 |     {
 12 |      "name": "stdout",
 13 |      "output_type": "stream",
 14 |      "text": [
 15 |       "['test_tweets_anuFYb8.csv', 'train_E6oV3lV.csv']\n"
 16 |      ]
 17 |     }
 18 |    ],
 19 |    "source": [
 20 |     "# This Python 3 environment comes with many helpful analytics libraries installed\n",
 21 |     "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n",
 22 |     "# For example, here's several helpful packages to load in \n",
 23 |     "\n",
 24 |     "import numpy as np # linear algebra\n",
 25 |     "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
 26 |     "\n",
 27 |     "# Input data files are available in the \"../input/\" directory.\n",
 28 |     "# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n",
 29 |     "\n",
 30 |     "import os\n",
 31 |     "print(os.listdir(\"../input\"))\n",
 32 |     "\n",
 33 |     "# Any results you write to the current directory are saved as output.\n",
 34 |     "\n",
 35 |     "# import keras\n",
 36 |     "# import tensorflow\n",
 37 |     "\n",
 38 |     "# print(keras.__version__)\n",
 39 |     "# print(tensorflow.__version__)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import numpy as np\n",
 49 |     "import pandas as pd"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 3,
 55 |    "metadata": {
 56 |     "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
 57 |     "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
 58 |    },
 59 |    "outputs": [
 60 |     {
 61 |      "name": "stdout",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "   id                        ...                                                                      tweet\n",
 65 |       "0   1                        ...                           @user when a father is dysfunctional and is s...\n",
 66 |       "1   2                        ...                          @user @user thanks for #lyft credit i can't us...\n",
 67 |       "2   3                        ...                                                        bihday your majesty\n",
 68 |       "3   4                        ...                          #model   i love u take with u all the time in ...\n",
 69 |       "4   5                        ...                                     factsguide: society now    #motivation\n",
 70 |       "\n",
 71 |       "[5 rows x 3 columns]\n"
 72 |      ]
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "df = pd.read_csv('../input/train_E6oV3lV.csv')\n",
 77 |     "df['label'] = df['label'].map({0: 2, 1: 1})\n",
 78 |     "print(df.head())\n",
 79 |     "# print(df['label'].head())\n",
 80 |     "df = df.drop('id', axis=1)\n",
 81 |     "\n",
 82 |     "# print(df[:100])\n",
 83 |     "# zero = 0\n",
 84 |     "# one = 0\n",
 85 |     "# for i in df['label']:\n",
 86 |     "#     if(i==0):\n",
 87 |     "#         zero+=1\n",
 88 |     "#     else:\n",
 89 |     "#         one+=1\n",
 90 |     "    \n",
 91 |     "# print(zero,one)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 4,
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "name": "stderr",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "Using TensorFlow backend.\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "from keras.preprocessing.text import Tokenizer\n",
109 |     "from keras.preprocessing.sequence import pad_sequences\n",
110 |     "from sklearn.model_selection import train_test_split\n",
111 |     "import copy"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 5,
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "25569\n",
124 |       "6393\n",
125 |       "       label                                              tweet\n",
126 |       "11704      2   @user   #diwali !!! #besteduonline !!! contac...\n",
127 |       "6885       2  @user i was at this trump rally last night. ma...\n",
128 |       "744        2  i don't know where you're going but do you hav...\n",
129 |       "13467      2  watch on #periscope: the reading of victims na...\n",
130 |       "31250      2  9 months later.. i am back to my pre pregnancy...\n",
131 |       "<class 'pandas.core.frame.DataFrame'>\n"
132 |      ]
133 |     }
134 |    ],
135 |    "source": [
136 |     "\n",
137 |     "train, test = train_test_split(df, test_size=0.2)\n",
138 |     "print(len(train))\n",
139 |     "print(len(test))\n",
140 |     "print(train.head())\n",
141 |     "\n",
142 |     "print(type(train))"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 6,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "train_df = copy.deepcopy(train)\n",
152 |     "test_df = copy.deepcopy(test)\n",
153 |     "\n",
154 |     "# concatenate column 1 and column 2 as one text\n",
155 |     "# print(train_df[1])\n",
156 |     "\n",
157 |     "# convert string to lower case\n",
158 |     "train_texts = train_df['tweet'].values\n",
159 |     "train_texts = [s.lower() for s in train_texts]\n",
160 |     "\n",
161 |     "test_texts = test_df['tweet'].values\n",
162 |     "test_texts = [s.lower() for s in test_texts]\n",
163 |     "\n",
164 |     "# print(train_texts.head())\n",
165 |     "# print(train_texts)\n"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 7,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "\n",
175 |     "# =======================Convert string to index================\n",
176 |     "# Tokenizer\n",
177 |     "tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')\n",
178 |     "tk.fit_on_texts(train_texts)\n",
179 |     "# If we already have a character list, then replace the tk.word_index\n",
180 |     "# If not, just skip below part\n",
181 |     "\n",
182 |     "# -----------------------Skip part start--------------------------\n",
183 |     "# construct a new vocabulary\n",
184 |     "alphabet = \"abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\\\"/\\\\|_@#$%^&*~`+-=<>()[]{}\"\n",
185 |     "char_dict = {}\n",
186 |     "for i, char in enumerate(alphabet):\n",
187 |     "    char_dict[char] = i + 1\n",
188 |     "\n",
189 |     "# Use char_dict to replace the tk.word_index\n",
190 |     "tk.word_index = char_dict.copy()\n",
191 |     "# Add 'UNK' to the vocabulary\n",
192 |     "tk.word_index[tk.oov_token] = max(char_dict.values()) + 1\n",
193 |     "# -----------------------Skip part end----------------------------\n",
194 |     "\n",
195 |     "# Convert string to index\n",
196 |     "train_sequences = tk.texts_to_sequences(train_texts)\n",
197 |     "test_texts = tk.texts_to_sequences(test_texts)\n",
198 |     "\n",
199 |     "# Padding\n",
200 |     "train_data = pad_sequences(train_sequences, maxlen=1014, padding='post')\n",
201 |     "test_data = pad_sequences(test_texts, maxlen=1014, padding='post')\n",
202 |     "\n",
203 |     "# Convert to numpy array\n",
204 |     "train_data = np.array(train_data, dtype='float32')\n",
205 |     "test_data = np.array(test_data, dtype='float32')\n",
206 |     "\n",
207 |     "# =======================Get classes================\n",
208 |     "train_classes = train_df['label'].values\n",
209 |     "train_class_list = [x - 1 for x in train_classes]\n",
210 |     "\n",
211 |     "test_classes = test_df['label'].values\n",
212 |     "test_class_list = [x - 1 for x in test_classes]\n",
213 |     "\n",
214 |     "from keras.utils import to_categorical\n",
215 |     "\n",
216 |     "train_classes = to_categorical(train_class_list)\n",
217 |     "test_classes = to_categorical(test_class_list)"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 8,
223 |    "metadata": {},
224 |    "outputs": [
225 |     {
226 |      "name": "stdout",
227 |      "output_type": "stream",
228 |      "text": [
229 |       "6393\n",
230 |       "{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '0': 27, '1': 28, '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34, '8': 35, '9': 36, ' ': 37, ',': 38, ';': 39, '.': 40, '!': 41, '?': 42, ':': 43, \"'\": 44, '\"': 45, '/': 46, '\\\\': 47, '|': 48, '_': 49, '@': 50, '#': 51, '$': 52, '%': 53, '^': 54, '&': 55, '*': 56, '~': 57, '`': 58, '+': 59, '-': 60, '=': 61, '<': 62, '>': 63, '(': 64, ')': 65, '[': 66, ']': 67, '{': 68, '}': 69, 'UNK': 70}\n"
231 |      ]
232 |     }
233 |    ],
234 |    "source": [
235 |     "print(len(test_classes))\n",
236 |     "print(tk.word_index)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 9,
242 |    "metadata": {},
243 |    "outputs": [
244 |     {
245 |      "data": {
246 |       "text/plain": [
247 |        "70"
248 |       ]
249 |      },
250 |      "execution_count": 9,
251 |      "metadata": {},
252 |      "output_type": "execute_result"
253 |     }
254 |    ],
255 |    "source": [
256 |     "\n",
257 |     "\n",
258 |     "vocab_size = len(tk.word_index)\n",
259 |     "vocab_size\n",
260 |     "\n"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 10,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "embedding_weights = [] #(71, 70)\n",
270 |     "embedding_weights.append(np.zeros(vocab_size)) # first row is pad\n",
271 |     "\n",
272 |     "for char, i in tk.word_index.items(): # from index 1 to 70\n",
273 |     "    onehot = np.zeros(vocab_size)\n",
274 |     "    onehot[i-1] = 1\n",
275 |     "    embedding_weights.append(onehot)\n",
276 |     "embedding_weights = np.array(embedding_weights)\n"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 11,
282 |    "metadata": {},
283 |    "outputs": [
284 |     {
285 |      "name": "stdout",
286 |      "output_type": "stream",
287 |      "text": [
288 |       "(71, 70)\n"
289 |      ]
290 |     },
291 |     {
292 |      "data": {
293 |       "text/plain": [
294 |        "array([[0., 0., 0., ..., 0., 0., 0.],\n",
295 |        "       [1., 0., 0., ..., 0., 0., 0.],\n",
296 |        "       [0., 1., 0., ..., 0., 0., 0.],\n",
297 |        "       ...,\n",
298 |        "       [0., 0., 0., ..., 1., 0., 0.],\n",
299 |        "       [0., 0., 0., ..., 0., 1., 0.],\n",
300 |        "       [0., 0., 0., ..., 0., 0., 1.]])"
301 |       ]
302 |      },
303 |      "execution_count": 11,
304 |      "metadata": {},
305 |      "output_type": "execute_result"
306 |     }
307 |    ],
308 |    "source": [
309 |     "print(embedding_weights.shape) # first row all 0 for PAD, 69 char, last row for UNK\n",
310 |     "embedding_weights"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 12,
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": [
319 |     "from keras.layers import Input, Embedding, Activation, Flatten, Dense\n",
320 |     "from keras.layers import Conv1D, MaxPooling1D, Dropout\n",
321 |     "from keras.models import Model"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": 13,
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "# parameter \n",
331 |     "input_size = 1014\n",
332 |     "# vocab_size = 69\n",
333 |     "embedding_size = 70\n",
334 |     "conv_layers = [[256, 7, 3], \n",
335 |     "               [256, 7, 3], \n",
336 |     "               [256, 3, -1], \n",
337 |     "               [256, 3, -1], \n",
338 |     "               [256, 3, -1], \n",
339 |     "               [256, 3, 3]]\n",
340 |     "\n",
341 |     "fully_connected_layers = [1024, 1024]\n",
342 |     "num_of_classes = 2\n",
343 |     "dropout_p = 0.5\n",
344 |     "optimizer = 'adam'\n",
345 |     "loss = 'categorical_crossentropy'"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": 14,
351 |    "metadata": {},
352 |    "outputs": [],
353 |    "source": [
354 |     "# Embedding layer Initialization\n",
355 |     "embedding_layer = Embedding(vocab_size+1, \n",
356 |     "                            embedding_size,\n",
357 |     "                            input_length=input_size,\n",
358 |     "                            weights=[embedding_weights])"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 15,
364 |    "metadata": {},
365 |    "outputs": [
366 |     {
367 |      "name": "stdout",
368 |      "output_type": "stream",
369 |      "text": [
370 |       "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
371 |       "Instructions for updating:\n",
372 |       "Colocations handled automatically by placer.\n",
373 |       "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n",
374 |       "Instructions for updating:\n",
375 |       "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n",
376 |       "_________________________________________________________________\n",
377 |       "Layer (type)                 Output Shape              Param #   \n",
378 |       "=================================================================\n",
379 |       "input (InputLayer)           (None, 1014)              0         \n",
380 |       "_________________________________________________________________\n",
381 |       "embedding_1 (Embedding)      (None, 1014, 70)          4970      \n",
382 |       "_________________________________________________________________\n",
383 |       "conv1d_1 (Conv1D)            (None, 1008, 256)         125696    \n",
384 |       "_________________________________________________________________\n",
385 |       "activation_1 (Activation)    (None, 1008, 256)         0         \n",
386 |       "_________________________________________________________________\n",
387 |       "max_pooling1d_1 (MaxPooling1 (None, 336, 256)          0         \n",
388 |       "_________________________________________________________________\n",
389 |       "conv1d_2 (Conv1D)            (None, 330, 256)          459008    \n",
390 |       "_________________________________________________________________\n",
391 |       "activation_2 (Activation)    (None, 330, 256)          0         \n",
392 |       "_________________________________________________________________\n",
393 |       "max_pooling1d_2 (MaxPooling1 (None, 110, 256)          0         \n",
394 |       "_________________________________________________________________\n",
395 |       "conv1d_3 (Conv1D)            (None, 108, 256)          196864    \n",
396 |       "_________________________________________________________________\n",
397 |       "activation_3 (Activation)    (None, 108, 256)          0         \n",
398 |       "_________________________________________________________________\n",
399 |       "conv1d_4 (Conv1D)            (None, 106, 256)          196864    \n",
400 |       "_________________________________________________________________\n",
401 |       "activation_4 (Activation)    (None, 106, 256)          0         \n",
402 |       "_________________________________________________________________\n",
403 |       "conv1d_5 (Conv1D)            (None, 104, 256)          196864    \n",
404 |       "_________________________________________________________________\n",
405 |       "activation_5 (Activation)    (None, 104, 256)          0         \n",
406 |       "_________________________________________________________________\n",
407 |       "conv1d_6 (Conv1D)            (None, 102, 256)          196864    \n",
408 |       "_________________________________________________________________\n",
409 |       "activation_6 (Activation)    (None, 102, 256)          0         \n",
410 |       "_________________________________________________________________\n",
411 |       "max_pooling1d_3 (MaxPooling1 (None, 34, 256)           0         \n",
412 |       "_________________________________________________________________\n",
413 |       "flatten_1 (Flatten)          (None, 8704)              0         \n",
414 |       "_________________________________________________________________\n",
415 |       "dense_1 (Dense)              (None, 1024)              8913920   \n",
416 |       "_________________________________________________________________\n",
417 |       "dropout_1 (Dropout)          (None, 1024)              0         \n",
418 |       "_________________________________________________________________\n",
419 |       "dense_2 (Dense)              (None, 1024)              1049600   \n",
420 |       "_________________________________________________________________\n",
421 |       "dropout_2 (Dropout)          (None, 1024)              0         \n",
422 |       "_________________________________________________________________\n",
423 |       "dense_3 (Dense)              (None, 2)                 2050      \n",
424 |       "=================================================================\n",
425 |       "Total params: 11,342,700\n",
426 |       "Trainable params: 11,342,700\n",
427 |       "Non-trainable params: 0\n",
428 |       "_________________________________________________________________\n"
429 |      ]
430 |     }
431 |    ],
432 |    "source": [
433 |     "# Model \n",
434 |     "\n",
435 |     "# Input\n",
436 |     "inputs = Input(shape=(input_size,), name='input', dtype='int64')  # shape=(?, 1014)\n",
437 |     "# Embedding \n",
438 |     "x = embedding_layer(inputs)\n",
439 |     "# Conv \n",
440 |     "for filter_num, filter_size, pooling_size in conv_layers:\n",
441 |     "    x = Conv1D(filter_num, filter_size)(x) \n",
442 |     "    x = Activation('relu')(x)\n",
443 |     "    if pooling_size != -1:\n",
444 |     "        x = MaxPooling1D(pool_size=pooling_size)(x) # Final shape=(None, 34, 256)\n",
445 |     "x = Flatten()(x) # (None, 8704)\n",
446 |     "# Fully connected layers \n",
447 |     "for dense_size in fully_connected_layers:\n",
448 |     "    x = Dense(dense_size, activation='relu')(x) # dense_size == 1024\n",
449 |     "    x = Dropout(dropout_p)(x)\n",
450 |     "# Output Layer\n",
451 |     "predictions = Dense(num_of_classes, activation='softmax')(x)\n",
452 |     "# Build model\n",
453 |     "model = Model(inputs=inputs, outputs=predictions)\n",
454 |     "model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) # Adam, categorical_crossentropy\n",
455 |     "model.summary()"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": 16,
461 |    "metadata": {},
462 |    "outputs": [
463 |     {
464 |      "name": "stdout",
465 |      "output_type": "stream",
466 |      "text": [
467 |       "[37. 50. 21. ...  0.  0.  0.]\n",
468 |       "[50. 21. 19. ...  0.  0.  0.]\n"
469 |      ]
470 |     }
471 |    ],
472 |    "source": [
473 |     "print(train_data[0])\n",
474 |     "print(train_data[1])\n",
475 |     "\n",
476 |     "# print(train_classes[1])\n",
477 |     "\n",
478 |     "# for i in range(100):\n",
479 |     "#     print(train_classes[i])\n",
480 |     "# for i in train_data[0]:\n",
481 |     "#     print(i)\n",
482 |     "    \n",
483 |     "# print(train)\n",
484 |     "# for i in train_data[0]:\n",
485 |     "#     print(i)"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "code",
490 |    "execution_count": 17,
491 |    "metadata": {},
492 |    "outputs": [],
493 |    "source": [
494 |     "# 1000 training samples and 100 testing samples\n",
495 |     "indices = np.arange(train_data.shape[0])\n",
496 |     "np.random.shuffle(indices)\n",
497 |     "\n",
498 |     "x_train = train_data[indices]\n",
499 |     "y_train = train_classes[indices]\n",
500 |     "\n",
501 |     "x_test = test_data\n",
502 |     "y_test = test_classes"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": 18,
508 |    "metadata": {},
509 |    "outputs": [
510 |     {
511 |      "name": "stdout",
512 |      "output_type": "stream",
513 |      "text": [
514 |       "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
515 |       "Instructions for updating:\n",
516 |       "Use tf.cast instead.\n",
517 |       "Train on 25569 samples, validate on 6393 samples\n",
518 |       "Epoch 1/10\n",
519 |       " - 16s - loss: 0.2343 - acc: 0.9287 - val_loss: 0.1542 - val_acc: 0.9321\n",
520 |       "Epoch 2/10\n",
521 |       " - 13s - loss: 0.1346 - acc: 0.9514 - val_loss: 0.1125 - val_acc: 0.9582\n",
522 |       "Epoch 3/10\n",
523 |       " - 13s - loss: 0.0975 - acc: 0.9664 - val_loss: 0.1058 - val_acc: 0.9657\n",
524 |       "Epoch 4/10\n",
525 |       " - 13s - loss: 0.0690 - acc: 0.9767 - val_loss: 0.0992 - val_acc: 0.9645\n",
526 |       "Epoch 5/10\n",
527 |       " - 13s - loss: 0.0449 - acc: 0.9847 - val_loss: 0.1074 - val_acc: 0.9667\n",
528 |       "Epoch 6/10\n",
529 |       " - 13s - loss: 0.0343 - acc: 0.9895 - val_loss: 0.1266 - val_acc: 0.9692\n",
530 |       "Epoch 7/10\n",
531 |       " - 13s - loss: 0.0278 - acc: 0.9907 - val_loss: 0.1406 - val_acc: 0.9626\n",
532 |       "Epoch 8/10\n",
533 |       " - 13s - loss: 0.0242 - acc: 0.9926 - val_loss: 0.1558 - val_acc: 0.9582\n",
534 |       "Epoch 9/10\n",
535 |       " - 13s - loss: 0.0174 - acc: 0.9944 - val_loss: 0.1468 - val_acc: 0.9651\n",
536 |       "Epoch 10/10\n",
537 |       " - 13s - loss: 0.0200 - acc: 0.9932 - val_loss: 0.1749 - val_acc: 0.9596\n"
538 |      ]
539 |     },
540 |     {
541 |      "data": {
542 |       "text/plain": [
543 |        "<keras.callbacks.History at 0x7fa355a5de48>"
544 |       ]
545 |      },
546 |      "execution_count": 18,
547 |      "metadata": {},
548 |      "output_type": "execute_result"
549 |     }
550 |    ],
551 |    "source": [
552 |     "\n",
553 |     "\n",
554 |     "# Training\n",
555 |     "model.fit(x_train, y_train,\n",
556 |     "          validation_data=(x_test, y_test),\n",
557 |     "          batch_size=128,\n",
558 |     "          epochs=10,\n",
559 |     "          verbose=2)\n",
560 |     "\n"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": 19,
566 |    "metadata": {},
567 |    "outputs": [],
568 |    "source": [
569 |     "train_data_nb = copy.deepcopy(train_data)\n",
570 |     "train_classes_nb = copy.deepcopy(train_classes)\n",
571 |     "\n",
572 |     "test_data_nb = copy.deepcopy(test_data)\n",
573 |     "test_classes_nb = copy.deepcopy(test_classes)"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": 20,
579 |    "metadata": {},
580 |    "outputs": [
581 |     {
582 |      "name": "stdout",
583 |      "output_type": "stream",
584 |      "text": [
585 |       "{0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0}\n"
586 |      ]
587 |     }
588 |    ],
589 |    "source": [
590 |     "# print(train_data_nb)\n",
591 |     "set1 = set()\n",
592 |     "\n",
593 |     "\n",
594 |     "for i in train_data_nb:\n",
595 |     "    for j in i:\n",
596 |     "        set1.add(j)\n",
597 |     "        \n",
598 |     "print((set1))"
599 |    ]
600 |   },
601 |   {
602 |    "cell_type": "code",
603 |    "execution_count": 21,
604 |    "metadata": {},
605 |    "outputs": [
606 |     {
607 |      "name": "stdout",
608 |      "output_type": "stream",
609 |      "text": [
610 |       "69\n",
611 |       "0\n"
612 |      ]
613 |     }
614 |    ],
615 |    "source": [
616 |     "print(len(set1))\n",
617 |     "total_vocab = len(set1)\n",
618 |     "\n",
619 |     "set2 = set()\n",
620 |     "train_classes_nb_l = len(train_classes_nb) * [0]\n",
621 |     "test_classes_nb_l = len(test_classes_nb) * [0]\n",
622 |     "\n",
623 |     "for idx, i in enumerate(train_classes_nb):\n",
624 |     "    if(i[0]==0.0 and i[1]==1.0):\n",
625 |     "        train_classes_nb_l[idx]=copy.deepcopy(int(0))\n",
626 |     "    elif(i[0]==1.0 and i[1]==0.0):\n",
627 |     "        train_classes_nb_l[idx]=copy.deepcopy(int(1))\n",
628 |     "    else:\n",
629 |     "        print(i)\n",
630 |     "\n",
631 |     "for idx, i in enumerate(test_classes_nb):\n",
632 |     "    if(i[0]==0.0 and i[1]==1.0):\n",
633 |     "        test_classes_nb_l[idx]=copy.deepcopy(int(0))\n",
634 |     "    elif(i[0]==1.0 and i[1]==0.0):\n",
635 |     "        test_classes_nb_l[idx]=copy.deepcopy(int(1))\n",
636 |     "    else:\n",
637 |     "        print(i)\n",
638 |     "        \n",
639 |     "print(len(set2))"
640 |    ]
641 |   },
642 |   {
643 |    "cell_type": "code",
644 |    "execution_count": 22,
645 |    "metadata": {},
646 |    "outputs": [],
647 |    "source": [
648 |     "dict1=dict()\n",
649 |     "dict1[0]=dict()\n",
650 |     "dict1[1]=dict()"
651 |    ]
652 |   },
653 |   {
654 |    "cell_type": "code",
655 |    "execution_count": 23,
656 |    "metadata": {},
657 |    "outputs": [
658 |     {
659 |      "name": "stdout",
660 |      "output_type": "stream",
661 |      "text": [
662 |       "0 {'freq': 0, 'prob_class': 0, 0.0: 0, 1.0: 0, 2.0: 0, 3.0: 0, 4.0: 0, 5.0: 0, 6.0: 0, 7.0: 0, 8.0: 0, 9.0: 0, 10.0: 0, 11.0: 0, 12.0: 0, 13.0: 0, 14.0: 0, 15.0: 0, 16.0: 0, 17.0: 0, 18.0: 0, 19.0: 0, 20.0: 0, 21.0: 0, 22.0: 0, 23.0: 0, 24.0: 0, 25.0: 0, 26.0: 0, 27.0: 0, 28.0: 0, 29.0: 0, 30.0: 0, 31.0: 0, 32.0: 0, 33.0: 0, 34.0: 0, 35.0: 0, 36.0: 0, 37.0: 0, 38.0: 0, 39.0: 0, 40.0: 0, 41.0: 0, 42.0: 0, 43.0: 0, 44.0: 0, 45.0: 0, 46.0: 0, 47.0: 0, 48.0: 0, 49.0: 0, 50.0: 0, 51.0: 0, 52.0: 0, 53.0: 0, 54.0: 0, 55.0: 0, 56.0: 0, 57.0: 0, 58.0: 0, 59.0: 0, 60.0: 0, 61.0: 0, 64.0: 0, 65.0: 0, 66.0: 0, 67.0: 0, 68.0: 0, 69.0: 0, 70.0: 0}\n",
663 |       "1 {'freq': 0, 'prob_class': 0, 0.0: 0, 1.0: 0, 2.0: 0, 3.0: 0, 4.0: 0, 5.0: 0, 6.0: 0, 7.0: 0, 8.0: 0, 9.0: 0, 10.0: 0, 11.0: 0, 12.0: 0, 13.0: 0, 14.0: 0, 15.0: 0, 16.0: 0, 17.0: 0, 18.0: 0, 19.0: 0, 20.0: 0, 21.0: 0, 22.0: 0, 23.0: 0, 24.0: 0, 25.0: 0, 26.0: 0, 27.0: 0, 28.0: 0, 29.0: 0, 30.0: 0, 31.0: 0, 32.0: 0, 33.0: 0, 34.0: 0, 35.0: 0, 36.0: 0, 37.0: 0, 38.0: 0, 39.0: 0, 40.0: 0, 41.0: 0, 42.0: 0, 43.0: 0, 44.0: 0, 45.0: 0, 46.0: 0, 47.0: 0, 48.0: 0, 49.0: 0, 50.0: 0, 51.0: 0, 52.0: 0, 53.0: 0, 54.0: 0, 55.0: 0, 56.0: 0, 57.0: 0, 58.0: 0, 59.0: 0, 60.0: 0, 61.0: 0, 64.0: 0, 65.0: 0, 66.0: 0, 67.0: 0, 68.0: 0, 69.0: 0, 70.0: 0}\n"
664 |      ]
665 |     }
666 |    ],
667 |    "source": [
668 |     "for key,val in dict1.items():\n",
669 |     "    dict1[key][\"freq\"] = 0\n",
670 |     "    dict1[key][\"prob_class\"] = 0\n",
671 |     "    for j in set1:\n",
672 |     "        dict1[key][j] = 0\n",
673 |     "    print(key,dict1[key])\n",
674 |     "    \n"
675 |    ]
676 |   },
677 |   {
678 |    "cell_type": "code",
679 |    "execution_count": 24,
680 |    "metadata": {},
681 |    "outputs": [
682 |     {
683 |      "name": "stdout",
684 |      "output_type": "stream",
685 |      "text": [
686 |       "{0: {'freq': 24093654, 'prob_class': 23761, 0.0: 22087777, 1.0: 115637, 2.0: 25252, 3.0: 33900, 4.0: 51837, 5.0: 163930, 6.0: 32893, 7.0: 36043, 8.0: 59369, 9.0: 100088, 10.0: 3825, 11.0: 17322, 12.0: 66879, 13.0: 42361, 14.0: 88241, 15.0: 113170, 16.0: 31965, 17.0: 1069, 18.0: 87011, 19.0: 97758, 20.0: 108461, 21.0: 56213, 22.0: 17425, 23.0: 29233, 24.0: 3352, 25.0: 42683, 26.0: 1899, 27.0: 2305, 28.0: 2600, 29.0: 2121, 30.0: 1081, 31.0: 1005, 32.0: 748, 33.0: 1250, 34.0: 489, 35.0: 427, 36.0: 578, 37.0: 358139, 38.0: 4544, 39.0: 1672, 40.0: 18668, 41.0: 11219, 42.0: 1803, 43.0: 2397, 44.0: 6222, 45.0: 1254, 46.0: 524, 47.0: 26, 48.0: 152, 49.0: 649, 50.0: 13087, 51.0: 57560, 52.0: 170, 53.0: 195, 54.0: 47, 55.0: 1529, 56.0: 190, 57.0: 110, 58.0: 8, 59.0: 148, 60.0: 2265, 61.0: 106, 64.0: 623, 65.0: 832, 66.0: 81, 67.0: 79, 68.0: 5, 69.0: 4, 70.0: 81149}, 1: {'freq': 1833312, 'prob_class': 1808, 0.0: 1669726, 1.0: 10363, 2.0: 2253, 3.0: 3816, 4.0: 3627, 5.0: 14133, 6.0: 2115, 7.0: 2422, 8.0: 4827, 9.0: 9398, 10.0: 412, 11.0: 1315, 12.0: 5558, 13.0: 4100, 14.0: 7303, 15.0: 8679, 16.0: 3078, 17.0: 94, 18.0: 8335, 19.0: 9657, 20.0: 9506, 21.0: 4938, 22.0: 1122, 23.0: 2366, 24.0: 266, 25.0: 2511, 26.0: 157, 27.0: 189, 28.0: 176, 29.0: 180, 30.0: 37, 31.0: 72, 32.0: 27, 33.0: 94, 34.0: 72, 35.0: 20, 36.0: 37, 37.0: 27256, 38.0: 538, 39.0: 274, 40.0: 1318, 41.0: 385, 42.0: 270, 43.0: 195, 44.0: 599, 45.0: 233, 46.0: 61, 47.0: 0, 48.0: 17, 49.0: 9, 50.0: 1503, 51.0: 3913, 52.0: 11, 53.0: 4, 54.0: 0, 55.0: 255, 56.0: 23, 57.0: 11, 58.0: 0, 59.0: 14, 60.0: 207, 61.0: 20, 64.0: 26, 65.0: 26, 66.0: 0, 67.0: 0, 68.0: 0, 69.0: 0, 70.0: 3163}}\n"
687 |      ]
688 |     }
689 |    ],
690 |    "source": [
691 |     "for i in range(len(train_data_nb)):\n",
692 |     "    dict1[train_classes_nb_l[i]]['freq'] += len(train_data_nb[i]) \n",
693 |     "    dict1[train_classes_nb_l[i]]['prob_class'] +=1\n",
694 |     "    for j in train_data_nb[i]:\n",
695 |     "        dict1[train_classes_nb_l[i]][j] +=1\n",
696 |     "#     print(len(train_data_nb[i]))\n",
697 |     "#     print(train_classes_nb_l[i])\n",
698 |     "\n",
699 |     "print(dict1)"
700 |    ]
701 |   },
702 |   {
703 |    "cell_type": "code",
704 |    "execution_count": 25,
705 |    "metadata": {},
706 |    "outputs": [
707 |     {
708 |      "name": "stdout",
709 |      "output_type": "stream",
710 |      "text": [
711 |       "69\n",
712 |       "25569\n"
713 |      ]
714 |     }
715 |    ],
716 |    "source": [
717 |     "print(total_vocab)\n",
718 |     "total_data_len = len(train_data_nb)\n",
719 |     "print(total_data_len)\n",
720 |     "\n",
721 |     "\n",
722 |     "# p(class) = dict[class][prob_class]/total_data_len\n",
723 |     "# p(word/class) = (dict[class][word]+1)/(dict[class][freq]+total_vocab)\n",
724 |     "\n",
725 |     "\n"
726 |    ]
727 |   },
728 |   {
729 |    "cell_type": "code",
730 |    "execution_count": 26,
731 |    "metadata": {},
732 |    "outputs": [],
733 |    "source": [
734 |     "def prob_fun(op_class,sentence):\n",
735 |     "    result = dict1[op_class]['prob_class']/total_data_len\n",
736 |     "    for i in sentence:\n",
737 |     "        result *= ((dict1[op_class][i]+1)/(dict1[op_class]['freq']+total_vocab))\n",
738 |     "    return result"
739 |    ]
740 |   },
741 |   {
742 |    "cell_type": "code",
743 |    "execution_count": 27,
744 |    "metadata": {},
745 |    "outputs": [
746 |     {
747 |      "name": "stdout",
748 |      "output_type": "stream",
749 |      "text": [
750 |       "Accuracy is  79.08650086031597\n"
751 |      ]
752 |     }
753 |    ],
754 |    "source": [
755 |     "correct = 0\n",
756 |     "for idx, sentence in enumerate(test_data_nb):\n",
757 |     "    res0 = prob_fun(0,sentence)\n",
758 |     "    res1 = prob_fun(1,sentence)    \n",
759 |     "    if(res0>res1 and test_classes_nb_l[idx] ==0):\n",
760 |     "        correct+=1\n",
761 |     "    elif(res1>res0 and test_classes_nb_l[idx] ==1):\n",
762 |     "        correct+=1\n",
763 |     "        \n",
764 |     "accuracy = (correct/len(test_data_nb))*100\n",
765 |     "print(\"Accuracy is \",accuracy)"
766 |    ]
767 |   },
768 |   {
769 |    "cell_type": "code",
770 |    "execution_count": 28,
771 |    "metadata": {},
772 |    "outputs": [],
773 |    "source": []
774 |   }
775 |  ],
776 |  "metadata": {
777 |   "kernelspec": {
778 |    "display_name": "Python 3",
779 |    "language": "python",
780 |    "name": "python3"
781 |   },
782 |   "language_info": {
783 |    "codemirror_mode": {
784 |     "name": "ipython",
785 |     "version": 3
786 |    },
787 |    "file_extension": ".py",
788 |    "mimetype": "text/x-python",
789 |    "name": "python",
790 |    "nbconvert_exporter": "python",
791 |    "pygments_lexer": "ipython3",
792 |    "version": "3.5.2"
793 |   }
794 |  },
795 |  "nbformat": 4,
796 |  "nbformat_minor": 1
797 | }
798 | 


--------------------------------------------------------------------------------
/CNN/cnn-keras.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
  8 |     "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
  9 |    },
 10 |    "outputs": [
 11 |     {
 12 |      "name": "stdout",
 13 |      "output_type": "stream",
 14 |      "text": [
 15 |       "['test_tweets_anuFYb8.csv', 'train_E6oV3lV.csv']\n"
 16 |      ]
 17 |     }
 18 |    ],
 19 |    "source": [
 20 |     "# This Python 3 environment comes with many helpful analytics libraries installed\n",
 21 |     "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n",
 22 |     "# For example, here's several helpful packages to load in \n",
 23 |     "\n",
 24 |     "import numpy as np # linear algebra\n",
 25 |     "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
 26 |     "\n",
 27 |     "# Input data files are available in the \"../input/\" directory.\n",
 28 |     "# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n",
 29 |     "\n",
 30 |     "import os\n",
 31 |     "print(os.listdir(\"../input\"))\n"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {
 38 |     "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
 39 |     "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "df = pd.read_csv('../input/train_E6oV3lV.csv')\n",
 44 |     "df['label'] = df['label'].map({0: 2, 1: 1})\n",
 45 |     "df = df.drop('id', axis=1)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 3,
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "name": "stderr",
 55 |      "output_type": "stream",
 56 |      "text": [
 57 |       "Using TensorFlow backend.\n"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "from keras.preprocessing.text import Tokenizer\n",
 63 |     "from keras.preprocessing.sequence import pad_sequences\n",
 64 |     "from sklearn.model_selection import train_test_split\n",
 65 |     "import copy"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 4,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "train, test = train_test_split(df, test_size=0.2)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 5,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "train_df = copy.deepcopy(train)\n",
 84 |     "test_df = copy.deepcopy(test)\n",
 85 |     "\n",
 86 |     "train_texts = train_df['tweet'].values\n",
 87 |     "train_texts = [s.lower() for s in train_texts]\n",
 88 |     "\n",
 89 |     "test_texts = test_df['tweet'].values\n",
 90 |     "test_texts = [s.lower() for s in test_texts]\n"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 6,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')\n",
100 |     "tk.fit_on_texts(train_texts)\n",
101 |     "\n",
102 |     "alphabet = \"abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\\\"/\\\\|_@#$%^&*~`+-=<>()[]{}\"\n",
103 |     "\n",
104 |     "char_dict = {}\n",
105 |     "for i, char in enumerate(alphabet):\n",
106 |     "    char_dict[char] = i + 1\n",
107 |     "tk.word_index = char_dict.copy()\n",
108 |     "tk.word_index[tk.oov_token] = max(char_dict.values()) + 1\n",
109 |     "\n",
110 |     "train_sequences = tk.texts_to_sequences(train_texts)\n",
111 |     "test_texts = tk.texts_to_sequences(test_texts)\n",
112 |     "\n",
113 |     "train_data = pad_sequences(train_sequences, maxlen=150, padding='post')\n",
114 |     "test_data = pad_sequences(test_texts, maxlen=150, padding='post')\n",
115 |     "\n",
116 |     "train_data = np.array(train_data, dtype='float32')\n",
117 |     "test_data = np.array(test_data, dtype='float32')\n",
118 |     "\n",
119 |     "train_classes = train_df['label'].values\n",
120 |     "train_class_list = [x - 1 for x in train_classes]\n",
121 |     "\n",
122 |     "test_classes = test_df['label'].values\n",
123 |     "test_class_list = [x - 1 for x in test_classes]\n",
124 |     "\n",
125 |     "from keras.utils import to_categorical\n",
126 |     "\n",
127 |     "train_classes = to_categorical(train_class_list)\n",
128 |     "test_classes = to_categorical(test_class_list)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 7,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "vocab_size = len(tk.word_index)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 8,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "embedding_weights = [] #(71, 70)\n",
147 |     "embedding_weights.append(np.zeros(vocab_size)) # first row is pad\n",
148 |     "\n",
149 |     "for char, i in tk.word_index.items(): # from index 1 to 70\n",
150 |     "    onehot = np.zeros(vocab_size)\n",
151 |     "    onehot[i-1] = 1\n",
152 |     "    embedding_weights.append(onehot)\n",
153 |     "embedding_weights = np.array(embedding_weights)\n"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 9,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "from keras.layers import Input, Embedding, Activation, Flatten, Dense\n",
163 |     "from keras.layers import Conv1D, MaxPooling1D, Dropout\n",
164 |     "from keras.models import Model"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 10,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "input_size = 150\n",
174 |     "embedding_size = 70\n",
175 |     "conv_layers = [[256, 7, 3], \n",
176 |     "               [256, 7, 3], \n",
177 |     "               [256, 3, -1], \n",
178 |     "               [256, 3, -1], \n",
179 |     "               [256, 3, -1], \n",
180 |     "               [256, 3, 3]]\n",
181 |     "\n",
182 |     "fully_connected_layers = [1024, 1024]\n",
183 |     "num_of_classes = 2\n",
184 |     "dropout_p = 0.5\n",
185 |     "optimizer = 'adam'\n",
186 |     "loss = 'categorical_crossentropy'"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 11,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "embedding_layer = Embedding(vocab_size+1, \n",
196 |     "                            embedding_size,\n",
197 |     "                            input_length=input_size,\n",
198 |     "                            weights=[embedding_weights])"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 12,
204 |    "metadata": {},
205 |    "outputs": [
206 |     {
207 |      "name": "stdout",
208 |      "output_type": "stream",
209 |      "text": [
210 |       "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
211 |       "Instructions for updating:\n",
212 |       "Colocations handled automatically by placer.\n",
213 |       "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n",
214 |       "Instructions for updating:\n",
215 |       "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n",
216 |       "_________________________________________________________________\n",
217 |       "Layer (type)                 Output Shape              Param #   \n",
218 |       "=================================================================\n",
219 |       "input (InputLayer)           (None, 150)               0         \n",
220 |       "_________________________________________________________________\n",
221 |       "embedding_1 (Embedding)      (None, 150, 70)           4970      \n",
222 |       "_________________________________________________________________\n",
223 |       "conv1d_1 (Conv1D)            (None, 144, 256)          125696    \n",
224 |       "_________________________________________________________________\n",
225 |       "activation_1 (Activation)    (None, 144, 256)          0         \n",
226 |       "_________________________________________________________________\n",
227 |       "max_pooling1d_1 (MaxPooling1 (None, 48, 256)           0         \n",
228 |       "_________________________________________________________________\n",
229 |       "conv1d_2 (Conv1D)            (None, 42, 256)           459008    \n",
230 |       "_________________________________________________________________\n",
231 |       "activation_2 (Activation)    (None, 42, 256)           0         \n",
232 |       "_________________________________________________________________\n",
233 |       "max_pooling1d_2 (MaxPooling1 (None, 14, 256)           0         \n",
234 |       "_________________________________________________________________\n",
235 |       "conv1d_3 (Conv1D)            (None, 12, 256)           196864    \n",
236 |       "_________________________________________________________________\n",
237 |       "activation_3 (Activation)    (None, 12, 256)           0         \n",
238 |       "_________________________________________________________________\n",
239 |       "conv1d_4 (Conv1D)            (None, 10, 256)           196864    \n",
240 |       "_________________________________________________________________\n",
241 |       "activation_4 (Activation)    (None, 10, 256)           0         \n",
242 |       "_________________________________________________________________\n",
243 |       "conv1d_5 (Conv1D)            (None, 8, 256)            196864    \n",
244 |       "_________________________________________________________________\n",
245 |       "activation_5 (Activation)    (None, 8, 256)            0         \n",
246 |       "_________________________________________________________________\n",
247 |       "conv1d_6 (Conv1D)            (None, 6, 256)            196864    \n",
248 |       "_________________________________________________________________\n",
249 |       "activation_6 (Activation)    (None, 6, 256)            0         \n",
250 |       "_________________________________________________________________\n",
251 |       "max_pooling1d_3 (MaxPooling1 (None, 2, 256)            0         \n",
252 |       "_________________________________________________________________\n",
253 |       "flatten_1 (Flatten)          (None, 512)               0         \n",
254 |       "_________________________________________________________________\n",
255 |       "dense_1 (Dense)              (None, 1024)              525312    \n",
256 |       "_________________________________________________________________\n",
257 |       "dropout_1 (Dropout)          (None, 1024)              0         \n",
258 |       "_________________________________________________________________\n",
259 |       "dense_2 (Dense)              (None, 1024)              1049600   \n",
260 |       "_________________________________________________________________\n",
261 |       "dropout_2 (Dropout)          (None, 1024)              0         \n",
262 |       "_________________________________________________________________\n",
263 |       "dense_3 (Dense)              (None, 2)                 2050      \n",
264 |       "=================================================================\n",
265 |       "Total params: 2,954,092\n",
266 |       "Trainable params: 2,954,092\n",
267 |       "Non-trainable params: 0\n",
268 |       "_________________________________________________________________\n"
269 |      ]
270 |     }
271 |    ],
272 |    "source": [
273 |     "inputs = Input(shape=(input_size,), name='input', dtype='int64')  \n",
274 |     "x = embedding_layer(inputs)\n",
275 |     "for filter_num, filter_size, pooling_size in conv_layers:\n",
276 |     "    x = Conv1D(filter_num, filter_size)(x) \n",
277 |     "    x = Activation('relu')(x)\n",
278 |     "    if pooling_size != -1:\n",
279 |     "        x = MaxPooling1D(pool_size=pooling_size)(x) \n",
280 |     "x = Flatten()(x) \n",
281 |     "\n",
282 |     "for dense_size in fully_connected_layers:\n",
283 |     "    x = Dense(dense_size, activation='relu')(x) \n",
284 |     "    x = Dropout(dropout_p)(x)\n",
285 |     "\n",
286 |     "predictions = Dense(num_of_classes, activation='softmax')(x)\n",
287 |     "\n",
288 |     "model = Model(inputs=inputs, outputs=predictions)\n",
289 |     "model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) \n",
290 |     "model.summary()"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 13,
296 |    "metadata": {},
297 |    "outputs": [],
298 |    "source": [
299 |     "indices = np.arange(train_data.shape[0])\n",
300 |     "np.random.shuffle(indices)\n",
301 |     "\n",
302 |     "x_train = train_data[indices]\n",
303 |     "y_train = train_classes[indices]\n",
304 |     "\n",
305 |     "x_test = test_data\n",
306 |     "y_test = test_classes"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 14,
312 |    "metadata": {},
313 |    "outputs": [
314 |     {
315 |      "name": "stdout",
316 |      "output_type": "stream",
317 |      "text": [
318 |       "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
319 |       "Instructions for updating:\n",
320 |       "Use tf.cast instead.\n",
321 |       "Train on 25569 samples, validate on 6393 samples\n",
322 |       "Epoch 1/10\n",
323 |       " - 8s - loss: 0.2342 - acc: 0.9275 - val_loss: 0.1753 - val_acc: 0.9249\n",
324 |       "Epoch 2/10\n",
325 |       " - 3s - loss: 0.1532 - acc: 0.9428 - val_loss: 0.1418 - val_acc: 0.9485\n",
326 |       "Epoch 3/10\n",
327 |       " - 3s - loss: 0.0928 - acc: 0.9668 - val_loss: 0.1041 - val_acc: 0.9571\n",
328 |       "Epoch 4/10\n",
329 |       " - 3s - loss: 0.0589 - acc: 0.9800 - val_loss: 0.0875 - val_acc: 0.9717\n",
330 |       "Epoch 5/10\n",
331 |       " - 3s - loss: 0.0435 - acc: 0.9855 - val_loss: 0.1206 - val_acc: 0.9559\n",
332 |       "Epoch 6/10\n",
333 |       " - 3s - loss: 0.0340 - acc: 0.9891 - val_loss: 0.1127 - val_acc: 0.9668\n",
334 |       "Epoch 7/10\n",
335 |       " - 3s - loss: 0.0241 - acc: 0.9920 - val_loss: 0.1443 - val_acc: 0.9698\n",
336 |       "Epoch 8/10\n",
337 |       " - 3s - loss: 0.0242 - acc: 0.9914 - val_loss: 0.1556 - val_acc: 0.9673\n",
338 |       "Epoch 9/10\n",
339 |       " - 3s - loss: 0.0143 - acc: 0.9951 - val_loss: 0.1586 - val_acc: 0.9676\n",
340 |       "Epoch 10/10\n",
341 |       " - 3s - loss: 0.0132 - acc: 0.9953 - val_loss: 0.1504 - val_acc: 0.9697\n"
342 |      ]
343 |     },
344 |     {
345 |      "data": {
346 |       "text/plain": [
347 |        "<keras.callbacks.History at 0x7f4f80f06b38>"
348 |       ]
349 |      },
350 |      "execution_count": 14,
351 |      "metadata": {},
352 |      "output_type": "execute_result"
353 |     }
354 |    ],
355 |    "source": [
356 |     "# Training\n",
357 |     "model.fit(x_train, y_train,\n",
358 |     "          validation_data=(x_test, y_test),\n",
359 |     "          batch_size=128,\n",
360 |     "          epochs=10,\n",
361 |     "          verbose=2)\n",
362 |     "\n"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 15,
368 |    "metadata": {},
369 |    "outputs": [],
370 |    "source": []
371 |   }
372 |  ],
373 |  "metadata": {
374 |   "kernelspec": {
375 |    "display_name": "Python 3",
376 |    "language": "python",
377 |    "name": "python3"
378 |   },
379 |   "language_info": {
380 |    "codemirror_mode": {
381 |     "name": "ipython",
382 |     "version": 3
383 |    },
384 |    "file_extension": ".py",
385 |    "mimetype": "text/x-python",
386 |    "name": "python",
387 |    "nbconvert_exporter": "python",
388 |    "pygments_lexer": "ipython3",
389 |    "version": "3.6.4"
390 |   }
391 |  },
392 |  "nbformat": 4,
393 |  "nbformat_minor": 1
394 | }
395 | 


--------------------------------------------------------------------------------
/CNN/cnn_nlp_nb.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
  8 |     "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
  9 |    },
 10 |    "outputs": [
 11 |     {
 12 |      "name": "stdout",
 13 |      "output_type": "stream",
 14 |      "text": [
 15 |       "['test_tweets_anuFYb8.csv', 'train_E6oV3lV.csv']\n"
 16 |      ]
 17 |     }
 18 |    ],
 19 |    "source": [
 20 |     "# This Python 3 environment comes with many helpful analytics libraries installed\n",
 21 |     "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n",
 22 |     "# For example, here's several helpful packages to load in \n",
 23 |     "\n",
 24 |     "import numpy as np # linear algebra\n",
 25 |     "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
 26 |     "\n",
 27 |     "# Input data files are available in the \"../input/\" directory.\n",
 28 |     "# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n",
 29 |     "\n",
 30 |     "import os\n",
 31 |     "print(os.listdir(\"../input\"))\n",
 32 |     "\n",
 33 |     "# Any results you write to the current directory are saved as output.\n",
 34 |     "\n",
 35 |     "# import keras\n",
 36 |     "# import tensorflow\n",
 37 |     "\n",
 38 |     "# print(keras.__version__)\n",
 39 |     "# print(tensorflow.__version__)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import numpy as np\n",
 49 |     "import pandas as pd"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 3,
 55 |    "metadata": {
 56 |     "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
 57 |     "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
 58 |    },
 59 |    "outputs": [
 60 |     {
 61 |      "name": "stdout",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "   id                        ...                                                                      tweet\n",
 65 |       "0   1                        ...                           @user when a father is dysfunctional and is s...\n",
 66 |       "1   2                        ...                          @user @user thanks for #lyft credit i can't us...\n",
 67 |       "2   3                        ...                                                        bihday your majesty\n",
 68 |       "3   4                        ...                          #model   i love u take with u all the time in ...\n",
 69 |       "4   5                        ...                                     factsguide: society now    #motivation\n",
 70 |       "\n",
 71 |       "[5 rows x 3 columns]\n"
 72 |      ]
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "df = pd.read_csv('../input/train_E6oV3lV.csv')\n",
 77 |     "df['label'] = df['label'].map({0: 2, 1: 1})\n",
 78 |     "print(df.head())\n",
 79 |     "# print(df['label'].head())\n",
 80 |     "df = df.drop('id', axis=1)\n",
 81 |     "\n",
 82 |     "# print(df[:100])\n",
 83 |     "# zero = 0\n",
 84 |     "# one = 0\n",
 85 |     "# for i in df['label']:\n",
 86 |     "#     if(i==0):\n",
 87 |     "#         zero+=1\n",
 88 |     "#     else:\n",
 89 |     "#         one+=1\n",
 90 |     "    \n",
 91 |     "# print(zero,one)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 4,
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "name": "stderr",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "Using TensorFlow backend.\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "from keras.preprocessing.text import Tokenizer\n",
109 |     "from keras.preprocessing.sequence import pad_sequences\n",
110 |     "from sklearn.model_selection import train_test_split\n",
111 |     "import copy"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 5,
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "25569\n",
124 |       "6393\n",
125 |       "       label                                              tweet\n",
126 |       "11704      2   @user   #diwali !!! #besteduonline !!! contac...\n",
127 |       "6885       2  @user i was at this trump rally last night. ma...\n",
128 |       "744        2  i don't know where you're going but do you hav...\n",
129 |       "13467      2  watch on #periscope: the reading of victims na...\n",
130 |       "31250      2  9 months later.. i am back to my pre pregnancy...\n",
131 |       "<class 'pandas.core.frame.DataFrame'>\n"
132 |      ]
133 |     }
134 |    ],
135 |    "source": [
136 |     "\n",
137 |     "train, test = train_test_split(df, test_size=0.2)\n",
138 |     "print(len(train))\n",
139 |     "print(len(test))\n",
140 |     "print(train.head())\n",
141 |     "\n",
142 |     "print(type(train))"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 6,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "train_df = copy.deepcopy(train)\n",
152 |     "test_df = copy.deepcopy(test)\n",
153 |     "\n",
154 |     "# concatenate column 1 and column 2 as one text\n",
155 |     "# print(train_df[1])\n",
156 |     "\n",
157 |     "# convert string to lower case\n",
158 |     "train_texts = train_df['tweet'].values\n",
159 |     "train_texts = [s.lower() for s in train_texts]\n",
160 |     "\n",
161 |     "test_texts = test_df['tweet'].values\n",
162 |     "test_texts = [s.lower() for s in test_texts]\n",
163 |     "\n",
164 |     "# print(train_texts.head())\n",
165 |     "# print(train_texts)\n"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 7,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "\n",
175 |     "# =======================Convert string to index================\n",
176 |     "# Tokenizer\n",
177 |     "tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')\n",
178 |     "tk.fit_on_texts(train_texts)\n",
179 |     "# If we already have a character list, then replace the tk.word_index\n",
180 |     "# If not, just skip below part\n",
181 |     "\n",
182 |     "# -----------------------Skip part start--------------------------\n",
183 |     "# construct a new vocabulary\n",
184 |     "alphabet = \"abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\\\"/\\\\|_@#$%^&*~`+-=<>()[]{}\"\n",
185 |     "char_dict = {}\n",
186 |     "for i, char in enumerate(alphabet):\n",
187 |     "    char_dict[char] = i + 1\n",
188 |     "\n",
189 |     "# Use char_dict to replace the tk.word_index\n",
190 |     "tk.word_index = char_dict.copy()\n",
191 |     "# Add 'UNK' to the vocabulary\n",
192 |     "tk.word_index[tk.oov_token] = max(char_dict.values()) + 1\n",
193 |     "# -----------------------Skip part end----------------------------\n",
194 |     "\n",
195 |     "# Convert string to index\n",
196 |     "train_sequences = tk.texts_to_sequences(train_texts)\n",
197 |     "test_texts = tk.texts_to_sequences(test_texts)\n",
198 |     "\n",
199 |     "# Padding\n",
200 |     "train_data = pad_sequences(train_sequences, maxlen=1014, padding='post')\n",
201 |     "test_data = pad_sequences(test_texts, maxlen=1014, padding='post')\n",
202 |     "\n",
203 |     "# Convert to numpy array\n",
204 |     "train_data = np.array(train_data, dtype='float32')\n",
205 |     "test_data = np.array(test_data, dtype='float32')\n",
206 |     "\n",
207 |     "# =======================Get classes================\n",
208 |     "train_classes = train_df['label'].values\n",
209 |     "train_class_list = [x - 1 for x in train_classes]\n",
210 |     "\n",
211 |     "test_classes = test_df['label'].values\n",
212 |     "test_class_list = [x - 1 for x in test_classes]\n",
213 |     "\n",
214 |     "from keras.utils import to_categorical\n",
215 |     "\n",
216 |     "train_classes = to_categorical(train_class_list)\n",
217 |     "test_classes = to_categorical(test_class_list)"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 8,
223 |    "metadata": {},
224 |    "outputs": [
225 |     {
226 |      "name": "stdout",
227 |      "output_type": "stream",
228 |      "text": [
229 |       "6393\n",
230 |       "{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '0': 27, '1': 28, '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34, '8': 35, '9': 36, ' ': 37, ',': 38, ';': 39, '.': 40, '!': 41, '?': 42, ':': 43, \"'\": 44, '\"': 45, '/': 46, '\\\\': 47, '|': 48, '_': 49, '@': 50, '#': 51, '$': 52, '%': 53, '^': 54, '&': 55, '*': 56, '~': 57, '`': 58, '+': 59, '-': 60, '=': 61, '<': 62, '>': 63, '(': 64, ')': 65, '[': 66, ']': 67, '{': 68, '}': 69, 'UNK': 70}\n"
231 |      ]
232 |     }
233 |    ],
234 |    "source": [
235 |     "print(len(test_classes))\n",
236 |     "print(tk.word_index)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 9,
242 |    "metadata": {},
243 |    "outputs": [
244 |     {
245 |      "data": {
246 |       "text/plain": [
247 |        "70"
248 |       ]
249 |      },
250 |      "execution_count": 9,
251 |      "metadata": {},
252 |      "output_type": "execute_result"
253 |     }
254 |    ],
255 |    "source": [
256 |     "\n",
257 |     "\n",
258 |     "vocab_size = len(tk.word_index)\n",
259 |     "vocab_size\n",
260 |     "\n"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 10,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "embedding_weights = [] #(71, 70)\n",
270 |     "embedding_weights.append(np.zeros(vocab_size)) # first row is pad\n",
271 |     "\n",
272 |     "for char, i in tk.word_index.items(): # from index 1 to 70\n",
273 |     "    onehot = np.zeros(vocab_size)\n",
274 |     "    onehot[i-1] = 1\n",
275 |     "    embedding_weights.append(onehot)\n",
276 |     "embedding_weights = np.array(embedding_weights)\n"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 11,
282 |    "metadata": {},
283 |    "outputs": [
284 |     {
285 |      "name": "stdout",
286 |      "output_type": "stream",
287 |      "text": [
288 |       "(71, 70)\n"
289 |      ]
290 |     },
291 |     {
292 |      "data": {
293 |       "text/plain": [
294 |        "array([[0., 0., 0., ..., 0., 0., 0.],\n",
295 |        "       [1., 0., 0., ..., 0., 0., 0.],\n",
296 |        "       [0., 1., 0., ..., 0., 0., 0.],\n",
297 |        "       ...,\n",
298 |        "       [0., 0., 0., ..., 1., 0., 0.],\n",
299 |        "       [0., 0., 0., ..., 0., 1., 0.],\n",
300 |        "       [0., 0., 0., ..., 0., 0., 1.]])"
301 |       ]
302 |      },
303 |      "execution_count": 11,
304 |      "metadata": {},
305 |      "output_type": "execute_result"
306 |     }
307 |    ],
308 |    "source": [
309 |     "print(embedding_weights.shape) # first row all 0 for PAD, 69 char, last row for UNK\n",
310 |     "embedding_weights"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 12,
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": [
319 |     "from keras.layers import Input, Embedding, Activation, Flatten, Dense\n",
320 |     "from keras.layers import Conv1D, MaxPooling1D, Dropout\n",
321 |     "from keras.models import Model"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": 13,
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "# parameter \n",
331 |     "input_size = 1014\n",
332 |     "# vocab_size = 69\n",
333 |     "embedding_size = 70\n",
334 |     "conv_layers = [[256, 7, 3], \n",
335 |     "               [256, 7, 3], \n",
336 |     "               [256, 3, -1], \n",
337 |     "               [256, 3, -1], \n",
338 |     "               [256, 3, -1], \n",
339 |     "               [256, 3, 3]]\n",
340 |     "\n",
341 |     "fully_connected_layers = [1024, 1024]\n",
342 |     "num_of_classes = 2\n",
343 |     "dropout_p = 0.5\n",
344 |     "optimizer = 'adam'\n",
345 |     "loss = 'categorical_crossentropy'"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": 14,
351 |    "metadata": {},
352 |    "outputs": [],
353 |    "source": [
354 |     "# Embedding layer Initialization\n",
355 |     "embedding_layer = Embedding(vocab_size+1, \n",
356 |     "                            embedding_size,\n",
357 |     "                            input_length=input_size,\n",
358 |     "                            weights=[embedding_weights])"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 15,
364 |    "metadata": {},
365 |    "outputs": [
366 |     {
367 |      "name": "stdout",
368 |      "output_type": "stream",
369 |      "text": [
370 |       "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
371 |       "Instructions for updating:\n",
372 |       "Colocations handled automatically by placer.\n",
373 |       "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n",
374 |       "Instructions for updating:\n",
375 |       "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n",
376 |       "_________________________________________________________________\n",
377 |       "Layer (type)                 Output Shape              Param #   \n",
378 |       "=================================================================\n",
379 |       "input (InputLayer)           (None, 1014)              0         \n",
380 |       "_________________________________________________________________\n",
381 |       "embedding_1 (Embedding)      (None, 1014, 70)          4970      \n",
382 |       "_________________________________________________________________\n",
383 |       "conv1d_1 (Conv1D)            (None, 1008, 256)         125696    \n",
384 |       "_________________________________________________________________\n",
385 |       "activation_1 (Activation)    (None, 1008, 256)         0         \n",
386 |       "_________________________________________________________________\n",
387 |       "max_pooling1d_1 (MaxPooling1 (None, 336, 256)          0         \n",
388 |       "_________________________________________________________________\n",
389 |       "conv1d_2 (Conv1D)            (None, 330, 256)          459008    \n",
390 |       "_________________________________________________________________\n",
391 |       "activation_2 (Activation)    (None, 330, 256)          0         \n",
392 |       "_________________________________________________________________\n",
393 |       "max_pooling1d_2 (MaxPooling1 (None, 110, 256)          0         \n",
394 |       "_________________________________________________________________\n",
395 |       "conv1d_3 (Conv1D)            (None, 108, 256)          196864    \n",
396 |       "_________________________________________________________________\n",
397 |       "activation_3 (Activation)    (None, 108, 256)          0         \n",
398 |       "_________________________________________________________________\n",
399 |       "conv1d_4 (Conv1D)            (None, 106, 256)          196864    \n",
400 |       "_________________________________________________________________\n",
401 |       "activation_4 (Activation)    (None, 106, 256)          0         \n",
402 |       "_________________________________________________________________\n",
403 |       "conv1d_5 (Conv1D)            (None, 104, 256)          196864    \n",
404 |       "_________________________________________________________________\n",
405 |       "activation_5 (Activation)    (None, 104, 256)          0         \n",
406 |       "_________________________________________________________________\n",
407 |       "conv1d_6 (Conv1D)            (None, 102, 256)          196864    \n",
408 |       "_________________________________________________________________\n",
409 |       "activation_6 (Activation)    (None, 102, 256)          0         \n",
410 |       "_________________________________________________________________\n",
411 |       "max_pooling1d_3 (MaxPooling1 (None, 34, 256)           0         \n",
412 |       "_________________________________________________________________\n",
413 |       "flatten_1 (Flatten)          (None, 8704)              0         \n",
414 |       "_________________________________________________________________\n",
415 |       "dense_1 (Dense)              (None, 1024)              8913920   \n",
416 |       "_________________________________________________________________\n",
417 |       "dropout_1 (Dropout)          (None, 1024)              0         \n",
418 |       "_________________________________________________________________\n",
419 |       "dense_2 (Dense)              (None, 1024)              1049600   \n",
420 |       "_________________________________________________________________\n",
421 |       "dropout_2 (Dropout)          (None, 1024)              0         \n",
422 |       "_________________________________________________________________\n",
423 |       "dense_3 (Dense)              (None, 2)                 2050      \n",
424 |       "=================================================================\n",
425 |       "Total params: 11,342,700\n",
426 |       "Trainable params: 11,342,700\n",
427 |       "Non-trainable params: 0\n",
428 |       "_________________________________________________________________\n"
429 |      ]
430 |     }
431 |    ],
432 |    "source": [
433 |     "# Model \n",
434 |     "\n",
435 |     "# Input\n",
436 |     "inputs = Input(shape=(input_size,), name='input', dtype='int64')  # shape=(?, 1014)\n",
437 |     "# Embedding \n",
438 |     "x = embedding_layer(inputs)\n",
439 |     "# Conv \n",
440 |     "for filter_num, filter_size, pooling_size in conv_layers:\n",
441 |     "    x = Conv1D(filter_num, filter_size)(x) \n",
442 |     "    x = Activation('relu')(x)\n",
443 |     "    if pooling_size != -1:\n",
444 |     "        x = MaxPooling1D(pool_size=pooling_size)(x) # Final shape=(None, 34, 256)\n",
445 |     "x = Flatten()(x) # (None, 8704)\n",
446 |     "# Fully connected layers \n",
447 |     "for dense_size in fully_connected_layers:\n",
448 |     "    x = Dense(dense_size, activation='relu')(x) # dense_size == 1024\n",
449 |     "    x = Dropout(dropout_p)(x)\n",
450 |     "# Output Layer\n",
451 |     "predictions = Dense(num_of_classes, activation='softmax')(x)\n",
452 |     "# Build model\n",
453 |     "model = Model(inputs=inputs, outputs=predictions)\n",
454 |     "model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) # Adam, categorical_crossentropy\n",
455 |     "model.summary()"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": 16,
461 |    "metadata": {},
462 |    "outputs": [
463 |     {
464 |      "name": "stdout",
465 |      "output_type": "stream",
466 |      "text": [
467 |       "[37. 50. 21. ...  0.  0.  0.]\n",
468 |       "[50. 21. 19. ...  0.  0.  0.]\n"
469 |      ]
470 |     }
471 |    ],
472 |    "source": [
473 |     "print(train_data[0])\n",
474 |     "print(train_data[1])\n",
475 |     "\n",
476 |     "# print(train_classes[1])\n",
477 |     "\n",
478 |     "# for i in range(100):\n",
479 |     "#     print(train_classes[i])\n",
480 |     "# for i in train_data[0]:\n",
481 |     "#     print(i)\n",
482 |     "    \n",
483 |     "# print(train)\n",
484 |     "# for i in train_data[0]:\n",
485 |     "#     print(i)"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "code",
490 |    "execution_count": 17,
491 |    "metadata": {},
492 |    "outputs": [],
493 |    "source": [
494 |     "# 1000 training samples and 100 testing samples\n",
495 |     "indices = np.arange(train_data.shape[0])\n",
496 |     "np.random.shuffle(indices)\n",
497 |     "\n",
498 |     "x_train = train_data[indices]\n",
499 |     "y_train = train_classes[indices]\n",
500 |     "\n",
501 |     "x_test = test_data\n",
502 |     "y_test = test_classes"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": 18,
508 |    "metadata": {},
509 |    "outputs": [
510 |     {
511 |      "name": "stdout",
512 |      "output_type": "stream",
513 |      "text": [
514 |       "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
515 |       "Instructions for updating:\n",
516 |       "Use tf.cast instead.\n",
517 |       "Train on 25569 samples, validate on 6393 samples\n",
518 |       "Epoch 1/10\n",
519 |       " - 16s - loss: 0.2343 - acc: 0.9287 - val_loss: 0.1542 - val_acc: 0.9321\n",
520 |       "Epoch 2/10\n",
521 |       " - 13s - loss: 0.1346 - acc: 0.9514 - val_loss: 0.1125 - val_acc: 0.9582\n",
522 |       "Epoch 3/10\n",
523 |       " - 13s - loss: 0.0975 - acc: 0.9664 - val_loss: 0.1058 - val_acc: 0.9657\n",
524 |       "Epoch 4/10\n",
525 |       " - 13s - loss: 0.0690 - acc: 0.9767 - val_loss: 0.0992 - val_acc: 0.9645\n",
526 |       "Epoch 5/10\n",
527 |       " - 13s - loss: 0.0449 - acc: 0.9847 - val_loss: 0.1074 - val_acc: 0.9667\n",
528 |       "Epoch 6/10\n",
529 |       " - 13s - loss: 0.0343 - acc: 0.9895 - val_loss: 0.1266 - val_acc: 0.9692\n",
530 |       "Epoch 7/10\n",
531 |       " - 13s - loss: 0.0278 - acc: 0.9907 - val_loss: 0.1406 - val_acc: 0.9626\n",
532 |       "Epoch 8/10\n",
533 |       " - 13s - loss: 0.0242 - acc: 0.9926 - val_loss: 0.1558 - val_acc: 0.9582\n",
534 |       "Epoch 9/10\n",
535 |       " - 13s - loss: 0.0174 - acc: 0.9944 - val_loss: 0.1468 - val_acc: 0.9651\n",
536 |       "Epoch 10/10\n",
537 |       " - 13s - loss: 0.0200 - acc: 0.9932 - val_loss: 0.1749 - val_acc: 0.9596\n"
538 |      ]
539 |     },
540 |     {
541 |      "data": {
542 |       "text/plain": [
543 |        "<keras.callbacks.History at 0x7fa355a5de48>"
544 |       ]
545 |      },
546 |      "execution_count": 18,
547 |      "metadata": {},
548 |      "output_type": "execute_result"
549 |     }
550 |    ],
551 |    "source": [
552 |     "\n",
553 |     "\n",
554 |     "# Training\n",
555 |     "model.fit(x_train, y_train,\n",
556 |     "          validation_data=(x_test, y_test),\n",
557 |     "          batch_size=128,\n",
558 |     "          epochs=10,\n",
559 |     "          verbose=2)\n",
560 |     "\n"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": 19,
566 |    "metadata": {},
567 |    "outputs": [],
568 |    "source": [
569 |     "train_data_nb = copy.deepcopy(train_data)\n",
570 |     "train_classes_nb = copy.deepcopy(train_classes)\n",
571 |     "\n",
572 |     "test_data_nb = copy.deepcopy(test_data)\n",
573 |     "test_classes_nb = copy.deepcopy(test_classes)"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": 20,
579 |    "metadata": {},
580 |    "outputs": [
581 |     {
582 |      "name": "stdout",
583 |      "output_type": "stream",
584 |      "text": [
585 |       "{0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0}\n"
586 |      ]
587 |     }
588 |    ],
589 |    "source": [
590 |     "# print(train_data_nb)\n",
591 |     "set1 = set()\n",
592 |     "\n",
593 |     "\n",
594 |     "for i in train_data_nb:\n",
595 |     "    for j in i:\n",
596 |     "        set1.add(j)\n",
597 |     "        \n",
598 |     "print((set1))"
599 |    ]
600 |   },
601 |   {
602 |    "cell_type": "code",
603 |    "execution_count": 21,
604 |    "metadata": {},
605 |    "outputs": [
606 |     {
607 |      "name": "stdout",
608 |      "output_type": "stream",
609 |      "text": [
610 |       "69\n",
611 |       "0\n"
612 |      ]
613 |     }
614 |    ],
615 |    "source": [
616 |     "print(len(set1))\n",
617 |     "total_vocab = len(set1)\n",
618 |     "\n",
619 |     "set2 = set()\n",
620 |     "train_classes_nb_l = len(train_classes_nb) * [0]\n",
621 |     "test_classes_nb_l = len(test_classes_nb) * [0]\n",
622 |     "\n",
623 |     "for idx, i in enumerate(train_classes_nb):\n",
624 |     "    if(i[0]==0.0 and i[1]==1.0):\n",
625 |     "        train_classes_nb_l[idx]=copy.deepcopy(int(0))\n",
626 |     "    elif(i[0]==1.0 and i[1]==0.0):\n",
627 |     "        train_classes_nb_l[idx]=copy.deepcopy(int(1))\n",
628 |     "    else:\n",
629 |     "        print(i)\n",
630 |     "\n",
631 |     "for idx, i in enumerate(test_classes_nb):\n",
632 |     "    if(i[0]==0.0 and i[1]==1.0):\n",
633 |     "        test_classes_nb_l[idx]=copy.deepcopy(int(0))\n",
634 |     "    elif(i[0]==1.0 and i[1]==0.0):\n",
635 |     "        test_classes_nb_l[idx]=copy.deepcopy(int(1))\n",
636 |     "    else:\n",
637 |     "        print(i)\n",
638 |     "        \n",
639 |     "print(len(set2))"
640 |    ]
641 |   },
642 |   {
643 |    "cell_type": "code",
644 |    "execution_count": 22,
645 |    "metadata": {},
646 |    "outputs": [],
647 |    "source": [
648 |     "dict1=dict()\n",
649 |     "dict1[0]=dict()\n",
650 |     "dict1[1]=dict()"
651 |    ]
652 |   },
653 |   {
654 |    "cell_type": "code",
655 |    "execution_count": 23,
656 |    "metadata": {},
657 |    "outputs": [
658 |     {
659 |      "name": "stdout",
660 |      "output_type": "stream",
661 |      "text": [
662 |       "0 {'freq': 0, 'prob_class': 0, 0.0: 0, 1.0: 0, 2.0: 0, 3.0: 0, 4.0: 0, 5.0: 0, 6.0: 0, 7.0: 0, 8.0: 0, 9.0: 0, 10.0: 0, 11.0: 0, 12.0: 0, 13.0: 0, 14.0: 0, 15.0: 0, 16.0: 0, 17.0: 0, 18.0: 0, 19.0: 0, 20.0: 0, 21.0: 0, 22.0: 0, 23.0: 0, 24.0: 0, 25.0: 0, 26.0: 0, 27.0: 0, 28.0: 0, 29.0: 0, 30.0: 0, 31.0: 0, 32.0: 0, 33.0: 0, 34.0: 0, 35.0: 0, 36.0: 0, 37.0: 0, 38.0: 0, 39.0: 0, 40.0: 0, 41.0: 0, 42.0: 0, 43.0: 0, 44.0: 0, 45.0: 0, 46.0: 0, 47.0: 0, 48.0: 0, 49.0: 0, 50.0: 0, 51.0: 0, 52.0: 0, 53.0: 0, 54.0: 0, 55.0: 0, 56.0: 0, 57.0: 0, 58.0: 0, 59.0: 0, 60.0: 0, 61.0: 0, 64.0: 0, 65.0: 0, 66.0: 0, 67.0: 0, 68.0: 0, 69.0: 0, 70.0: 0}\n",
663 |       "1 {'freq': 0, 'prob_class': 0, 0.0: 0, 1.0: 0, 2.0: 0, 3.0: 0, 4.0: 0, 5.0: 0, 6.0: 0, 7.0: 0, 8.0: 0, 9.0: 0, 10.0: 0, 11.0: 0, 12.0: 0, 13.0: 0, 14.0: 0, 15.0: 0, 16.0: 0, 17.0: 0, 18.0: 0, 19.0: 0, 20.0: 0, 21.0: 0, 22.0: 0, 23.0: 0, 24.0: 0, 25.0: 0, 26.0: 0, 27.0: 0, 28.0: 0, 29.0: 0, 30.0: 0, 31.0: 0, 32.0: 0, 33.0: 0, 34.0: 0, 35.0: 0, 36.0: 0, 37.0: 0, 38.0: 0, 39.0: 0, 40.0: 0, 41.0: 0, 42.0: 0, 43.0: 0, 44.0: 0, 45.0: 0, 46.0: 0, 47.0: 0, 48.0: 0, 49.0: 0, 50.0: 0, 51.0: 0, 52.0: 0, 53.0: 0, 54.0: 0, 55.0: 0, 56.0: 0, 57.0: 0, 58.0: 0, 59.0: 0, 60.0: 0, 61.0: 0, 64.0: 0, 65.0: 0, 66.0: 0, 67.0: 0, 68.0: 0, 69.0: 0, 70.0: 0}\n"
664 |      ]
665 |     }
666 |    ],
667 |    "source": [
668 |     "for key,val in dict1.items():\n",
669 |     "    dict1[key][\"freq\"] = 0\n",
670 |     "    dict1[key][\"prob_class\"] = 0\n",
671 |     "    for j in set1:\n",
672 |     "        dict1[key][j] = 0\n",
673 |     "    print(key,dict1[key])\n",
674 |     "    \n"
675 |    ]
676 |   },
677 |   {
678 |    "cell_type": "code",
679 |    "execution_count": 24,
680 |    "metadata": {},
681 |    "outputs": [
682 |     {
683 |      "name": "stdout",
684 |      "output_type": "stream",
685 |      "text": [
686 |       "{0: {'freq': 24093654, 'prob_class': 23761, 0.0: 22087777, 1.0: 115637, 2.0: 25252, 3.0: 33900, 4.0: 51837, 5.0: 163930, 6.0: 32893, 7.0: 36043, 8.0: 59369, 9.0: 100088, 10.0: 3825, 11.0: 17322, 12.0: 66879, 13.0: 42361, 14.0: 88241, 15.0: 113170, 16.0: 31965, 17.0: 1069, 18.0: 87011, 19.0: 97758, 20.0: 108461, 21.0: 56213, 22.0: 17425, 23.0: 29233, 24.0: 3352, 25.0: 42683, 26.0: 1899, 27.0: 2305, 28.0: 2600, 29.0: 2121, 30.0: 1081, 31.0: 1005, 32.0: 748, 33.0: 1250, 34.0: 489, 35.0: 427, 36.0: 578, 37.0: 358139, 38.0: 4544, 39.0: 1672, 40.0: 18668, 41.0: 11219, 42.0: 1803, 43.0: 2397, 44.0: 6222, 45.0: 1254, 46.0: 524, 47.0: 26, 48.0: 152, 49.0: 649, 50.0: 13087, 51.0: 57560, 52.0: 170, 53.0: 195, 54.0: 47, 55.0: 1529, 56.0: 190, 57.0: 110, 58.0: 8, 59.0: 148, 60.0: 2265, 61.0: 106, 64.0: 623, 65.0: 832, 66.0: 81, 67.0: 79, 68.0: 5, 69.0: 4, 70.0: 81149}, 1: {'freq': 1833312, 'prob_class': 1808, 0.0: 1669726, 1.0: 10363, 2.0: 2253, 3.0: 3816, 4.0: 3627, 5.0: 14133, 6.0: 2115, 7.0: 2422, 8.0: 4827, 9.0: 9398, 10.0: 412, 11.0: 1315, 12.0: 5558, 13.0: 4100, 14.0: 7303, 15.0: 8679, 16.0: 3078, 17.0: 94, 18.0: 8335, 19.0: 9657, 20.0: 9506, 21.0: 4938, 22.0: 1122, 23.0: 2366, 24.0: 266, 25.0: 2511, 26.0: 157, 27.0: 189, 28.0: 176, 29.0: 180, 30.0: 37, 31.0: 72, 32.0: 27, 33.0: 94, 34.0: 72, 35.0: 20, 36.0: 37, 37.0: 27256, 38.0: 538, 39.0: 274, 40.0: 1318, 41.0: 385, 42.0: 270, 43.0: 195, 44.0: 599, 45.0: 233, 46.0: 61, 47.0: 0, 48.0: 17, 49.0: 9, 50.0: 1503, 51.0: 3913, 52.0: 11, 53.0: 4, 54.0: 0, 55.0: 255, 56.0: 23, 57.0: 11, 58.0: 0, 59.0: 14, 60.0: 207, 61.0: 20, 64.0: 26, 65.0: 26, 66.0: 0, 67.0: 0, 68.0: 0, 69.0: 0, 70.0: 3163}}\n"
687 |      ]
688 |     }
689 |    ],
690 |    "source": [
691 |     "for i in range(len(train_data_nb)):\n",
692 |     "    dict1[train_classes_nb_l[i]]['freq'] += len(train_data_nb[i]) \n",
693 |     "    dict1[train_classes_nb_l[i]]['prob_class'] +=1\n",
694 |     "    for j in train_data_nb[i]:\n",
695 |     "        dict1[train_classes_nb_l[i]][j] +=1\n",
696 |     "#     print(len(train_data_nb[i]))\n",
697 |     "#     print(train_classes_nb_l[i])\n",
698 |     "\n",
699 |     "print(dict1)"
700 |    ]
701 |   },
702 |   {
703 |    "cell_type": "code",
704 |    "execution_count": 25,
705 |    "metadata": {},
706 |    "outputs": [
707 |     {
708 |      "name": "stdout",
709 |      "output_type": "stream",
710 |      "text": [
711 |       "69\n",
712 |       "25569\n"
713 |      ]
714 |     }
715 |    ],
716 |    "source": [
717 |     "print(total_vocab)\n",
718 |     "total_data_len = len(train_data_nb)\n",
719 |     "print(total_data_len)\n",
720 |     "\n",
721 |     "\n",
722 |     "# p(class) = dict[class][prob_class]/total_data_len\n",
723 |     "# p(word/class) = (dict[class][word]+1)/(dict[class][freq]+total_vocab)\n",
724 |     "\n",
725 |     "\n"
726 |    ]
727 |   },
728 |   {
729 |    "cell_type": "code",
730 |    "execution_count": 26,
731 |    "metadata": {},
732 |    "outputs": [],
733 |    "source": [
734 |     "def prob_fun(op_class,sentence):\n",
735 |     "    result = dict1[op_class]['prob_class']/total_data_len\n",
736 |     "    for i in sentence:\n",
737 |     "        result *= ((dict1[op_class][i]+1)/(dict1[op_class]['freq']+total_vocab))\n",
738 |     "    return result"
739 |    ]
740 |   },
741 |   {
742 |    "cell_type": "code",
743 |    "execution_count": 27,
744 |    "metadata": {},
745 |    "outputs": [
746 |     {
747 |      "name": "stdout",
748 |      "output_type": "stream",
749 |      "text": [
750 |       "Accuracy is  79.08650086031597\n"
751 |      ]
752 |     }
753 |    ],
754 |    "source": [
755 |     "correct = 0\n",
756 |     "for idx, sentence in enumerate(test_data_nb):\n",
757 |     "    res0 = prob_fun(0,sentence)\n",
758 |     "    res1 = prob_fun(1,sentence)    \n",
759 |     "    if(res0>res1 and test_classes_nb_l[idx] ==0):\n",
760 |     "        correct+=1\n",
761 |     "    elif(res1>res0 and test_classes_nb_l[idx] ==1):\n",
762 |     "        correct+=1\n",
763 |     "        \n",
764 |     "accuracy = (correct/len(test_data_nb))*100\n",
765 |     "print(\"Accuracy is \",accuracy)"
766 |    ]
767 |   },
768 |   {
769 |    "cell_type": "code",
770 |    "execution_count": 28,
771 |    "metadata": {},
772 |    "outputs": [],
773 |    "source": []
774 |   }
775 |  ],
776 |  "metadata": {
777 |   "kernelspec": {
778 |    "display_name": "Python 3",
779 |    "language": "python",
780 |    "name": "python3"
781 |   },
782 |   "language_info": {
783 |    "codemirror_mode": {
784 |     "name": "ipython",
785 |     "version": 3
786 |    },
787 |    "file_extension": ".py",
788 |    "mimetype": "text/x-python",
789 |    "name": "python",
790 |    "nbconvert_exporter": "python",
791 |    "pygments_lexer": "ipython3",
792 |    "version": "3.5.2"
793 |   }
794 |  },
795 |  "nbformat": 4,
796 |  "nbformat_minor": 1
797 | }
798 | 


--------------------------------------------------------------------------------
/Experience_Paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manishshettym/Offensive-Text-Detection/ed633804a09fa8d6b6c1d252ac5de371e0bdef15/Experience_Paper.pdf


--------------------------------------------------------------------------------
/GRU/gru.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |   "nbformat": 4,
   3 |   "nbformat_minor": 0,
   4 |   "metadata": {
   5 |     "colab": {
   6 |       "name": "gru.ipynb",
   7 |       "version": "0.3.2",
   8 |       "provenance": [],
   9 |       "toc_visible": true
  10 |     },
  11 |     "language_info": {
  12 |       "codemirror_mode": {
  13 |         "name": "ipython",
  14 |         "version": 3
  15 |       },
  16 |       "file_extension": ".py",
  17 |       "mimetype": "text/x-python",
  18 |       "name": "python",
  19 |       "nbconvert_exporter": "python",
  20 |       "pygments_lexer": "ipython3",
  21 |       "version": "3.5.2"
  22 |     },
  23 |     "kernelspec": {
  24 |       "display_name": "Python 3",
  25 |       "language": "python",
  26 |       "name": "python3"
  27 |     }
  28 |   },
  29 |   "cells": [
  30 |     {
  31 |       "metadata": {
  32 |         "id": "RKwUedd0ojVV",
  33 |         "colab_type": "code",
  34 |         "colab": {}
  35 |       },
  36 |       "cell_type": "code",
  37 |       "source": [
  38 |         "import pandas as pd\n",
  39 |         "import numpy as np\n",
  40 |         "\n",
  41 |         "from itertools import chain\n",
  42 |         "import matplotlib.pyplot as plt\n",
  43 |         "import os\n",
  44 |         "import nltk\n",
  45 |         "from nltk.corpus import stopwords \n",
  46 |         "from nltk.tokenize import word_tokenize"
  47 |       ],
  48 |       "execution_count": 0,
  49 |       "outputs": []
  50 |     },
  51 |     {
  52 |       "metadata": {
  53 |         "id": "JrTpwLOHojVe",
  54 |         "colab_type": "code",
  55 |         "colab": {}
  56 |       },
  57 |       "cell_type": "code",
  58 |       "source": [
  59 |         "from sklearn.model_selection import train_test_split\n",
  60 |         "from sklearn.metrics import recall_score,accuracy_score\n",
  61 |         "from sklearn.preprocessing import MinMaxScaler\n",
  62 |         "\n",
  63 |         "from keras.preprocessing.text import Tokenizer\n",
  64 |         "from keras.preprocessing.sequence import pad_sequences\n",
  65 |         "from keras.utils import to_categorical\n",
  66 |         "from keras.layers import Dense, Input, GlobalMaxPooling1D\n",
  67 |         "from keras.layers import GRU, MaxPooling1D, Embedding\n",
  68 |         "from keras.models import Model\n",
  69 |         "from keras import layers, Input\n",
  70 |         "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
  71 |         "from keras.models import load_model"
  72 |       ],
  73 |       "execution_count": 0,
  74 |       "outputs": []
  75 |     },
  76 |     {
  77 |       "metadata": {
  78 |         "id": "1Nfbtx28rRVO",
  79 |         "colab_type": "code",
  80 |         "colab": {
  81 |           "base_uri": "https://localhost:8080/",
  82 |           "height": 1217
  83 |         },
  84 |         "outputId": "093e35fe-003a-418f-e4dc-8e94c12001c1"
  85 |       },
  86 |       "cell_type": "code",
  87 |       "source": [
  88 |         "!pip3 install hyperas"
  89 |       ],
  90 |       "execution_count": 4,
  91 |       "outputs": [
  92 |         {
  93 |           "output_type": "stream",
  94 |           "text": [
  95 |             "Collecting hyperas\n",
  96 |             "  Downloading https://files.pythonhosted.org/packages/04/34/87ad6ffb42df9c1fa9c4c906f65813d42ad70d68c66af4ffff048c228cd4/hyperas-0.4.1-py3-none-any.whl\n",
  97 |             "Requirement already satisfied: nbconvert in /usr/local/lib/python3.6/dist-packages (from hyperas) (5.4.1)\n",
  98 |             "Requirement already satisfied: hyperopt in /usr/local/lib/python3.6/dist-packages (from hyperas) (0.1.2)\n",
  99 |             "Requirement already satisfied: jupyter in /usr/local/lib/python3.6/dist-packages (from hyperas) (1.0.0)\n",
 100 |             "Requirement already satisfied: entrypoints in /usr/local/lib/python3.6/dist-packages (from hyperas) (0.3)\n",
 101 |             "Requirement already satisfied: nbformat in /usr/local/lib/python3.6/dist-packages (from hyperas) (4.4.0)\n",
 102 |             "Requirement already satisfied: keras in /usr/local/lib/python3.6/dist-packages (from hyperas) (2.2.4)\n",
 103 |             "Requirement already satisfied: jupyter-core in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (4.4.0)\n",
 104 |             "Requirement already satisfied: testpath in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (0.4.2)\n",
 105 |             "Requirement already satisfied: defusedxml in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (0.6.0)\n",
 106 |             "Requirement already satisfied: mistune>=0.8.1 in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (0.8.4)\n",
 107 |             "Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (4.3.2)\n",
 108 |             "Requirement already satisfied: bleach in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (3.1.0)\n",
 109 |             "Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (1.4.2)\n",
 110 |             "Requirement already satisfied: pygments in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (2.1.3)\n",
 111 |             "Requirement already satisfied: jinja2 in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (2.10.1)\n",
 112 |             "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas) (1.2.1)\n",
 113 |             "Requirement already satisfied: networkx in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas) (2.3)\n",
 114 |             "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas) (1.16.3)\n",
 115 |             "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas) (4.28.1)\n",
 116 |             "Requirement already satisfied: pymongo in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas) (3.8.0)\n",
 117 |             "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas) (0.16.0)\n",
 118 |             "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas) (1.12.0)\n",
 119 |             "Requirement already satisfied: qtconsole in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas) (4.4.3)\n",
 120 |             "Requirement already satisfied: ipykernel in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas) (4.6.1)\n",
 121 |             "Requirement already satisfied: notebook in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas) (5.2.2)\n",
 122 |             "Requirement already satisfied: ipywidgets in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas) (7.4.2)\n",
 123 |             "Requirement already satisfied: jupyter-console in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas) (6.0.0)\n",
 124 |             "Requirement already satisfied: ipython-genutils in /usr/local/lib/python3.6/dist-packages (from nbformat->hyperas) (0.2.0)\n",
 125 |             "Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /usr/local/lib/python3.6/dist-packages (from nbformat->hyperas) (2.6.0)\n",
 126 |             "Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from keras->hyperas) (1.0.9)\n",
 127 |             "Requirement already satisfied: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from keras->hyperas) (1.0.7)\n",
 128 |             "Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from keras->hyperas) (3.13)\n",
 129 |             "Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from keras->hyperas) (2.8.0)\n",
 130 |             "Requirement already satisfied: decorator in /usr/local/lib/python3.6/dist-packages (from traitlets>=4.2->nbconvert->hyperas) (4.4.0)\n",
 131 |             "Requirement already satisfied: webencodings in /usr/local/lib/python3.6/dist-packages (from bleach->nbconvert->hyperas) (0.5.1)\n",
 132 |             "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from jinja2->nbconvert->hyperas) (1.1.1)\n",
 133 |             "Requirement already satisfied: jupyter-client>=4.1 in /usr/local/lib/python3.6/dist-packages (from qtconsole->jupyter->hyperas) (5.2.4)\n",
 134 |             "Requirement already satisfied: ipython>=4.0.0 in /usr/local/lib/python3.6/dist-packages (from ipykernel->jupyter->hyperas) (5.5.0)\n",
 135 |             "Requirement already satisfied: tornado>=4.0 in /usr/local/lib/python3.6/dist-packages (from ipykernel->jupyter->hyperas) (4.5.3)\n",
 136 |             "Requirement already satisfied: terminado>=0.3.3; sys_platform != \"win32\" in /usr/local/lib/python3.6/dist-packages (from notebook->jupyter->hyperas) (0.8.2)\n",
 137 |             "Requirement already satisfied: widgetsnbextension~=3.4.0 in /usr/local/lib/python3.6/dist-packages (from ipywidgets->jupyter->hyperas) (3.4.2)\n",
 138 |             "Collecting prompt-toolkit<2.1.0,>=2.0.0 (from jupyter-console->jupyter->hyperas)\n",
 139 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/f7/a7/9b1dd14ef45345f186ef69d175bdd2491c40ab1dfa4b2b3e4352df719ed7/prompt_toolkit-2.0.9-py3-none-any.whl (337kB)\n",
 140 |             "\u001b[K    100% |████████████████████████████████| 337kB 26.9MB/s \n",
 141 |             "\u001b[?25hRequirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from jupyter-client>=4.1->qtconsole->jupyter->hyperas) (2.5.3)\n",
 142 |             "Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.6/dist-packages (from jupyter-client>=4.1->qtconsole->jupyter->hyperas) (17.0.0)\n",
 143 |             "Requirement already satisfied: simplegeneric>0.8 in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0->ipykernel->jupyter->hyperas) (0.8.1)\n",
 144 |             "Requirement already satisfied: pickleshare in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0->ipykernel->jupyter->hyperas) (0.7.5)\n",
 145 |             "Requirement already satisfied: pexpect; sys_platform != \"win32\" in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0->ipykernel->jupyter->hyperas) (4.7.0)\n",
 146 |             "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0->ipykernel->jupyter->hyperas) (40.9.0)\n",
 147 |             "Requirement already satisfied: ptyprocess; os_name != \"nt\" in /usr/local/lib/python3.6/dist-packages (from terminado>=0.3.3; sys_platform != \"win32\"->notebook->jupyter->hyperas) (0.6.0)\n",
 148 |             "Requirement already satisfied: wcwidth in /usr/local/lib/python3.6/dist-packages (from prompt-toolkit<2.1.0,>=2.0.0->jupyter-console->jupyter->hyperas) (0.1.7)\n",
 149 |             "\u001b[31mipython 5.5.0 has requirement prompt-toolkit<2.0.0,>=1.0.4, but you'll have prompt-toolkit 2.0.9 which is incompatible.\u001b[0m\n",
 150 |             "Installing collected packages: hyperas, prompt-toolkit\n",
 151 |             "  Found existing installation: prompt-toolkit 1.0.16\n",
 152 |             "    Uninstalling prompt-toolkit-1.0.16:\n",
 153 |             "      Successfully uninstalled prompt-toolkit-1.0.16\n",
 154 |             "Successfully installed hyperas-0.4.1 prompt-toolkit-2.0.9\n"
 155 |           ],
 156 |           "name": "stdout"
 157 |         },
 158 |         {
 159 |           "output_type": "display_data",
 160 |           "data": {
 161 |             "application/vnd.colab-display-data+json": {
 162 |               "pip_warning": {
 163 |                 "packages": [
 164 |                   "prompt_toolkit"
 165 |                 ]
 166 |               }
 167 |             }
 168 |           },
 169 |           "metadata": {
 170 |             "tags": []
 171 |           }
 172 |         }
 173 |       ]
 174 |     },
 175 |     {
 176 |       "metadata": {
 177 |         "id": "fCT1mIKErqZt",
 178 |         "colab_type": "code",
 179 |         "colab": {
 180 |           "base_uri": "https://localhost:8080/",
 181 |           "height": 1074
 182 |         },
 183 |         "outputId": "1a9622d4-3b4a-45aa-e6ce-4d204dfdd09e"
 184 |       },
 185 |       "cell_type": "code",
 186 |       "source": [
 187 |         "!pip3 install git+https://github.com/maxpumperla/hyperas.git"
 188 |       ],
 189 |       "execution_count": 6,
 190 |       "outputs": [
 191 |         {
 192 |           "output_type": "stream",
 193 |           "text": [
 194 |             "Collecting git+https://github.com/maxpumperla/hyperas.git\n",
 195 |             "  Cloning https://github.com/maxpumperla/hyperas.git to /tmp/pip-req-build-ym0r1e2g\n",
 196 |             "Requirement already satisfied (use --upgrade to upgrade): hyperas==0.4.1 from git+https://github.com/maxpumperla/hyperas.git in /usr/local/lib/python3.6/dist-packages\n",
 197 |             "Requirement already satisfied: keras in /usr/local/lib/python3.6/dist-packages (from hyperas==0.4.1) (2.2.4)\n",
 198 |             "Requirement already satisfied: hyperopt in /usr/local/lib/python3.6/dist-packages (from hyperas==0.4.1) (0.1.2)\n",
 199 |             "Requirement already satisfied: entrypoints in /usr/local/lib/python3.6/dist-packages (from hyperas==0.4.1) (0.3)\n",
 200 |             "Requirement already satisfied: jupyter in /usr/local/lib/python3.6/dist-packages (from hyperas==0.4.1) (1.0.0)\n",
 201 |             "Requirement already satisfied: nbformat in /usr/local/lib/python3.6/dist-packages (from hyperas==0.4.1) (4.4.0)\n",
 202 |             "Requirement already satisfied: nbconvert in /usr/local/lib/python3.6/dist-packages (from hyperas==0.4.1) (5.4.1)\n",
 203 |             "Requirement already satisfied: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from keras->hyperas==0.4.1) (1.0.7)\n",
 204 |             "Requirement already satisfied: scipy>=0.14 in /usr/local/lib/python3.6/dist-packages (from keras->hyperas==0.4.1) (1.2.1)\n",
 205 |             "Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from keras->hyperas==0.4.1) (2.8.0)\n",
 206 |             "Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from keras->hyperas==0.4.1) (1.0.9)\n",
 207 |             "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.6/dist-packages (from keras->hyperas==0.4.1) (1.12.0)\n",
 208 |             "Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from keras->hyperas==0.4.1) (3.13)\n",
 209 |             "Requirement already satisfied: numpy>=1.9.1 in /usr/local/lib/python3.6/dist-packages (from keras->hyperas==0.4.1) (1.16.3)\n",
 210 |             "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas==0.4.1) (0.16.0)\n",
 211 |             "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas==0.4.1) (4.28.1)\n",
 212 |             "Requirement already satisfied: networkx in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas==0.4.1) (2.3)\n",
 213 |             "Requirement already satisfied: pymongo in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas==0.4.1) (3.8.0)\n",
 214 |             "Requirement already satisfied: ipywidgets in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas==0.4.1) (7.4.2)\n",
 215 |             "Requirement already satisfied: jupyter-console in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas==0.4.1) (6.0.0)\n",
 216 |             "Requirement already satisfied: qtconsole in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas==0.4.1) (4.4.3)\n",
 217 |             "Requirement already satisfied: notebook in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas==0.4.1) (5.2.2)\n",
 218 |             "Requirement already satisfied: ipykernel in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas==0.4.1) (4.6.1)\n",
 219 |             "Requirement already satisfied: jupyter-core in /usr/local/lib/python3.6/dist-packages (from nbformat->hyperas==0.4.1) (4.4.0)\n",
 220 |             "Requirement already satisfied: ipython-genutils in /usr/local/lib/python3.6/dist-packages (from nbformat->hyperas==0.4.1) (0.2.0)\n",
 221 |             "Requirement already satisfied: traitlets>=4.1 in /usr/local/lib/python3.6/dist-packages (from nbformat->hyperas==0.4.1) (4.3.2)\n",
 222 |             "Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /usr/local/lib/python3.6/dist-packages (from nbformat->hyperas==0.4.1) (2.6.0)\n",
 223 |             "Requirement already satisfied: jinja2 in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas==0.4.1) (2.10.1)\n",
 224 |             "Requirement already satisfied: pygments in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas==0.4.1) (2.1.3)\n",
 225 |             "Requirement already satisfied: bleach in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas==0.4.1) (3.1.0)\n",
 226 |             "Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas==0.4.1) (1.4.2)\n",
 227 |             "Requirement already satisfied: mistune>=0.8.1 in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas==0.4.1) (0.8.4)\n",
 228 |             "Requirement already satisfied: defusedxml in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas==0.4.1) (0.6.0)\n",
 229 |             "Requirement already satisfied: testpath in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas==0.4.1) (0.4.2)\n",
 230 |             "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from networkx->hyperopt->hyperas==0.4.1) (4.4.0)\n",
 231 |             "Requirement already satisfied: widgetsnbextension~=3.4.0 in /usr/local/lib/python3.6/dist-packages (from ipywidgets->jupyter->hyperas==0.4.1) (3.4.2)\n",
 232 |             "Requirement already satisfied: ipython>=4.0.0; python_version >= \"3.3\" in /usr/local/lib/python3.6/dist-packages (from ipywidgets->jupyter->hyperas==0.4.1) (5.5.0)\n",
 233 |             "Requirement already satisfied: jupyter-client in /usr/local/lib/python3.6/dist-packages (from jupyter-console->jupyter->hyperas==0.4.1) (5.2.4)\n",
 234 |             "Requirement already satisfied: prompt-toolkit<2.1.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from jupyter-console->jupyter->hyperas==0.4.1) (2.0.9)\n",
 235 |             "Requirement already satisfied: tornado>=4 in /usr/local/lib/python3.6/dist-packages (from notebook->jupyter->hyperas==0.4.1) (4.5.3)\n",
 236 |             "Requirement already satisfied: terminado>=0.3.3; sys_platform != \"win32\" in /usr/local/lib/python3.6/dist-packages (from notebook->jupyter->hyperas==0.4.1) (0.8.2)\n",
 237 |             "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from jinja2->nbconvert->hyperas==0.4.1) (1.1.1)\n",
 238 |             "Requirement already satisfied: webencodings in /usr/local/lib/python3.6/dist-packages (from bleach->nbconvert->hyperas==0.4.1) (0.5.1)\n",
 239 |             "Requirement already satisfied: pexpect; sys_platform != \"win32\" in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets->jupyter->hyperas==0.4.1) (4.7.0)\n",
 240 |             "Requirement already satisfied: simplegeneric>0.8 in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets->jupyter->hyperas==0.4.1) (0.8.1)\n",
 241 |             "Requirement already satisfied: pickleshare in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets->jupyter->hyperas==0.4.1) (0.7.5)\n",
 242 |             "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets->jupyter->hyperas==0.4.1) (40.9.0)\n",
 243 |             "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from jupyter-client->jupyter-console->jupyter->hyperas==0.4.1) (2.5.3)\n",
 244 |             "Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.6/dist-packages (from jupyter-client->jupyter-console->jupyter->hyperas==0.4.1) (17.0.0)\n",
 245 |             "Requirement already satisfied: wcwidth in /usr/local/lib/python3.6/dist-packages (from prompt-toolkit<2.1.0,>=2.0.0->jupyter-console->jupyter->hyperas==0.4.1) (0.1.7)\n",
 246 |             "Requirement already satisfied: ptyprocess; os_name != \"nt\" in /usr/local/lib/python3.6/dist-packages (from terminado>=0.3.3; sys_platform != \"win32\"->notebook->jupyter->hyperas==0.4.1) (0.6.0)\n",
 247 |             "Building wheels for collected packages: hyperas\n",
 248 |             "  Building wheel for hyperas (setup.py) ... \u001b[?25ldone\n",
 249 |             "\u001b[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-k663ho3s/wheels/27/c7/75/b70097065b73570eda25350a796d87c41cd967471a04064cc2\n",
 250 |             "Successfully built hyperas\n"
 251 |           ],
 252 |           "name": "stdout"
 253 |         }
 254 |       ]
 255 |     },
 256 |     {
 257 |       "metadata": {
 258 |         "id": "QDuittmhojVo",
 259 |         "colab_type": "code",
 260 |         "colab": {
 261 |           "base_uri": "https://localhost:8080/",
 262 |           "height": 353
 263 |         },
 264 |         "outputId": "b4dbd669-01f0-4320-cbb2-0a9365a3c5a4"
 265 |       },
 266 |       "cell_type": "code",
 267 |       "source": [
 268 |         "from hyperopt import Trials, STATUS_OK, tpe\n",
 269 |         "from hyperas import optim\n",
 270 |         "from hyperas.distributions import choice, uniform, conditional"
 271 |       ],
 272 |       "execution_count": 7,
 273 |       "outputs": [
 274 |         {
 275 |           "output_type": "error",
 276 |           "ename": "ImportError",
 277 |           "evalue": "ignored",
 278 |           "traceback": [
 279 |             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 280 |             "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
 281 |             "\u001b[0;32m<ipython-input-7-2f5e7a639393>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mhyperopt\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mTrials\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSTATUS_OK\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtpe\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mhyperas\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0moptim\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mhyperas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdistributions\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mchoice\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muniform\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconditional\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
 282 |             "\u001b[0;31mImportError\u001b[0m: cannot import name 'conditional'",
 283 |             "",
 284 |             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n"
 285 |           ]
 286 |         }
 287 |       ]
 288 |     },
 289 |     {
 290 |       "metadata": {
 291 |         "id": "q8qIM2c-t9DO",
 292 |         "colab_type": "code",
 293 |         "colab": {
 294 |           "base_uri": "https://localhost:8080/",
 295 |           "height": 128
 296 |         },
 297 |         "outputId": "c5a7fdad-49ff-412b-a1d7-cff21b560886"
 298 |       },
 299 |       "cell_type": "code",
 300 |       "source": [
 301 |         "from google.colab import drive\n",
 302 |         "drive.mount('/content/gdrive')"
 303 |       ],
 304 |       "execution_count": 8,
 305 |       "outputs": [
 306 |         {
 307 |           "output_type": "stream",
 308 |           "text": [
 309 |             "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code\n",
 310 |             "\n",
 311 |             "Enter your authorization code:\n",
 312 |             "··········\n",
 313 |             "Mounted at /content/gdrive\n"
 314 |           ],
 315 |           "name": "stdout"
 316 |         }
 317 |       ]
 318 |     },
 319 |     {
 320 |       "metadata": {
 321 |         "id": "_rodD_a6ojVr",
 322 |         "colab_type": "code",
 323 |         "colab": {}
 324 |       },
 325 |       "cell_type": "code",
 326 |       "source": [
 327 |         "data = pd.read_csv('/content/gdrive/My Drive/NLP/Dataset/train_E6oV3lV.csv')\n",
 328 |         "data.shape[0] - data.dropna().shape[0]\n",
 329 |         "data_copy = data.copy()"
 330 |       ],
 331 |       "execution_count": 0,
 332 |       "outputs": []
 333 |     },
 334 |     {
 335 |       "metadata": {
 336 |         "id": "9OT54nvLojVu",
 337 |         "colab_type": "text"
 338 |       },
 339 |       "cell_type": "markdown",
 340 |       "source": [
 341 |         "### Tokenization"
 342 |       ]
 343 |     },
 344 |     {
 345 |       "metadata": {
 346 |         "id": "bb6dScp0un5C",
 347 |         "colab_type": "code",
 348 |         "colab": {
 349 |           "base_uri": "https://localhost:8080/",
 350 |           "height": 72
 351 |         },
 352 |         "outputId": "913c37fc-f861-4ad6-bc74-be9601f319d2"
 353 |       },
 354 |       "cell_type": "code",
 355 |       "source": [
 356 |         " nltk.download('punkt')"
 357 |       ],
 358 |       "execution_count": 12,
 359 |       "outputs": [
 360 |         {
 361 |           "output_type": "stream",
 362 |           "text": [
 363 |             "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
 364 |             "[nltk_data]   Unzipping tokenizers/punkt.zip.\n"
 365 |           ],
 366 |           "name": "stdout"
 367 |         },
 368 |         {
 369 |           "output_type": "execute_result",
 370 |           "data": {
 371 |             "text/plain": [
 372 |               "True"
 373 |             ]
 374 |           },
 375 |           "metadata": {
 376 |             "tags": []
 377 |           },
 378 |           "execution_count": 12
 379 |         }
 380 |       ]
 381 |     },
 382 |     {
 383 |       "metadata": {
 384 |         "id": "sCqlatYfojVv",
 385 |         "colab_type": "code",
 386 |         "colab": {
 387 |           "base_uri": "https://localhost:8080/",
 388 |           "height": 399
 389 |         },
 390 |         "outputId": "a0c4623f-e181-4a80-d760-1569884141a5"
 391 |       },
 392 |       "cell_type": "code",
 393 |       "source": [
 394 |         "tokenized_single_posts = [nltk.tokenize.word_tokenize(i) for i in data.tweet]\n",
 395 |         "len(tokenized_single_posts)\n"
 396 |       ],
 397 |       "execution_count": 107,
 398 |       "outputs": [
 399 |         {
 400 |           "output_type": "execute_result",
 401 |           "data": {
 402 |             "text/plain": [
 403 |               "['@',\n",
 404 |               " 'user',\n",
 405 |               " 'when',\n",
 406 |               " 'a',\n",
 407 |               " 'father',\n",
 408 |               " 'is',\n",
 409 |               " 'dysfunctional',\n",
 410 |               " 'and',\n",
 411 |               " 'is',\n",
 412 |               " 'so',\n",
 413 |               " 'selfish',\n",
 414 |               " 'he',\n",
 415 |               " 'drags',\n",
 416 |               " 'his',\n",
 417 |               " 'kids',\n",
 418 |               " 'into',\n",
 419 |               " 'his',\n",
 420 |               " 'dysfunction',\n",
 421 |               " '.',\n",
 422 |               " '#',\n",
 423 |               " 'run']"
 424 |             ]
 425 |           },
 426 |           "metadata": {
 427 |             "tags": []
 428 |           },
 429 |           "execution_count": 107
 430 |         }
 431 |       ]
 432 |     },
 433 |     {
 434 |       "metadata": {
 435 |         "id": "Kx6wgMedBdeY",
 436 |         "colab_type": "code",
 437 |         "colab": {
 438 |           "base_uri": "https://localhost:8080/",
 439 |           "height": 55
 440 |         },
 441 |         "outputId": "8ee9ce9a-2d1f-4e4a-fce3-866fcc42918c"
 442 |       },
 443 |       "cell_type": "code",
 444 |       "source": [
 445 |         "print(tokenized_single_posts[0])"
 446 |       ],
 447 |       "execution_count": 111,
 448 |       "outputs": [
 449 |         {
 450 |           "output_type": "stream",
 451 |           "text": [
 452 |             "['@', 'user', 'when', 'a', 'father', 'is', 'dysfunctional', 'and', 'is', 'so', 'selfish', 'he', 'drags', 'his', 'kids', 'into', 'his', 'dysfunction', '.', '#', 'run']\n"
 453 |           ],
 454 |           "name": "stdout"
 455 |         }
 456 |       ]
 457 |     },
 458 |     {
 459 |       "metadata": {
 460 |         "id": "YqyqR-pkojV3",
 461 |         "colab_type": "code",
 462 |         "colab": {}
 463 |       },
 464 |       "cell_type": "code",
 465 |       "source": [
 466 |         "leng = []\n",
 467 |         "for i in range(len(tokenized_single_posts)):\n",
 468 |         "    length = len(tokenized_single_posts[i])\n",
 469 |         "    leng.append(length)"
 470 |       ],
 471 |       "execution_count": 0,
 472 |       "outputs": []
 473 |     },
 474 |     {
 475 |       "metadata": {
 476 |         "id": "jvoU7GSgojV_",
 477 |         "colab_type": "text"
 478 |       },
 479 |       "cell_type": "markdown",
 480 |       "source": [
 481 |         "### stopwords removal"
 482 |       ]
 483 |     },
 484 |     {
 485 |       "metadata": {
 486 |         "id": "twzVwef4u40S",
 487 |         "colab_type": "code",
 488 |         "colab": {
 489 |           "base_uri": "https://localhost:8080/",
 490 |           "height": 72
 491 |         },
 492 |         "outputId": "7c75b940-abff-4b48-91e5-98f44517bf6a"
 493 |       },
 494 |       "cell_type": "code",
 495 |       "source": [
 496 |         "nltk.download('stopwords')"
 497 |       ],
 498 |       "execution_count": 16,
 499 |       "outputs": [
 500 |         {
 501 |           "output_type": "stream",
 502 |           "text": [
 503 |             "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
 504 |             "[nltk_data]   Unzipping corpora/stopwords.zip.\n"
 505 |           ],
 506 |           "name": "stdout"
 507 |         },
 508 |         {
 509 |           "output_type": "execute_result",
 510 |           "data": {
 511 |             "text/plain": [
 512 |               "True"
 513 |             ]
 514 |           },
 515 |           "metadata": {
 516 |             "tags": []
 517 |           },
 518 |           "execution_count": 16
 519 |         }
 520 |       ]
 521 |     },
 522 |     {
 523 |       "metadata": {
 524 |         "id": "tDQHKY7tojWA",
 525 |         "colab_type": "code",
 526 |         "colab": {}
 527 |       },
 528 |       "cell_type": "code",
 529 |       "source": [
 530 |         "import string\n",
 531 |         "stp_removed = []\n",
 532 |         "for i in range (len(tokenized_single_posts)):\n",
 533 |         "    stp = [word for word in tokenized_single_posts[i] if word not in (stopwords.words('english')+list(string.punctuation))]\n",
 534 |         "    stp_removed.append(stp)"
 535 |       ],
 536 |       "execution_count": 0,
 537 |       "outputs": []
 538 |     },
 539 |     {
 540 |       "metadata": {
 541 |         "id": "B72rZxyVojWF",
 542 |         "colab_type": "code",
 543 |         "colab": {
 544 |           "base_uri": "https://localhost:8080/",
 545 |           "height": 54
 546 |         },
 547 |         "outputId": "97af4154-3db9-4a76-9dcd-9968345bae0f"
 548 |       },
 549 |       "cell_type": "code",
 550 |       "source": [
 551 |         "print(len(stp_removed))\n",
 552 |         "print(stp_removed[0])"
 553 |       ],
 554 |       "execution_count": 113,
 555 |       "outputs": [
 556 |         {
 557 |           "output_type": "stream",
 558 |           "text": [
 559 |             "31962\n",
 560 |             "['user', 'father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run']\n"
 561 |           ],
 562 |           "name": "stdout"
 563 |         }
 564 |       ]
 565 |     },
 566 |     {
 567 |       "metadata": {
 568 |         "id": "zlQsyxpSojWN",
 569 |         "colab_type": "text"
 570 |       },
 571 |       "cell_type": "markdown",
 572 |       "source": [
 573 |         "### Lemmatize the stop removed posts"
 574 |       ]
 575 |     },
 576 |     {
 577 |       "metadata": {
 578 |         "id": "ycFaoKT0vf2l",
 579 |         "colab_type": "code",
 580 |         "colab": {
 581 |           "base_uri": "https://localhost:8080/",
 582 |           "height": 72
 583 |         },
 584 |         "outputId": "17bed81f-3344-48d8-de11-9cce0309cbe2"
 585 |       },
 586 |       "cell_type": "code",
 587 |       "source": [
 588 |         "nltk.download('wordnet')"
 589 |       ],
 590 |       "execution_count": 20,
 591 |       "outputs": [
 592 |         {
 593 |           "output_type": "stream",
 594 |           "text": [
 595 |             "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
 596 |             "[nltk_data]   Unzipping corpora/wordnet.zip.\n"
 597 |           ],
 598 |           "name": "stdout"
 599 |         },
 600 |         {
 601 |           "output_type": "execute_result",
 602 |           "data": {
 603 |             "text/plain": [
 604 |               "True"
 605 |             ]
 606 |           },
 607 |           "metadata": {
 608 |             "tags": []
 609 |           },
 610 |           "execution_count": 20
 611 |         }
 612 |       ]
 613 |     },
 614 |     {
 615 |       "metadata": {
 616 |         "id": "UUwmc3oTojWO",
 617 |         "colab_type": "code",
 618 |         "colab": {}
 619 |       },
 620 |       "cell_type": "code",
 621 |       "source": [
 622 |         "words_lemma = []\n",
 623 |         "lemma = nltk.WordNetLemmatizer()\n",
 624 |         "for i in range(len(stp_removed)):\n",
 625 |         "    words = [lemma.lemmatize(word) for word in stp_removed[i]]\n",
 626 |         "    words_lemma.append(words)"
 627 |       ],
 628 |       "execution_count": 0,
 629 |       "outputs": []
 630 |     },
 631 |     {
 632 |       "metadata": {
 633 |         "id": "D0y-HBS2ojWR",
 634 |         "colab_type": "code",
 635 |         "colab": {
 636 |           "base_uri": "https://localhost:8080/",
 637 |           "height": 54
 638 |         },
 639 |         "outputId": "f138001a-1d7d-4b65-abab-38d17d4b1fc3"
 640 |       },
 641 |       "cell_type": "code",
 642 |       "source": [
 643 |         "print(len(words_lemma))\n",
 644 |         "print(words_lemma[0])"
 645 |       ],
 646 |       "execution_count": 116,
 647 |       "outputs": [
 648 |         {
 649 |           "output_type": "stream",
 650 |           "text": [
 651 |             "31962\n",
 652 |             "['user', 'father', 'dysfunctional', 'selfish', 'drag', 'kid', 'dysfunction', 'run']\n"
 653 |           ],
 654 |           "name": "stdout"
 655 |         }
 656 |       ]
 657 |     },
 658 |     {
 659 |       "metadata": {
 660 |         "id": "mAK3gKzmojWd",
 661 |         "colab_type": "text"
 662 |       },
 663 |       "cell_type": "markdown",
 664 |       "source": [
 665 |         "### Remove all digit words"
 666 |       ]
 667 |     },
 668 |     {
 669 |       "metadata": {
 670 |         "id": "PgtMb70xojWe",
 671 |         "colab_type": "code",
 672 |         "colab": {}
 673 |       },
 674 |       "cell_type": "code",
 675 |       "source": [
 676 |         "words_noNum = []\n",
 677 |         "for i in range(len(words_lemma)):\n",
 678 |         "    words = [word for word in words_lemma[i] if word.isdigit() == False]\n",
 679 |         "    words_noNum.append(words)"
 680 |       ],
 681 |       "execution_count": 0,
 682 |       "outputs": []
 683 |     },
 684 |     {
 685 |       "metadata": {
 686 |         "id": "R_Xjne6hojWi",
 687 |         "colab_type": "code",
 688 |         "colab": {
 689 |           "base_uri": "https://localhost:8080/",
 690 |           "height": 35
 691 |         },
 692 |         "outputId": "3f1b462b-c0b9-420f-bf39-9299332933f9"
 693 |       },
 694 |       "cell_type": "code",
 695 |       "source": [
 696 |         "len(words_noNum)\n",
 697 |         "print(words_noNum[0])"
 698 |       ],
 699 |       "execution_count": 119,
 700 |       "outputs": [
 701 |         {
 702 |           "output_type": "stream",
 703 |           "text": [
 704 |             "['user', 'father', 'dysfunctional', 'selfish', 'drag', 'kid', 'dysfunction', 'run']\n"
 705 |           ],
 706 |           "name": "stdout"
 707 |         }
 708 |       ]
 709 |     },
 710 |     {
 711 |       "metadata": {
 712 |         "id": "6hkQCk29ojWp",
 713 |         "colab_type": "text"
 714 |       },
 715 |       "cell_type": "markdown",
 716 |       "source": [
 717 |         "### Remove single character words"
 718 |       ]
 719 |     },
 720 |     {
 721 |       "metadata": {
 722 |         "id": "QHDfso6-ojWr",
 723 |         "colab_type": "code",
 724 |         "colab": {}
 725 |       },
 726 |       "cell_type": "code",
 727 |       "source": [
 728 |         "words_nonSingle = []\n",
 729 |         "for i in range(len(words_noNum)):\n",
 730 |         "    words = [word for word in words_noNum[i] if len(word) > 1]\n",
 731 |         "    words_nonSingle.append(words)"
 732 |       ],
 733 |       "execution_count": 0,
 734 |       "outputs": []
 735 |     },
 736 |     {
 737 |       "metadata": {
 738 |         "id": "iFXZg0J9ojWw",
 739 |         "colab_type": "code",
 740 |         "colab": {
 741 |           "base_uri": "https://localhost:8080/",
 742 |           "height": 35
 743 |         },
 744 |         "outputId": "428664cf-3dfa-4527-fbc5-51b477193876"
 745 |       },
 746 |       "cell_type": "code",
 747 |       "source": [
 748 |         "len(words_nonSingle)"
 749 |       ],
 750 |       "execution_count": 121,
 751 |       "outputs": [
 752 |         {
 753 |           "output_type": "execute_result",
 754 |           "data": {
 755 |             "text/plain": [
 756 |               "31962"
 757 |             ]
 758 |           },
 759 |           "metadata": {
 760 |             "tags": []
 761 |           },
 762 |           "execution_count": 121
 763 |         }
 764 |       ]
 765 |     },
 766 |     {
 767 |       "metadata": {
 768 |         "id": "zCgZ2vsIojW6",
 769 |         "colab_type": "text"
 770 |       },
 771 |       "cell_type": "markdown",
 772 |       "source": [
 773 |         "### Remove non-alphabetic words"
 774 |       ]
 775 |     },
 776 |     {
 777 |       "metadata": {
 778 |         "id": "37xdOC53ojW9",
 779 |         "colab_type": "code",
 780 |         "colab": {}
 781 |       },
 782 |       "cell_type": "code",
 783 |       "source": [
 784 |         "words_alpha = []\n",
 785 |         "for i in range(len(words_nonSingle)):\n",
 786 |         "    words = [word for word in words_nonSingle[i] if word.isalpha()]\n",
 787 |         "    words_alpha.append(words)"
 788 |       ],
 789 |       "execution_count": 0,
 790 |       "outputs": []
 791 |     },
 792 |     {
 793 |       "metadata": {
 794 |         "id": "1-V3stkaojXB",
 795 |         "colab_type": "code",
 796 |         "colab": {
 797 |           "base_uri": "https://localhost:8080/",
 798 |           "height": 54
 799 |         },
 800 |         "outputId": "294c87b6-2e9c-4647-922c-5a675b6e8207"
 801 |       },
 802 |       "cell_type": "code",
 803 |       "source": [
 804 |         "print(len(words_alpha))\n",
 805 |         "print(words_alpha[0])"
 806 |       ],
 807 |       "execution_count": 124,
 808 |       "outputs": [
 809 |         {
 810 |           "output_type": "stream",
 811 |           "text": [
 812 |             "31962\n",
 813 |             "['user', 'father', 'dysfunctional', 'selfish', 'drag', 'kid', 'dysfunction', 'run']\n"
 814 |           ],
 815 |           "name": "stdout"
 816 |         }
 817 |       ]
 818 |     },
 819 |     {
 820 |       "metadata": {
 821 |         "id": "4Z0egXsbojXH",
 822 |         "colab_type": "code",
 823 |         "colab": {}
 824 |       },
 825 |       "cell_type": "code",
 826 |       "source": [
 827 |         "data_copy['words_count'] = [len(i) for i in words_alpha]"
 828 |       ],
 829 |       "execution_count": 0,
 830 |       "outputs": []
 831 |     },
 832 |     {
 833 |       "metadata": {
 834 |         "id": "jjK46C2OwR58",
 835 |         "colab_type": "code",
 836 |         "colab": {
 837 |           "base_uri": "https://localhost:8080/",
 838 |           "height": 90
 839 |         },
 840 |         "outputId": "18a37d60-36f8-4404-ca61-dccd81fc3aef"
 841 |       },
 842 |       "cell_type": "code",
 843 |       "source": [
 844 |         "nltk.download('averaged_perceptron_tagger')"
 845 |       ],
 846 |       "execution_count": 33,
 847 |       "outputs": [
 848 |         {
 849 |           "output_type": "stream",
 850 |           "text": [
 851 |             "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
 852 |             "[nltk_data]     /root/nltk_data...\n",
 853 |             "[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.\n"
 854 |           ],
 855 |           "name": "stdout"
 856 |         },
 857 |         {
 858 |           "output_type": "execute_result",
 859 |           "data": {
 860 |             "text/plain": [
 861 |               "True"
 862 |             ]
 863 |           },
 864 |           "metadata": {
 865 |             "tags": []
 866 |           },
 867 |           "execution_count": 33
 868 |         }
 869 |       ]
 870 |     },
 871 |     {
 872 |       "metadata": {
 873 |         "id": "i__VNp6CojXK",
 874 |         "colab_type": "code",
 875 |         "colab": {}
 876 |       },
 877 |       "cell_type": "code",
 878 |       "source": [
 879 |         "noun_freq = []\n",
 880 |         "verb_freq = []\n",
 881 |         "adjective_freq = []\n",
 882 |         "adverb_freq = []\n",
 883 |         "for i in range(len(words_alpha)):\n",
 884 |         "    word_pos_tag = nltk.pos_tag(words_alpha[i])\n",
 885 |         "    count_noun = 0\n",
 886 |         "    count_verb = 0\n",
 887 |         "    count_adjective = 0\n",
 888 |         "    count_adverb = 0\n",
 889 |         "    for j in range(len(word_pos_tag)):\n",
 890 |         "        if word_pos_tag[j][1] == \"NN\":\n",
 891 |         "            count_noun += 1\n",
 892 |         "        if word_pos_tag[j][1] == 'VB':\n",
 893 |         "            count_verb += 1\n",
 894 |         "        if word_pos_tag[j][1] == 'JJ':\n",
 895 |         "            count_adjective += 1\n",
 896 |         "        if word_pos_tag[j][1] == 'RB':\n",
 897 |         "            count_adverb += 1\n",
 898 |         "    noun_freq.append(count_noun/(len(words_alpha[i]) + 1))\n",
 899 |         "    verb_freq.append(count_verb/(len(words_alpha[i])+1))\n",
 900 |         "    adjective_freq.append(count_adjective/(len(words_alpha[i])+1))\n",
 901 |         "    adverb_freq.append(count_adverb/(len(words_alpha[i])+1))"
 902 |       ],
 903 |       "execution_count": 0,
 904 |       "outputs": []
 905 |     },
 906 |     {
 907 |       "metadata": {
 908 |         "id": "macvpusmojXR",
 909 |         "colab_type": "code",
 910 |         "colab": {}
 911 |       },
 912 |       "cell_type": "code",
 913 |       "source": [
 914 |         "freq_dict = {'noun_freq' : noun_freq, 'verb_freq' : verb_freq, 'adjective_freq' : adjective_freq, 'adverb_freq' : adverb_freq}"
 915 |       ],
 916 |       "execution_count": 0,
 917 |       "outputs": []
 918 |     },
 919 |     {
 920 |       "metadata": {
 921 |         "id": "H1jvbthTojXU",
 922 |         "colab_type": "code",
 923 |         "colab": {}
 924 |       },
 925 |       "cell_type": "code",
 926 |       "source": [
 927 |         "data_copy = data_copy.join(pd.DataFrame(freq_dict))"
 928 |       ],
 929 |       "execution_count": 0,
 930 |       "outputs": []
 931 |     },
 932 |     {
 933 |       "metadata": {
 934 |         "id": "wK399OlHojXW",
 935 |         "colab_type": "code",
 936 |         "colab": {
 937 |           "base_uri": "https://localhost:8080/",
 938 |           "height": 198
 939 |         },
 940 |         "outputId": "775da18b-6df1-4a0d-ca26-357d1f61e2a1"
 941 |       },
 942 |       "cell_type": "code",
 943 |       "source": [
 944 |         "data_copy = data_copy.drop(['id'],axis=1)\n",
 945 |         "data_copy.tail()\n"
 946 |       ],
 947 |       "execution_count": 141,
 948 |       "outputs": [
 949 |         {
 950 |           "output_type": "execute_result",
 951 |           "data": {
 952 |             "text/html": [
 953 |               "<div>\n",
 954 |               "<style scoped>\n",
 955 |               "    .dataframe tbody tr th:only-of-type {\n",
 956 |               "        vertical-align: middle;\n",
 957 |               "    }\n",
 958 |               "\n",
 959 |               "    .dataframe tbody tr th {\n",
 960 |               "        vertical-align: top;\n",
 961 |               "    }\n",
 962 |               "\n",
 963 |               "    .dataframe thead th {\n",
 964 |               "        text-align: right;\n",
 965 |               "    }\n",
 966 |               "</style>\n",
 967 |               "<table border=\"1\" class=\"dataframe\">\n",
 968 |               "  <thead>\n",
 969 |               "    <tr style=\"text-align: right;\">\n",
 970 |               "      <th></th>\n",
 971 |               "      <th>label</th>\n",
 972 |               "      <th>tweet</th>\n",
 973 |               "      <th>words_count</th>\n",
 974 |               "      <th>noun_freq</th>\n",
 975 |               "      <th>verb_freq</th>\n",
 976 |               "      <th>adjective_freq</th>\n",
 977 |               "      <th>adverb_freq</th>\n",
 978 |               "    </tr>\n",
 979 |               "  </thead>\n",
 980 |               "  <tbody>\n",
 981 |               "    <tr>\n",
 982 |               "      <th>31957</th>\n",
 983 |               "      <td>0</td>\n",
 984 |               "      <td>ate @user isz that youuu?ðððððð...</td>\n",
 985 |               "      <td>4</td>\n",
 986 |               "      <td>0.600000</td>\n",
 987 |               "      <td>0.200000</td>\n",
 988 |               "      <td>0.000000</td>\n",
 989 |               "      <td>0.0</td>\n",
 990 |               "    </tr>\n",
 991 |               "    <tr>\n",
 992 |               "      <th>31958</th>\n",
 993 |               "      <td>0</td>\n",
 994 |               "      <td>to see nina turner on the airwaves trying to...</td>\n",
 995 |               "      <td>14</td>\n",
 996 |               "      <td>0.400000</td>\n",
 997 |               "      <td>0.066667</td>\n",
 998 |               "      <td>0.133333</td>\n",
 999 |               "      <td>0.0</td>\n",
1000 |               "    </tr>\n",
1001 |               "    <tr>\n",
1002 |               "      <th>31959</th>\n",
1003 |               "      <td>0</td>\n",
1004 |               "      <td>listening to sad songs on a monday morning otw...</td>\n",
1005 |               "      <td>8</td>\n",
1006 |               "      <td>0.666667</td>\n",
1007 |               "      <td>0.000000</td>\n",
1008 |               "      <td>0.111111</td>\n",
1009 |               "      <td>0.0</td>\n",
1010 |               "    </tr>\n",
1011 |               "    <tr>\n",
1012 |               "      <th>31960</th>\n",
1013 |               "      <td>1</td>\n",
1014 |               "      <td>@user #sikh #temple vandalised in in #calgary,...</td>\n",
1015 |               "      <td>8</td>\n",
1016 |               "      <td>0.666667</td>\n",
1017 |               "      <td>0.000000</td>\n",
1018 |               "      <td>0.111111</td>\n",
1019 |               "      <td>0.0</td>\n",
1020 |               "    </tr>\n",
1021 |               "    <tr>\n",
1022 |               "      <th>31961</th>\n",
1023 |               "      <td>0</td>\n",
1024 |               "      <td>thank you @user for you follow</td>\n",
1025 |               "      <td>3</td>\n",
1026 |               "      <td>0.750000</td>\n",
1027 |               "      <td>0.000000</td>\n",
1028 |               "      <td>0.000000</td>\n",
1029 |               "      <td>0.0</td>\n",
1030 |               "    </tr>\n",
1031 |               "  </tbody>\n",
1032 |               "</table>\n",
1033 |               "</div>"
1034 |             ],
1035 |             "text/plain": [
1036 |               "       label                                              tweet  words_count  \\\n",
1037 |               "31957      0  ate @user isz that youuu?ðððððð...            4   \n",
1038 |               "31958      0    to see nina turner on the airwaves trying to...           14   \n",
1039 |               "31959      0  listening to sad songs on a monday morning otw...            8   \n",
1040 |               "31960      1  @user #sikh #temple vandalised in in #calgary,...            8   \n",
1041 |               "31961      0                   thank you @user for you follow              3   \n",
1042 |               "\n",
1043 |               "       noun_freq  verb_freq  adjective_freq  adverb_freq  \n",
1044 |               "31957   0.600000   0.200000        0.000000          0.0  \n",
1045 |               "31958   0.400000   0.066667        0.133333          0.0  \n",
1046 |               "31959   0.666667   0.000000        0.111111          0.0  \n",
1047 |               "31960   0.666667   0.000000        0.111111          0.0  \n",
1048 |               "31961   0.750000   0.000000        0.000000          0.0  "
1049 |             ]
1050 |           },
1051 |           "metadata": {
1052 |             "tags": []
1053 |           },
1054 |           "execution_count": 141
1055 |         }
1056 |       ]
1057 |     },
1058 |     {
1059 |       "metadata": {
1060 |         "id": "xRCSGkzqGn1r",
1061 |         "colab_type": "code",
1062 |         "colab": {
1063 |           "base_uri": "https://localhost:8080/",
1064 |           "height": 108
1065 |         },
1066 |         "outputId": "56c7238e-30b2-4dbe-80e9-3df0e065667e"
1067 |       },
1068 |       "cell_type": "code",
1069 |       "source": [
1070 |         "features_distrbn = data_copy.groupby(['label'], as_index = False)['words_count', 'adjective_freq', 'noun_freq', 'adverb_freq', 'verb_freq'].mean()\n",
1071 |         "np.round(features_distrbn, 3)"
1072 |       ],
1073 |       "execution_count": 143,
1074 |       "outputs": [
1075 |         {
1076 |           "output_type": "execute_result",
1077 |           "data": {
1078 |             "text/html": [
1079 |               "<div>\n",
1080 |               "<style scoped>\n",
1081 |               "    .dataframe tbody tr th:only-of-type {\n",
1082 |               "        vertical-align: middle;\n",
1083 |               "    }\n",
1084 |               "\n",
1085 |               "    .dataframe tbody tr th {\n",
1086 |               "        vertical-align: top;\n",
1087 |               "    }\n",
1088 |               "\n",
1089 |               "    .dataframe thead th {\n",
1090 |               "        text-align: right;\n",
1091 |               "    }\n",
1092 |               "</style>\n",
1093 |               "<table border=\"1\" class=\"dataframe\">\n",
1094 |               "  <thead>\n",
1095 |               "    <tr style=\"text-align: right;\">\n",
1096 |               "      <th></th>\n",
1097 |               "      <th>label</th>\n",
1098 |               "      <th>words_count</th>\n",
1099 |               "      <th>adjective_freq</th>\n",
1100 |               "      <th>noun_freq</th>\n",
1101 |               "      <th>adverb_freq</th>\n",
1102 |               "      <th>verb_freq</th>\n",
1103 |               "    </tr>\n",
1104 |               "  </thead>\n",
1105 |               "  <tbody>\n",
1106 |               "    <tr>\n",
1107 |               "      <th>0</th>\n",
1108 |               "      <td>0</td>\n",
1109 |               "      <td>7.785</td>\n",
1110 |               "      <td>0.162</td>\n",
1111 |               "      <td>0.446</td>\n",
1112 |               "      <td>0.050</td>\n",
1113 |               "      <td>0.032</td>\n",
1114 |               "    </tr>\n",
1115 |               "    <tr>\n",
1116 |               "      <th>1</th>\n",
1117 |               "      <td>1</td>\n",
1118 |               "      <td>8.368</td>\n",
1119 |               "      <td>0.180</td>\n",
1120 |               "      <td>0.436</td>\n",
1121 |               "      <td>0.049</td>\n",
1122 |               "      <td>0.025</td>\n",
1123 |               "    </tr>\n",
1124 |               "  </tbody>\n",
1125 |               "</table>\n",
1126 |               "</div>"
1127 |             ],
1128 |             "text/plain": [
1129 |               "   label  words_count  adjective_freq  noun_freq  adverb_freq  verb_freq\n",
1130 |               "0      0        7.785           0.162      0.446        0.050      0.032\n",
1131 |               "1      1        8.368           0.180      0.436        0.049      0.025"
1132 |             ]
1133 |           },
1134 |           "metadata": {
1135 |             "tags": []
1136 |           },
1137 |           "execution_count": 143
1138 |         }
1139 |       ]
1140 |     },
1141 |     {
1142 |       "metadata": {
1143 |         "id": "VH6d05-2ojXg",
1144 |         "colab_type": "text"
1145 |       },
1146 |       "cell_type": "markdown",
1147 |       "source": [
1148 |         "#### read the GLoVE 100 file for preparing an Embedding layer of 100 dimenions"
1149 |       ]
1150 |     },
1151 |     {
1152 |       "metadata": {
1153 |         "id": "MlcN9Z3OxaV5",
1154 |         "colab_type": "code",
1155 |         "colab": {}
1156 |       },
1157 |       "cell_type": "code",
1158 |       "source": [
1159 |         "from google.colab import files\n",
1160 |         "uploaded = files.upload()"
1161 |       ],
1162 |       "execution_count": 0,
1163 |       "outputs": []
1164 |     },
1165 |     {
1166 |       "metadata": {
1167 |         "id": "-3Ul8DpiojXh",
1168 |         "colab_type": "code",
1169 |         "colab": {
1170 |           "base_uri": "https://localhost:8080/",
1171 |           "height": 35
1172 |         },
1173 |         "outputId": "b5972abc-4035-4bb8-eada-88ea2ac0089a"
1174 |       },
1175 |       "cell_type": "code",
1176 |       "source": [
1177 |         "MAX_SEQUENCE_LENGTH = 50\n",
1178 |         "MAX_NUM_WORDS = 10000\n",
1179 |         "EMBEDDING_DIM = 100\n",
1180 |         "VALIDATION_SPLIT = 0.2\n",
1181 |         "\n",
1182 |         "embeddings_index = {}\n",
1183 |         "\n",
1184 |         "with open(\"/content/glove.6B.100d.txt\", encoding='utf-8') as f:\n",
1185 |         "    for line in f:\n",
1186 |         "        values = line.split()\n",
1187 |         "        word = values[0]\n",
1188 |         "        coefs = np.asarray(values[1:], dtype='float32')\n",
1189 |         "        embeddings_index[word] = coefs\n",
1190 |         "print('Found %s word vectors.' % len(embeddings_index))"
1191 |       ],
1192 |       "execution_count": 144,
1193 |       "outputs": [
1194 |         {
1195 |           "output_type": "stream",
1196 |           "text": [
1197 |             "Found 400000 word vectors.\n"
1198 |           ],
1199 |           "name": "stdout"
1200 |         }
1201 |       ]
1202 |     },
1203 |     {
1204 |       "metadata": {
1205 |         "id": "aT9Vlu4wojXm",
1206 |         "colab_type": "text"
1207 |       },
1208 |       "cell_type": "markdown",
1209 |       "source": [
1210 |         "#### Initialize the Embedding layer with pre-trained weights from the GLoVE model:"
1211 |       ]
1212 |     },
1213 |     {
1214 |       "metadata": {
1215 |         "id": "6u6uoO2RojXn",
1216 |         "colab_type": "code",
1217 |         "colab": {}
1218 |       },
1219 |       "cell_type": "code",
1220 |       "source": [
1221 |         "## Define the sequence lengths, max number of words and embedding dimensions\n",
1222 |         "MAX_SEQUENCE_LENGTH = 50   # Sequence length of each sentence. If more, crop. If less, pad with zeros\n",
1223 |         "MAX_NB_WORDS = 10000        # Top 10000 frequently occuring words"
1224 |       ],
1225 |       "execution_count": 0,
1226 |       "outputs": []
1227 |     },
1228 |     {
1229 |       "metadata": {
1230 |         "id": "YCAa9ShNojXp",
1231 |         "colab_type": "text"
1232 |       },
1233 |       "cell_type": "markdown",
1234 |       "source": [
1235 |         "#### Train-validation-test split. We separate validation data to use early stopping callback feature during training our model below"
1236 |       ]
1237 |     },
1238 |     {
1239 |       "metadata": {
1240 |         "id": "OkmtWUWDojXp",
1241 |         "colab_type": "code",
1242 |         "colab": {
1243 |           "base_uri": "https://localhost:8080/",
1244 |           "height": 35
1245 |         },
1246 |         "outputId": "f0826fff-6bcb-43bd-8e06-8a66c539c167"
1247 |       },
1248 |       "cell_type": "code",
1249 |       "source": [
1250 |         "X = data_copy.iloc[:,1:7]\n",
1251 |         "y = data_copy['label']\n",
1252 |         "print(len(X),len(y))\n",
1253 |         "\n"
1254 |       ],
1255 |       "execution_count": 147,
1256 |       "outputs": [
1257 |         {
1258 |           "output_type": "stream",
1259 |           "text": [
1260 |             "31962 31962\n"
1261 |           ],
1262 |           "name": "stdout"
1263 |         }
1264 |       ]
1265 |     },
1266 |     {
1267 |       "metadata": {
1268 |         "id": "OVaXFlc_ojXt",
1269 |         "colab_type": "code",
1270 |         "colab": {}
1271 |       },
1272 |       "cell_type": "code",
1273 |       "source": [
1274 |         "x_train_val, x_test, y_train_val, y_test = train_test_split(X,y, test_size = 0.3, random_state = 123)"
1275 |       ],
1276 |       "execution_count": 0,
1277 |       "outputs": []
1278 |     },
1279 |     {
1280 |       "metadata": {
1281 |         "id": "WqLkUFhc0Y6T",
1282 |         "colab_type": "code",
1283 |         "colab": {}
1284 |       },
1285 |       "cell_type": "code",
1286 |       "source": [
1287 |         "x_train, x_val, y_train, y_val = train_test_split(x_train_val, \n",
1288 |         "                                    y_train_val, test_size = 0.2, random_state = 123)"
1289 |       ],
1290 |       "execution_count": 0,
1291 |       "outputs": []
1292 |     },
1293 |     {
1294 |       "metadata": {
1295 |         "id": "wlizcka71g2g",
1296 |         "colab_type": "text"
1297 |       },
1298 |       "cell_type": "markdown",
1299 |       "source": [
1300 |         "rebuild the full training data-frame to double-check that word count and POS tagging remain useful features across the training data set as well. Note above when we analysed the test data we did not include any labels data or standardization measures, thereby strictly ensuring no leakage of test information into the trainnig space"
1301 |       ]
1302 |     },
1303 |     {
1304 |       "metadata": {
1305 |         "id": "uMu87ILmojX1",
1306 |         "colab_type": "code",
1307 |         "colab": {
1308 |           "base_uri": "https://localhost:8080/",
1309 |           "height": 198
1310 |         },
1311 |         "outputId": "2fa8059a-728e-4fdb-81e2-08956c4d86a2"
1312 |       },
1313 |       "cell_type": "code",
1314 |       "source": [
1315 |         "train_full = pd.concat([x_train, y_train], axis = 1)\n",
1316 |         "train_full.head()"
1317 |       ],
1318 |       "execution_count": 150,
1319 |       "outputs": [
1320 |         {
1321 |           "output_type": "execute_result",
1322 |           "data": {
1323 |             "text/html": [
1324 |               "<div>\n",
1325 |               "<style scoped>\n",
1326 |               "    .dataframe tbody tr th:only-of-type {\n",
1327 |               "        vertical-align: middle;\n",
1328 |               "    }\n",
1329 |               "\n",
1330 |               "    .dataframe tbody tr th {\n",
1331 |               "        vertical-align: top;\n",
1332 |               "    }\n",
1333 |               "\n",
1334 |               "    .dataframe thead th {\n",
1335 |               "        text-align: right;\n",
1336 |               "    }\n",
1337 |               "</style>\n",
1338 |               "<table border=\"1\" class=\"dataframe\">\n",
1339 |               "  <thead>\n",
1340 |               "    <tr style=\"text-align: right;\">\n",
1341 |               "      <th></th>\n",
1342 |               "      <th>tweet</th>\n",
1343 |               "      <th>words_count</th>\n",
1344 |               "      <th>noun_freq</th>\n",
1345 |               "      <th>verb_freq</th>\n",
1346 |               "      <th>adjective_freq</th>\n",
1347 |               "      <th>adverb_freq</th>\n",
1348 |               "      <th>label</th>\n",
1349 |               "    </tr>\n",
1350 |               "  </thead>\n",
1351 |               "  <tbody>\n",
1352 |               "    <tr>\n",
1353 |               "      <th>4574</th>\n",
1354 |               "      <td>here's a ?for the day the government and state...</td>\n",
1355 |               "      <td>12</td>\n",
1356 |               "      <td>0.692308</td>\n",
1357 |               "      <td>0.0</td>\n",
1358 |               "      <td>0.000000</td>\n",
1359 |               "      <td>0.000000</td>\n",
1360 |               "      <td>1</td>\n",
1361 |               "    </tr>\n",
1362 |               "    <tr>\n",
1363 |               "      <th>6142</th>\n",
1364 |               "      <td>cardiff!!!! ó¾©. #cardiff #euro2016 #euro #fo...</td>\n",
1365 |               "      <td>9</td>\n",
1366 |               "      <td>0.800000</td>\n",
1367 |               "      <td>0.0</td>\n",
1368 |               "      <td>0.000000</td>\n",
1369 |               "      <td>0.000000</td>\n",
1370 |               "      <td>0</td>\n",
1371 |               "    </tr>\n",
1372 |               "    <tr>\n",
1373 |               "      <th>22578</th>\n",
1374 |               "      <td>@user where on eah is pastor #mmusimaimane? is...</td>\n",
1375 |               "      <td>10</td>\n",
1376 |               "      <td>0.636364</td>\n",
1377 |               "      <td>0.0</td>\n",
1378 |               "      <td>0.181818</td>\n",
1379 |               "      <td>0.090909</td>\n",
1380 |               "      <td>1</td>\n",
1381 |               "    </tr>\n",
1382 |               "    <tr>\n",
1383 |               "      <th>3709</th>\n",
1384 |               "      <td>@user and the panel is seated! @user @user @u...</td>\n",
1385 |               "      <td>8</td>\n",
1386 |               "      <td>0.444444</td>\n",
1387 |               "      <td>0.0</td>\n",
1388 |               "      <td>0.222222</td>\n",
1389 |               "      <td>0.000000</td>\n",
1390 |               "      <td>0</td>\n",
1391 |               "    </tr>\n",
1392 |               "    <tr>\n",
1393 |               "      <th>5766</th>\n",
1394 |               "      <td>so often we think our country has come so far ...</td>\n",
1395 |               "      <td>9</td>\n",
1396 |               "      <td>0.300000</td>\n",
1397 |               "      <td>0.0</td>\n",
1398 |               "      <td>0.100000</td>\n",
1399 |               "      <td>0.200000</td>\n",
1400 |               "      <td>0</td>\n",
1401 |               "    </tr>\n",
1402 |               "  </tbody>\n",
1403 |               "</table>\n",
1404 |               "</div>"
1405 |             ],
1406 |             "text/plain": [
1407 |               "                                                   tweet  words_count  \\\n",
1408 |               "4574   here's a ?for the day the government and state...           12   \n",
1409 |               "6142   cardiff!!!! ó¾©. #cardiff #euro2016 #euro #fo...            9   \n",
1410 |               "22578  @user where on eah is pastor #mmusimaimane? is...           10   \n",
1411 |               "3709    @user and the panel is seated! @user @user @u...            8   \n",
1412 |               "5766   so often we think our country has come so far ...            9   \n",
1413 |               "\n",
1414 |               "       noun_freq  verb_freq  adjective_freq  adverb_freq  label  \n",
1415 |               "4574    0.692308        0.0        0.000000     0.000000      1  \n",
1416 |               "6142    0.800000        0.0        0.000000     0.000000      0  \n",
1417 |               "22578   0.636364        0.0        0.181818     0.090909      1  \n",
1418 |               "3709    0.444444        0.0        0.222222     0.000000      0  \n",
1419 |               "5766    0.300000        0.0        0.100000     0.200000      0  "
1420 |             ]
1421 |           },
1422 |           "metadata": {
1423 |             "tags": []
1424 |           },
1425 |           "execution_count": 150
1426 |         }
1427 |       ]
1428 |     },
1429 |     {
1430 |       "metadata": {
1431 |         "id": "2wTqA_y31oI0",
1432 |         "colab_type": "code",
1433 |         "colab": {
1434 |           "base_uri": "https://localhost:8080/",
1435 |           "height": 108
1436 |         },
1437 |         "outputId": "cb581d5c-20aa-4e90-db2f-f5f8d4cd961e"
1438 |       },
1439 |       "cell_type": "code",
1440 |       "source": [
1441 |         "word_count_distrbn_train = train_full.groupby(['label'], \n",
1442 |         "        as_index = False)['words_count', 'adjective_freq', 'noun_freq', 'adverb_freq', 'verb_freq'].mean()\n",
1443 |         "np.round(word_count_distrbn_train, 3)"
1444 |       ],
1445 |       "execution_count": 151,
1446 |       "outputs": [
1447 |         {
1448 |           "output_type": "execute_result",
1449 |           "data": {
1450 |             "text/html": [
1451 |               "<div>\n",
1452 |               "<style scoped>\n",
1453 |               "    .dataframe tbody tr th:only-of-type {\n",
1454 |               "        vertical-align: middle;\n",
1455 |               "    }\n",
1456 |               "\n",
1457 |               "    .dataframe tbody tr th {\n",
1458 |               "        vertical-align: top;\n",
1459 |               "    }\n",
1460 |               "\n",
1461 |               "    .dataframe thead th {\n",
1462 |               "        text-align: right;\n",
1463 |               "    }\n",
1464 |               "</style>\n",
1465 |               "<table border=\"1\" class=\"dataframe\">\n",
1466 |               "  <thead>\n",
1467 |               "    <tr style=\"text-align: right;\">\n",
1468 |               "      <th></th>\n",
1469 |               "      <th>label</th>\n",
1470 |               "      <th>words_count</th>\n",
1471 |               "      <th>adjective_freq</th>\n",
1472 |               "      <th>noun_freq</th>\n",
1473 |               "      <th>adverb_freq</th>\n",
1474 |               "      <th>verb_freq</th>\n",
1475 |               "    </tr>\n",
1476 |               "  </thead>\n",
1477 |               "  <tbody>\n",
1478 |               "    <tr>\n",
1479 |               "      <th>0</th>\n",
1480 |               "      <td>0</td>\n",
1481 |               "      <td>7.751</td>\n",
1482 |               "      <td>0.162</td>\n",
1483 |               "      <td>0.446</td>\n",
1484 |               "      <td>0.05</td>\n",
1485 |               "      <td>0.031</td>\n",
1486 |               "    </tr>\n",
1487 |               "    <tr>\n",
1488 |               "      <th>1</th>\n",
1489 |               "      <td>1</td>\n",
1490 |               "      <td>8.268</td>\n",
1491 |               "      <td>0.177</td>\n",
1492 |               "      <td>0.432</td>\n",
1493 |               "      <td>0.05</td>\n",
1494 |               "      <td>0.025</td>\n",
1495 |               "    </tr>\n",
1496 |               "  </tbody>\n",
1497 |               "</table>\n",
1498 |               "</div>"
1499 |             ],
1500 |             "text/plain": [
1501 |               "   label  words_count  adjective_freq  noun_freq  adverb_freq  verb_freq\n",
1502 |               "0      0        7.751           0.162      0.446         0.05      0.031\n",
1503 |               "1      1        8.268           0.177      0.432         0.05      0.025"
1504 |             ]
1505 |           },
1506 |           "metadata": {
1507 |             "tags": []
1508 |           },
1509 |           "execution_count": 151
1510 |         }
1511 |       ]
1512 |     },
1513 |     {
1514 |       "metadata": {
1515 |         "id": "Qn3LB0mF1uYn",
1516 |         "colab_type": "code",
1517 |         "colab": {
1518 |           "base_uri": "https://localhost:8080/",
1519 |           "height": 54
1520 |         },
1521 |         "outputId": "94570376-ed97-4029-96fe-8c37521f3963"
1522 |       },
1523 |       "cell_type": "code",
1524 |       "source": [
1525 |         "print(x_train.shape, x_val.shape, x_test.shape)\n",
1526 |         "print(y_train.shape, y_val.shape, y_test.shape)"
1527 |       ],
1528 |       "execution_count": 154,
1529 |       "outputs": [
1530 |         {
1531 |           "output_type": "stream",
1532 |           "text": [
1533 |             "(17898, 6) (4475, 6) (9589, 6)\n",
1534 |             "(17898,) (4475,) (9589,)\n"
1535 |           ],
1536 |           "name": "stdout"
1537 |         }
1538 |       ]
1539 |     },
1540 |     {
1541 |       "metadata": {
1542 |         "id": "w1IuJwhX3OXK",
1543 |         "colab_type": "text"
1544 |       },
1545 |       "cell_type": "markdown",
1546 |       "source": [
1547 |         "The train-validation-test original texts are tokenized and padded for machine-readable formatting"
1548 |       ]
1549 |     },
1550 |     {
1551 |       "metadata": {
1552 |         "id": "cVFNLC4u3U0O",
1553 |         "colab_type": "code",
1554 |         "colab": {
1555 |           "base_uri": "https://localhost:8080/",
1556 |           "height": 163
1557 |         },
1558 |         "outputId": "2992d5f3-6525-4c2f-f785-2e88fc300b51"
1559 |       },
1560 |       "cell_type": "code",
1561 |       "source": [
1562 |         "tokenizer = Tokenizer(num_words=MAX_NB_WORDS)   # get the frequently occuring words\n",
1563 |         "tokenizer.fit_on_texts(x_train.tweet)           \n",
1564 |         "train_sequences = tokenizer.texts_to_sequences(x_train.tweet)\n",
1565 |         "val_sequences = tokenizer.texts_to_sequences(x_val.tweet)\n",
1566 |         "test_sequences = tokenizer.texts_to_sequences(x_test.tweet)\n",
1567 |         "\n",
1568 |         "word_index = tokenizer.word_index               # dictionary containing words and their index\n",
1569 |         "#print(tokenizer.word_index)                   # print to check\n",
1570 |         "print('Found %s unique tokens.' % len(word_index)) # total words in the corpus\n",
1571 |         "train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n",
1572 |         "val_data = pad_sequences(val_sequences, maxlen = MAX_SEQUENCE_LENGTH)# get only the top frequent words on train\n",
1573 |         "test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)   # get only the top frequent words on test\n",
1574 |         "\n",
1575 |         "print(train_data[0])\n",
1576 |         "print(train_data.shape)\n",
1577 |         "print(val_data.shape)\n",
1578 |         "print(test_data.shape)"
1579 |       ],
1580 |       "execution_count": 157,
1581 |       "outputs": [
1582 |         {
1583 |           "output_type": "stream",
1584 |           "text": [
1585 |             "Found 31519 unique tokens.\n",
1586 |             "[   0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
1587 |             "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
1588 |             "    0    0  944    4    9    2   19    2 1477    7 1087 1854   65 2520\n",
1589 |             " 3200  666 3201  412 3629    8 1329   20]\n",
1590 |             "(17898, 50)\n",
1591 |             "(4475, 50)\n",
1592 |             "(9589, 50)\n"
1593 |           ],
1594 |           "name": "stdout"
1595 |         }
1596 |       ]
1597 |     },
1598 |     {
1599 |       "metadata": {
1600 |         "id": "pgy6Vh_eJSdT",
1601 |         "colab_type": "text"
1602 |       },
1603 |       "cell_type": "markdown",
1604 |       "source": [
1605 |         "standardize all the numeric features generated above by by fitting them on training data and using that to transform validation and test data respectively\n"
1606 |       ]
1607 |     },
1608 |     {
1609 |       "metadata": {
1610 |         "id": "JBO_5xvcJVJr",
1611 |         "colab_type": "code",
1612 |         "colab": {}
1613 |       },
1614 |       "cell_type": "code",
1615 |       "source": [
1616 |         "scaleable_cols = ['words_count', 'adjective_freq', 'noun_freq', 'adverb_freq', 'verb_freq']"
1617 |       ],
1618 |       "execution_count": 0,
1619 |       "outputs": []
1620 |     },
1621 |     {
1622 |       "metadata": {
1623 |         "id": "neV_czZVJm_U",
1624 |         "colab_type": "code",
1625 |         "colab": {
1626 |           "base_uri": "https://localhost:8080/",
1627 |           "height": 146
1628 |         },
1629 |         "outputId": "858e5670-52fa-4ad0-8d2c-0b27bf43ba49"
1630 |       },
1631 |       "cell_type": "code",
1632 |       "source": [
1633 |         "scaler_multicol = MinMaxScaler()\n",
1634 |         "train_multicol_scaled = scaler_multicol.fit_transform(x_train[scaleable_cols])\n",
1635 |         "val_multicol_scaled = scaler_multicol.fit_transform(x_val[scaleable_cols])\n",
1636 |         "test_multicol_scaled = scaler_multicol.fit_transform(x_test[scaleable_cols])"
1637 |       ],
1638 |       "execution_count": 158,
1639 |       "outputs": [
1640 |         {
1641 |           "output_type": "stream",
1642 |           "text": [
1643 |             "/usr/local/lib/python3.6/dist-packages/sklearn/preprocessing/data.py:334: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by MinMaxScaler.\n",
1644 |             "  return self.partial_fit(X, y)\n",
1645 |             "/usr/local/lib/python3.6/dist-packages/sklearn/preprocessing/data.py:334: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by MinMaxScaler.\n",
1646 |             "  return self.partial_fit(X, y)\n",
1647 |             "/usr/local/lib/python3.6/dist-packages/sklearn/preprocessing/data.py:334: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by MinMaxScaler.\n",
1648 |             "  return self.partial_fit(X, y)\n"
1649 |           ],
1650 |           "name": "stderr"
1651 |         }
1652 |       ]
1653 |     },
1654 |     {
1655 |       "metadata": {
1656 |         "id": "gwdAMxrtJ8ba",
1657 |         "colab_type": "code",
1658 |         "colab": {
1659 |           "base_uri": "https://localhost:8080/",
1660 |           "height": 35
1661 |         },
1662 |         "outputId": "10b5c69b-2d2f-4b77-99da-0d57720f45a4"
1663 |       },
1664 |       "cell_type": "code",
1665 |       "source": [
1666 |         "train_multicol_scaled[2]"
1667 |       ],
1668 |       "execution_count": 161,
1669 |       "outputs": [
1670 |         {
1671 |           "output_type": "execute_result",
1672 |           "data": {
1673 |             "text/plain": [
1674 |               "array([0.27027027, 0.22727273, 0.65356265, 0.12121212, 0.        ])"
1675 |             ]
1676 |           },
1677 |           "metadata": {
1678 |             "tags": []
1679 |           },
1680 |           "execution_count": 161
1681 |         }
1682 |       ]
1683 |     },
1684 |     {
1685 |       "metadata": {
1686 |         "id": "GPv8yUtTKFnc",
1687 |         "colab_type": "code",
1688 |         "colab": {
1689 |           "base_uri": "https://localhost:8080/",
1690 |           "height": 72
1691 |         },
1692 |         "outputId": "7cf5d92a-b0ac-4c21-c23b-0aa671454683"
1693 |       },
1694 |       "cell_type": "code",
1695 |       "source": [
1696 |         "train_data = np.hstack((train_data, train_multicol_scaled))\n",
1697 |         "val_data = np.hstack((val_data, val_multicol_scaled))\n",
1698 |         "test_data = np.hstack((test_data, test_multicol_scaled))\n",
1699 |         "\n",
1700 |         "print(train_data.shape)\n",
1701 |         "print(val_data.shape)\n",
1702 |         "print(test_data.shape)"
1703 |       ],
1704 |       "execution_count": 162,
1705 |       "outputs": [
1706 |         {
1707 |           "output_type": "stream",
1708 |           "text": [
1709 |             "(17898, 55)\n",
1710 |             "(4475, 55)\n",
1711 |             "(9589, 55)\n"
1712 |           ],
1713 |           "name": "stdout"
1714 |         }
1715 |       ]
1716 |     },
1717 |     {
1718 |       "metadata": {
1719 |         "id": "Zenq_yPSKoNf",
1720 |         "colab_type": "code",
1721 |         "colab": {
1722 |           "base_uri": "https://localhost:8080/",
1723 |           "height": 272
1724 |         },
1725 |         "outputId": "009c4709-7c97-40b9-952e-902b95222eb7"
1726 |       },
1727 |       "cell_type": "code",
1728 |       "source": [
1729 |         "print(train_data[0])"
1730 |       ],
1731 |       "execution_count": 163,
1732 |       "outputs": [
1733 |         {
1734 |           "output_type": "stream",
1735 |           "text": [
1736 |             "[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00\n",
1737 |             " 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00\n",
1738 |             " 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00\n",
1739 |             " 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00\n",
1740 |             " 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00\n",
1741 |             " 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00\n",
1742 |             " 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00\n",
1743 |             " 0.00000000e+00 0.00000000e+00 9.44000000e+02 4.00000000e+00\n",
1744 |             " 9.00000000e+00 2.00000000e+00 1.90000000e+01 2.00000000e+00\n",
1745 |             " 1.47700000e+03 7.00000000e+00 1.08700000e+03 1.85400000e+03\n",
1746 |             " 6.50000000e+01 2.52000000e+03 3.20000000e+03 6.66000000e+02\n",
1747 |             " 3.20100000e+03 4.12000000e+02 3.62900000e+03 8.00000000e+00\n",
1748 |             " 1.32900000e+03 2.00000000e+01 3.24324324e-01 0.00000000e+00\n",
1749 |             " 7.11018711e-01 0.00000000e+00 0.00000000e+00]\n"
1750 |           ],
1751 |           "name": "stdout"
1752 |         }
1753 |       ]
1754 |     },
1755 |     {
1756 |       "metadata": {
1757 |         "id": "M-UkgyTGL9N4",
1758 |         "colab_type": "text"
1759 |       },
1760 |       "cell_type": "markdown",
1761 |       "source": [
1762 |         "initialize the embedding matrix from GLoVE 100 which will be used as the initial weights of the Embedding layer in our neural network"
1763 |       ]
1764 |     },
1765 |     {
1766 |       "metadata": {
1767 |         "id": "PGcKcltqKuL_",
1768 |         "colab_type": "code",
1769 |         "colab": {}
1770 |       },
1771 |       "cell_type": "code",
1772 |       "source": [
1773 |         "num_words = min(MAX_NB_WORDS, len(embeddings_index))\n",
1774 |         "embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))\n",
1775 |         "for word, i in word_index.items():\n",
1776 |         "    if i >= MAX_NB_WORDS:\n",
1777 |         "        continue\n",
1778 |         "    embedding_vector = embeddings_index.get(word)\n",
1779 |         "    if embedding_vector is not None:\n",
1780 |         "        # words not found in embedding index will be all-zeros.\n",
1781 |         "        embedding_matrix[i] = embedding_vector"
1782 |       ],
1783 |       "execution_count": 0,
1784 |       "outputs": []
1785 |     },
1786 |     {
1787 |       "metadata": {
1788 |         "id": "VQAjA0ujNCVc",
1789 |         "colab_type": "text"
1790 |       },
1791 |       "cell_type": "markdown",
1792 |       "source": [
1793 |         "The following cell declares our final model that has:\n",
1794 |         "\n",
1795 |         "\n",
1796 |         "\n",
1797 |         "\n",
1798 |         "*   an initial Embedding layer with pre-trained weights from GLoVE 100 as described above, hence trainable = False at this layer,\n",
1799 |         "*   two layers of GRU model with in-between BatchNormalization and Dropout as mentioned\n",
1800 |         "*   the neural network uses the Keras function API and is then split after the second GRU layer into a one output model - 'label',\n",
1801 |         "*   the output model has two Dense() layers as we want to visualize the learning of the network just prior to it's final predictions\n",
1802 |         "\n"
1803 |       ]
1804 |     },
1805 |     {
1806 |       "metadata": {
1807 |         "id": "ltd8golNLu86",
1808 |         "colab_type": "code",
1809 |         "colab": {
1810 |           "base_uri": "https://localhost:8080/",
1811 |           "height": 545
1812 |         },
1813 |         "outputId": "1e5660ab-9b7b-41f0-d97b-2603baeb2d5d"
1814 |       },
1815 |       "cell_type": "code",
1816 |       "source": [
1817 |         "posts_input = Input(shape=(None,), dtype='int32', name='all_posts')\n",
1818 |         "embedded_posts = Embedding(input_dim=MAX_NB_WORDS,\n",
1819 |         "                            input_length=MAX_SEQUENCE_LENGTH, \n",
1820 |         "                            output_dim=EMBEDDING_DIM,\n",
1821 |         "                            weights=[embedding_matrix],\n",
1822 |         "                            trainable=False)(posts_input)\n",
1823 |         "\n",
1824 |         "x = layers.GRU(128, activation='relu', return_sequences = True)(embedded_posts)\n",
1825 |         "x = layers.BatchNormalization()(x)\n",
1826 |         "x = layers.Dropout(0.2)(x)\n",
1827 |         "x = layers.GRU(64, activation = 'relu')(x)\n",
1828 |         "x = layers.Dense(16, activation='relu')(x)\n",
1829 |         "x = layers.Dropout(0.2)(x)\n",
1830 |         "\n",
1831 |         "label_pred = layers.Dense(8, activation = 'relu', name = 'label0')(x)\n",
1832 |         "label_pred = layers.Dropout(0.5)(label_pred)\n",
1833 |         "label_pred = layers.Dense(1, activation = 'sigmoid', name = 'label1')(label_pred)\n",
1834 |         "\n",
1835 |         "\n",
1836 |         "combined_model = Model(posts_input, [label_pred])\n",
1837 |         "combined_model.summary()"
1838 |       ],
1839 |       "execution_count": 172,
1840 |       "outputs": [
1841 |         {
1842 |           "output_type": "stream",
1843 |           "text": [
1844 |             "_________________________________________________________________\n",
1845 |             "Layer (type)                 Output Shape              Param #   \n",
1846 |             "=================================================================\n",
1847 |             "all_posts (InputLayer)       (None, None)              0         \n",
1848 |             "_________________________________________________________________\n",
1849 |             "embedding_4 (Embedding)      (None, 50, 100)           1000000   \n",
1850 |             "_________________________________________________________________\n",
1851 |             "gru_7 (GRU)                  (None, 50, 128)           87936     \n",
1852 |             "_________________________________________________________________\n",
1853 |             "batch_normalization_4 (Batch (None, 50, 128)           512       \n",
1854 |             "_________________________________________________________________\n",
1855 |             "dropout_10 (Dropout)         (None, 50, 128)           0         \n",
1856 |             "_________________________________________________________________\n",
1857 |             "gru_8 (GRU)                  (None, 64)                37056     \n",
1858 |             "_________________________________________________________________\n",
1859 |             "dense_4 (Dense)              (None, 16)                1040      \n",
1860 |             "_________________________________________________________________\n",
1861 |             "dropout_11 (Dropout)         (None, 16)                0         \n",
1862 |             "_________________________________________________________________\n",
1863 |             "label0 (Dense)               (None, 8)                 136       \n",
1864 |             "_________________________________________________________________\n",
1865 |             "dropout_12 (Dropout)         (None, 8)                 0         \n",
1866 |             "_________________________________________________________________\n",
1867 |             "label1 (Dense)               (None, 1)                 9         \n",
1868 |             "=================================================================\n",
1869 |             "Total params: 1,126,689\n",
1870 |             "Trainable params: 126,433\n",
1871 |             "Non-trainable params: 1,000,256\n",
1872 |             "_________________________________________________________________\n"
1873 |           ],
1874 |           "name": "stdout"
1875 |         }
1876 |       ]
1877 |     },
1878 |     {
1879 |       "metadata": {
1880 |         "id": "n1CtLxGkPeiq",
1881 |         "colab_type": "code",
1882 |         "colab": {}
1883 |       },
1884 |       "cell_type": "code",
1885 |       "source": [
1886 |         "callbacks_list = [EarlyStopping(monitor='val_loss', patience=1, ),\n",
1887 |         "                ModelCheckpoint(filepath='model_multi-feature.h5', monitor='val_loss',\n",
1888 |         "                        save_best_only=True,)]"
1889 |       ],
1890 |       "execution_count": 0,
1891 |       "outputs": []
1892 |     },
1893 |     {
1894 |       "metadata": {
1895 |         "id": "EG3t8OY3QZT5",
1896 |         "colab_type": "code",
1897 |         "colab": {}
1898 |       },
1899 |       "cell_type": "code",
1900 |       "source": [
1901 |         "combined_model.compile(optimizer = 'rmsprop',loss = {'label1' : 'binary_crossentropy'},metrics = ['acc'])"
1902 |       ],
1903 |       "execution_count": 0,
1904 |       "outputs": []
1905 |     },
1906 |     {
1907 |       "metadata": {
1908 |         "id": "UOEAvDNvRi9F",
1909 |         "colab_type": "code",
1910 |         "colab": {
1911 |           "base_uri": "https://localhost:8080/",
1912 |           "height": 126
1913 |         },
1914 |         "outputId": "d87df6db-68a5-464b-aa27-7aebc933899f"
1915 |       },
1916 |       "cell_type": "code",
1917 |       "source": [
1918 |         "y_train.head()"
1919 |       ],
1920 |       "execution_count": 180,
1921 |       "outputs": [
1922 |         {
1923 |           "output_type": "execute_result",
1924 |           "data": {
1925 |             "text/plain": [
1926 |               "4574     1\n",
1927 |               "6142     0\n",
1928 |               "22578    1\n",
1929 |               "3709     0\n",
1930 |               "5766     0\n",
1931 |               "Name: label, dtype: int64"
1932 |             ]
1933 |           },
1934 |           "metadata": {
1935 |             "tags": []
1936 |           },
1937 |           "execution_count": 180
1938 |         }
1939 |       ]
1940 |     },
1941 |     {
1942 |       "metadata": {
1943 |         "id": "Q2Zvd-FdSKOw",
1944 |         "colab_type": "code",
1945 |         "colab": {}
1946 |       },
1947 |       "cell_type": "code",
1948 |       "source": [
1949 |         "epochs = 16\n",
1950 |         "batch_size = 32"
1951 |       ],
1952 |       "execution_count": 0,
1953 |       "outputs": []
1954 |     },
1955 |     {
1956 |       "metadata": {
1957 |         "id": "yvCmN-euQzRu",
1958 |         "colab_type": "code",
1959 |         "colab": {
1960 |           "base_uri": "https://localhost:8080/",
1961 |           "height": 219
1962 |         },
1963 |         "outputId": "2ea93371-ebab-40f3-c098-5135a322ca3a"
1964 |       },
1965 |       "cell_type": "code",
1966 |       "source": [
1967 |         "hist = combined_model.fit(train_data, {'label1' : y_train},\n",
1968 |         "                                  epochs = epochs, batch_size = batch_size,\n",
1969 |         "                   callbacks = callbacks_list,\n",
1970 |         "               validation_data = (val_data, {'label1' : y_val})).history"
1971 |       ],
1972 |       "execution_count": 183,
1973 |       "outputs": [
1974 |         {
1975 |           "output_type": "stream",
1976 |           "text": [
1977 |             "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
1978 |             "Instructions for updating:\n",
1979 |             "Use tf.cast instead.\n",
1980 |             "Train on 17898 samples, validate on 4475 samples\n",
1981 |             "Epoch 1/16\n",
1982 |             "17898/17898 [==============================] - 78s 4ms/step - loss: 0.2876 - acc: 0.9151 - val_loss: 0.1718 - val_acc: 0.9280\n",
1983 |             "Epoch 2/16\n",
1984 |             "17898/17898 [==============================] - 76s 4ms/step - loss: 0.2168 - acc: 0.9327 - val_loss: 0.1531 - val_acc: 0.9457\n",
1985 |             "Epoch 3/16\n",
1986 |             "17898/17898 [==============================] - 77s 4ms/step - loss: 0.1857 - acc: 0.9412 - val_loss: 0.2215 - val_acc: 0.9457\n"
1987 |           ],
1988 |           "name": "stdout"
1989 |         }
1990 |       ]
1991 |     },
1992 |     {
1993 |       "metadata": {
1994 |         "id": "509Mzz6MSFPx",
1995 |         "colab_type": "code",
1996 |         "colab": {}
1997 |       },
1998 |       "cell_type": "code",
1999 |       "source": [
2000 |         "model1 = load_model('model_multi-feature.h5')"
2001 |       ],
2002 |       "execution_count": 0,
2003 |       "outputs": []
2004 |     },
2005 |     {
2006 |       "metadata": {
2007 |         "id": "HZSy7KslVbUq",
2008 |         "colab_type": "code",
2009 |         "colab": {}
2010 |       },
2011 |       "cell_type": "code",
2012 |       "source": [
2013 |         "pred = model1.predict(test_data)"
2014 |       ],
2015 |       "execution_count": 0,
2016 |       "outputs": []
2017 |     },
2018 |     {
2019 |       "metadata": {
2020 |         "id": "uN9SsGqdV9df",
2021 |         "colab_type": "code",
2022 |         "colab": {}
2023 |       },
2024 |       "cell_type": "code",
2025 |       "source": [
2026 |         "label_list = list(chain.from_iterable(pred))\n",
2027 |         "label_predict = [1 if x >= 0.5 else 0 for x in label_list]\n",
2028 |         "label_recall = recall_score(label_predict, y_test)\n",
2029 |         "label_acc = accuracy_score(label_predict, y_test)"
2030 |       ],
2031 |       "execution_count": 0,
2032 |       "outputs": []
2033 |     },
2034 |     {
2035 |       "metadata": {
2036 |         "id": "AIe0pYyJXHF8",
2037 |         "colab_type": "code",
2038 |         "colab": {
2039 |           "base_uri": "https://localhost:8080/",
2040 |           "height": 54
2041 |         },
2042 |         "outputId": "cbc33c75-69cb-4082-a88d-68a02019fbbf"
2043 |       },
2044 |       "cell_type": "code",
2045 |       "source": [
2046 |         "print(\"ACCURACY : \",label_acc)\n",
2047 |         "\n",
2048 |         "print(\"Recall : \",label_recall)"
2049 |       ],
2050 |       "execution_count": 196,
2051 |       "outputs": [
2052 |         {
2053 |           "output_type": "stream",
2054 |           "text": [
2055 |             "ACCURACY :  0.9465011992908541\n",
2056 |             "Recall :  0.7821428571428571\n"
2057 |           ],
2058 |           "name": "stdout"
2059 |         }
2060 |       ]
2061 |     },
2062 |     {
2063 |       "metadata": {
2064 |         "id": "0dDWyW7BXDcj",
2065 |         "colab_type": "code",
2066 |         "colab": {}
2067 |       },
2068 |       "cell_type": "code",
2069 |       "source": [
2070 |         ""
2071 |       ],
2072 |       "execution_count": 0,
2073 |       "outputs": []
2074 |     }
2075 |   ]
2076 | }


--------------------------------------------------------------------------------
/Project_Synopsis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manishshettym/Offensive-Text-Detection/ed633804a09fa8d6b6c1d252ac5de371e0bdef15/Project_Synopsis.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Offensive Text Detection using NLP
 2 | 
 3 | This project aims to detect offensive ( Racist , Sexist etc ) text from social media posts (tweets) using NLP techniques
 4 | and sentiment analysis with the help of Machine Learning / Deep learning models.
 5 | 
 6 | ### Please find the Experience paper: [Experience Paper](https://github.com/ManishShettyM/NLP-Offensive-Text-Detection/blob/master/Experience_Paper.pdf)
 7 | 
 8 | #### References
 9 | 
10 | Mahmud, A., Ahmed, K.Z. and Khan, M., 2008. Detecting flames and insults in text.
11 | https://www.researchgate.net/publication/49242911_Detecting_flames_and_insults_in_text
12 |   
13 | Kshirsagar, R., Cukuvac, T., McKeown, K. and McGregor, S., 2018. Predictive embeddings for hate speech detection on twitter. arXiv preprint arXiv:1809.10644.
14 | https://aclweb.org/anthology/W18-5104
15 | 
16 | Amplayo, R.K. and Occidental, J., 2015. Multi-level classifier for the detection of insults in social media. In Proceedings of 15th Philippine Computing Science Congress.
17 | https://www.researchgate.net/publication/273381302_Multi-level_classifier_for_the_detection_of_insults_in_social_media
18 |   
19 | Malmasi, S. and Zampieri, M., 2017. Detecting hate speech in social media. arXiv preprint arXiv:1712.06427.
20 | https://arxiv.org/abs/1712.06427
21 | 
22 | Sax, S., 2016. Flame wars: Automatic insult detection.
23 | https://cs224d.stanford.edu/reports/Sax.pdf
24 | 
25 | Biere, S., Bhulai, S. and Analytics, M.B., 2018. Hate speech detection using natural language processing techniques. Master Business AnalyticsDepartment of Mathematics Faculty of Science.
26 | https://beta.vu.nl/nl/Images/werkstuk-biere_tcm235-893877.pdf
27 | 
28 | Çano, E. and Morisio, M., 2019. Word embeddings for sentiment analysis: a comprehensive empirical survey. arXiv preprint arXiv:1902.00753.
29 | https://www.academia.edu/38464940/Word_Embeddings_for_Sentiment_Analysis_A_Comprehensive_Empirical_Survey
30 | 


--------------------------------------------------------------------------------
/Reference/.ipynb_checkpoints/Naive Bayes-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 62,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Import libraries\n",
 10 |     "import pandas as pd\n",
 11 |     "from sklearn.model_selection import train_test_split\n",
 12 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
 13 |     "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
 14 |     "import pandas as pd\n",
 15 |     "import numpy as np"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 63,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "data": {
 25 |       "text/plain": [
 26 |        "Index(['id', 'label', 'tweet'], dtype='object')"
 27 |       ]
 28 |      },
 29 |      "execution_count": 63,
 30 |      "metadata": {},
 31 |      "output_type": "execute_result"
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "df = pd.read_csv('../Dataset/train_E6oV3lV.csv')\n",
 36 |     "df.columns"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 64,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "from sklearn.utils import shuffle\n",
 46 |     "df = shuffle(df)\n",
 47 |     "\n",
 48 |     "train, test = train_test_split(df, test_size=0.)\n",
 49 |     "X_train = train[\"tweet\"]\n",
 50 |     "X_test  = test[\"tweet\"]\n",
 51 |     "y_train = train[\"label\"]\n",
 52 |     "y_test  = test[\"label\"]"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 65,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# Instantiate the CountVectorizer method\n",
 62 |     "count_vector = CountVectorizer(stop_words = 'english')\n",
 63 |     "\n",
 64 |     "# Fit the training data and then return the matrix\n",
 65 |     "training_data = count_vector.fit_transform(X_train)\n",
 66 |     "\n",
 67 |     "# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()\n",
 68 |     "testing_data = count_vector.transform(X_test)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 66,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "ename": "ValueError",
 78 |      "evalue": "could not convert string to float: \"@user @user yep!! back to #sameolestory some black films @user don't deserve any marketing @user  \"",
 79 |      "output_type": "error",
 80 |      "traceback": [
 81 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 82 |       "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
 83 |       "\u001b[0;32m<ipython-input-66-c32a8e360fb5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnaive_bayes\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mMultinomialNB\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mnaive_bayes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mMultinomialNB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnaive_bayes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0mpredictions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnaive_bayes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 84 |       "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/naive_bayes.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m    577\u001b[0m             \u001b[0mReturns\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    578\u001b[0m         \"\"\"\n\u001b[0;32m--> 579\u001b[0;31m         \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'csr'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    580\u001b[0m         \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_features\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    581\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 85 |       "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m    571\u001b[0m     X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,\n\u001b[1;32m    572\u001b[0m                     \u001b[0mensure_2d\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallow_nd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_min_samples\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 573\u001b[0;31m                     ensure_min_features, warn_on_dtype, estimator)\n\u001b[0m\u001b[1;32m    574\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mmulti_output\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    575\u001b[0m         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,\n",
 86 |       "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m    431\u001b[0m                                       force_all_finite)\n\u001b[1;32m    432\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 433\u001b[0;31m         \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    434\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    435\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 87 |       "\u001b[0;31mValueError\u001b[0m: could not convert string to float: \"@user @user yep!! back to #sameolestory some black films @user don't deserve any marketing @user  \""
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "from sklearn.naive_bayes import MultinomialNB\n",
 93 |     "naive_bayes = MultinomialNB()\n",
 94 |     "naive_bayes.fit(training_data, y_train)\n",
 95 |     "predictions = naive_bayes.predict(testing_data)\n",
 96 |     "\n",
 97 |     "print('Accuracy score: ', format(accuracy_score(y_test, predictions)))\n",
 98 |     "print('Precision score: ', format(precision_score(y_test, predictions)))\n",
 99 |     "print('Recall score: ', format(recall_score(y_test, predictions)))\n",
100 |     "print('F1 score: ', format(f1_score(y_test, predictions)))"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": []
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": []
116 |   }
117 |  ],
118 |  "metadata": {
119 |   "kernelspec": {
120 |    "display_name": "Python 3",
121 |    "language": "python",
122 |    "name": "python3"
123 |   },
124 |   "language_info": {
125 |    "codemirror_mode": {
126 |     "name": "ipython",
127 |     "version": 3
128 |    },
129 |    "file_extension": ".py",
130 |    "mimetype": "text/x-python",
131 |    "name": "python",
132 |    "nbconvert_exporter": "python",
133 |    "pygments_lexer": "ipython3",
134 |    "version": "3.5.2"
135 |   }
136 |  },
137 |  "nbformat": 4,
138 |  "nbformat_minor": 2
139 | }
140 | 


--------------------------------------------------------------------------------