├── Baselines └── Naive Bayes.ipynb ├── CNN ├── .ipynb_checkpoints │ ├── cnn_keras-checkpoint.ipynb │ └── cnn_nlp_nb-checkpoint.ipynb ├── cnn-keras.ipynb └── cnn_nlp_nb.ipynb ├── Dataset └── train_E6oV3lV.csv ├── Experience_Paper.pdf ├── GRU └── gru.ipynb ├── Project_Synopsis.pdf ├── README.md └── Reference ├── .ipynb_checkpoints └── Naive Bayes-checkpoint.ipynb ├── test_predictions.csv ├── test_tweets.csv └── train.csv /Baselines/Naive Bayes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 73, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Import libraries\n", 10 | "import pandas as pd\n", 11 | "from sklearn.model_selection import train_test_split\n", 12 | "from sklearn.feature_extraction.text import CountVectorizer\n", 13 | "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n", 14 | "import pandas as pd\n", 15 | "import numpy as np" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 74, 21 | "metadata": {}, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/plain": [ 26 | "Index(['id', 'label', 'tweet'], dtype='object')" 27 | ] 28 | }, 29 | "execution_count": 74, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "df = pd.read_csv('../Dataset/train_E6oV3lV.csv')\n", 36 | "df.columns" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 75, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "from sklearn.utils import shuffle\n", 46 | "df = shuffle(df)\n", 47 | "\n", 48 | "X_train, X_test, y_train, y_test = train_test_split(df['tweet'], \n", 49 | " df['label'], \n", 50 | " random_state=1)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 76, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Instantiate the CountVectorizer method\n", 60 | "count_vector = CountVectorizer(stop_words = 'english')\n", 61 | "\n", 62 | "# Fit the training data and then return the matrix\n", 63 | "training_data = count_vector.fit_transform(X_train)\n", 64 | "\n", 65 | "# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()\n", 66 | "testing_data = count_vector.transform(X_test)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 77, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "Accuracy score: 0.9564510073833062\n", 79 | "Precision score: 0.864406779661017\n", 80 | "Recall score: 0.45293072824156305\n", 81 | "F1 score: 0.5944055944055944\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "from sklearn.naive_bayes import MultinomialNB\n", 87 | "naive_bayes = MultinomialNB()\n", 88 | "naive_bayes.fit(training_data, y_train)\n", 89 | "predictions = naive_bayes.predict(testing_data)\n", 90 | "\n", 91 | "print('Accuracy score: ', format(accuracy_score(y_test, predictions)))\n", 92 | "print('Precision score: ', format(precision_score(y_test, predictions)))\n", 93 | "print('Recall score: ', format(recall_score(y_test, predictions)))\n", 94 | "print('F1 score: ', format(f1_score(y_test, predictions)))" 95 | ] 96 | }], 97 | "metadata": { 98 | "kernelspec": { 99 | "display_name": "Python 3", 100 | "language": "python", 101 | "name": "python3" 102 | }, 103 | "language_info": { 104 | "codemirror_mode": { 105 | "name": "ipython", 106 | "version": 3 107 | }, 108 | "file_extension": ".py", 109 | "mimetype": "text/x-python", 110 | "name": "python", 111 | "nbconvert_exporter": "python", 112 | "pygments_lexer": "ipython3", 113 | "version": "3.5.2" 114 | } 115 | }, 116 | "nbformat": 4, 117 | "nbformat_minor": 2 118 | } 119 | -------------------------------------------------------------------------------- /CNN/.ipynb_checkpoints/cnn_keras-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 8 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" 9 | }, 10 | "outputs": [ 11 | { 12 | "name": "stdout", 13 | "output_type": "stream", 14 | "text": [ 15 | "['test_tweets_anuFYb8.csv', 'train_E6oV3lV.csv']\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "# This Python 3 environment comes with many helpful analytics libraries installed\n", 21 | "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n", 22 | "# For example, here's several helpful packages to load in \n", 23 | "\n", 24 | "import numpy as np # linear algebra\n", 25 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 26 | "\n", 27 | "# Input data files are available in the \"../input/\" directory.\n", 28 | "# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n", 29 | "\n", 30 | "import os\n", 31 | "print(os.listdir(\"../input\"))\n", 32 | "\n", 33 | "# Any results you write to the current directory are saved as output." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "import numpy as np\n", 45 | "import pandas as pd" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 34, 51 | "metadata": { 52 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 53 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" 54 | }, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | " id ... tweet\n", 61 | "0 1 ... @user when a father is dysfunctional and is s...\n", 62 | "1 2 ... @user @user thanks for #lyft credit i can't us...\n", 63 | "2 3 ... bihday your majesty\n", 64 | "3 4 ... #model i love u take with u all the time in ...\n", 65 | "4 5 ... factsguide: society now #motivation\n", 66 | "\n", 67 | "[5 rows x 3 columns]\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "df = pd.read_csv('../input/train_E6oV3lV.csv')\n", 73 | "df['label'] = df['label'].map({0: 2, 1: 1})\n", 74 | "print(df.head())\n", 75 | "# print(df['label'].head())\n", 76 | "df = df.drop('id', axis=1)\n", 77 | "\n", 78 | "# print(df[:100])\n", 79 | "# zero = 0\n", 80 | "# one = 0\n", 81 | "# for i in df['label']:\n", 82 | "# if(i==0):\n", 83 | "# zero+=1\n", 84 | "# else:\n", 85 | "# one+=1\n", 86 | " \n", 87 | "# print(zero,one)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 35, 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "from keras.preprocessing.text import Tokenizer\n", 99 | "from keras.preprocessing.sequence import pad_sequences\n", 100 | "from sklearn.model_selection import train_test_split\n", 101 | "import copy" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 36, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "25569\n", 114 | "6393\n", 115 | " label tweet\n", 116 | "20529 2 @user “be with those who help your being.â€...\n", 117 | "17833 2 @user a comic to make you #smile and laugh. \n", 118 | "27050 2 lmao @ guys liking random girls pictures like ...\n", 119 | "17683 2 #goodmorning ☀️🔥😃 #neymar #playing ð...\n", 120 | "25301 2 happy at work c.onference: right mindset leads...\n", 121 | "\n" 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "\n", 127 | "train, test = train_test_split(df, test_size=0.2)\n", 128 | "print(len(train))\n", 129 | "print(len(test))\n", 130 | "print(train.head())\n", 131 | "\n", 132 | "print(type(train))" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 37, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "train_df = copy.deepcopy(train)\n", 144 | "test_df = copy.deepcopy(test)\n", 145 | "\n", 146 | "# concatenate column 1 and column 2 as one text\n", 147 | "# print(train_df[1])\n", 148 | "\n", 149 | "# convert string to lower case\n", 150 | "train_texts = train_df['tweet'].values\n", 151 | "train_texts = [s.lower() for s in train_texts]\n", 152 | "\n", 153 | "test_texts = test_df['tweet'].values\n", 154 | "test_texts = [s.lower() for s in test_texts]\n", 155 | "\n", 156 | "# print(train_texts.head())\n", 157 | "# print(train_texts)\n" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 38, 163 | "metadata": { 164 | "collapsed": true 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "\n", 169 | "# =======================Convert string to index================\n", 170 | "# Tokenizer\n", 171 | "tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')\n", 172 | "tk.fit_on_texts(train_texts)\n", 173 | "# If we already have a character list, then replace the tk.word_index\n", 174 | "# If not, just skip below part\n", 175 | "\n", 176 | "# -----------------------Skip part start--------------------------\n", 177 | "# construct a new vocabulary\n", 178 | "alphabet = \"abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\\\"/\\\\|_@#$%^&*~`+-=<>()[]{}\"\n", 179 | "char_dict = {}\n", 180 | "for i, char in enumerate(alphabet):\n", 181 | " char_dict[char] = i + 1\n", 182 | "\n", 183 | "# Use char_dict to replace the tk.word_index\n", 184 | "tk.word_index = char_dict.copy()\n", 185 | "# Add 'UNK' to the vocabulary\n", 186 | "tk.word_index[tk.oov_token] = max(char_dict.values()) + 1\n", 187 | "# -----------------------Skip part end----------------------------\n", 188 | "\n", 189 | "# Convert string to index\n", 190 | "train_sequences = tk.texts_to_sequences(train_texts)\n", 191 | "test_texts = tk.texts_to_sequences(test_texts)\n", 192 | "\n", 193 | "# Padding\n", 194 | "train_data = pad_sequences(train_sequences, maxlen=1014, padding='post')\n", 195 | "test_data = pad_sequences(test_texts, maxlen=1014, padding='post')\n", 196 | "\n", 197 | "# Convert to numpy array\n", 198 | "train_data = np.array(train_data, dtype='float32')\n", 199 | "test_data = np.array(test_data, dtype='float32')\n", 200 | "\n", 201 | "# =======================Get classes================\n", 202 | "train_classes = train_df['label'].values\n", 203 | "train_class_list = [x - 1 for x in train_classes]\n", 204 | "\n", 205 | "test_classes = test_df['label'].values\n", 206 | "test_class_list = [x - 1 for x in test_classes]\n", 207 | "\n", 208 | "from keras.utils import to_categorical\n", 209 | "\n", 210 | "train_classes = to_categorical(train_class_list)\n", 211 | "test_classes = to_categorical(test_class_list)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 39, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "6393\n", 224 | "{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '0': 27, '1': 28, '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34, '8': 35, '9': 36, ' ': 37, ',': 38, ';': 39, '.': 40, '!': 41, '?': 42, ':': 43, \"'\": 44, '\"': 45, '/': 46, '\\\\': 47, '|': 48, '_': 49, '@': 50, '#': 51, '$': 52, '%': 53, '^': 54, '&': 55, '*': 56, '~': 57, '`': 58, '+': 59, '-': 60, '=': 61, '<': 62, '>': 63, '(': 64, ')': 65, '[': 66, ']': 67, '{': 68, '}': 69, 'UNK': 70}\n" 225 | ] 226 | } 227 | ], 228 | "source": [ 229 | "print(len(test_classes))\n", 230 | "print(tk.word_index)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 40, 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "70" 242 | ] 243 | }, 244 | "execution_count": 40, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "\n", 251 | "\n", 252 | "vocab_size = len(tk.word_index)\n", 253 | "vocab_size\n", 254 | "\n" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 41, 260 | "metadata": { 261 | "collapsed": true 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "embedding_weights = [] #(71, 70)\n", 266 | "embedding_weights.append(np.zeros(vocab_size)) # first row is pad\n", 267 | "\n", 268 | "for char, i in tk.word_index.items(): # from index 1 to 70\n", 269 | " onehot = np.zeros(vocab_size)\n", 270 | " onehot[i-1] = 1\n", 271 | " embedding_weights.append(onehot)\n", 272 | "embedding_weights = np.array(embedding_weights)\n" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 42, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "name": "stdout", 282 | "output_type": "stream", 283 | "text": [ 284 | "(71, 70)\n" 285 | ] 286 | }, 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "array([[0., 0., 0., ..., 0., 0., 0.],\n", 291 | " [1., 0., 0., ..., 0., 0., 0.],\n", 292 | " [0., 1., 0., ..., 0., 0., 0.],\n", 293 | " ...,\n", 294 | " [0., 0., 0., ..., 1., 0., 0.],\n", 295 | " [0., 0., 0., ..., 0., 1., 0.],\n", 296 | " [0., 0., 0., ..., 0., 0., 1.]])" 297 | ] 298 | }, 299 | "execution_count": 42, 300 | "metadata": {}, 301 | "output_type": "execute_result" 302 | } 303 | ], 304 | "source": [ 305 | "print(embedding_weights.shape) # first row all 0 for PAD, 69 char, last row for UNK\n", 306 | "embedding_weights" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 43, 312 | "metadata": { 313 | "collapsed": true 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "from keras.layers import Input, Embedding, Activation, Flatten, Dense\n", 318 | "from keras.layers import Conv1D, MaxPooling1D, Dropout\n", 319 | "from keras.models import Model" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 44, 325 | "metadata": { 326 | "collapsed": true 327 | }, 328 | "outputs": [], 329 | "source": [ 330 | "# parameter \n", 331 | "input_size = 1014\n", 332 | "# vocab_size = 69\n", 333 | "embedding_size = 70\n", 334 | "conv_layers = [[256, 7, 3], \n", 335 | " [256, 7, 3], \n", 336 | " [256, 3, -1], \n", 337 | " [256, 3, -1], \n", 338 | " [256, 3, -1], \n", 339 | " [256, 3, 3]]\n", 340 | "\n", 341 | "fully_connected_layers = [1024, 1024]\n", 342 | "num_of_classes = 2\n", 343 | "dropout_p = 0.5\n", 344 | "optimizer = 'adam'\n", 345 | "loss = 'categorical_crossentropy'" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 45, 351 | "metadata": { 352 | "collapsed": true 353 | }, 354 | "outputs": [], 355 | "source": [ 356 | "# Embedding layer Initialization\n", 357 | "embedding_layer = Embedding(vocab_size+1, \n", 358 | " embedding_size,\n", 359 | " input_length=input_size,\n", 360 | " weights=[embedding_weights])" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 46, 366 | "metadata": {}, 367 | "outputs": [ 368 | { 369 | "name": "stdout", 370 | "output_type": "stream", 371 | "text": [ 372 | "_________________________________________________________________\n", 373 | "Layer (type) Output Shape Param # \n", 374 | "=================================================================\n", 375 | "input (InputLayer) (None, 1014) 0 \n", 376 | "_________________________________________________________________\n", 377 | "embedding_4 (Embedding) (None, 1014, 70) 4970 \n", 378 | "_________________________________________________________________\n", 379 | "conv1d_13 (Conv1D) (None, 1008, 256) 125696 \n", 380 | "_________________________________________________________________\n", 381 | "activation_13 (Activation) (None, 1008, 256) 0 \n", 382 | "_________________________________________________________________\n", 383 | "max_pooling1d_7 (MaxPooling1 (None, 336, 256) 0 \n", 384 | "_________________________________________________________________\n", 385 | "conv1d_14 (Conv1D) (None, 330, 256) 459008 \n", 386 | "_________________________________________________________________\n", 387 | "activation_14 (Activation) (None, 330, 256) 0 \n", 388 | "_________________________________________________________________\n", 389 | "max_pooling1d_8 (MaxPooling1 (None, 110, 256) 0 \n", 390 | "_________________________________________________________________\n", 391 | "conv1d_15 (Conv1D) (None, 108, 256) 196864 \n", 392 | "_________________________________________________________________\n", 393 | "activation_15 (Activation) (None, 108, 256) 0 \n", 394 | "_________________________________________________________________\n", 395 | "conv1d_16 (Conv1D) (None, 106, 256) 196864 \n", 396 | "_________________________________________________________________\n", 397 | "activation_16 (Activation) (None, 106, 256) 0 \n", 398 | "_________________________________________________________________\n", 399 | "conv1d_17 (Conv1D) (None, 104, 256) 196864 \n", 400 | "_________________________________________________________________\n", 401 | "activation_17 (Activation) (None, 104, 256) 0 \n", 402 | "_________________________________________________________________\n", 403 | "conv1d_18 (Conv1D) (None, 102, 256) 196864 \n", 404 | "_________________________________________________________________\n", 405 | "activation_18 (Activation) (None, 102, 256) 0 \n", 406 | "_________________________________________________________________\n", 407 | "max_pooling1d_9 (MaxPooling1 (None, 34, 256) 0 \n", 408 | "_________________________________________________________________\n", 409 | "flatten_3 (Flatten) (None, 8704) 0 \n", 410 | "_________________________________________________________________\n", 411 | "dense_7 (Dense) (None, 1024) 8913920 \n", 412 | "_________________________________________________________________\n", 413 | "dropout_5 (Dropout) (None, 1024) 0 \n", 414 | "_________________________________________________________________\n", 415 | "dense_8 (Dense) (None, 1024) 1049600 \n", 416 | "_________________________________________________________________\n", 417 | "dropout_6 (Dropout) (None, 1024) 0 \n", 418 | "_________________________________________________________________\n", 419 | "dense_9 (Dense) (None, 2) 2050 \n", 420 | "=================================================================\n", 421 | "Total params: 11,342,700\n", 422 | "Trainable params: 11,342,700\n", 423 | "Non-trainable params: 0\n", 424 | "_________________________________________________________________\n" 425 | ] 426 | } 427 | ], 428 | "source": [ 429 | "# Model \n", 430 | "\n", 431 | "# Input\n", 432 | "inputs = Input(shape=(input_size,), name='input', dtype='int64') # shape=(?, 1014)\n", 433 | "# Embedding \n", 434 | "x = embedding_layer(inputs)\n", 435 | "# Conv \n", 436 | "for filter_num, filter_size, pooling_size in conv_layers:\n", 437 | " x = Conv1D(filter_num, filter_size)(x) \n", 438 | " x = Activation('relu')(x)\n", 439 | " if pooling_size != -1:\n", 440 | " x = MaxPooling1D(pool_size=pooling_size)(x) # Final shape=(None, 34, 256)\n", 441 | "x = Flatten()(x) # (None, 8704)\n", 442 | "# Fully connected layers \n", 443 | "for dense_size in fully_connected_layers:\n", 444 | " x = Dense(dense_size, activation='relu')(x) # dense_size == 1024\n", 445 | " x = Dropout(dropout_p)(x)\n", 446 | "# Output Layer\n", 447 | "predictions = Dense(num_of_classes, activation='softmax')(x)\n", 448 | "# Build model\n", 449 | "model = Model(inputs=inputs, outputs=predictions)\n", 450 | "model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) # Adam, categorical_crossentropy\n", 451 | "model.summary()" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 49, 457 | "metadata": { 458 | "collapsed": true 459 | }, 460 | "outputs": [], 461 | "source": [ 462 | "# 1000 training samples and 100 testing samples\n", 463 | "indices = np.arange(train_data.shape[0])\n", 464 | "np.random.shuffle(indices)\n", 465 | "\n", 466 | "x_train = train_data[indices]\n", 467 | "y_train = train_classes[indices]\n", 468 | "\n", 469 | "x_test = test_data\n", 470 | "y_test = test_classes" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 50, 476 | "metadata": {}, 477 | "outputs": [ 478 | { 479 | "name": "stdout", 480 | "output_type": "stream", 481 | "text": [ 482 | "Train on 25569 samples, validate on 6393 samples\n", 483 | "Epoch 1/10\n", 484 | " - 13s - loss: 0.1876 - acc: 0.9315 - val_loss: 0.1360 - val_acc: 0.9496\n", 485 | "Epoch 2/10\n", 486 | " - 13s - loss: 0.1238 - acc: 0.9555 - val_loss: 0.1172 - val_acc: 0.9571\n", 487 | "Epoch 3/10\n", 488 | " - 13s - loss: 0.0919 - acc: 0.9678 - val_loss: 0.1198 - val_acc: 0.9546\n", 489 | "Epoch 4/10\n", 490 | " - 13s - loss: 0.0641 - acc: 0.9774 - val_loss: 0.1294 - val_acc: 0.9537\n", 491 | "Epoch 5/10\n", 492 | " - 13s - loss: 0.0593 - acc: 0.9795 - val_loss: 0.1109 - val_acc: 0.9629\n", 493 | "Epoch 6/10\n", 494 | " - 13s - loss: 0.0362 - acc: 0.9870 - val_loss: 0.1243 - val_acc: 0.9611\n", 495 | "Epoch 7/10\n", 496 | " - 13s - loss: 0.0374 - acc: 0.9867 - val_loss: 0.1432 - val_acc: 0.9604\n", 497 | "Epoch 8/10\n", 498 | " - 13s - loss: 0.0213 - acc: 0.9923 - val_loss: 0.1498 - val_acc: 0.9620\n", 499 | "Epoch 9/10\n", 500 | " - 13s - loss: 0.0153 - acc: 0.9947 - val_loss: 0.1718 - val_acc: 0.9617\n", 501 | "Epoch 10/10\n", 502 | " - 13s - loss: 0.0126 - acc: 0.9960 - val_loss: 0.3250 - val_acc: 0.9565\n" 503 | ] 504 | }, 505 | { 506 | "data": { 507 | "text/plain": [ 508 | "" 509 | ] 510 | }, 511 | "execution_count": 50, 512 | "metadata": {}, 513 | "output_type": "execute_result" 514 | } 515 | ], 516 | "source": [ 517 | "\n", 518 | "\n", 519 | "# Training\n", 520 | "model.fit(x_train, y_train,\n", 521 | " validation_data=(x_test, y_test),\n", 522 | " batch_size=128,\n", 523 | " epochs=10,\n", 524 | " verbose=2)\n", 525 | "\n" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": null, 531 | "metadata": { 532 | "collapsed": true 533 | }, 534 | "outputs": [], 535 | "source": [] 536 | } 537 | ], 538 | "metadata": { 539 | "kernelspec": { 540 | "display_name": "Python 3", 541 | "language": "python", 542 | "name": "python3" 543 | }, 544 | "language_info": { 545 | "codemirror_mode": { 546 | "name": "ipython", 547 | "version": 3 548 | }, 549 | "file_extension": ".py", 550 | "mimetype": "text/x-python", 551 | "name": "python", 552 | "nbconvert_exporter": "python", 553 | "pygments_lexer": "ipython3", 554 | "version": "3.6.8" 555 | } 556 | }, 557 | "nbformat": 4, 558 | "nbformat_minor": 1 559 | } 560 | -------------------------------------------------------------------------------- /CNN/.ipynb_checkpoints/cnn_nlp_nb-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 8 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" 9 | }, 10 | "outputs": [ 11 | { 12 | "name": "stdout", 13 | "output_type": "stream", 14 | "text": [ 15 | "['test_tweets_anuFYb8.csv', 'train_E6oV3lV.csv']\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "# This Python 3 environment comes with many helpful analytics libraries installed\n", 21 | "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n", 22 | "# For example, here's several helpful packages to load in \n", 23 | "\n", 24 | "import numpy as np # linear algebra\n", 25 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 26 | "\n", 27 | "# Input data files are available in the \"../input/\" directory.\n", 28 | "# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n", 29 | "\n", 30 | "import os\n", 31 | "print(os.listdir(\"../input\"))\n", 32 | "\n", 33 | "# Any results you write to the current directory are saved as output.\n", 34 | "\n", 35 | "# import keras\n", 36 | "# import tensorflow\n", 37 | "\n", 38 | "# print(keras.__version__)\n", 39 | "# print(tensorflow.__version__)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "import numpy as np\n", 49 | "import pandas as pd" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "metadata": { 56 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 57 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" 58 | }, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | " id ... tweet\n", 65 | "0 1 ... @user when a father is dysfunctional and is s...\n", 66 | "1 2 ... @user @user thanks for #lyft credit i can't us...\n", 67 | "2 3 ... bihday your majesty\n", 68 | "3 4 ... #model i love u take with u all the time in ...\n", 69 | "4 5 ... factsguide: society now #motivation\n", 70 | "\n", 71 | "[5 rows x 3 columns]\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "df = pd.read_csv('../input/train_E6oV3lV.csv')\n", 77 | "df['label'] = df['label'].map({0: 2, 1: 1})\n", 78 | "print(df.head())\n", 79 | "# print(df['label'].head())\n", 80 | "df = df.drop('id', axis=1)\n", 81 | "\n", 82 | "# print(df[:100])\n", 83 | "# zero = 0\n", 84 | "# one = 0\n", 85 | "# for i in df['label']:\n", 86 | "# if(i==0):\n", 87 | "# zero+=1\n", 88 | "# else:\n", 89 | "# one+=1\n", 90 | " \n", 91 | "# print(zero,one)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 4, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "name": "stderr", 101 | "output_type": "stream", 102 | "text": [ 103 | "Using TensorFlow backend.\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "from keras.preprocessing.text import Tokenizer\n", 109 | "from keras.preprocessing.sequence import pad_sequences\n", 110 | "from sklearn.model_selection import train_test_split\n", 111 | "import copy" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 5, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "25569\n", 124 | "6393\n", 125 | " label tweet\n", 126 | "11704 2 @user #diwali !!! #besteduonline !!! contac...\n", 127 | "6885 2 @user i was at this trump rally last night. ma...\n", 128 | "744 2 i don't know where you're going but do you hav...\n", 129 | "13467 2 watch on #periscope: the reading of victims na...\n", 130 | "31250 2 9 months later.. i am back to my pre pregnancy...\n", 131 | "\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "\n", 137 | "train, test = train_test_split(df, test_size=0.2)\n", 138 | "print(len(train))\n", 139 | "print(len(test))\n", 140 | "print(train.head())\n", 141 | "\n", 142 | "print(type(train))" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 6, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "train_df = copy.deepcopy(train)\n", 152 | "test_df = copy.deepcopy(test)\n", 153 | "\n", 154 | "# concatenate column 1 and column 2 as one text\n", 155 | "# print(train_df[1])\n", 156 | "\n", 157 | "# convert string to lower case\n", 158 | "train_texts = train_df['tweet'].values\n", 159 | "train_texts = [s.lower() for s in train_texts]\n", 160 | "\n", 161 | "test_texts = test_df['tweet'].values\n", 162 | "test_texts = [s.lower() for s in test_texts]\n", 163 | "\n", 164 | "# print(train_texts.head())\n", 165 | "# print(train_texts)\n" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 7, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "\n", 175 | "# =======================Convert string to index================\n", 176 | "# Tokenizer\n", 177 | "tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')\n", 178 | "tk.fit_on_texts(train_texts)\n", 179 | "# If we already have a character list, then replace the tk.word_index\n", 180 | "# If not, just skip below part\n", 181 | "\n", 182 | "# -----------------------Skip part start--------------------------\n", 183 | "# construct a new vocabulary\n", 184 | "alphabet = \"abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\\\"/\\\\|_@#$%^&*~`+-=<>()[]{}\"\n", 185 | "char_dict = {}\n", 186 | "for i, char in enumerate(alphabet):\n", 187 | " char_dict[char] = i + 1\n", 188 | "\n", 189 | "# Use char_dict to replace the tk.word_index\n", 190 | "tk.word_index = char_dict.copy()\n", 191 | "# Add 'UNK' to the vocabulary\n", 192 | "tk.word_index[tk.oov_token] = max(char_dict.values()) + 1\n", 193 | "# -----------------------Skip part end----------------------------\n", 194 | "\n", 195 | "# Convert string to index\n", 196 | "train_sequences = tk.texts_to_sequences(train_texts)\n", 197 | "test_texts = tk.texts_to_sequences(test_texts)\n", 198 | "\n", 199 | "# Padding\n", 200 | "train_data = pad_sequences(train_sequences, maxlen=1014, padding='post')\n", 201 | "test_data = pad_sequences(test_texts, maxlen=1014, padding='post')\n", 202 | "\n", 203 | "# Convert to numpy array\n", 204 | "train_data = np.array(train_data, dtype='float32')\n", 205 | "test_data = np.array(test_data, dtype='float32')\n", 206 | "\n", 207 | "# =======================Get classes================\n", 208 | "train_classes = train_df['label'].values\n", 209 | "train_class_list = [x - 1 for x in train_classes]\n", 210 | "\n", 211 | "test_classes = test_df['label'].values\n", 212 | "test_class_list = [x - 1 for x in test_classes]\n", 213 | "\n", 214 | "from keras.utils import to_categorical\n", 215 | "\n", 216 | "train_classes = to_categorical(train_class_list)\n", 217 | "test_classes = to_categorical(test_class_list)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 8, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "name": "stdout", 227 | "output_type": "stream", 228 | "text": [ 229 | "6393\n", 230 | "{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '0': 27, '1': 28, '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34, '8': 35, '9': 36, ' ': 37, ',': 38, ';': 39, '.': 40, '!': 41, '?': 42, ':': 43, \"'\": 44, '\"': 45, '/': 46, '\\\\': 47, '|': 48, '_': 49, '@': 50, '#': 51, '$': 52, '%': 53, '^': 54, '&': 55, '*': 56, '~': 57, '`': 58, '+': 59, '-': 60, '=': 61, '<': 62, '>': 63, '(': 64, ')': 65, '[': 66, ']': 67, '{': 68, '}': 69, 'UNK': 70}\n" 231 | ] 232 | } 233 | ], 234 | "source": [ 235 | "print(len(test_classes))\n", 236 | "print(tk.word_index)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 9, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "data": { 246 | "text/plain": [ 247 | "70" 248 | ] 249 | }, 250 | "execution_count": 9, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "\n", 257 | "\n", 258 | "vocab_size = len(tk.word_index)\n", 259 | "vocab_size\n", 260 | "\n" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 10, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "embedding_weights = [] #(71, 70)\n", 270 | "embedding_weights.append(np.zeros(vocab_size)) # first row is pad\n", 271 | "\n", 272 | "for char, i in tk.word_index.items(): # from index 1 to 70\n", 273 | " onehot = np.zeros(vocab_size)\n", 274 | " onehot[i-1] = 1\n", 275 | " embedding_weights.append(onehot)\n", 276 | "embedding_weights = np.array(embedding_weights)\n" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 11, 282 | "metadata": {}, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "(71, 70)\n" 289 | ] 290 | }, 291 | { 292 | "data": { 293 | "text/plain": [ 294 | "array([[0., 0., 0., ..., 0., 0., 0.],\n", 295 | " [1., 0., 0., ..., 0., 0., 0.],\n", 296 | " [0., 1., 0., ..., 0., 0., 0.],\n", 297 | " ...,\n", 298 | " [0., 0., 0., ..., 1., 0., 0.],\n", 299 | " [0., 0., 0., ..., 0., 1., 0.],\n", 300 | " [0., 0., 0., ..., 0., 0., 1.]])" 301 | ] 302 | }, 303 | "execution_count": 11, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "print(embedding_weights.shape) # first row all 0 for PAD, 69 char, last row for UNK\n", 310 | "embedding_weights" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 12, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "from keras.layers import Input, Embedding, Activation, Flatten, Dense\n", 320 | "from keras.layers import Conv1D, MaxPooling1D, Dropout\n", 321 | "from keras.models import Model" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 13, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "# parameter \n", 331 | "input_size = 1014\n", 332 | "# vocab_size = 69\n", 333 | "embedding_size = 70\n", 334 | "conv_layers = [[256, 7, 3], \n", 335 | " [256, 7, 3], \n", 336 | " [256, 3, -1], \n", 337 | " [256, 3, -1], \n", 338 | " [256, 3, -1], \n", 339 | " [256, 3, 3]]\n", 340 | "\n", 341 | "fully_connected_layers = [1024, 1024]\n", 342 | "num_of_classes = 2\n", 343 | "dropout_p = 0.5\n", 344 | "optimizer = 'adam'\n", 345 | "loss = 'categorical_crossentropy'" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 14, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "# Embedding layer Initialization\n", 355 | "embedding_layer = Embedding(vocab_size+1, \n", 356 | " embedding_size,\n", 357 | " input_length=input_size,\n", 358 | " weights=[embedding_weights])" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 15, 364 | "metadata": {}, 365 | "outputs": [ 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", 371 | "Instructions for updating:\n", 372 | "Colocations handled automatically by placer.\n", 373 | "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n", 374 | "Instructions for updating:\n", 375 | "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n", 376 | "_________________________________________________________________\n", 377 | "Layer (type) Output Shape Param # \n", 378 | "=================================================================\n", 379 | "input (InputLayer) (None, 1014) 0 \n", 380 | "_________________________________________________________________\n", 381 | "embedding_1 (Embedding) (None, 1014, 70) 4970 \n", 382 | "_________________________________________________________________\n", 383 | "conv1d_1 (Conv1D) (None, 1008, 256) 125696 \n", 384 | "_________________________________________________________________\n", 385 | "activation_1 (Activation) (None, 1008, 256) 0 \n", 386 | "_________________________________________________________________\n", 387 | "max_pooling1d_1 (MaxPooling1 (None, 336, 256) 0 \n", 388 | "_________________________________________________________________\n", 389 | "conv1d_2 (Conv1D) (None, 330, 256) 459008 \n", 390 | "_________________________________________________________________\n", 391 | "activation_2 (Activation) (None, 330, 256) 0 \n", 392 | "_________________________________________________________________\n", 393 | "max_pooling1d_2 (MaxPooling1 (None, 110, 256) 0 \n", 394 | "_________________________________________________________________\n", 395 | "conv1d_3 (Conv1D) (None, 108, 256) 196864 \n", 396 | "_________________________________________________________________\n", 397 | "activation_3 (Activation) (None, 108, 256) 0 \n", 398 | "_________________________________________________________________\n", 399 | "conv1d_4 (Conv1D) (None, 106, 256) 196864 \n", 400 | "_________________________________________________________________\n", 401 | "activation_4 (Activation) (None, 106, 256) 0 \n", 402 | "_________________________________________________________________\n", 403 | "conv1d_5 (Conv1D) (None, 104, 256) 196864 \n", 404 | "_________________________________________________________________\n", 405 | "activation_5 (Activation) (None, 104, 256) 0 \n", 406 | "_________________________________________________________________\n", 407 | "conv1d_6 (Conv1D) (None, 102, 256) 196864 \n", 408 | "_________________________________________________________________\n", 409 | "activation_6 (Activation) (None, 102, 256) 0 \n", 410 | "_________________________________________________________________\n", 411 | "max_pooling1d_3 (MaxPooling1 (None, 34, 256) 0 \n", 412 | "_________________________________________________________________\n", 413 | "flatten_1 (Flatten) (None, 8704) 0 \n", 414 | "_________________________________________________________________\n", 415 | "dense_1 (Dense) (None, 1024) 8913920 \n", 416 | "_________________________________________________________________\n", 417 | "dropout_1 (Dropout) (None, 1024) 0 \n", 418 | "_________________________________________________________________\n", 419 | "dense_2 (Dense) (None, 1024) 1049600 \n", 420 | "_________________________________________________________________\n", 421 | "dropout_2 (Dropout) (None, 1024) 0 \n", 422 | "_________________________________________________________________\n", 423 | "dense_3 (Dense) (None, 2) 2050 \n", 424 | "=================================================================\n", 425 | "Total params: 11,342,700\n", 426 | "Trainable params: 11,342,700\n", 427 | "Non-trainable params: 0\n", 428 | "_________________________________________________________________\n" 429 | ] 430 | } 431 | ], 432 | "source": [ 433 | "# Model \n", 434 | "\n", 435 | "# Input\n", 436 | "inputs = Input(shape=(input_size,), name='input', dtype='int64') # shape=(?, 1014)\n", 437 | "# Embedding \n", 438 | "x = embedding_layer(inputs)\n", 439 | "# Conv \n", 440 | "for filter_num, filter_size, pooling_size in conv_layers:\n", 441 | " x = Conv1D(filter_num, filter_size)(x) \n", 442 | " x = Activation('relu')(x)\n", 443 | " if pooling_size != -1:\n", 444 | " x = MaxPooling1D(pool_size=pooling_size)(x) # Final shape=(None, 34, 256)\n", 445 | "x = Flatten()(x) # (None, 8704)\n", 446 | "# Fully connected layers \n", 447 | "for dense_size in fully_connected_layers:\n", 448 | " x = Dense(dense_size, activation='relu')(x) # dense_size == 1024\n", 449 | " x = Dropout(dropout_p)(x)\n", 450 | "# Output Layer\n", 451 | "predictions = Dense(num_of_classes, activation='softmax')(x)\n", 452 | "# Build model\n", 453 | "model = Model(inputs=inputs, outputs=predictions)\n", 454 | "model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) # Adam, categorical_crossentropy\n", 455 | "model.summary()" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 16, 461 | "metadata": {}, 462 | "outputs": [ 463 | { 464 | "name": "stdout", 465 | "output_type": "stream", 466 | "text": [ 467 | "[37. 50. 21. ... 0. 0. 0.]\n", 468 | "[50. 21. 19. ... 0. 0. 0.]\n" 469 | ] 470 | } 471 | ], 472 | "source": [ 473 | "print(train_data[0])\n", 474 | "print(train_data[1])\n", 475 | "\n", 476 | "# print(train_classes[1])\n", 477 | "\n", 478 | "# for i in range(100):\n", 479 | "# print(train_classes[i])\n", 480 | "# for i in train_data[0]:\n", 481 | "# print(i)\n", 482 | " \n", 483 | "# print(train)\n", 484 | "# for i in train_data[0]:\n", 485 | "# print(i)" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 17, 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [ 494 | "# 1000 training samples and 100 testing samples\n", 495 | "indices = np.arange(train_data.shape[0])\n", 496 | "np.random.shuffle(indices)\n", 497 | "\n", 498 | "x_train = train_data[indices]\n", 499 | "y_train = train_classes[indices]\n", 500 | "\n", 501 | "x_test = test_data\n", 502 | "y_test = test_classes" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 18, 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "name": "stdout", 512 | "output_type": "stream", 513 | "text": [ 514 | "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", 515 | "Instructions for updating:\n", 516 | "Use tf.cast instead.\n", 517 | "Train on 25569 samples, validate on 6393 samples\n", 518 | "Epoch 1/10\n", 519 | " - 16s - loss: 0.2343 - acc: 0.9287 - val_loss: 0.1542 - val_acc: 0.9321\n", 520 | "Epoch 2/10\n", 521 | " - 13s - loss: 0.1346 - acc: 0.9514 - val_loss: 0.1125 - val_acc: 0.9582\n", 522 | "Epoch 3/10\n", 523 | " - 13s - loss: 0.0975 - acc: 0.9664 - val_loss: 0.1058 - val_acc: 0.9657\n", 524 | "Epoch 4/10\n", 525 | " - 13s - loss: 0.0690 - acc: 0.9767 - val_loss: 0.0992 - val_acc: 0.9645\n", 526 | "Epoch 5/10\n", 527 | " - 13s - loss: 0.0449 - acc: 0.9847 - val_loss: 0.1074 - val_acc: 0.9667\n", 528 | "Epoch 6/10\n", 529 | " - 13s - loss: 0.0343 - acc: 0.9895 - val_loss: 0.1266 - val_acc: 0.9692\n", 530 | "Epoch 7/10\n", 531 | " - 13s - loss: 0.0278 - acc: 0.9907 - val_loss: 0.1406 - val_acc: 0.9626\n", 532 | "Epoch 8/10\n", 533 | " - 13s - loss: 0.0242 - acc: 0.9926 - val_loss: 0.1558 - val_acc: 0.9582\n", 534 | "Epoch 9/10\n", 535 | " - 13s - loss: 0.0174 - acc: 0.9944 - val_loss: 0.1468 - val_acc: 0.9651\n", 536 | "Epoch 10/10\n", 537 | " - 13s - loss: 0.0200 - acc: 0.9932 - val_loss: 0.1749 - val_acc: 0.9596\n" 538 | ] 539 | }, 540 | { 541 | "data": { 542 | "text/plain": [ 543 | "" 544 | ] 545 | }, 546 | "execution_count": 18, 547 | "metadata": {}, 548 | "output_type": "execute_result" 549 | } 550 | ], 551 | "source": [ 552 | "\n", 553 | "\n", 554 | "# Training\n", 555 | "model.fit(x_train, y_train,\n", 556 | " validation_data=(x_test, y_test),\n", 557 | " batch_size=128,\n", 558 | " epochs=10,\n", 559 | " verbose=2)\n", 560 | "\n" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 19, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "train_data_nb = copy.deepcopy(train_data)\n", 570 | "train_classes_nb = copy.deepcopy(train_classes)\n", 571 | "\n", 572 | "test_data_nb = copy.deepcopy(test_data)\n", 573 | "test_classes_nb = copy.deepcopy(test_classes)" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 20, 579 | "metadata": {}, 580 | "outputs": [ 581 | { 582 | "name": "stdout", 583 | "output_type": "stream", 584 | "text": [ 585 | "{0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0}\n" 586 | ] 587 | } 588 | ], 589 | "source": [ 590 | "# print(train_data_nb)\n", 591 | "set1 = set()\n", 592 | "\n", 593 | "\n", 594 | "for i in train_data_nb:\n", 595 | " for j in i:\n", 596 | " set1.add(j)\n", 597 | " \n", 598 | "print((set1))" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": 21, 604 | "metadata": {}, 605 | "outputs": [ 606 | { 607 | "name": "stdout", 608 | "output_type": "stream", 609 | "text": [ 610 | "69\n", 611 | "0\n" 612 | ] 613 | } 614 | ], 615 | "source": [ 616 | "print(len(set1))\n", 617 | "total_vocab = len(set1)\n", 618 | "\n", 619 | "set2 = set()\n", 620 | "train_classes_nb_l = len(train_classes_nb) * [0]\n", 621 | "test_classes_nb_l = len(test_classes_nb) * [0]\n", 622 | "\n", 623 | "for idx, i in enumerate(train_classes_nb):\n", 624 | " if(i[0]==0.0 and i[1]==1.0):\n", 625 | " train_classes_nb_l[idx]=copy.deepcopy(int(0))\n", 626 | " elif(i[0]==1.0 and i[1]==0.0):\n", 627 | " train_classes_nb_l[idx]=copy.deepcopy(int(1))\n", 628 | " else:\n", 629 | " print(i)\n", 630 | "\n", 631 | "for idx, i in enumerate(test_classes_nb):\n", 632 | " if(i[0]==0.0 and i[1]==1.0):\n", 633 | " test_classes_nb_l[idx]=copy.deepcopy(int(0))\n", 634 | " elif(i[0]==1.0 and i[1]==0.0):\n", 635 | " test_classes_nb_l[idx]=copy.deepcopy(int(1))\n", 636 | " else:\n", 637 | " print(i)\n", 638 | " \n", 639 | "print(len(set2))" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 22, 645 | "metadata": {}, 646 | "outputs": [], 647 | "source": [ 648 | "dict1=dict()\n", 649 | "dict1[0]=dict()\n", 650 | "dict1[1]=dict()" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": 23, 656 | "metadata": {}, 657 | "outputs": [ 658 | { 659 | "name": "stdout", 660 | "output_type": "stream", 661 | "text": [ 662 | "0 {'freq': 0, 'prob_class': 0, 0.0: 0, 1.0: 0, 2.0: 0, 3.0: 0, 4.0: 0, 5.0: 0, 6.0: 0, 7.0: 0, 8.0: 0, 9.0: 0, 10.0: 0, 11.0: 0, 12.0: 0, 13.0: 0, 14.0: 0, 15.0: 0, 16.0: 0, 17.0: 0, 18.0: 0, 19.0: 0, 20.0: 0, 21.0: 0, 22.0: 0, 23.0: 0, 24.0: 0, 25.0: 0, 26.0: 0, 27.0: 0, 28.0: 0, 29.0: 0, 30.0: 0, 31.0: 0, 32.0: 0, 33.0: 0, 34.0: 0, 35.0: 0, 36.0: 0, 37.0: 0, 38.0: 0, 39.0: 0, 40.0: 0, 41.0: 0, 42.0: 0, 43.0: 0, 44.0: 0, 45.0: 0, 46.0: 0, 47.0: 0, 48.0: 0, 49.0: 0, 50.0: 0, 51.0: 0, 52.0: 0, 53.0: 0, 54.0: 0, 55.0: 0, 56.0: 0, 57.0: 0, 58.0: 0, 59.0: 0, 60.0: 0, 61.0: 0, 64.0: 0, 65.0: 0, 66.0: 0, 67.0: 0, 68.0: 0, 69.0: 0, 70.0: 0}\n", 663 | "1 {'freq': 0, 'prob_class': 0, 0.0: 0, 1.0: 0, 2.0: 0, 3.0: 0, 4.0: 0, 5.0: 0, 6.0: 0, 7.0: 0, 8.0: 0, 9.0: 0, 10.0: 0, 11.0: 0, 12.0: 0, 13.0: 0, 14.0: 0, 15.0: 0, 16.0: 0, 17.0: 0, 18.0: 0, 19.0: 0, 20.0: 0, 21.0: 0, 22.0: 0, 23.0: 0, 24.0: 0, 25.0: 0, 26.0: 0, 27.0: 0, 28.0: 0, 29.0: 0, 30.0: 0, 31.0: 0, 32.0: 0, 33.0: 0, 34.0: 0, 35.0: 0, 36.0: 0, 37.0: 0, 38.0: 0, 39.0: 0, 40.0: 0, 41.0: 0, 42.0: 0, 43.0: 0, 44.0: 0, 45.0: 0, 46.0: 0, 47.0: 0, 48.0: 0, 49.0: 0, 50.0: 0, 51.0: 0, 52.0: 0, 53.0: 0, 54.0: 0, 55.0: 0, 56.0: 0, 57.0: 0, 58.0: 0, 59.0: 0, 60.0: 0, 61.0: 0, 64.0: 0, 65.0: 0, 66.0: 0, 67.0: 0, 68.0: 0, 69.0: 0, 70.0: 0}\n" 664 | ] 665 | } 666 | ], 667 | "source": [ 668 | "for key,val in dict1.items():\n", 669 | " dict1[key][\"freq\"] = 0\n", 670 | " dict1[key][\"prob_class\"] = 0\n", 671 | " for j in set1:\n", 672 | " dict1[key][j] = 0\n", 673 | " print(key,dict1[key])\n", 674 | " \n" 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": 24, 680 | "metadata": {}, 681 | "outputs": [ 682 | { 683 | "name": "stdout", 684 | "output_type": "stream", 685 | "text": [ 686 | "{0: {'freq': 24093654, 'prob_class': 23761, 0.0: 22087777, 1.0: 115637, 2.0: 25252, 3.0: 33900, 4.0: 51837, 5.0: 163930, 6.0: 32893, 7.0: 36043, 8.0: 59369, 9.0: 100088, 10.0: 3825, 11.0: 17322, 12.0: 66879, 13.0: 42361, 14.0: 88241, 15.0: 113170, 16.0: 31965, 17.0: 1069, 18.0: 87011, 19.0: 97758, 20.0: 108461, 21.0: 56213, 22.0: 17425, 23.0: 29233, 24.0: 3352, 25.0: 42683, 26.0: 1899, 27.0: 2305, 28.0: 2600, 29.0: 2121, 30.0: 1081, 31.0: 1005, 32.0: 748, 33.0: 1250, 34.0: 489, 35.0: 427, 36.0: 578, 37.0: 358139, 38.0: 4544, 39.0: 1672, 40.0: 18668, 41.0: 11219, 42.0: 1803, 43.0: 2397, 44.0: 6222, 45.0: 1254, 46.0: 524, 47.0: 26, 48.0: 152, 49.0: 649, 50.0: 13087, 51.0: 57560, 52.0: 170, 53.0: 195, 54.0: 47, 55.0: 1529, 56.0: 190, 57.0: 110, 58.0: 8, 59.0: 148, 60.0: 2265, 61.0: 106, 64.0: 623, 65.0: 832, 66.0: 81, 67.0: 79, 68.0: 5, 69.0: 4, 70.0: 81149}, 1: {'freq': 1833312, 'prob_class': 1808, 0.0: 1669726, 1.0: 10363, 2.0: 2253, 3.0: 3816, 4.0: 3627, 5.0: 14133, 6.0: 2115, 7.0: 2422, 8.0: 4827, 9.0: 9398, 10.0: 412, 11.0: 1315, 12.0: 5558, 13.0: 4100, 14.0: 7303, 15.0: 8679, 16.0: 3078, 17.0: 94, 18.0: 8335, 19.0: 9657, 20.0: 9506, 21.0: 4938, 22.0: 1122, 23.0: 2366, 24.0: 266, 25.0: 2511, 26.0: 157, 27.0: 189, 28.0: 176, 29.0: 180, 30.0: 37, 31.0: 72, 32.0: 27, 33.0: 94, 34.0: 72, 35.0: 20, 36.0: 37, 37.0: 27256, 38.0: 538, 39.0: 274, 40.0: 1318, 41.0: 385, 42.0: 270, 43.0: 195, 44.0: 599, 45.0: 233, 46.0: 61, 47.0: 0, 48.0: 17, 49.0: 9, 50.0: 1503, 51.0: 3913, 52.0: 11, 53.0: 4, 54.0: 0, 55.0: 255, 56.0: 23, 57.0: 11, 58.0: 0, 59.0: 14, 60.0: 207, 61.0: 20, 64.0: 26, 65.0: 26, 66.0: 0, 67.0: 0, 68.0: 0, 69.0: 0, 70.0: 3163}}\n" 687 | ] 688 | } 689 | ], 690 | "source": [ 691 | "for i in range(len(train_data_nb)):\n", 692 | " dict1[train_classes_nb_l[i]]['freq'] += len(train_data_nb[i]) \n", 693 | " dict1[train_classes_nb_l[i]]['prob_class'] +=1\n", 694 | " for j in train_data_nb[i]:\n", 695 | " dict1[train_classes_nb_l[i]][j] +=1\n", 696 | "# print(len(train_data_nb[i]))\n", 697 | "# print(train_classes_nb_l[i])\n", 698 | "\n", 699 | "print(dict1)" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": 25, 705 | "metadata": {}, 706 | "outputs": [ 707 | { 708 | "name": "stdout", 709 | "output_type": "stream", 710 | "text": [ 711 | "69\n", 712 | "25569\n" 713 | ] 714 | } 715 | ], 716 | "source": [ 717 | "print(total_vocab)\n", 718 | "total_data_len = len(train_data_nb)\n", 719 | "print(total_data_len)\n", 720 | "\n", 721 | "\n", 722 | "# p(class) = dict[class][prob_class]/total_data_len\n", 723 | "# p(word/class) = (dict[class][word]+1)/(dict[class][freq]+total_vocab)\n", 724 | "\n", 725 | "\n" 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": 26, 731 | "metadata": {}, 732 | "outputs": [], 733 | "source": [ 734 | "def prob_fun(op_class,sentence):\n", 735 | " result = dict1[op_class]['prob_class']/total_data_len\n", 736 | " for i in sentence:\n", 737 | " result *= ((dict1[op_class][i]+1)/(dict1[op_class]['freq']+total_vocab))\n", 738 | " return result" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": 27, 744 | "metadata": {}, 745 | "outputs": [ 746 | { 747 | "name": "stdout", 748 | "output_type": "stream", 749 | "text": [ 750 | "Accuracy is 79.08650086031597\n" 751 | ] 752 | } 753 | ], 754 | "source": [ 755 | "correct = 0\n", 756 | "for idx, sentence in enumerate(test_data_nb):\n", 757 | " res0 = prob_fun(0,sentence)\n", 758 | " res1 = prob_fun(1,sentence) \n", 759 | " if(res0>res1 and test_classes_nb_l[idx] ==0):\n", 760 | " correct+=1\n", 761 | " elif(res1>res0 and test_classes_nb_l[idx] ==1):\n", 762 | " correct+=1\n", 763 | " \n", 764 | "accuracy = (correct/len(test_data_nb))*100\n", 765 | "print(\"Accuracy is \",accuracy)" 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": 28, 771 | "metadata": {}, 772 | "outputs": [], 773 | "source": [] 774 | } 775 | ], 776 | "metadata": { 777 | "kernelspec": { 778 | "display_name": "Python 3", 779 | "language": "python", 780 | "name": "python3" 781 | }, 782 | "language_info": { 783 | "codemirror_mode": { 784 | "name": "ipython", 785 | "version": 3 786 | }, 787 | "file_extension": ".py", 788 | "mimetype": "text/x-python", 789 | "name": "python", 790 | "nbconvert_exporter": "python", 791 | "pygments_lexer": "ipython3", 792 | "version": "3.5.2" 793 | } 794 | }, 795 | "nbformat": 4, 796 | "nbformat_minor": 1 797 | } 798 | -------------------------------------------------------------------------------- /CNN/cnn-keras.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 8 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" 9 | }, 10 | "outputs": [ 11 | { 12 | "name": "stdout", 13 | "output_type": "stream", 14 | "text": [ 15 | "['test_tweets_anuFYb8.csv', 'train_E6oV3lV.csv']\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "# This Python 3 environment comes with many helpful analytics libraries installed\n", 21 | "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n", 22 | "# For example, here's several helpful packages to load in \n", 23 | "\n", 24 | "import numpy as np # linear algebra\n", 25 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 26 | "\n", 27 | "# Input data files are available in the \"../input/\" directory.\n", 28 | "# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n", 29 | "\n", 30 | "import os\n", 31 | "print(os.listdir(\"../input\"))\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": { 38 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 39 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "df = pd.read_csv('../input/train_E6oV3lV.csv')\n", 44 | "df['label'] = df['label'].map({0: 2, 1: 1})\n", 45 | "df = df.drop('id', axis=1)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stderr", 55 | "output_type": "stream", 56 | "text": [ 57 | "Using TensorFlow backend.\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "from keras.preprocessing.text import Tokenizer\n", 63 | "from keras.preprocessing.sequence import pad_sequences\n", 64 | "from sklearn.model_selection import train_test_split\n", 65 | "import copy" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "train, test = train_test_split(df, test_size=0.2)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 5, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "train_df = copy.deepcopy(train)\n", 84 | "test_df = copy.deepcopy(test)\n", 85 | "\n", 86 | "train_texts = train_df['tweet'].values\n", 87 | "train_texts = [s.lower() for s in train_texts]\n", 88 | "\n", 89 | "test_texts = test_df['tweet'].values\n", 90 | "test_texts = [s.lower() for s in test_texts]\n" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 6, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')\n", 100 | "tk.fit_on_texts(train_texts)\n", 101 | "\n", 102 | "alphabet = \"abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\\\"/\\\\|_@#$%^&*~`+-=<>()[]{}\"\n", 103 | "\n", 104 | "char_dict = {}\n", 105 | "for i, char in enumerate(alphabet):\n", 106 | " char_dict[char] = i + 1\n", 107 | "tk.word_index = char_dict.copy()\n", 108 | "tk.word_index[tk.oov_token] = max(char_dict.values()) + 1\n", 109 | "\n", 110 | "train_sequences = tk.texts_to_sequences(train_texts)\n", 111 | "test_texts = tk.texts_to_sequences(test_texts)\n", 112 | "\n", 113 | "train_data = pad_sequences(train_sequences, maxlen=150, padding='post')\n", 114 | "test_data = pad_sequences(test_texts, maxlen=150, padding='post')\n", 115 | "\n", 116 | "train_data = np.array(train_data, dtype='float32')\n", 117 | "test_data = np.array(test_data, dtype='float32')\n", 118 | "\n", 119 | "train_classes = train_df['label'].values\n", 120 | "train_class_list = [x - 1 for x in train_classes]\n", 121 | "\n", 122 | "test_classes = test_df['label'].values\n", 123 | "test_class_list = [x - 1 for x in test_classes]\n", 124 | "\n", 125 | "from keras.utils import to_categorical\n", 126 | "\n", 127 | "train_classes = to_categorical(train_class_list)\n", 128 | "test_classes = to_categorical(test_class_list)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 7, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "vocab_size = len(tk.word_index)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 8, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "embedding_weights = [] #(71, 70)\n", 147 | "embedding_weights.append(np.zeros(vocab_size)) # first row is pad\n", 148 | "\n", 149 | "for char, i in tk.word_index.items(): # from index 1 to 70\n", 150 | " onehot = np.zeros(vocab_size)\n", 151 | " onehot[i-1] = 1\n", 152 | " embedding_weights.append(onehot)\n", 153 | "embedding_weights = np.array(embedding_weights)\n" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 9, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "from keras.layers import Input, Embedding, Activation, Flatten, Dense\n", 163 | "from keras.layers import Conv1D, MaxPooling1D, Dropout\n", 164 | "from keras.models import Model" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 10, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "input_size = 150\n", 174 | "embedding_size = 70\n", 175 | "conv_layers = [[256, 7, 3], \n", 176 | " [256, 7, 3], \n", 177 | " [256, 3, -1], \n", 178 | " [256, 3, -1], \n", 179 | " [256, 3, -1], \n", 180 | " [256, 3, 3]]\n", 181 | "\n", 182 | "fully_connected_layers = [1024, 1024]\n", 183 | "num_of_classes = 2\n", 184 | "dropout_p = 0.5\n", 185 | "optimizer = 'adam'\n", 186 | "loss = 'categorical_crossentropy'" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 11, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "embedding_layer = Embedding(vocab_size+1, \n", 196 | " embedding_size,\n", 197 | " input_length=input_size,\n", 198 | " weights=[embedding_weights])" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 12, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", 211 | "Instructions for updating:\n", 212 | "Colocations handled automatically by placer.\n", 213 | "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n", 214 | "Instructions for updating:\n", 215 | "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n", 216 | "_________________________________________________________________\n", 217 | "Layer (type) Output Shape Param # \n", 218 | "=================================================================\n", 219 | "input (InputLayer) (None, 150) 0 \n", 220 | "_________________________________________________________________\n", 221 | "embedding_1 (Embedding) (None, 150, 70) 4970 \n", 222 | "_________________________________________________________________\n", 223 | "conv1d_1 (Conv1D) (None, 144, 256) 125696 \n", 224 | "_________________________________________________________________\n", 225 | "activation_1 (Activation) (None, 144, 256) 0 \n", 226 | "_________________________________________________________________\n", 227 | "max_pooling1d_1 (MaxPooling1 (None, 48, 256) 0 \n", 228 | "_________________________________________________________________\n", 229 | "conv1d_2 (Conv1D) (None, 42, 256) 459008 \n", 230 | "_________________________________________________________________\n", 231 | "activation_2 (Activation) (None, 42, 256) 0 \n", 232 | "_________________________________________________________________\n", 233 | "max_pooling1d_2 (MaxPooling1 (None, 14, 256) 0 \n", 234 | "_________________________________________________________________\n", 235 | "conv1d_3 (Conv1D) (None, 12, 256) 196864 \n", 236 | "_________________________________________________________________\n", 237 | "activation_3 (Activation) (None, 12, 256) 0 \n", 238 | "_________________________________________________________________\n", 239 | "conv1d_4 (Conv1D) (None, 10, 256) 196864 \n", 240 | "_________________________________________________________________\n", 241 | "activation_4 (Activation) (None, 10, 256) 0 \n", 242 | "_________________________________________________________________\n", 243 | "conv1d_5 (Conv1D) (None, 8, 256) 196864 \n", 244 | "_________________________________________________________________\n", 245 | "activation_5 (Activation) (None, 8, 256) 0 \n", 246 | "_________________________________________________________________\n", 247 | "conv1d_6 (Conv1D) (None, 6, 256) 196864 \n", 248 | "_________________________________________________________________\n", 249 | "activation_6 (Activation) (None, 6, 256) 0 \n", 250 | "_________________________________________________________________\n", 251 | "max_pooling1d_3 (MaxPooling1 (None, 2, 256) 0 \n", 252 | "_________________________________________________________________\n", 253 | "flatten_1 (Flatten) (None, 512) 0 \n", 254 | "_________________________________________________________________\n", 255 | "dense_1 (Dense) (None, 1024) 525312 \n", 256 | "_________________________________________________________________\n", 257 | "dropout_1 (Dropout) (None, 1024) 0 \n", 258 | "_________________________________________________________________\n", 259 | "dense_2 (Dense) (None, 1024) 1049600 \n", 260 | "_________________________________________________________________\n", 261 | "dropout_2 (Dropout) (None, 1024) 0 \n", 262 | "_________________________________________________________________\n", 263 | "dense_3 (Dense) (None, 2) 2050 \n", 264 | "=================================================================\n", 265 | "Total params: 2,954,092\n", 266 | "Trainable params: 2,954,092\n", 267 | "Non-trainable params: 0\n", 268 | "_________________________________________________________________\n" 269 | ] 270 | } 271 | ], 272 | "source": [ 273 | "inputs = Input(shape=(input_size,), name='input', dtype='int64') \n", 274 | "x = embedding_layer(inputs)\n", 275 | "for filter_num, filter_size, pooling_size in conv_layers:\n", 276 | " x = Conv1D(filter_num, filter_size)(x) \n", 277 | " x = Activation('relu')(x)\n", 278 | " if pooling_size != -1:\n", 279 | " x = MaxPooling1D(pool_size=pooling_size)(x) \n", 280 | "x = Flatten()(x) \n", 281 | "\n", 282 | "for dense_size in fully_connected_layers:\n", 283 | " x = Dense(dense_size, activation='relu')(x) \n", 284 | " x = Dropout(dropout_p)(x)\n", 285 | "\n", 286 | "predictions = Dense(num_of_classes, activation='softmax')(x)\n", 287 | "\n", 288 | "model = Model(inputs=inputs, outputs=predictions)\n", 289 | "model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) \n", 290 | "model.summary()" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 13, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "indices = np.arange(train_data.shape[0])\n", 300 | "np.random.shuffle(indices)\n", 301 | "\n", 302 | "x_train = train_data[indices]\n", 303 | "y_train = train_classes[indices]\n", 304 | "\n", 305 | "x_test = test_data\n", 306 | "y_test = test_classes" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 14, 312 | "metadata": {}, 313 | "outputs": [ 314 | { 315 | "name": "stdout", 316 | "output_type": "stream", 317 | "text": [ 318 | "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", 319 | "Instructions for updating:\n", 320 | "Use tf.cast instead.\n", 321 | "Train on 25569 samples, validate on 6393 samples\n", 322 | "Epoch 1/10\n", 323 | " - 8s - loss: 0.2342 - acc: 0.9275 - val_loss: 0.1753 - val_acc: 0.9249\n", 324 | "Epoch 2/10\n", 325 | " - 3s - loss: 0.1532 - acc: 0.9428 - val_loss: 0.1418 - val_acc: 0.9485\n", 326 | "Epoch 3/10\n", 327 | " - 3s - loss: 0.0928 - acc: 0.9668 - val_loss: 0.1041 - val_acc: 0.9571\n", 328 | "Epoch 4/10\n", 329 | " - 3s - loss: 0.0589 - acc: 0.9800 - val_loss: 0.0875 - val_acc: 0.9717\n", 330 | "Epoch 5/10\n", 331 | " - 3s - loss: 0.0435 - acc: 0.9855 - val_loss: 0.1206 - val_acc: 0.9559\n", 332 | "Epoch 6/10\n", 333 | " - 3s - loss: 0.0340 - acc: 0.9891 - val_loss: 0.1127 - val_acc: 0.9668\n", 334 | "Epoch 7/10\n", 335 | " - 3s - loss: 0.0241 - acc: 0.9920 - val_loss: 0.1443 - val_acc: 0.9698\n", 336 | "Epoch 8/10\n", 337 | " - 3s - loss: 0.0242 - acc: 0.9914 - val_loss: 0.1556 - val_acc: 0.9673\n", 338 | "Epoch 9/10\n", 339 | " - 3s - loss: 0.0143 - acc: 0.9951 - val_loss: 0.1586 - val_acc: 0.9676\n", 340 | "Epoch 10/10\n", 341 | " - 3s - loss: 0.0132 - acc: 0.9953 - val_loss: 0.1504 - val_acc: 0.9697\n" 342 | ] 343 | }, 344 | { 345 | "data": { 346 | "text/plain": [ 347 | "" 348 | ] 349 | }, 350 | "execution_count": 14, 351 | "metadata": {}, 352 | "output_type": "execute_result" 353 | } 354 | ], 355 | "source": [ 356 | "# Training\n", 357 | "model.fit(x_train, y_train,\n", 358 | " validation_data=(x_test, y_test),\n", 359 | " batch_size=128,\n", 360 | " epochs=10,\n", 361 | " verbose=2)\n", 362 | "\n" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 15, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [] 371 | } 372 | ], 373 | "metadata": { 374 | "kernelspec": { 375 | "display_name": "Python 3", 376 | "language": "python", 377 | "name": "python3" 378 | }, 379 | "language_info": { 380 | "codemirror_mode": { 381 | "name": "ipython", 382 | "version": 3 383 | }, 384 | "file_extension": ".py", 385 | "mimetype": "text/x-python", 386 | "name": "python", 387 | "nbconvert_exporter": "python", 388 | "pygments_lexer": "ipython3", 389 | "version": "3.6.4" 390 | } 391 | }, 392 | "nbformat": 4, 393 | "nbformat_minor": 1 394 | } 395 | -------------------------------------------------------------------------------- /CNN/cnn_nlp_nb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 8 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" 9 | }, 10 | "outputs": [ 11 | { 12 | "name": "stdout", 13 | "output_type": "stream", 14 | "text": [ 15 | "['test_tweets_anuFYb8.csv', 'train_E6oV3lV.csv']\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "# This Python 3 environment comes with many helpful analytics libraries installed\n", 21 | "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n", 22 | "# For example, here's several helpful packages to load in \n", 23 | "\n", 24 | "import numpy as np # linear algebra\n", 25 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 26 | "\n", 27 | "# Input data files are available in the \"../input/\" directory.\n", 28 | "# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n", 29 | "\n", 30 | "import os\n", 31 | "print(os.listdir(\"../input\"))\n", 32 | "\n", 33 | "# Any results you write to the current directory are saved as output.\n", 34 | "\n", 35 | "# import keras\n", 36 | "# import tensorflow\n", 37 | "\n", 38 | "# print(keras.__version__)\n", 39 | "# print(tensorflow.__version__)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "import numpy as np\n", 49 | "import pandas as pd" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "metadata": { 56 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 57 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" 58 | }, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | " id ... tweet\n", 65 | "0 1 ... @user when a father is dysfunctional and is s...\n", 66 | "1 2 ... @user @user thanks for #lyft credit i can't us...\n", 67 | "2 3 ... bihday your majesty\n", 68 | "3 4 ... #model i love u take with u all the time in ...\n", 69 | "4 5 ... factsguide: society now #motivation\n", 70 | "\n", 71 | "[5 rows x 3 columns]\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "df = pd.read_csv('../input/train_E6oV3lV.csv')\n", 77 | "df['label'] = df['label'].map({0: 2, 1: 1})\n", 78 | "print(df.head())\n", 79 | "# print(df['label'].head())\n", 80 | "df = df.drop('id', axis=1)\n", 81 | "\n", 82 | "# print(df[:100])\n", 83 | "# zero = 0\n", 84 | "# one = 0\n", 85 | "# for i in df['label']:\n", 86 | "# if(i==0):\n", 87 | "# zero+=1\n", 88 | "# else:\n", 89 | "# one+=1\n", 90 | " \n", 91 | "# print(zero,one)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 4, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "name": "stderr", 101 | "output_type": "stream", 102 | "text": [ 103 | "Using TensorFlow backend.\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "from keras.preprocessing.text import Tokenizer\n", 109 | "from keras.preprocessing.sequence import pad_sequences\n", 110 | "from sklearn.model_selection import train_test_split\n", 111 | "import copy" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 5, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "25569\n", 124 | "6393\n", 125 | " label tweet\n", 126 | "11704 2 @user #diwali !!! #besteduonline !!! contac...\n", 127 | "6885 2 @user i was at this trump rally last night. ma...\n", 128 | "744 2 i don't know where you're going but do you hav...\n", 129 | "13467 2 watch on #periscope: the reading of victims na...\n", 130 | "31250 2 9 months later.. i am back to my pre pregnancy...\n", 131 | "\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "\n", 137 | "train, test = train_test_split(df, test_size=0.2)\n", 138 | "print(len(train))\n", 139 | "print(len(test))\n", 140 | "print(train.head())\n", 141 | "\n", 142 | "print(type(train))" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 6, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "train_df = copy.deepcopy(train)\n", 152 | "test_df = copy.deepcopy(test)\n", 153 | "\n", 154 | "# concatenate column 1 and column 2 as one text\n", 155 | "# print(train_df[1])\n", 156 | "\n", 157 | "# convert string to lower case\n", 158 | "train_texts = train_df['tweet'].values\n", 159 | "train_texts = [s.lower() for s in train_texts]\n", 160 | "\n", 161 | "test_texts = test_df['tweet'].values\n", 162 | "test_texts = [s.lower() for s in test_texts]\n", 163 | "\n", 164 | "# print(train_texts.head())\n", 165 | "# print(train_texts)\n" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 7, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "\n", 175 | "# =======================Convert string to index================\n", 176 | "# Tokenizer\n", 177 | "tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')\n", 178 | "tk.fit_on_texts(train_texts)\n", 179 | "# If we already have a character list, then replace the tk.word_index\n", 180 | "# If not, just skip below part\n", 181 | "\n", 182 | "# -----------------------Skip part start--------------------------\n", 183 | "# construct a new vocabulary\n", 184 | "alphabet = \"abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\\\"/\\\\|_@#$%^&*~`+-=<>()[]{}\"\n", 185 | "char_dict = {}\n", 186 | "for i, char in enumerate(alphabet):\n", 187 | " char_dict[char] = i + 1\n", 188 | "\n", 189 | "# Use char_dict to replace the tk.word_index\n", 190 | "tk.word_index = char_dict.copy()\n", 191 | "# Add 'UNK' to the vocabulary\n", 192 | "tk.word_index[tk.oov_token] = max(char_dict.values()) + 1\n", 193 | "# -----------------------Skip part end----------------------------\n", 194 | "\n", 195 | "# Convert string to index\n", 196 | "train_sequences = tk.texts_to_sequences(train_texts)\n", 197 | "test_texts = tk.texts_to_sequences(test_texts)\n", 198 | "\n", 199 | "# Padding\n", 200 | "train_data = pad_sequences(train_sequences, maxlen=1014, padding='post')\n", 201 | "test_data = pad_sequences(test_texts, maxlen=1014, padding='post')\n", 202 | "\n", 203 | "# Convert to numpy array\n", 204 | "train_data = np.array(train_data, dtype='float32')\n", 205 | "test_data = np.array(test_data, dtype='float32')\n", 206 | "\n", 207 | "# =======================Get classes================\n", 208 | "train_classes = train_df['label'].values\n", 209 | "train_class_list = [x - 1 for x in train_classes]\n", 210 | "\n", 211 | "test_classes = test_df['label'].values\n", 212 | "test_class_list = [x - 1 for x in test_classes]\n", 213 | "\n", 214 | "from keras.utils import to_categorical\n", 215 | "\n", 216 | "train_classes = to_categorical(train_class_list)\n", 217 | "test_classes = to_categorical(test_class_list)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 8, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "name": "stdout", 227 | "output_type": "stream", 228 | "text": [ 229 | "6393\n", 230 | "{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '0': 27, '1': 28, '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34, '8': 35, '9': 36, ' ': 37, ',': 38, ';': 39, '.': 40, '!': 41, '?': 42, ':': 43, \"'\": 44, '\"': 45, '/': 46, '\\\\': 47, '|': 48, '_': 49, '@': 50, '#': 51, '$': 52, '%': 53, '^': 54, '&': 55, '*': 56, '~': 57, '`': 58, '+': 59, '-': 60, '=': 61, '<': 62, '>': 63, '(': 64, ')': 65, '[': 66, ']': 67, '{': 68, '}': 69, 'UNK': 70}\n" 231 | ] 232 | } 233 | ], 234 | "source": [ 235 | "print(len(test_classes))\n", 236 | "print(tk.word_index)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 9, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "data": { 246 | "text/plain": [ 247 | "70" 248 | ] 249 | }, 250 | "execution_count": 9, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "\n", 257 | "\n", 258 | "vocab_size = len(tk.word_index)\n", 259 | "vocab_size\n", 260 | "\n" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 10, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "embedding_weights = [] #(71, 70)\n", 270 | "embedding_weights.append(np.zeros(vocab_size)) # first row is pad\n", 271 | "\n", 272 | "for char, i in tk.word_index.items(): # from index 1 to 70\n", 273 | " onehot = np.zeros(vocab_size)\n", 274 | " onehot[i-1] = 1\n", 275 | " embedding_weights.append(onehot)\n", 276 | "embedding_weights = np.array(embedding_weights)\n" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 11, 282 | "metadata": {}, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "(71, 70)\n" 289 | ] 290 | }, 291 | { 292 | "data": { 293 | "text/plain": [ 294 | "array([[0., 0., 0., ..., 0., 0., 0.],\n", 295 | " [1., 0., 0., ..., 0., 0., 0.],\n", 296 | " [0., 1., 0., ..., 0., 0., 0.],\n", 297 | " ...,\n", 298 | " [0., 0., 0., ..., 1., 0., 0.],\n", 299 | " [0., 0., 0., ..., 0., 1., 0.],\n", 300 | " [0., 0., 0., ..., 0., 0., 1.]])" 301 | ] 302 | }, 303 | "execution_count": 11, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "print(embedding_weights.shape) # first row all 0 for PAD, 69 char, last row for UNK\n", 310 | "embedding_weights" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 12, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "from keras.layers import Input, Embedding, Activation, Flatten, Dense\n", 320 | "from keras.layers import Conv1D, MaxPooling1D, Dropout\n", 321 | "from keras.models import Model" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 13, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "# parameter \n", 331 | "input_size = 1014\n", 332 | "# vocab_size = 69\n", 333 | "embedding_size = 70\n", 334 | "conv_layers = [[256, 7, 3], \n", 335 | " [256, 7, 3], \n", 336 | " [256, 3, -1], \n", 337 | " [256, 3, -1], \n", 338 | " [256, 3, -1], \n", 339 | " [256, 3, 3]]\n", 340 | "\n", 341 | "fully_connected_layers = [1024, 1024]\n", 342 | "num_of_classes = 2\n", 343 | "dropout_p = 0.5\n", 344 | "optimizer = 'adam'\n", 345 | "loss = 'categorical_crossentropy'" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 14, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "# Embedding layer Initialization\n", 355 | "embedding_layer = Embedding(vocab_size+1, \n", 356 | " embedding_size,\n", 357 | " input_length=input_size,\n", 358 | " weights=[embedding_weights])" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 15, 364 | "metadata": {}, 365 | "outputs": [ 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", 371 | "Instructions for updating:\n", 372 | "Colocations handled automatically by placer.\n", 373 | "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n", 374 | "Instructions for updating:\n", 375 | "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n", 376 | "_________________________________________________________________\n", 377 | "Layer (type) Output Shape Param # \n", 378 | "=================================================================\n", 379 | "input (InputLayer) (None, 1014) 0 \n", 380 | "_________________________________________________________________\n", 381 | "embedding_1 (Embedding) (None, 1014, 70) 4970 \n", 382 | "_________________________________________________________________\n", 383 | "conv1d_1 (Conv1D) (None, 1008, 256) 125696 \n", 384 | "_________________________________________________________________\n", 385 | "activation_1 (Activation) (None, 1008, 256) 0 \n", 386 | "_________________________________________________________________\n", 387 | "max_pooling1d_1 (MaxPooling1 (None, 336, 256) 0 \n", 388 | "_________________________________________________________________\n", 389 | "conv1d_2 (Conv1D) (None, 330, 256) 459008 \n", 390 | "_________________________________________________________________\n", 391 | "activation_2 (Activation) (None, 330, 256) 0 \n", 392 | "_________________________________________________________________\n", 393 | "max_pooling1d_2 (MaxPooling1 (None, 110, 256) 0 \n", 394 | "_________________________________________________________________\n", 395 | "conv1d_3 (Conv1D) (None, 108, 256) 196864 \n", 396 | "_________________________________________________________________\n", 397 | "activation_3 (Activation) (None, 108, 256) 0 \n", 398 | "_________________________________________________________________\n", 399 | "conv1d_4 (Conv1D) (None, 106, 256) 196864 \n", 400 | "_________________________________________________________________\n", 401 | "activation_4 (Activation) (None, 106, 256) 0 \n", 402 | "_________________________________________________________________\n", 403 | "conv1d_5 (Conv1D) (None, 104, 256) 196864 \n", 404 | "_________________________________________________________________\n", 405 | "activation_5 (Activation) (None, 104, 256) 0 \n", 406 | "_________________________________________________________________\n", 407 | "conv1d_6 (Conv1D) (None, 102, 256) 196864 \n", 408 | "_________________________________________________________________\n", 409 | "activation_6 (Activation) (None, 102, 256) 0 \n", 410 | "_________________________________________________________________\n", 411 | "max_pooling1d_3 (MaxPooling1 (None, 34, 256) 0 \n", 412 | "_________________________________________________________________\n", 413 | "flatten_1 (Flatten) (None, 8704) 0 \n", 414 | "_________________________________________________________________\n", 415 | "dense_1 (Dense) (None, 1024) 8913920 \n", 416 | "_________________________________________________________________\n", 417 | "dropout_1 (Dropout) (None, 1024) 0 \n", 418 | "_________________________________________________________________\n", 419 | "dense_2 (Dense) (None, 1024) 1049600 \n", 420 | "_________________________________________________________________\n", 421 | "dropout_2 (Dropout) (None, 1024) 0 \n", 422 | "_________________________________________________________________\n", 423 | "dense_3 (Dense) (None, 2) 2050 \n", 424 | "=================================================================\n", 425 | "Total params: 11,342,700\n", 426 | "Trainable params: 11,342,700\n", 427 | "Non-trainable params: 0\n", 428 | "_________________________________________________________________\n" 429 | ] 430 | } 431 | ], 432 | "source": [ 433 | "# Model \n", 434 | "\n", 435 | "# Input\n", 436 | "inputs = Input(shape=(input_size,), name='input', dtype='int64') # shape=(?, 1014)\n", 437 | "# Embedding \n", 438 | "x = embedding_layer(inputs)\n", 439 | "# Conv \n", 440 | "for filter_num, filter_size, pooling_size in conv_layers:\n", 441 | " x = Conv1D(filter_num, filter_size)(x) \n", 442 | " x = Activation('relu')(x)\n", 443 | " if pooling_size != -1:\n", 444 | " x = MaxPooling1D(pool_size=pooling_size)(x) # Final shape=(None, 34, 256)\n", 445 | "x = Flatten()(x) # (None, 8704)\n", 446 | "# Fully connected layers \n", 447 | "for dense_size in fully_connected_layers:\n", 448 | " x = Dense(dense_size, activation='relu')(x) # dense_size == 1024\n", 449 | " x = Dropout(dropout_p)(x)\n", 450 | "# Output Layer\n", 451 | "predictions = Dense(num_of_classes, activation='softmax')(x)\n", 452 | "# Build model\n", 453 | "model = Model(inputs=inputs, outputs=predictions)\n", 454 | "model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) # Adam, categorical_crossentropy\n", 455 | "model.summary()" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 16, 461 | "metadata": {}, 462 | "outputs": [ 463 | { 464 | "name": "stdout", 465 | "output_type": "stream", 466 | "text": [ 467 | "[37. 50. 21. ... 0. 0. 0.]\n", 468 | "[50. 21. 19. ... 0. 0. 0.]\n" 469 | ] 470 | } 471 | ], 472 | "source": [ 473 | "print(train_data[0])\n", 474 | "print(train_data[1])\n", 475 | "\n", 476 | "# print(train_classes[1])\n", 477 | "\n", 478 | "# for i in range(100):\n", 479 | "# print(train_classes[i])\n", 480 | "# for i in train_data[0]:\n", 481 | "# print(i)\n", 482 | " \n", 483 | "# print(train)\n", 484 | "# for i in train_data[0]:\n", 485 | "# print(i)" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 17, 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [ 494 | "# 1000 training samples and 100 testing samples\n", 495 | "indices = np.arange(train_data.shape[0])\n", 496 | "np.random.shuffle(indices)\n", 497 | "\n", 498 | "x_train = train_data[indices]\n", 499 | "y_train = train_classes[indices]\n", 500 | "\n", 501 | "x_test = test_data\n", 502 | "y_test = test_classes" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 18, 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "name": "stdout", 512 | "output_type": "stream", 513 | "text": [ 514 | "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", 515 | "Instructions for updating:\n", 516 | "Use tf.cast instead.\n", 517 | "Train on 25569 samples, validate on 6393 samples\n", 518 | "Epoch 1/10\n", 519 | " - 16s - loss: 0.2343 - acc: 0.9287 - val_loss: 0.1542 - val_acc: 0.9321\n", 520 | "Epoch 2/10\n", 521 | " - 13s - loss: 0.1346 - acc: 0.9514 - val_loss: 0.1125 - val_acc: 0.9582\n", 522 | "Epoch 3/10\n", 523 | " - 13s - loss: 0.0975 - acc: 0.9664 - val_loss: 0.1058 - val_acc: 0.9657\n", 524 | "Epoch 4/10\n", 525 | " - 13s - loss: 0.0690 - acc: 0.9767 - val_loss: 0.0992 - val_acc: 0.9645\n", 526 | "Epoch 5/10\n", 527 | " - 13s - loss: 0.0449 - acc: 0.9847 - val_loss: 0.1074 - val_acc: 0.9667\n", 528 | "Epoch 6/10\n", 529 | " - 13s - loss: 0.0343 - acc: 0.9895 - val_loss: 0.1266 - val_acc: 0.9692\n", 530 | "Epoch 7/10\n", 531 | " - 13s - loss: 0.0278 - acc: 0.9907 - val_loss: 0.1406 - val_acc: 0.9626\n", 532 | "Epoch 8/10\n", 533 | " - 13s - loss: 0.0242 - acc: 0.9926 - val_loss: 0.1558 - val_acc: 0.9582\n", 534 | "Epoch 9/10\n", 535 | " - 13s - loss: 0.0174 - acc: 0.9944 - val_loss: 0.1468 - val_acc: 0.9651\n", 536 | "Epoch 10/10\n", 537 | " - 13s - loss: 0.0200 - acc: 0.9932 - val_loss: 0.1749 - val_acc: 0.9596\n" 538 | ] 539 | }, 540 | { 541 | "data": { 542 | "text/plain": [ 543 | "" 544 | ] 545 | }, 546 | "execution_count": 18, 547 | "metadata": {}, 548 | "output_type": "execute_result" 549 | } 550 | ], 551 | "source": [ 552 | "\n", 553 | "\n", 554 | "# Training\n", 555 | "model.fit(x_train, y_train,\n", 556 | " validation_data=(x_test, y_test),\n", 557 | " batch_size=128,\n", 558 | " epochs=10,\n", 559 | " verbose=2)\n", 560 | "\n" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 19, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "train_data_nb = copy.deepcopy(train_data)\n", 570 | "train_classes_nb = copy.deepcopy(train_classes)\n", 571 | "\n", 572 | "test_data_nb = copy.deepcopy(test_data)\n", 573 | "test_classes_nb = copy.deepcopy(test_classes)" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 20, 579 | "metadata": {}, 580 | "outputs": [ 581 | { 582 | "name": "stdout", 583 | "output_type": "stream", 584 | "text": [ 585 | "{0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0}\n" 586 | ] 587 | } 588 | ], 589 | "source": [ 590 | "# print(train_data_nb)\n", 591 | "set1 = set()\n", 592 | "\n", 593 | "\n", 594 | "for i in train_data_nb:\n", 595 | " for j in i:\n", 596 | " set1.add(j)\n", 597 | " \n", 598 | "print((set1))" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": 21, 604 | "metadata": {}, 605 | "outputs": [ 606 | { 607 | "name": "stdout", 608 | "output_type": "stream", 609 | "text": [ 610 | "69\n", 611 | "0\n" 612 | ] 613 | } 614 | ], 615 | "source": [ 616 | "print(len(set1))\n", 617 | "total_vocab = len(set1)\n", 618 | "\n", 619 | "set2 = set()\n", 620 | "train_classes_nb_l = len(train_classes_nb) * [0]\n", 621 | "test_classes_nb_l = len(test_classes_nb) * [0]\n", 622 | "\n", 623 | "for idx, i in enumerate(train_classes_nb):\n", 624 | " if(i[0]==0.0 and i[1]==1.0):\n", 625 | " train_classes_nb_l[idx]=copy.deepcopy(int(0))\n", 626 | " elif(i[0]==1.0 and i[1]==0.0):\n", 627 | " train_classes_nb_l[idx]=copy.deepcopy(int(1))\n", 628 | " else:\n", 629 | " print(i)\n", 630 | "\n", 631 | "for idx, i in enumerate(test_classes_nb):\n", 632 | " if(i[0]==0.0 and i[1]==1.0):\n", 633 | " test_classes_nb_l[idx]=copy.deepcopy(int(0))\n", 634 | " elif(i[0]==1.0 and i[1]==0.0):\n", 635 | " test_classes_nb_l[idx]=copy.deepcopy(int(1))\n", 636 | " else:\n", 637 | " print(i)\n", 638 | " \n", 639 | "print(len(set2))" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 22, 645 | "metadata": {}, 646 | "outputs": [], 647 | "source": [ 648 | "dict1=dict()\n", 649 | "dict1[0]=dict()\n", 650 | "dict1[1]=dict()" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": 23, 656 | "metadata": {}, 657 | "outputs": [ 658 | { 659 | "name": "stdout", 660 | "output_type": "stream", 661 | "text": [ 662 | "0 {'freq': 0, 'prob_class': 0, 0.0: 0, 1.0: 0, 2.0: 0, 3.0: 0, 4.0: 0, 5.0: 0, 6.0: 0, 7.0: 0, 8.0: 0, 9.0: 0, 10.0: 0, 11.0: 0, 12.0: 0, 13.0: 0, 14.0: 0, 15.0: 0, 16.0: 0, 17.0: 0, 18.0: 0, 19.0: 0, 20.0: 0, 21.0: 0, 22.0: 0, 23.0: 0, 24.0: 0, 25.0: 0, 26.0: 0, 27.0: 0, 28.0: 0, 29.0: 0, 30.0: 0, 31.0: 0, 32.0: 0, 33.0: 0, 34.0: 0, 35.0: 0, 36.0: 0, 37.0: 0, 38.0: 0, 39.0: 0, 40.0: 0, 41.0: 0, 42.0: 0, 43.0: 0, 44.0: 0, 45.0: 0, 46.0: 0, 47.0: 0, 48.0: 0, 49.0: 0, 50.0: 0, 51.0: 0, 52.0: 0, 53.0: 0, 54.0: 0, 55.0: 0, 56.0: 0, 57.0: 0, 58.0: 0, 59.0: 0, 60.0: 0, 61.0: 0, 64.0: 0, 65.0: 0, 66.0: 0, 67.0: 0, 68.0: 0, 69.0: 0, 70.0: 0}\n", 663 | "1 {'freq': 0, 'prob_class': 0, 0.0: 0, 1.0: 0, 2.0: 0, 3.0: 0, 4.0: 0, 5.0: 0, 6.0: 0, 7.0: 0, 8.0: 0, 9.0: 0, 10.0: 0, 11.0: 0, 12.0: 0, 13.0: 0, 14.0: 0, 15.0: 0, 16.0: 0, 17.0: 0, 18.0: 0, 19.0: 0, 20.0: 0, 21.0: 0, 22.0: 0, 23.0: 0, 24.0: 0, 25.0: 0, 26.0: 0, 27.0: 0, 28.0: 0, 29.0: 0, 30.0: 0, 31.0: 0, 32.0: 0, 33.0: 0, 34.0: 0, 35.0: 0, 36.0: 0, 37.0: 0, 38.0: 0, 39.0: 0, 40.0: 0, 41.0: 0, 42.0: 0, 43.0: 0, 44.0: 0, 45.0: 0, 46.0: 0, 47.0: 0, 48.0: 0, 49.0: 0, 50.0: 0, 51.0: 0, 52.0: 0, 53.0: 0, 54.0: 0, 55.0: 0, 56.0: 0, 57.0: 0, 58.0: 0, 59.0: 0, 60.0: 0, 61.0: 0, 64.0: 0, 65.0: 0, 66.0: 0, 67.0: 0, 68.0: 0, 69.0: 0, 70.0: 0}\n" 664 | ] 665 | } 666 | ], 667 | "source": [ 668 | "for key,val in dict1.items():\n", 669 | " dict1[key][\"freq\"] = 0\n", 670 | " dict1[key][\"prob_class\"] = 0\n", 671 | " for j in set1:\n", 672 | " dict1[key][j] = 0\n", 673 | " print(key,dict1[key])\n", 674 | " \n" 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": 24, 680 | "metadata": {}, 681 | "outputs": [ 682 | { 683 | "name": "stdout", 684 | "output_type": "stream", 685 | "text": [ 686 | "{0: {'freq': 24093654, 'prob_class': 23761, 0.0: 22087777, 1.0: 115637, 2.0: 25252, 3.0: 33900, 4.0: 51837, 5.0: 163930, 6.0: 32893, 7.0: 36043, 8.0: 59369, 9.0: 100088, 10.0: 3825, 11.0: 17322, 12.0: 66879, 13.0: 42361, 14.0: 88241, 15.0: 113170, 16.0: 31965, 17.0: 1069, 18.0: 87011, 19.0: 97758, 20.0: 108461, 21.0: 56213, 22.0: 17425, 23.0: 29233, 24.0: 3352, 25.0: 42683, 26.0: 1899, 27.0: 2305, 28.0: 2600, 29.0: 2121, 30.0: 1081, 31.0: 1005, 32.0: 748, 33.0: 1250, 34.0: 489, 35.0: 427, 36.0: 578, 37.0: 358139, 38.0: 4544, 39.0: 1672, 40.0: 18668, 41.0: 11219, 42.0: 1803, 43.0: 2397, 44.0: 6222, 45.0: 1254, 46.0: 524, 47.0: 26, 48.0: 152, 49.0: 649, 50.0: 13087, 51.0: 57560, 52.0: 170, 53.0: 195, 54.0: 47, 55.0: 1529, 56.0: 190, 57.0: 110, 58.0: 8, 59.0: 148, 60.0: 2265, 61.0: 106, 64.0: 623, 65.0: 832, 66.0: 81, 67.0: 79, 68.0: 5, 69.0: 4, 70.0: 81149}, 1: {'freq': 1833312, 'prob_class': 1808, 0.0: 1669726, 1.0: 10363, 2.0: 2253, 3.0: 3816, 4.0: 3627, 5.0: 14133, 6.0: 2115, 7.0: 2422, 8.0: 4827, 9.0: 9398, 10.0: 412, 11.0: 1315, 12.0: 5558, 13.0: 4100, 14.0: 7303, 15.0: 8679, 16.0: 3078, 17.0: 94, 18.0: 8335, 19.0: 9657, 20.0: 9506, 21.0: 4938, 22.0: 1122, 23.0: 2366, 24.0: 266, 25.0: 2511, 26.0: 157, 27.0: 189, 28.0: 176, 29.0: 180, 30.0: 37, 31.0: 72, 32.0: 27, 33.0: 94, 34.0: 72, 35.0: 20, 36.0: 37, 37.0: 27256, 38.0: 538, 39.0: 274, 40.0: 1318, 41.0: 385, 42.0: 270, 43.0: 195, 44.0: 599, 45.0: 233, 46.0: 61, 47.0: 0, 48.0: 17, 49.0: 9, 50.0: 1503, 51.0: 3913, 52.0: 11, 53.0: 4, 54.0: 0, 55.0: 255, 56.0: 23, 57.0: 11, 58.0: 0, 59.0: 14, 60.0: 207, 61.0: 20, 64.0: 26, 65.0: 26, 66.0: 0, 67.0: 0, 68.0: 0, 69.0: 0, 70.0: 3163}}\n" 687 | ] 688 | } 689 | ], 690 | "source": [ 691 | "for i in range(len(train_data_nb)):\n", 692 | " dict1[train_classes_nb_l[i]]['freq'] += len(train_data_nb[i]) \n", 693 | " dict1[train_classes_nb_l[i]]['prob_class'] +=1\n", 694 | " for j in train_data_nb[i]:\n", 695 | " dict1[train_classes_nb_l[i]][j] +=1\n", 696 | "# print(len(train_data_nb[i]))\n", 697 | "# print(train_classes_nb_l[i])\n", 698 | "\n", 699 | "print(dict1)" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": 25, 705 | "metadata": {}, 706 | "outputs": [ 707 | { 708 | "name": "stdout", 709 | "output_type": "stream", 710 | "text": [ 711 | "69\n", 712 | "25569\n" 713 | ] 714 | } 715 | ], 716 | "source": [ 717 | "print(total_vocab)\n", 718 | "total_data_len = len(train_data_nb)\n", 719 | "print(total_data_len)\n", 720 | "\n", 721 | "\n", 722 | "# p(class) = dict[class][prob_class]/total_data_len\n", 723 | "# p(word/class) = (dict[class][word]+1)/(dict[class][freq]+total_vocab)\n", 724 | "\n", 725 | "\n" 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": 26, 731 | "metadata": {}, 732 | "outputs": [], 733 | "source": [ 734 | "def prob_fun(op_class,sentence):\n", 735 | " result = dict1[op_class]['prob_class']/total_data_len\n", 736 | " for i in sentence:\n", 737 | " result *= ((dict1[op_class][i]+1)/(dict1[op_class]['freq']+total_vocab))\n", 738 | " return result" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": 27, 744 | "metadata": {}, 745 | "outputs": [ 746 | { 747 | "name": "stdout", 748 | "output_type": "stream", 749 | "text": [ 750 | "Accuracy is 79.08650086031597\n" 751 | ] 752 | } 753 | ], 754 | "source": [ 755 | "correct = 0\n", 756 | "for idx, sentence in enumerate(test_data_nb):\n", 757 | " res0 = prob_fun(0,sentence)\n", 758 | " res1 = prob_fun(1,sentence) \n", 759 | " if(res0>res1 and test_classes_nb_l[idx] ==0):\n", 760 | " correct+=1\n", 761 | " elif(res1>res0 and test_classes_nb_l[idx] ==1):\n", 762 | " correct+=1\n", 763 | " \n", 764 | "accuracy = (correct/len(test_data_nb))*100\n", 765 | "print(\"Accuracy is \",accuracy)" 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": 28, 771 | "metadata": {}, 772 | "outputs": [], 773 | "source": [] 774 | } 775 | ], 776 | "metadata": { 777 | "kernelspec": { 778 | "display_name": "Python 3", 779 | "language": "python", 780 | "name": "python3" 781 | }, 782 | "language_info": { 783 | "codemirror_mode": { 784 | "name": "ipython", 785 | "version": 3 786 | }, 787 | "file_extension": ".py", 788 | "mimetype": "text/x-python", 789 | "name": "python", 790 | "nbconvert_exporter": "python", 791 | "pygments_lexer": "ipython3", 792 | "version": "3.5.2" 793 | } 794 | }, 795 | "nbformat": 4, 796 | "nbformat_minor": 1 797 | } 798 | -------------------------------------------------------------------------------- /Experience_Paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manishshettym/Offensive-Text-Detection/ed633804a09fa8d6b6c1d252ac5de371e0bdef15/Experience_Paper.pdf -------------------------------------------------------------------------------- /GRU/gru.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "gru.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "toc_visible": true 10 | }, 11 | "language_info": { 12 | "codemirror_mode": { 13 | "name": "ipython", 14 | "version": 3 15 | }, 16 | "file_extension": ".py", 17 | "mimetype": "text/x-python", 18 | "name": "python", 19 | "nbconvert_exporter": "python", 20 | "pygments_lexer": "ipython3", 21 | "version": "3.5.2" 22 | }, 23 | "kernelspec": { 24 | "display_name": "Python 3", 25 | "language": "python", 26 | "name": "python3" 27 | } 28 | }, 29 | "cells": [ 30 | { 31 | "metadata": { 32 | "id": "RKwUedd0ojVV", 33 | "colab_type": "code", 34 | "colab": {} 35 | }, 36 | "cell_type": "code", 37 | "source": [ 38 | "import pandas as pd\n", 39 | "import numpy as np\n", 40 | "\n", 41 | "from itertools import chain\n", 42 | "import matplotlib.pyplot as plt\n", 43 | "import os\n", 44 | "import nltk\n", 45 | "from nltk.corpus import stopwords \n", 46 | "from nltk.tokenize import word_tokenize" 47 | ], 48 | "execution_count": 0, 49 | "outputs": [] 50 | }, 51 | { 52 | "metadata": { 53 | "id": "JrTpwLOHojVe", 54 | "colab_type": "code", 55 | "colab": {} 56 | }, 57 | "cell_type": "code", 58 | "source": [ 59 | "from sklearn.model_selection import train_test_split\n", 60 | "from sklearn.metrics import recall_score,accuracy_score\n", 61 | "from sklearn.preprocessing import MinMaxScaler\n", 62 | "\n", 63 | "from keras.preprocessing.text import Tokenizer\n", 64 | "from keras.preprocessing.sequence import pad_sequences\n", 65 | "from keras.utils import to_categorical\n", 66 | "from keras.layers import Dense, Input, GlobalMaxPooling1D\n", 67 | "from keras.layers import GRU, MaxPooling1D, Embedding\n", 68 | "from keras.models import Model\n", 69 | "from keras import layers, Input\n", 70 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", 71 | "from keras.models import load_model" 72 | ], 73 | "execution_count": 0, 74 | "outputs": [] 75 | }, 76 | { 77 | "metadata": { 78 | "id": "1Nfbtx28rRVO", 79 | "colab_type": "code", 80 | "colab": { 81 | "base_uri": "https://localhost:8080/", 82 | "height": 1217 83 | }, 84 | "outputId": "093e35fe-003a-418f-e4dc-8e94c12001c1" 85 | }, 86 | "cell_type": "code", 87 | "source": [ 88 | "!pip3 install hyperas" 89 | ], 90 | "execution_count": 4, 91 | "outputs": [ 92 | { 93 | "output_type": "stream", 94 | "text": [ 95 | "Collecting hyperas\n", 96 | " Downloading https://files.pythonhosted.org/packages/04/34/87ad6ffb42df9c1fa9c4c906f65813d42ad70d68c66af4ffff048c228cd4/hyperas-0.4.1-py3-none-any.whl\n", 97 | "Requirement already satisfied: nbconvert in /usr/local/lib/python3.6/dist-packages (from hyperas) (5.4.1)\n", 98 | "Requirement already satisfied: hyperopt in /usr/local/lib/python3.6/dist-packages (from hyperas) (0.1.2)\n", 99 | "Requirement already satisfied: jupyter in /usr/local/lib/python3.6/dist-packages (from hyperas) (1.0.0)\n", 100 | "Requirement already satisfied: entrypoints in /usr/local/lib/python3.6/dist-packages (from hyperas) (0.3)\n", 101 | "Requirement already satisfied: nbformat in /usr/local/lib/python3.6/dist-packages (from hyperas) (4.4.0)\n", 102 | "Requirement already satisfied: keras in /usr/local/lib/python3.6/dist-packages (from hyperas) (2.2.4)\n", 103 | "Requirement already satisfied: jupyter-core in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (4.4.0)\n", 104 | "Requirement already satisfied: testpath in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (0.4.2)\n", 105 | "Requirement already satisfied: defusedxml in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (0.6.0)\n", 106 | "Requirement already satisfied: mistune>=0.8.1 in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (0.8.4)\n", 107 | "Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (4.3.2)\n", 108 | "Requirement already satisfied: bleach in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (3.1.0)\n", 109 | "Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (1.4.2)\n", 110 | "Requirement already satisfied: pygments in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (2.1.3)\n", 111 | "Requirement already satisfied: jinja2 in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas) (2.10.1)\n", 112 | "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas) (1.2.1)\n", 113 | "Requirement already satisfied: networkx in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas) (2.3)\n", 114 | "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas) (1.16.3)\n", 115 | "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas) (4.28.1)\n", 116 | "Requirement already satisfied: pymongo in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas) (3.8.0)\n", 117 | "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas) (0.16.0)\n", 118 | "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas) (1.12.0)\n", 119 | "Requirement already satisfied: qtconsole in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas) (4.4.3)\n", 120 | "Requirement already satisfied: ipykernel in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas) (4.6.1)\n", 121 | "Requirement already satisfied: notebook in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas) (5.2.2)\n", 122 | "Requirement already satisfied: ipywidgets in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas) (7.4.2)\n", 123 | "Requirement already satisfied: jupyter-console in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas) (6.0.0)\n", 124 | "Requirement already satisfied: ipython-genutils in /usr/local/lib/python3.6/dist-packages (from nbformat->hyperas) (0.2.0)\n", 125 | "Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /usr/local/lib/python3.6/dist-packages (from nbformat->hyperas) (2.6.0)\n", 126 | "Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from keras->hyperas) (1.0.9)\n", 127 | "Requirement already satisfied: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from keras->hyperas) (1.0.7)\n", 128 | "Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from keras->hyperas) (3.13)\n", 129 | "Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from keras->hyperas) (2.8.0)\n", 130 | "Requirement already satisfied: decorator in /usr/local/lib/python3.6/dist-packages (from traitlets>=4.2->nbconvert->hyperas) (4.4.0)\n", 131 | "Requirement already satisfied: webencodings in /usr/local/lib/python3.6/dist-packages (from bleach->nbconvert->hyperas) (0.5.1)\n", 132 | "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from jinja2->nbconvert->hyperas) (1.1.1)\n", 133 | "Requirement already satisfied: jupyter-client>=4.1 in /usr/local/lib/python3.6/dist-packages (from qtconsole->jupyter->hyperas) (5.2.4)\n", 134 | "Requirement already satisfied: ipython>=4.0.0 in /usr/local/lib/python3.6/dist-packages (from ipykernel->jupyter->hyperas) (5.5.0)\n", 135 | "Requirement already satisfied: tornado>=4.0 in /usr/local/lib/python3.6/dist-packages (from ipykernel->jupyter->hyperas) (4.5.3)\n", 136 | "Requirement already satisfied: terminado>=0.3.3; sys_platform != \"win32\" in /usr/local/lib/python3.6/dist-packages (from notebook->jupyter->hyperas) (0.8.2)\n", 137 | "Requirement already satisfied: widgetsnbextension~=3.4.0 in /usr/local/lib/python3.6/dist-packages (from ipywidgets->jupyter->hyperas) (3.4.2)\n", 138 | "Collecting prompt-toolkit<2.1.0,>=2.0.0 (from jupyter-console->jupyter->hyperas)\n", 139 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f7/a7/9b1dd14ef45345f186ef69d175bdd2491c40ab1dfa4b2b3e4352df719ed7/prompt_toolkit-2.0.9-py3-none-any.whl (337kB)\n", 140 | "\u001b[K 100% |████████████████████████████████| 337kB 26.9MB/s \n", 141 | "\u001b[?25hRequirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from jupyter-client>=4.1->qtconsole->jupyter->hyperas) (2.5.3)\n", 142 | "Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.6/dist-packages (from jupyter-client>=4.1->qtconsole->jupyter->hyperas) (17.0.0)\n", 143 | "Requirement already satisfied: simplegeneric>0.8 in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0->ipykernel->jupyter->hyperas) (0.8.1)\n", 144 | "Requirement already satisfied: pickleshare in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0->ipykernel->jupyter->hyperas) (0.7.5)\n", 145 | "Requirement already satisfied: pexpect; sys_platform != \"win32\" in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0->ipykernel->jupyter->hyperas) (4.7.0)\n", 146 | "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0->ipykernel->jupyter->hyperas) (40.9.0)\n", 147 | "Requirement already satisfied: ptyprocess; os_name != \"nt\" in /usr/local/lib/python3.6/dist-packages (from terminado>=0.3.3; sys_platform != \"win32\"->notebook->jupyter->hyperas) (0.6.0)\n", 148 | "Requirement already satisfied: wcwidth in /usr/local/lib/python3.6/dist-packages (from prompt-toolkit<2.1.0,>=2.0.0->jupyter-console->jupyter->hyperas) (0.1.7)\n", 149 | "\u001b[31mipython 5.5.0 has requirement prompt-toolkit<2.0.0,>=1.0.4, but you'll have prompt-toolkit 2.0.9 which is incompatible.\u001b[0m\n", 150 | "Installing collected packages: hyperas, prompt-toolkit\n", 151 | " Found existing installation: prompt-toolkit 1.0.16\n", 152 | " Uninstalling prompt-toolkit-1.0.16:\n", 153 | " Successfully uninstalled prompt-toolkit-1.0.16\n", 154 | "Successfully installed hyperas-0.4.1 prompt-toolkit-2.0.9\n" 155 | ], 156 | "name": "stdout" 157 | }, 158 | { 159 | "output_type": "display_data", 160 | "data": { 161 | "application/vnd.colab-display-data+json": { 162 | "pip_warning": { 163 | "packages": [ 164 | "prompt_toolkit" 165 | ] 166 | } 167 | } 168 | }, 169 | "metadata": { 170 | "tags": [] 171 | } 172 | } 173 | ] 174 | }, 175 | { 176 | "metadata": { 177 | "id": "fCT1mIKErqZt", 178 | "colab_type": "code", 179 | "colab": { 180 | "base_uri": "https://localhost:8080/", 181 | "height": 1074 182 | }, 183 | "outputId": "1a9622d4-3b4a-45aa-e6ce-4d204dfdd09e" 184 | }, 185 | "cell_type": "code", 186 | "source": [ 187 | "!pip3 install git+https://github.com/maxpumperla/hyperas.git" 188 | ], 189 | "execution_count": 6, 190 | "outputs": [ 191 | { 192 | "output_type": "stream", 193 | "text": [ 194 | "Collecting git+https://github.com/maxpumperla/hyperas.git\n", 195 | " Cloning https://github.com/maxpumperla/hyperas.git to /tmp/pip-req-build-ym0r1e2g\n", 196 | "Requirement already satisfied (use --upgrade to upgrade): hyperas==0.4.1 from git+https://github.com/maxpumperla/hyperas.git in /usr/local/lib/python3.6/dist-packages\n", 197 | "Requirement already satisfied: keras in /usr/local/lib/python3.6/dist-packages (from hyperas==0.4.1) (2.2.4)\n", 198 | "Requirement already satisfied: hyperopt in /usr/local/lib/python3.6/dist-packages (from hyperas==0.4.1) (0.1.2)\n", 199 | "Requirement already satisfied: entrypoints in /usr/local/lib/python3.6/dist-packages (from hyperas==0.4.1) (0.3)\n", 200 | "Requirement already satisfied: jupyter in /usr/local/lib/python3.6/dist-packages (from hyperas==0.4.1) (1.0.0)\n", 201 | "Requirement already satisfied: nbformat in /usr/local/lib/python3.6/dist-packages (from hyperas==0.4.1) (4.4.0)\n", 202 | "Requirement already satisfied: nbconvert in /usr/local/lib/python3.6/dist-packages (from hyperas==0.4.1) (5.4.1)\n", 203 | "Requirement already satisfied: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from keras->hyperas==0.4.1) (1.0.7)\n", 204 | "Requirement already satisfied: scipy>=0.14 in /usr/local/lib/python3.6/dist-packages (from keras->hyperas==0.4.1) (1.2.1)\n", 205 | "Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from keras->hyperas==0.4.1) (2.8.0)\n", 206 | "Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from keras->hyperas==0.4.1) (1.0.9)\n", 207 | "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.6/dist-packages (from keras->hyperas==0.4.1) (1.12.0)\n", 208 | "Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from keras->hyperas==0.4.1) (3.13)\n", 209 | "Requirement already satisfied: numpy>=1.9.1 in /usr/local/lib/python3.6/dist-packages (from keras->hyperas==0.4.1) (1.16.3)\n", 210 | "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas==0.4.1) (0.16.0)\n", 211 | "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas==0.4.1) (4.28.1)\n", 212 | "Requirement already satisfied: networkx in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas==0.4.1) (2.3)\n", 213 | "Requirement already satisfied: pymongo in /usr/local/lib/python3.6/dist-packages (from hyperopt->hyperas==0.4.1) (3.8.0)\n", 214 | "Requirement already satisfied: ipywidgets in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas==0.4.1) (7.4.2)\n", 215 | "Requirement already satisfied: jupyter-console in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas==0.4.1) (6.0.0)\n", 216 | "Requirement already satisfied: qtconsole in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas==0.4.1) (4.4.3)\n", 217 | "Requirement already satisfied: notebook in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas==0.4.1) (5.2.2)\n", 218 | "Requirement already satisfied: ipykernel in /usr/local/lib/python3.6/dist-packages (from jupyter->hyperas==0.4.1) (4.6.1)\n", 219 | "Requirement already satisfied: jupyter-core in /usr/local/lib/python3.6/dist-packages (from nbformat->hyperas==0.4.1) (4.4.0)\n", 220 | "Requirement already satisfied: ipython-genutils in /usr/local/lib/python3.6/dist-packages (from nbformat->hyperas==0.4.1) (0.2.0)\n", 221 | "Requirement already satisfied: traitlets>=4.1 in /usr/local/lib/python3.6/dist-packages (from nbformat->hyperas==0.4.1) (4.3.2)\n", 222 | "Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /usr/local/lib/python3.6/dist-packages (from nbformat->hyperas==0.4.1) (2.6.0)\n", 223 | "Requirement already satisfied: jinja2 in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas==0.4.1) (2.10.1)\n", 224 | "Requirement already satisfied: pygments in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas==0.4.1) (2.1.3)\n", 225 | "Requirement already satisfied: bleach in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas==0.4.1) (3.1.0)\n", 226 | "Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas==0.4.1) (1.4.2)\n", 227 | "Requirement already satisfied: mistune>=0.8.1 in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas==0.4.1) (0.8.4)\n", 228 | "Requirement already satisfied: defusedxml in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas==0.4.1) (0.6.0)\n", 229 | "Requirement already satisfied: testpath in /usr/local/lib/python3.6/dist-packages (from nbconvert->hyperas==0.4.1) (0.4.2)\n", 230 | "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from networkx->hyperopt->hyperas==0.4.1) (4.4.0)\n", 231 | "Requirement already satisfied: widgetsnbextension~=3.4.0 in /usr/local/lib/python3.6/dist-packages (from ipywidgets->jupyter->hyperas==0.4.1) (3.4.2)\n", 232 | "Requirement already satisfied: ipython>=4.0.0; python_version >= \"3.3\" in /usr/local/lib/python3.6/dist-packages (from ipywidgets->jupyter->hyperas==0.4.1) (5.5.0)\n", 233 | "Requirement already satisfied: jupyter-client in /usr/local/lib/python3.6/dist-packages (from jupyter-console->jupyter->hyperas==0.4.1) (5.2.4)\n", 234 | "Requirement already satisfied: prompt-toolkit<2.1.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from jupyter-console->jupyter->hyperas==0.4.1) (2.0.9)\n", 235 | "Requirement already satisfied: tornado>=4 in /usr/local/lib/python3.6/dist-packages (from notebook->jupyter->hyperas==0.4.1) (4.5.3)\n", 236 | "Requirement already satisfied: terminado>=0.3.3; sys_platform != \"win32\" in /usr/local/lib/python3.6/dist-packages (from notebook->jupyter->hyperas==0.4.1) (0.8.2)\n", 237 | "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from jinja2->nbconvert->hyperas==0.4.1) (1.1.1)\n", 238 | "Requirement already satisfied: webencodings in /usr/local/lib/python3.6/dist-packages (from bleach->nbconvert->hyperas==0.4.1) (0.5.1)\n", 239 | "Requirement already satisfied: pexpect; sys_platform != \"win32\" in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets->jupyter->hyperas==0.4.1) (4.7.0)\n", 240 | "Requirement already satisfied: simplegeneric>0.8 in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets->jupyter->hyperas==0.4.1) (0.8.1)\n", 241 | "Requirement already satisfied: pickleshare in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets->jupyter->hyperas==0.4.1) (0.7.5)\n", 242 | "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0; python_version >= \"3.3\"->ipywidgets->jupyter->hyperas==0.4.1) (40.9.0)\n", 243 | "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from jupyter-client->jupyter-console->jupyter->hyperas==0.4.1) (2.5.3)\n", 244 | "Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.6/dist-packages (from jupyter-client->jupyter-console->jupyter->hyperas==0.4.1) (17.0.0)\n", 245 | "Requirement already satisfied: wcwidth in /usr/local/lib/python3.6/dist-packages (from prompt-toolkit<2.1.0,>=2.0.0->jupyter-console->jupyter->hyperas==0.4.1) (0.1.7)\n", 246 | "Requirement already satisfied: ptyprocess; os_name != \"nt\" in /usr/local/lib/python3.6/dist-packages (from terminado>=0.3.3; sys_platform != \"win32\"->notebook->jupyter->hyperas==0.4.1) (0.6.0)\n", 247 | "Building wheels for collected packages: hyperas\n", 248 | " Building wheel for hyperas (setup.py) ... \u001b[?25ldone\n", 249 | "\u001b[?25h Stored in directory: /tmp/pip-ephem-wheel-cache-k663ho3s/wheels/27/c7/75/b70097065b73570eda25350a796d87c41cd967471a04064cc2\n", 250 | "Successfully built hyperas\n" 251 | ], 252 | "name": "stdout" 253 | } 254 | ] 255 | }, 256 | { 257 | "metadata": { 258 | "id": "QDuittmhojVo", 259 | "colab_type": "code", 260 | "colab": { 261 | "base_uri": "https://localhost:8080/", 262 | "height": 353 263 | }, 264 | "outputId": "b4dbd669-01f0-4320-cbb2-0a9365a3c5a4" 265 | }, 266 | "cell_type": "code", 267 | "source": [ 268 | "from hyperopt import Trials, STATUS_OK, tpe\n", 269 | "from hyperas import optim\n", 270 | "from hyperas.distributions import choice, uniform, conditional" 271 | ], 272 | "execution_count": 7, 273 | "outputs": [ 274 | { 275 | "output_type": "error", 276 | "ename": "ImportError", 277 | "evalue": "ignored", 278 | "traceback": [ 279 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 280 | "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", 281 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mhyperopt\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mTrials\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSTATUS_OK\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtpe\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mhyperas\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0moptim\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mhyperas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdistributions\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mchoice\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muniform\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconditional\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 282 | "\u001b[0;31mImportError\u001b[0m: cannot import name 'conditional'", 283 | "", 284 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n" 285 | ] 286 | } 287 | ] 288 | }, 289 | { 290 | "metadata": { 291 | "id": "q8qIM2c-t9DO", 292 | "colab_type": "code", 293 | "colab": { 294 | "base_uri": "https://localhost:8080/", 295 | "height": 128 296 | }, 297 | "outputId": "c5a7fdad-49ff-412b-a1d7-cff21b560886" 298 | }, 299 | "cell_type": "code", 300 | "source": [ 301 | "from google.colab import drive\n", 302 | "drive.mount('/content/gdrive')" 303 | ], 304 | "execution_count": 8, 305 | "outputs": [ 306 | { 307 | "output_type": "stream", 308 | "text": [ 309 | "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code\n", 310 | "\n", 311 | "Enter your authorization code:\n", 312 | "··········\n", 313 | "Mounted at /content/gdrive\n" 314 | ], 315 | "name": "stdout" 316 | } 317 | ] 318 | }, 319 | { 320 | "metadata": { 321 | "id": "_rodD_a6ojVr", 322 | "colab_type": "code", 323 | "colab": {} 324 | }, 325 | "cell_type": "code", 326 | "source": [ 327 | "data = pd.read_csv('/content/gdrive/My Drive/NLP/Dataset/train_E6oV3lV.csv')\n", 328 | "data.shape[0] - data.dropna().shape[0]\n", 329 | "data_copy = data.copy()" 330 | ], 331 | "execution_count": 0, 332 | "outputs": [] 333 | }, 334 | { 335 | "metadata": { 336 | "id": "9OT54nvLojVu", 337 | "colab_type": "text" 338 | }, 339 | "cell_type": "markdown", 340 | "source": [ 341 | "### Tokenization" 342 | ] 343 | }, 344 | { 345 | "metadata": { 346 | "id": "bb6dScp0un5C", 347 | "colab_type": "code", 348 | "colab": { 349 | "base_uri": "https://localhost:8080/", 350 | "height": 72 351 | }, 352 | "outputId": "913c37fc-f861-4ad6-bc74-be9601f319d2" 353 | }, 354 | "cell_type": "code", 355 | "source": [ 356 | " nltk.download('punkt')" 357 | ], 358 | "execution_count": 12, 359 | "outputs": [ 360 | { 361 | "output_type": "stream", 362 | "text": [ 363 | "[nltk_data] Downloading package punkt to /root/nltk_data...\n", 364 | "[nltk_data] Unzipping tokenizers/punkt.zip.\n" 365 | ], 366 | "name": "stdout" 367 | }, 368 | { 369 | "output_type": "execute_result", 370 | "data": { 371 | "text/plain": [ 372 | "True" 373 | ] 374 | }, 375 | "metadata": { 376 | "tags": [] 377 | }, 378 | "execution_count": 12 379 | } 380 | ] 381 | }, 382 | { 383 | "metadata": { 384 | "id": "sCqlatYfojVv", 385 | "colab_type": "code", 386 | "colab": { 387 | "base_uri": "https://localhost:8080/", 388 | "height": 399 389 | }, 390 | "outputId": "a0c4623f-e181-4a80-d760-1569884141a5" 391 | }, 392 | "cell_type": "code", 393 | "source": [ 394 | "tokenized_single_posts = [nltk.tokenize.word_tokenize(i) for i in data.tweet]\n", 395 | "len(tokenized_single_posts)\n" 396 | ], 397 | "execution_count": 107, 398 | "outputs": [ 399 | { 400 | "output_type": "execute_result", 401 | "data": { 402 | "text/plain": [ 403 | "['@',\n", 404 | " 'user',\n", 405 | " 'when',\n", 406 | " 'a',\n", 407 | " 'father',\n", 408 | " 'is',\n", 409 | " 'dysfunctional',\n", 410 | " 'and',\n", 411 | " 'is',\n", 412 | " 'so',\n", 413 | " 'selfish',\n", 414 | " 'he',\n", 415 | " 'drags',\n", 416 | " 'his',\n", 417 | " 'kids',\n", 418 | " 'into',\n", 419 | " 'his',\n", 420 | " 'dysfunction',\n", 421 | " '.',\n", 422 | " '#',\n", 423 | " 'run']" 424 | ] 425 | }, 426 | "metadata": { 427 | "tags": [] 428 | }, 429 | "execution_count": 107 430 | } 431 | ] 432 | }, 433 | { 434 | "metadata": { 435 | "id": "Kx6wgMedBdeY", 436 | "colab_type": "code", 437 | "colab": { 438 | "base_uri": "https://localhost:8080/", 439 | "height": 55 440 | }, 441 | "outputId": "8ee9ce9a-2d1f-4e4a-fce3-866fcc42918c" 442 | }, 443 | "cell_type": "code", 444 | "source": [ 445 | "print(tokenized_single_posts[0])" 446 | ], 447 | "execution_count": 111, 448 | "outputs": [ 449 | { 450 | "output_type": "stream", 451 | "text": [ 452 | "['@', 'user', 'when', 'a', 'father', 'is', 'dysfunctional', 'and', 'is', 'so', 'selfish', 'he', 'drags', 'his', 'kids', 'into', 'his', 'dysfunction', '.', '#', 'run']\n" 453 | ], 454 | "name": "stdout" 455 | } 456 | ] 457 | }, 458 | { 459 | "metadata": { 460 | "id": "YqyqR-pkojV3", 461 | "colab_type": "code", 462 | "colab": {} 463 | }, 464 | "cell_type": "code", 465 | "source": [ 466 | "leng = []\n", 467 | "for i in range(len(tokenized_single_posts)):\n", 468 | " length = len(tokenized_single_posts[i])\n", 469 | " leng.append(length)" 470 | ], 471 | "execution_count": 0, 472 | "outputs": [] 473 | }, 474 | { 475 | "metadata": { 476 | "id": "jvoU7GSgojV_", 477 | "colab_type": "text" 478 | }, 479 | "cell_type": "markdown", 480 | "source": [ 481 | "### stopwords removal" 482 | ] 483 | }, 484 | { 485 | "metadata": { 486 | "id": "twzVwef4u40S", 487 | "colab_type": "code", 488 | "colab": { 489 | "base_uri": "https://localhost:8080/", 490 | "height": 72 491 | }, 492 | "outputId": "7c75b940-abff-4b48-91e5-98f44517bf6a" 493 | }, 494 | "cell_type": "code", 495 | "source": [ 496 | "nltk.download('stopwords')" 497 | ], 498 | "execution_count": 16, 499 | "outputs": [ 500 | { 501 | "output_type": "stream", 502 | "text": [ 503 | "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", 504 | "[nltk_data] Unzipping corpora/stopwords.zip.\n" 505 | ], 506 | "name": "stdout" 507 | }, 508 | { 509 | "output_type": "execute_result", 510 | "data": { 511 | "text/plain": [ 512 | "True" 513 | ] 514 | }, 515 | "metadata": { 516 | "tags": [] 517 | }, 518 | "execution_count": 16 519 | } 520 | ] 521 | }, 522 | { 523 | "metadata": { 524 | "id": "tDQHKY7tojWA", 525 | "colab_type": "code", 526 | "colab": {} 527 | }, 528 | "cell_type": "code", 529 | "source": [ 530 | "import string\n", 531 | "stp_removed = []\n", 532 | "for i in range (len(tokenized_single_posts)):\n", 533 | " stp = [word for word in tokenized_single_posts[i] if word not in (stopwords.words('english')+list(string.punctuation))]\n", 534 | " stp_removed.append(stp)" 535 | ], 536 | "execution_count": 0, 537 | "outputs": [] 538 | }, 539 | { 540 | "metadata": { 541 | "id": "B72rZxyVojWF", 542 | "colab_type": "code", 543 | "colab": { 544 | "base_uri": "https://localhost:8080/", 545 | "height": 54 546 | }, 547 | "outputId": "97af4154-3db9-4a76-9dcd-9968345bae0f" 548 | }, 549 | "cell_type": "code", 550 | "source": [ 551 | "print(len(stp_removed))\n", 552 | "print(stp_removed[0])" 553 | ], 554 | "execution_count": 113, 555 | "outputs": [ 556 | { 557 | "output_type": "stream", 558 | "text": [ 559 | "31962\n", 560 | "['user', 'father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run']\n" 561 | ], 562 | "name": "stdout" 563 | } 564 | ] 565 | }, 566 | { 567 | "metadata": { 568 | "id": "zlQsyxpSojWN", 569 | "colab_type": "text" 570 | }, 571 | "cell_type": "markdown", 572 | "source": [ 573 | "### Lemmatize the stop removed posts" 574 | ] 575 | }, 576 | { 577 | "metadata": { 578 | "id": "ycFaoKT0vf2l", 579 | "colab_type": "code", 580 | "colab": { 581 | "base_uri": "https://localhost:8080/", 582 | "height": 72 583 | }, 584 | "outputId": "17bed81f-3344-48d8-de11-9cce0309cbe2" 585 | }, 586 | "cell_type": "code", 587 | "source": [ 588 | "nltk.download('wordnet')" 589 | ], 590 | "execution_count": 20, 591 | "outputs": [ 592 | { 593 | "output_type": "stream", 594 | "text": [ 595 | "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", 596 | "[nltk_data] Unzipping corpora/wordnet.zip.\n" 597 | ], 598 | "name": "stdout" 599 | }, 600 | { 601 | "output_type": "execute_result", 602 | "data": { 603 | "text/plain": [ 604 | "True" 605 | ] 606 | }, 607 | "metadata": { 608 | "tags": [] 609 | }, 610 | "execution_count": 20 611 | } 612 | ] 613 | }, 614 | { 615 | "metadata": { 616 | "id": "UUwmc3oTojWO", 617 | "colab_type": "code", 618 | "colab": {} 619 | }, 620 | "cell_type": "code", 621 | "source": [ 622 | "words_lemma = []\n", 623 | "lemma = nltk.WordNetLemmatizer()\n", 624 | "for i in range(len(stp_removed)):\n", 625 | " words = [lemma.lemmatize(word) for word in stp_removed[i]]\n", 626 | " words_lemma.append(words)" 627 | ], 628 | "execution_count": 0, 629 | "outputs": [] 630 | }, 631 | { 632 | "metadata": { 633 | "id": "D0y-HBS2ojWR", 634 | "colab_type": "code", 635 | "colab": { 636 | "base_uri": "https://localhost:8080/", 637 | "height": 54 638 | }, 639 | "outputId": "f138001a-1d7d-4b65-abab-38d17d4b1fc3" 640 | }, 641 | "cell_type": "code", 642 | "source": [ 643 | "print(len(words_lemma))\n", 644 | "print(words_lemma[0])" 645 | ], 646 | "execution_count": 116, 647 | "outputs": [ 648 | { 649 | "output_type": "stream", 650 | "text": [ 651 | "31962\n", 652 | "['user', 'father', 'dysfunctional', 'selfish', 'drag', 'kid', 'dysfunction', 'run']\n" 653 | ], 654 | "name": "stdout" 655 | } 656 | ] 657 | }, 658 | { 659 | "metadata": { 660 | "id": "mAK3gKzmojWd", 661 | "colab_type": "text" 662 | }, 663 | "cell_type": "markdown", 664 | "source": [ 665 | "### Remove all digit words" 666 | ] 667 | }, 668 | { 669 | "metadata": { 670 | "id": "PgtMb70xojWe", 671 | "colab_type": "code", 672 | "colab": {} 673 | }, 674 | "cell_type": "code", 675 | "source": [ 676 | "words_noNum = []\n", 677 | "for i in range(len(words_lemma)):\n", 678 | " words = [word for word in words_lemma[i] if word.isdigit() == False]\n", 679 | " words_noNum.append(words)" 680 | ], 681 | "execution_count": 0, 682 | "outputs": [] 683 | }, 684 | { 685 | "metadata": { 686 | "id": "R_Xjne6hojWi", 687 | "colab_type": "code", 688 | "colab": { 689 | "base_uri": "https://localhost:8080/", 690 | "height": 35 691 | }, 692 | "outputId": "3f1b462b-c0b9-420f-bf39-9299332933f9" 693 | }, 694 | "cell_type": "code", 695 | "source": [ 696 | "len(words_noNum)\n", 697 | "print(words_noNum[0])" 698 | ], 699 | "execution_count": 119, 700 | "outputs": [ 701 | { 702 | "output_type": "stream", 703 | "text": [ 704 | "['user', 'father', 'dysfunctional', 'selfish', 'drag', 'kid', 'dysfunction', 'run']\n" 705 | ], 706 | "name": "stdout" 707 | } 708 | ] 709 | }, 710 | { 711 | "metadata": { 712 | "id": "6hkQCk29ojWp", 713 | "colab_type": "text" 714 | }, 715 | "cell_type": "markdown", 716 | "source": [ 717 | "### Remove single character words" 718 | ] 719 | }, 720 | { 721 | "metadata": { 722 | "id": "QHDfso6-ojWr", 723 | "colab_type": "code", 724 | "colab": {} 725 | }, 726 | "cell_type": "code", 727 | "source": [ 728 | "words_nonSingle = []\n", 729 | "for i in range(len(words_noNum)):\n", 730 | " words = [word for word in words_noNum[i] if len(word) > 1]\n", 731 | " words_nonSingle.append(words)" 732 | ], 733 | "execution_count": 0, 734 | "outputs": [] 735 | }, 736 | { 737 | "metadata": { 738 | "id": "iFXZg0J9ojWw", 739 | "colab_type": "code", 740 | "colab": { 741 | "base_uri": "https://localhost:8080/", 742 | "height": 35 743 | }, 744 | "outputId": "428664cf-3dfa-4527-fbc5-51b477193876" 745 | }, 746 | "cell_type": "code", 747 | "source": [ 748 | "len(words_nonSingle)" 749 | ], 750 | "execution_count": 121, 751 | "outputs": [ 752 | { 753 | "output_type": "execute_result", 754 | "data": { 755 | "text/plain": [ 756 | "31962" 757 | ] 758 | }, 759 | "metadata": { 760 | "tags": [] 761 | }, 762 | "execution_count": 121 763 | } 764 | ] 765 | }, 766 | { 767 | "metadata": { 768 | "id": "zCgZ2vsIojW6", 769 | "colab_type": "text" 770 | }, 771 | "cell_type": "markdown", 772 | "source": [ 773 | "### Remove non-alphabetic words" 774 | ] 775 | }, 776 | { 777 | "metadata": { 778 | "id": "37xdOC53ojW9", 779 | "colab_type": "code", 780 | "colab": {} 781 | }, 782 | "cell_type": "code", 783 | "source": [ 784 | "words_alpha = []\n", 785 | "for i in range(len(words_nonSingle)):\n", 786 | " words = [word for word in words_nonSingle[i] if word.isalpha()]\n", 787 | " words_alpha.append(words)" 788 | ], 789 | "execution_count": 0, 790 | "outputs": [] 791 | }, 792 | { 793 | "metadata": { 794 | "id": "1-V3stkaojXB", 795 | "colab_type": "code", 796 | "colab": { 797 | "base_uri": "https://localhost:8080/", 798 | "height": 54 799 | }, 800 | "outputId": "294c87b6-2e9c-4647-922c-5a675b6e8207" 801 | }, 802 | "cell_type": "code", 803 | "source": [ 804 | "print(len(words_alpha))\n", 805 | "print(words_alpha[0])" 806 | ], 807 | "execution_count": 124, 808 | "outputs": [ 809 | { 810 | "output_type": "stream", 811 | "text": [ 812 | "31962\n", 813 | "['user', 'father', 'dysfunctional', 'selfish', 'drag', 'kid', 'dysfunction', 'run']\n" 814 | ], 815 | "name": "stdout" 816 | } 817 | ] 818 | }, 819 | { 820 | "metadata": { 821 | "id": "4Z0egXsbojXH", 822 | "colab_type": "code", 823 | "colab": {} 824 | }, 825 | "cell_type": "code", 826 | "source": [ 827 | "data_copy['words_count'] = [len(i) for i in words_alpha]" 828 | ], 829 | "execution_count": 0, 830 | "outputs": [] 831 | }, 832 | { 833 | "metadata": { 834 | "id": "jjK46C2OwR58", 835 | "colab_type": "code", 836 | "colab": { 837 | "base_uri": "https://localhost:8080/", 838 | "height": 90 839 | }, 840 | "outputId": "18a37d60-36f8-4404-ca61-dccd81fc3aef" 841 | }, 842 | "cell_type": "code", 843 | "source": [ 844 | "nltk.download('averaged_perceptron_tagger')" 845 | ], 846 | "execution_count": 33, 847 | "outputs": [ 848 | { 849 | "output_type": "stream", 850 | "text": [ 851 | "[nltk_data] Downloading package averaged_perceptron_tagger to\n", 852 | "[nltk_data] /root/nltk_data...\n", 853 | "[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\n" 854 | ], 855 | "name": "stdout" 856 | }, 857 | { 858 | "output_type": "execute_result", 859 | "data": { 860 | "text/plain": [ 861 | "True" 862 | ] 863 | }, 864 | "metadata": { 865 | "tags": [] 866 | }, 867 | "execution_count": 33 868 | } 869 | ] 870 | }, 871 | { 872 | "metadata": { 873 | "id": "i__VNp6CojXK", 874 | "colab_type": "code", 875 | "colab": {} 876 | }, 877 | "cell_type": "code", 878 | "source": [ 879 | "noun_freq = []\n", 880 | "verb_freq = []\n", 881 | "adjective_freq = []\n", 882 | "adverb_freq = []\n", 883 | "for i in range(len(words_alpha)):\n", 884 | " word_pos_tag = nltk.pos_tag(words_alpha[i])\n", 885 | " count_noun = 0\n", 886 | " count_verb = 0\n", 887 | " count_adjective = 0\n", 888 | " count_adverb = 0\n", 889 | " for j in range(len(word_pos_tag)):\n", 890 | " if word_pos_tag[j][1] == \"NN\":\n", 891 | " count_noun += 1\n", 892 | " if word_pos_tag[j][1] == 'VB':\n", 893 | " count_verb += 1\n", 894 | " if word_pos_tag[j][1] == 'JJ':\n", 895 | " count_adjective += 1\n", 896 | " if word_pos_tag[j][1] == 'RB':\n", 897 | " count_adverb += 1\n", 898 | " noun_freq.append(count_noun/(len(words_alpha[i]) + 1))\n", 899 | " verb_freq.append(count_verb/(len(words_alpha[i])+1))\n", 900 | " adjective_freq.append(count_adjective/(len(words_alpha[i])+1))\n", 901 | " adverb_freq.append(count_adverb/(len(words_alpha[i])+1))" 902 | ], 903 | "execution_count": 0, 904 | "outputs": [] 905 | }, 906 | { 907 | "metadata": { 908 | "id": "macvpusmojXR", 909 | "colab_type": "code", 910 | "colab": {} 911 | }, 912 | "cell_type": "code", 913 | "source": [ 914 | "freq_dict = {'noun_freq' : noun_freq, 'verb_freq' : verb_freq, 'adjective_freq' : adjective_freq, 'adverb_freq' : adverb_freq}" 915 | ], 916 | "execution_count": 0, 917 | "outputs": [] 918 | }, 919 | { 920 | "metadata": { 921 | "id": "H1jvbthTojXU", 922 | "colab_type": "code", 923 | "colab": {} 924 | }, 925 | "cell_type": "code", 926 | "source": [ 927 | "data_copy = data_copy.join(pd.DataFrame(freq_dict))" 928 | ], 929 | "execution_count": 0, 930 | "outputs": [] 931 | }, 932 | { 933 | "metadata": { 934 | "id": "wK399OlHojXW", 935 | "colab_type": "code", 936 | "colab": { 937 | "base_uri": "https://localhost:8080/", 938 | "height": 198 939 | }, 940 | "outputId": "775da18b-6df1-4a0d-ca26-357d1f61e2a1" 941 | }, 942 | "cell_type": "code", 943 | "source": [ 944 | "data_copy = data_copy.drop(['id'],axis=1)\n", 945 | "data_copy.tail()\n" 946 | ], 947 | "execution_count": 141, 948 | "outputs": [ 949 | { 950 | "output_type": "execute_result", 951 | "data": { 952 | "text/html": [ 953 | "
\n", 954 | "\n", 967 | "\n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | "
labeltweetwords_countnoun_freqverb_freqadjective_freqadverb_freq
319570ate @user isz that youuu?😍😍😍😍😍ð...40.6000000.2000000.0000000.0
319580to see nina turner on the airwaves trying to...140.4000000.0666670.1333330.0
319590listening to sad songs on a monday morning otw...80.6666670.0000000.1111110.0
319601@user #sikh #temple vandalised in in #calgary,...80.6666670.0000000.1111110.0
319610thank you @user for you follow30.7500000.0000000.0000000.0
\n", 1033 | "
" 1034 | ], 1035 | "text/plain": [ 1036 | " label tweet words_count \\\n", 1037 | "31957 0 ate @user isz that youuu?😍😍😍😍😍ð... 4 \n", 1038 | "31958 0 to see nina turner on the airwaves trying to... 14 \n", 1039 | "31959 0 listening to sad songs on a monday morning otw... 8 \n", 1040 | "31960 1 @user #sikh #temple vandalised in in #calgary,... 8 \n", 1041 | "31961 0 thank you @user for you follow 3 \n", 1042 | "\n", 1043 | " noun_freq verb_freq adjective_freq adverb_freq \n", 1044 | "31957 0.600000 0.200000 0.000000 0.0 \n", 1045 | "31958 0.400000 0.066667 0.133333 0.0 \n", 1046 | "31959 0.666667 0.000000 0.111111 0.0 \n", 1047 | "31960 0.666667 0.000000 0.111111 0.0 \n", 1048 | "31961 0.750000 0.000000 0.000000 0.0 " 1049 | ] 1050 | }, 1051 | "metadata": { 1052 | "tags": [] 1053 | }, 1054 | "execution_count": 141 1055 | } 1056 | ] 1057 | }, 1058 | { 1059 | "metadata": { 1060 | "id": "xRCSGkzqGn1r", 1061 | "colab_type": "code", 1062 | "colab": { 1063 | "base_uri": "https://localhost:8080/", 1064 | "height": 108 1065 | }, 1066 | "outputId": "56c7238e-30b2-4dbe-80e9-3df0e065667e" 1067 | }, 1068 | "cell_type": "code", 1069 | "source": [ 1070 | "features_distrbn = data_copy.groupby(['label'], as_index = False)['words_count', 'adjective_freq', 'noun_freq', 'adverb_freq', 'verb_freq'].mean()\n", 1071 | "np.round(features_distrbn, 3)" 1072 | ], 1073 | "execution_count": 143, 1074 | "outputs": [ 1075 | { 1076 | "output_type": "execute_result", 1077 | "data": { 1078 | "text/html": [ 1079 | "
\n", 1080 | "\n", 1093 | "\n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | "
labelwords_countadjective_freqnoun_freqadverb_freqverb_freq
007.7850.1620.4460.0500.032
118.3680.1800.4360.0490.025
\n", 1126 | "
" 1127 | ], 1128 | "text/plain": [ 1129 | " label words_count adjective_freq noun_freq adverb_freq verb_freq\n", 1130 | "0 0 7.785 0.162 0.446 0.050 0.032\n", 1131 | "1 1 8.368 0.180 0.436 0.049 0.025" 1132 | ] 1133 | }, 1134 | "metadata": { 1135 | "tags": [] 1136 | }, 1137 | "execution_count": 143 1138 | } 1139 | ] 1140 | }, 1141 | { 1142 | "metadata": { 1143 | "id": "VH6d05-2ojXg", 1144 | "colab_type": "text" 1145 | }, 1146 | "cell_type": "markdown", 1147 | "source": [ 1148 | "#### read the GLoVE 100 file for preparing an Embedding layer of 100 dimenions" 1149 | ] 1150 | }, 1151 | { 1152 | "metadata": { 1153 | "id": "MlcN9Z3OxaV5", 1154 | "colab_type": "code", 1155 | "colab": {} 1156 | }, 1157 | "cell_type": "code", 1158 | "source": [ 1159 | "from google.colab import files\n", 1160 | "uploaded = files.upload()" 1161 | ], 1162 | "execution_count": 0, 1163 | "outputs": [] 1164 | }, 1165 | { 1166 | "metadata": { 1167 | "id": "-3Ul8DpiojXh", 1168 | "colab_type": "code", 1169 | "colab": { 1170 | "base_uri": "https://localhost:8080/", 1171 | "height": 35 1172 | }, 1173 | "outputId": "b5972abc-4035-4bb8-eada-88ea2ac0089a" 1174 | }, 1175 | "cell_type": "code", 1176 | "source": [ 1177 | "MAX_SEQUENCE_LENGTH = 50\n", 1178 | "MAX_NUM_WORDS = 10000\n", 1179 | "EMBEDDING_DIM = 100\n", 1180 | "VALIDATION_SPLIT = 0.2\n", 1181 | "\n", 1182 | "embeddings_index = {}\n", 1183 | "\n", 1184 | "with open(\"/content/glove.6B.100d.txt\", encoding='utf-8') as f:\n", 1185 | " for line in f:\n", 1186 | " values = line.split()\n", 1187 | " word = values[0]\n", 1188 | " coefs = np.asarray(values[1:], dtype='float32')\n", 1189 | " embeddings_index[word] = coefs\n", 1190 | "print('Found %s word vectors.' % len(embeddings_index))" 1191 | ], 1192 | "execution_count": 144, 1193 | "outputs": [ 1194 | { 1195 | "output_type": "stream", 1196 | "text": [ 1197 | "Found 400000 word vectors.\n" 1198 | ], 1199 | "name": "stdout" 1200 | } 1201 | ] 1202 | }, 1203 | { 1204 | "metadata": { 1205 | "id": "aT9Vlu4wojXm", 1206 | "colab_type": "text" 1207 | }, 1208 | "cell_type": "markdown", 1209 | "source": [ 1210 | "#### Initialize the Embedding layer with pre-trained weights from the GLoVE model:" 1211 | ] 1212 | }, 1213 | { 1214 | "metadata": { 1215 | "id": "6u6uoO2RojXn", 1216 | "colab_type": "code", 1217 | "colab": {} 1218 | }, 1219 | "cell_type": "code", 1220 | "source": [ 1221 | "## Define the sequence lengths, max number of words and embedding dimensions\n", 1222 | "MAX_SEQUENCE_LENGTH = 50 # Sequence length of each sentence. If more, crop. If less, pad with zeros\n", 1223 | "MAX_NB_WORDS = 10000 # Top 10000 frequently occuring words" 1224 | ], 1225 | "execution_count": 0, 1226 | "outputs": [] 1227 | }, 1228 | { 1229 | "metadata": { 1230 | "id": "YCAa9ShNojXp", 1231 | "colab_type": "text" 1232 | }, 1233 | "cell_type": "markdown", 1234 | "source": [ 1235 | "#### Train-validation-test split. We separate validation data to use early stopping callback feature during training our model below" 1236 | ] 1237 | }, 1238 | { 1239 | "metadata": { 1240 | "id": "OkmtWUWDojXp", 1241 | "colab_type": "code", 1242 | "colab": { 1243 | "base_uri": "https://localhost:8080/", 1244 | "height": 35 1245 | }, 1246 | "outputId": "f0826fff-6bcb-43bd-8e06-8a66c539c167" 1247 | }, 1248 | "cell_type": "code", 1249 | "source": [ 1250 | "X = data_copy.iloc[:,1:7]\n", 1251 | "y = data_copy['label']\n", 1252 | "print(len(X),len(y))\n", 1253 | "\n" 1254 | ], 1255 | "execution_count": 147, 1256 | "outputs": [ 1257 | { 1258 | "output_type": "stream", 1259 | "text": [ 1260 | "31962 31962\n" 1261 | ], 1262 | "name": "stdout" 1263 | } 1264 | ] 1265 | }, 1266 | { 1267 | "metadata": { 1268 | "id": "OVaXFlc_ojXt", 1269 | "colab_type": "code", 1270 | "colab": {} 1271 | }, 1272 | "cell_type": "code", 1273 | "source": [ 1274 | "x_train_val, x_test, y_train_val, y_test = train_test_split(X,y, test_size = 0.3, random_state = 123)" 1275 | ], 1276 | "execution_count": 0, 1277 | "outputs": [] 1278 | }, 1279 | { 1280 | "metadata": { 1281 | "id": "WqLkUFhc0Y6T", 1282 | "colab_type": "code", 1283 | "colab": {} 1284 | }, 1285 | "cell_type": "code", 1286 | "source": [ 1287 | "x_train, x_val, y_train, y_val = train_test_split(x_train_val, \n", 1288 | " y_train_val, test_size = 0.2, random_state = 123)" 1289 | ], 1290 | "execution_count": 0, 1291 | "outputs": [] 1292 | }, 1293 | { 1294 | "metadata": { 1295 | "id": "wlizcka71g2g", 1296 | "colab_type": "text" 1297 | }, 1298 | "cell_type": "markdown", 1299 | "source": [ 1300 | "rebuild the full training data-frame to double-check that word count and POS tagging remain useful features across the training data set as well. Note above when we analysed the test data we did not include any labels data or standardization measures, thereby strictly ensuring no leakage of test information into the trainnig space" 1301 | ] 1302 | }, 1303 | { 1304 | "metadata": { 1305 | "id": "uMu87ILmojX1", 1306 | "colab_type": "code", 1307 | "colab": { 1308 | "base_uri": "https://localhost:8080/", 1309 | "height": 198 1310 | }, 1311 | "outputId": "2fa8059a-728e-4fdb-81e2-08956c4d86a2" 1312 | }, 1313 | "cell_type": "code", 1314 | "source": [ 1315 | "train_full = pd.concat([x_train, y_train], axis = 1)\n", 1316 | "train_full.head()" 1317 | ], 1318 | "execution_count": 150, 1319 | "outputs": [ 1320 | { 1321 | "output_type": "execute_result", 1322 | "data": { 1323 | "text/html": [ 1324 | "
\n", 1325 | "\n", 1338 | "\n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | "
tweetwords_countnoun_freqverb_freqadjective_freqadverb_freqlabel
4574here's a ?for the day the government and state...120.6923080.00.0000000.0000001
6142cardiff!!!! 󾌩. #cardiff #euro2016 #euro #fo...90.8000000.00.0000000.0000000
22578@user where on eah is pastor #mmusimaimane? is...100.6363640.00.1818180.0909091
3709@user and the panel is seated! @user @user @u...80.4444440.00.2222220.0000000
5766so often we think our country has come so far ...90.3000000.00.1000000.2000000
\n", 1404 | "
" 1405 | ], 1406 | "text/plain": [ 1407 | " tweet words_count \\\n", 1408 | "4574 here's a ?for the day the government and state... 12 \n", 1409 | "6142 cardiff!!!! 󾌩. #cardiff #euro2016 #euro #fo... 9 \n", 1410 | "22578 @user where on eah is pastor #mmusimaimane? is... 10 \n", 1411 | "3709 @user and the panel is seated! @user @user @u... 8 \n", 1412 | "5766 so often we think our country has come so far ... 9 \n", 1413 | "\n", 1414 | " noun_freq verb_freq adjective_freq adverb_freq label \n", 1415 | "4574 0.692308 0.0 0.000000 0.000000 1 \n", 1416 | "6142 0.800000 0.0 0.000000 0.000000 0 \n", 1417 | "22578 0.636364 0.0 0.181818 0.090909 1 \n", 1418 | "3709 0.444444 0.0 0.222222 0.000000 0 \n", 1419 | "5766 0.300000 0.0 0.100000 0.200000 0 " 1420 | ] 1421 | }, 1422 | "metadata": { 1423 | "tags": [] 1424 | }, 1425 | "execution_count": 150 1426 | } 1427 | ] 1428 | }, 1429 | { 1430 | "metadata": { 1431 | "id": "2wTqA_y31oI0", 1432 | "colab_type": "code", 1433 | "colab": { 1434 | "base_uri": "https://localhost:8080/", 1435 | "height": 108 1436 | }, 1437 | "outputId": "cb581d5c-20aa-4e90-db2f-f5f8d4cd961e" 1438 | }, 1439 | "cell_type": "code", 1440 | "source": [ 1441 | "word_count_distrbn_train = train_full.groupby(['label'], \n", 1442 | " as_index = False)['words_count', 'adjective_freq', 'noun_freq', 'adverb_freq', 'verb_freq'].mean()\n", 1443 | "np.round(word_count_distrbn_train, 3)" 1444 | ], 1445 | "execution_count": 151, 1446 | "outputs": [ 1447 | { 1448 | "output_type": "execute_result", 1449 | "data": { 1450 | "text/html": [ 1451 | "
\n", 1452 | "\n", 1465 | "\n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | "
labelwords_countadjective_freqnoun_freqadverb_freqverb_freq
007.7510.1620.4460.050.031
118.2680.1770.4320.050.025
\n", 1498 | "
" 1499 | ], 1500 | "text/plain": [ 1501 | " label words_count adjective_freq noun_freq adverb_freq verb_freq\n", 1502 | "0 0 7.751 0.162 0.446 0.05 0.031\n", 1503 | "1 1 8.268 0.177 0.432 0.05 0.025" 1504 | ] 1505 | }, 1506 | "metadata": { 1507 | "tags": [] 1508 | }, 1509 | "execution_count": 151 1510 | } 1511 | ] 1512 | }, 1513 | { 1514 | "metadata": { 1515 | "id": "Qn3LB0mF1uYn", 1516 | "colab_type": "code", 1517 | "colab": { 1518 | "base_uri": "https://localhost:8080/", 1519 | "height": 54 1520 | }, 1521 | "outputId": "94570376-ed97-4029-96fe-8c37521f3963" 1522 | }, 1523 | "cell_type": "code", 1524 | "source": [ 1525 | "print(x_train.shape, x_val.shape, x_test.shape)\n", 1526 | "print(y_train.shape, y_val.shape, y_test.shape)" 1527 | ], 1528 | "execution_count": 154, 1529 | "outputs": [ 1530 | { 1531 | "output_type": "stream", 1532 | "text": [ 1533 | "(17898, 6) (4475, 6) (9589, 6)\n", 1534 | "(17898,) (4475,) (9589,)\n" 1535 | ], 1536 | "name": "stdout" 1537 | } 1538 | ] 1539 | }, 1540 | { 1541 | "metadata": { 1542 | "id": "w1IuJwhX3OXK", 1543 | "colab_type": "text" 1544 | }, 1545 | "cell_type": "markdown", 1546 | "source": [ 1547 | "The train-validation-test original texts are tokenized and padded for machine-readable formatting" 1548 | ] 1549 | }, 1550 | { 1551 | "metadata": { 1552 | "id": "cVFNLC4u3U0O", 1553 | "colab_type": "code", 1554 | "colab": { 1555 | "base_uri": "https://localhost:8080/", 1556 | "height": 163 1557 | }, 1558 | "outputId": "2992d5f3-6525-4c2f-f785-2e88fc300b51" 1559 | }, 1560 | "cell_type": "code", 1561 | "source": [ 1562 | "tokenizer = Tokenizer(num_words=MAX_NB_WORDS) # get the frequently occuring words\n", 1563 | "tokenizer.fit_on_texts(x_train.tweet) \n", 1564 | "train_sequences = tokenizer.texts_to_sequences(x_train.tweet)\n", 1565 | "val_sequences = tokenizer.texts_to_sequences(x_val.tweet)\n", 1566 | "test_sequences = tokenizer.texts_to_sequences(x_test.tweet)\n", 1567 | "\n", 1568 | "word_index = tokenizer.word_index # dictionary containing words and their index\n", 1569 | "#print(tokenizer.word_index) # print to check\n", 1570 | "print('Found %s unique tokens.' % len(word_index)) # total words in the corpus\n", 1571 | "train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", 1572 | "val_data = pad_sequences(val_sequences, maxlen = MAX_SEQUENCE_LENGTH)# get only the top frequent words on train\n", 1573 | "test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH) # get only the top frequent words on test\n", 1574 | "\n", 1575 | "print(train_data[0])\n", 1576 | "print(train_data.shape)\n", 1577 | "print(val_data.shape)\n", 1578 | "print(test_data.shape)" 1579 | ], 1580 | "execution_count": 157, 1581 | "outputs": [ 1582 | { 1583 | "output_type": "stream", 1584 | "text": [ 1585 | "Found 31519 unique tokens.\n", 1586 | "[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 1587 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 1588 | " 0 0 944 4 9 2 19 2 1477 7 1087 1854 65 2520\n", 1589 | " 3200 666 3201 412 3629 8 1329 20]\n", 1590 | "(17898, 50)\n", 1591 | "(4475, 50)\n", 1592 | "(9589, 50)\n" 1593 | ], 1594 | "name": "stdout" 1595 | } 1596 | ] 1597 | }, 1598 | { 1599 | "metadata": { 1600 | "id": "pgy6Vh_eJSdT", 1601 | "colab_type": "text" 1602 | }, 1603 | "cell_type": "markdown", 1604 | "source": [ 1605 | "standardize all the numeric features generated above by by fitting them on training data and using that to transform validation and test data respectively\n" 1606 | ] 1607 | }, 1608 | { 1609 | "metadata": { 1610 | "id": "JBO_5xvcJVJr", 1611 | "colab_type": "code", 1612 | "colab": {} 1613 | }, 1614 | "cell_type": "code", 1615 | "source": [ 1616 | "scaleable_cols = ['words_count', 'adjective_freq', 'noun_freq', 'adverb_freq', 'verb_freq']" 1617 | ], 1618 | "execution_count": 0, 1619 | "outputs": [] 1620 | }, 1621 | { 1622 | "metadata": { 1623 | "id": "neV_czZVJm_U", 1624 | "colab_type": "code", 1625 | "colab": { 1626 | "base_uri": "https://localhost:8080/", 1627 | "height": 146 1628 | }, 1629 | "outputId": "858e5670-52fa-4ad0-8d2c-0b27bf43ba49" 1630 | }, 1631 | "cell_type": "code", 1632 | "source": [ 1633 | "scaler_multicol = MinMaxScaler()\n", 1634 | "train_multicol_scaled = scaler_multicol.fit_transform(x_train[scaleable_cols])\n", 1635 | "val_multicol_scaled = scaler_multicol.fit_transform(x_val[scaleable_cols])\n", 1636 | "test_multicol_scaled = scaler_multicol.fit_transform(x_test[scaleable_cols])" 1637 | ], 1638 | "execution_count": 158, 1639 | "outputs": [ 1640 | { 1641 | "output_type": "stream", 1642 | "text": [ 1643 | "/usr/local/lib/python3.6/dist-packages/sklearn/preprocessing/data.py:334: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by MinMaxScaler.\n", 1644 | " return self.partial_fit(X, y)\n", 1645 | "/usr/local/lib/python3.6/dist-packages/sklearn/preprocessing/data.py:334: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by MinMaxScaler.\n", 1646 | " return self.partial_fit(X, y)\n", 1647 | "/usr/local/lib/python3.6/dist-packages/sklearn/preprocessing/data.py:334: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by MinMaxScaler.\n", 1648 | " return self.partial_fit(X, y)\n" 1649 | ], 1650 | "name": "stderr" 1651 | } 1652 | ] 1653 | }, 1654 | { 1655 | "metadata": { 1656 | "id": "gwdAMxrtJ8ba", 1657 | "colab_type": "code", 1658 | "colab": { 1659 | "base_uri": "https://localhost:8080/", 1660 | "height": 35 1661 | }, 1662 | "outputId": "10b5c69b-2d2f-4b77-99da-0d57720f45a4" 1663 | }, 1664 | "cell_type": "code", 1665 | "source": [ 1666 | "train_multicol_scaled[2]" 1667 | ], 1668 | "execution_count": 161, 1669 | "outputs": [ 1670 | { 1671 | "output_type": "execute_result", 1672 | "data": { 1673 | "text/plain": [ 1674 | "array([0.27027027, 0.22727273, 0.65356265, 0.12121212, 0. ])" 1675 | ] 1676 | }, 1677 | "metadata": { 1678 | "tags": [] 1679 | }, 1680 | "execution_count": 161 1681 | } 1682 | ] 1683 | }, 1684 | { 1685 | "metadata": { 1686 | "id": "GPv8yUtTKFnc", 1687 | "colab_type": "code", 1688 | "colab": { 1689 | "base_uri": "https://localhost:8080/", 1690 | "height": 72 1691 | }, 1692 | "outputId": "7cf5d92a-b0ac-4c21-c23b-0aa671454683" 1693 | }, 1694 | "cell_type": "code", 1695 | "source": [ 1696 | "train_data = np.hstack((train_data, train_multicol_scaled))\n", 1697 | "val_data = np.hstack((val_data, val_multicol_scaled))\n", 1698 | "test_data = np.hstack((test_data, test_multicol_scaled))\n", 1699 | "\n", 1700 | "print(train_data.shape)\n", 1701 | "print(val_data.shape)\n", 1702 | "print(test_data.shape)" 1703 | ], 1704 | "execution_count": 162, 1705 | "outputs": [ 1706 | { 1707 | "output_type": "stream", 1708 | "text": [ 1709 | "(17898, 55)\n", 1710 | "(4475, 55)\n", 1711 | "(9589, 55)\n" 1712 | ], 1713 | "name": "stdout" 1714 | } 1715 | ] 1716 | }, 1717 | { 1718 | "metadata": { 1719 | "id": "Zenq_yPSKoNf", 1720 | "colab_type": "code", 1721 | "colab": { 1722 | "base_uri": "https://localhost:8080/", 1723 | "height": 272 1724 | }, 1725 | "outputId": "009c4709-7c97-40b9-952e-902b95222eb7" 1726 | }, 1727 | "cell_type": "code", 1728 | "source": [ 1729 | "print(train_data[0])" 1730 | ], 1731 | "execution_count": 163, 1732 | "outputs": [ 1733 | { 1734 | "output_type": "stream", 1735 | "text": [ 1736 | "[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00\n", 1737 | " 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00\n", 1738 | " 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00\n", 1739 | " 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00\n", 1740 | " 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00\n", 1741 | " 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00\n", 1742 | " 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00\n", 1743 | " 0.00000000e+00 0.00000000e+00 9.44000000e+02 4.00000000e+00\n", 1744 | " 9.00000000e+00 2.00000000e+00 1.90000000e+01 2.00000000e+00\n", 1745 | " 1.47700000e+03 7.00000000e+00 1.08700000e+03 1.85400000e+03\n", 1746 | " 6.50000000e+01 2.52000000e+03 3.20000000e+03 6.66000000e+02\n", 1747 | " 3.20100000e+03 4.12000000e+02 3.62900000e+03 8.00000000e+00\n", 1748 | " 1.32900000e+03 2.00000000e+01 3.24324324e-01 0.00000000e+00\n", 1749 | " 7.11018711e-01 0.00000000e+00 0.00000000e+00]\n" 1750 | ], 1751 | "name": "stdout" 1752 | } 1753 | ] 1754 | }, 1755 | { 1756 | "metadata": { 1757 | "id": "M-UkgyTGL9N4", 1758 | "colab_type": "text" 1759 | }, 1760 | "cell_type": "markdown", 1761 | "source": [ 1762 | "initialize the embedding matrix from GLoVE 100 which will be used as the initial weights of the Embedding layer in our neural network" 1763 | ] 1764 | }, 1765 | { 1766 | "metadata": { 1767 | "id": "PGcKcltqKuL_", 1768 | "colab_type": "code", 1769 | "colab": {} 1770 | }, 1771 | "cell_type": "code", 1772 | "source": [ 1773 | "num_words = min(MAX_NB_WORDS, len(embeddings_index))\n", 1774 | "embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))\n", 1775 | "for word, i in word_index.items():\n", 1776 | " if i >= MAX_NB_WORDS:\n", 1777 | " continue\n", 1778 | " embedding_vector = embeddings_index.get(word)\n", 1779 | " if embedding_vector is not None:\n", 1780 | " # words not found in embedding index will be all-zeros.\n", 1781 | " embedding_matrix[i] = embedding_vector" 1782 | ], 1783 | "execution_count": 0, 1784 | "outputs": [] 1785 | }, 1786 | { 1787 | "metadata": { 1788 | "id": "VQAjA0ujNCVc", 1789 | "colab_type": "text" 1790 | }, 1791 | "cell_type": "markdown", 1792 | "source": [ 1793 | "The following cell declares our final model that has:\n", 1794 | "\n", 1795 | "\n", 1796 | "\n", 1797 | "\n", 1798 | "* an initial Embedding layer with pre-trained weights from GLoVE 100 as described above, hence trainable = False at this layer,\n", 1799 | "* two layers of GRU model with in-between BatchNormalization and Dropout as mentioned\n", 1800 | "* the neural network uses the Keras function API and is then split after the second GRU layer into a one output model - 'label',\n", 1801 | "* the output model has two Dense() layers as we want to visualize the learning of the network just prior to it's final predictions\n", 1802 | "\n" 1803 | ] 1804 | }, 1805 | { 1806 | "metadata": { 1807 | "id": "ltd8golNLu86", 1808 | "colab_type": "code", 1809 | "colab": { 1810 | "base_uri": "https://localhost:8080/", 1811 | "height": 545 1812 | }, 1813 | "outputId": "1e5660ab-9b7b-41f0-d97b-2603baeb2d5d" 1814 | }, 1815 | "cell_type": "code", 1816 | "source": [ 1817 | "posts_input = Input(shape=(None,), dtype='int32', name='all_posts')\n", 1818 | "embedded_posts = Embedding(input_dim=MAX_NB_WORDS,\n", 1819 | " input_length=MAX_SEQUENCE_LENGTH, \n", 1820 | " output_dim=EMBEDDING_DIM,\n", 1821 | " weights=[embedding_matrix],\n", 1822 | " trainable=False)(posts_input)\n", 1823 | "\n", 1824 | "x = layers.GRU(128, activation='relu', return_sequences = True)(embedded_posts)\n", 1825 | "x = layers.BatchNormalization()(x)\n", 1826 | "x = layers.Dropout(0.2)(x)\n", 1827 | "x = layers.GRU(64, activation = 'relu')(x)\n", 1828 | "x = layers.Dense(16, activation='relu')(x)\n", 1829 | "x = layers.Dropout(0.2)(x)\n", 1830 | "\n", 1831 | "label_pred = layers.Dense(8, activation = 'relu', name = 'label0')(x)\n", 1832 | "label_pred = layers.Dropout(0.5)(label_pred)\n", 1833 | "label_pred = layers.Dense(1, activation = 'sigmoid', name = 'label1')(label_pred)\n", 1834 | "\n", 1835 | "\n", 1836 | "combined_model = Model(posts_input, [label_pred])\n", 1837 | "combined_model.summary()" 1838 | ], 1839 | "execution_count": 172, 1840 | "outputs": [ 1841 | { 1842 | "output_type": "stream", 1843 | "text": [ 1844 | "_________________________________________________________________\n", 1845 | "Layer (type) Output Shape Param # \n", 1846 | "=================================================================\n", 1847 | "all_posts (InputLayer) (None, None) 0 \n", 1848 | "_________________________________________________________________\n", 1849 | "embedding_4 (Embedding) (None, 50, 100) 1000000 \n", 1850 | "_________________________________________________________________\n", 1851 | "gru_7 (GRU) (None, 50, 128) 87936 \n", 1852 | "_________________________________________________________________\n", 1853 | "batch_normalization_4 (Batch (None, 50, 128) 512 \n", 1854 | "_________________________________________________________________\n", 1855 | "dropout_10 (Dropout) (None, 50, 128) 0 \n", 1856 | "_________________________________________________________________\n", 1857 | "gru_8 (GRU) (None, 64) 37056 \n", 1858 | "_________________________________________________________________\n", 1859 | "dense_4 (Dense) (None, 16) 1040 \n", 1860 | "_________________________________________________________________\n", 1861 | "dropout_11 (Dropout) (None, 16) 0 \n", 1862 | "_________________________________________________________________\n", 1863 | "label0 (Dense) (None, 8) 136 \n", 1864 | "_________________________________________________________________\n", 1865 | "dropout_12 (Dropout) (None, 8) 0 \n", 1866 | "_________________________________________________________________\n", 1867 | "label1 (Dense) (None, 1) 9 \n", 1868 | "=================================================================\n", 1869 | "Total params: 1,126,689\n", 1870 | "Trainable params: 126,433\n", 1871 | "Non-trainable params: 1,000,256\n", 1872 | "_________________________________________________________________\n" 1873 | ], 1874 | "name": "stdout" 1875 | } 1876 | ] 1877 | }, 1878 | { 1879 | "metadata": { 1880 | "id": "n1CtLxGkPeiq", 1881 | "colab_type": "code", 1882 | "colab": {} 1883 | }, 1884 | "cell_type": "code", 1885 | "source": [ 1886 | "callbacks_list = [EarlyStopping(monitor='val_loss', patience=1, ),\n", 1887 | " ModelCheckpoint(filepath='model_multi-feature.h5', monitor='val_loss',\n", 1888 | " save_best_only=True,)]" 1889 | ], 1890 | "execution_count": 0, 1891 | "outputs": [] 1892 | }, 1893 | { 1894 | "metadata": { 1895 | "id": "EG3t8OY3QZT5", 1896 | "colab_type": "code", 1897 | "colab": {} 1898 | }, 1899 | "cell_type": "code", 1900 | "source": [ 1901 | "combined_model.compile(optimizer = 'rmsprop',loss = {'label1' : 'binary_crossentropy'},metrics = ['acc'])" 1902 | ], 1903 | "execution_count": 0, 1904 | "outputs": [] 1905 | }, 1906 | { 1907 | "metadata": { 1908 | "id": "UOEAvDNvRi9F", 1909 | "colab_type": "code", 1910 | "colab": { 1911 | "base_uri": "https://localhost:8080/", 1912 | "height": 126 1913 | }, 1914 | "outputId": "d87df6db-68a5-464b-aa27-7aebc933899f" 1915 | }, 1916 | "cell_type": "code", 1917 | "source": [ 1918 | "y_train.head()" 1919 | ], 1920 | "execution_count": 180, 1921 | "outputs": [ 1922 | { 1923 | "output_type": "execute_result", 1924 | "data": { 1925 | "text/plain": [ 1926 | "4574 1\n", 1927 | "6142 0\n", 1928 | "22578 1\n", 1929 | "3709 0\n", 1930 | "5766 0\n", 1931 | "Name: label, dtype: int64" 1932 | ] 1933 | }, 1934 | "metadata": { 1935 | "tags": [] 1936 | }, 1937 | "execution_count": 180 1938 | } 1939 | ] 1940 | }, 1941 | { 1942 | "metadata": { 1943 | "id": "Q2Zvd-FdSKOw", 1944 | "colab_type": "code", 1945 | "colab": {} 1946 | }, 1947 | "cell_type": "code", 1948 | "source": [ 1949 | "epochs = 16\n", 1950 | "batch_size = 32" 1951 | ], 1952 | "execution_count": 0, 1953 | "outputs": [] 1954 | }, 1955 | { 1956 | "metadata": { 1957 | "id": "yvCmN-euQzRu", 1958 | "colab_type": "code", 1959 | "colab": { 1960 | "base_uri": "https://localhost:8080/", 1961 | "height": 219 1962 | }, 1963 | "outputId": "2ea93371-ebab-40f3-c098-5135a322ca3a" 1964 | }, 1965 | "cell_type": "code", 1966 | "source": [ 1967 | "hist = combined_model.fit(train_data, {'label1' : y_train},\n", 1968 | " epochs = epochs, batch_size = batch_size,\n", 1969 | " callbacks = callbacks_list,\n", 1970 | " validation_data = (val_data, {'label1' : y_val})).history" 1971 | ], 1972 | "execution_count": 183, 1973 | "outputs": [ 1974 | { 1975 | "output_type": "stream", 1976 | "text": [ 1977 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", 1978 | "Instructions for updating:\n", 1979 | "Use tf.cast instead.\n", 1980 | "Train on 17898 samples, validate on 4475 samples\n", 1981 | "Epoch 1/16\n", 1982 | "17898/17898 [==============================] - 78s 4ms/step - loss: 0.2876 - acc: 0.9151 - val_loss: 0.1718 - val_acc: 0.9280\n", 1983 | "Epoch 2/16\n", 1984 | "17898/17898 [==============================] - 76s 4ms/step - loss: 0.2168 - acc: 0.9327 - val_loss: 0.1531 - val_acc: 0.9457\n", 1985 | "Epoch 3/16\n", 1986 | "17898/17898 [==============================] - 77s 4ms/step - loss: 0.1857 - acc: 0.9412 - val_loss: 0.2215 - val_acc: 0.9457\n" 1987 | ], 1988 | "name": "stdout" 1989 | } 1990 | ] 1991 | }, 1992 | { 1993 | "metadata": { 1994 | "id": "509Mzz6MSFPx", 1995 | "colab_type": "code", 1996 | "colab": {} 1997 | }, 1998 | "cell_type": "code", 1999 | "source": [ 2000 | "model1 = load_model('model_multi-feature.h5')" 2001 | ], 2002 | "execution_count": 0, 2003 | "outputs": [] 2004 | }, 2005 | { 2006 | "metadata": { 2007 | "id": "HZSy7KslVbUq", 2008 | "colab_type": "code", 2009 | "colab": {} 2010 | }, 2011 | "cell_type": "code", 2012 | "source": [ 2013 | "pred = model1.predict(test_data)" 2014 | ], 2015 | "execution_count": 0, 2016 | "outputs": [] 2017 | }, 2018 | { 2019 | "metadata": { 2020 | "id": "uN9SsGqdV9df", 2021 | "colab_type": "code", 2022 | "colab": {} 2023 | }, 2024 | "cell_type": "code", 2025 | "source": [ 2026 | "label_list = list(chain.from_iterable(pred))\n", 2027 | "label_predict = [1 if x >= 0.5 else 0 for x in label_list]\n", 2028 | "label_recall = recall_score(label_predict, y_test)\n", 2029 | "label_acc = accuracy_score(label_predict, y_test)" 2030 | ], 2031 | "execution_count": 0, 2032 | "outputs": [] 2033 | }, 2034 | { 2035 | "metadata": { 2036 | "id": "AIe0pYyJXHF8", 2037 | "colab_type": "code", 2038 | "colab": { 2039 | "base_uri": "https://localhost:8080/", 2040 | "height": 54 2041 | }, 2042 | "outputId": "cbc33c75-69cb-4082-a88d-68a02019fbbf" 2043 | }, 2044 | "cell_type": "code", 2045 | "source": [ 2046 | "print(\"ACCURACY : \",label_acc)\n", 2047 | "\n", 2048 | "print(\"Recall : \",label_recall)" 2049 | ], 2050 | "execution_count": 196, 2051 | "outputs": [ 2052 | { 2053 | "output_type": "stream", 2054 | "text": [ 2055 | "ACCURACY : 0.9465011992908541\n", 2056 | "Recall : 0.7821428571428571\n" 2057 | ], 2058 | "name": "stdout" 2059 | } 2060 | ] 2061 | }, 2062 | { 2063 | "metadata": { 2064 | "id": "0dDWyW7BXDcj", 2065 | "colab_type": "code", 2066 | "colab": {} 2067 | }, 2068 | "cell_type": "code", 2069 | "source": [ 2070 | "" 2071 | ], 2072 | "execution_count": 0, 2073 | "outputs": [] 2074 | } 2075 | ] 2076 | } -------------------------------------------------------------------------------- /Project_Synopsis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manishshettym/Offensive-Text-Detection/ed633804a09fa8d6b6c1d252ac5de371e0bdef15/Project_Synopsis.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Offensive Text Detection using NLP 2 | 3 | This project aims to detect offensive ( Racist , Sexist etc ) text from social media posts (tweets) using NLP techniques 4 | and sentiment analysis with the help of Machine Learning / Deep learning models. 5 | 6 | ### Please find the Experience paper: [Experience Paper](https://github.com/ManishShettyM/NLP-Offensive-Text-Detection/blob/master/Experience_Paper.pdf) 7 | 8 | #### References 9 | 10 | Mahmud, A., Ahmed, K.Z. and Khan, M., 2008. Detecting flames and insults in text. 11 | https://www.researchgate.net/publication/49242911_Detecting_flames_and_insults_in_text 12 | 13 | Kshirsagar, R., Cukuvac, T., McKeown, K. and McGregor, S., 2018. Predictive embeddings for hate speech detection on twitter. arXiv preprint arXiv:1809.10644. 14 | https://aclweb.org/anthology/W18-5104 15 | 16 | Amplayo, R.K. and Occidental, J., 2015. Multi-level classifier for the detection of insults in social media. In Proceedings of 15th Philippine Computing Science Congress. 17 | https://www.researchgate.net/publication/273381302_Multi-level_classifier_for_the_detection_of_insults_in_social_media 18 | 19 | Malmasi, S. and Zampieri, M., 2017. Detecting hate speech in social media. arXiv preprint arXiv:1712.06427. 20 | https://arxiv.org/abs/1712.06427 21 | 22 | Sax, S., 2016. Flame wars: Automatic insult detection. 23 | https://cs224d.stanford.edu/reports/Sax.pdf 24 | 25 | Biere, S., Bhulai, S. and Analytics, M.B., 2018. Hate speech detection using natural language processing techniques. Master Business AnalyticsDepartment of Mathematics Faculty of Science. 26 | https://beta.vu.nl/nl/Images/werkstuk-biere_tcm235-893877.pdf 27 | 28 | Çano, E. and Morisio, M., 2019. Word embeddings for sentiment analysis: a comprehensive empirical survey. arXiv preprint arXiv:1902.00753. 29 | https://www.academia.edu/38464940/Word_Embeddings_for_Sentiment_Analysis_A_Comprehensive_Empirical_Survey 30 | -------------------------------------------------------------------------------- /Reference/.ipynb_checkpoints/Naive Bayes-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 62, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Import libraries\n", 10 | "import pandas as pd\n", 11 | "from sklearn.model_selection import train_test_split\n", 12 | "from sklearn.feature_extraction.text import CountVectorizer\n", 13 | "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n", 14 | "import pandas as pd\n", 15 | "import numpy as np" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 63, 21 | "metadata": {}, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/plain": [ 26 | "Index(['id', 'label', 'tweet'], dtype='object')" 27 | ] 28 | }, 29 | "execution_count": 63, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "df = pd.read_csv('../Dataset/train_E6oV3lV.csv')\n", 36 | "df.columns" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 64, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "from sklearn.utils import shuffle\n", 46 | "df = shuffle(df)\n", 47 | "\n", 48 | "train, test = train_test_split(df, test_size=0.)\n", 49 | "X_train = train[\"tweet\"]\n", 50 | "X_test = test[\"tweet\"]\n", 51 | "y_train = train[\"label\"]\n", 52 | "y_test = test[\"label\"]" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 65, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# Instantiate the CountVectorizer method\n", 62 | "count_vector = CountVectorizer(stop_words = 'english')\n", 63 | "\n", 64 | "# Fit the training data and then return the matrix\n", 65 | "training_data = count_vector.fit_transform(X_train)\n", 66 | "\n", 67 | "# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()\n", 68 | "testing_data = count_vector.transform(X_test)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 66, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "ename": "ValueError", 78 | "evalue": "could not convert string to float: \"@user @user yep!! back to #sameolestory some black films @user don't deserve any marketing @user \"", 79 | "output_type": "error", 80 | "traceback": [ 81 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 82 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 83 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnaive_bayes\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mMultinomialNB\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mnaive_bayes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mMultinomialNB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnaive_bayes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mpredictions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnaive_bayes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 84 | "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/naive_bayes.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 577\u001b[0m \u001b[0mReturns\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 578\u001b[0m \"\"\"\n\u001b[0;32m--> 579\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'csr'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 580\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_features\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 581\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 85 | "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m 571\u001b[0m X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,\n\u001b[1;32m 572\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallow_nd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_min_samples\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 573\u001b[0;31m ensure_min_features, warn_on_dtype, estimator)\n\u001b[0m\u001b[1;32m 574\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmulti_output\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 575\u001b[0m y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,\n", 86 | "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m 431\u001b[0m force_all_finite)\n\u001b[1;32m 432\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 433\u001b[0;31m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 434\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 435\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 87 | "\u001b[0;31mValueError\u001b[0m: could not convert string to float: \"@user @user yep!! back to #sameolestory some black films @user don't deserve any marketing @user \"" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "from sklearn.naive_bayes import MultinomialNB\n", 93 | "naive_bayes = MultinomialNB()\n", 94 | "naive_bayes.fit(training_data, y_train)\n", 95 | "predictions = naive_bayes.predict(testing_data)\n", 96 | "\n", 97 | "print('Accuracy score: ', format(accuracy_score(y_test, predictions)))\n", 98 | "print('Precision score: ', format(precision_score(y_test, predictions)))\n", 99 | "print('Recall score: ', format(recall_score(y_test, predictions)))\n", 100 | "print('F1 score: ', format(f1_score(y_test, predictions)))" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [] 116 | } 117 | ], 118 | "metadata": { 119 | "kernelspec": { 120 | "display_name": "Python 3", 121 | "language": "python", 122 | "name": "python3" 123 | }, 124 | "language_info": { 125 | "codemirror_mode": { 126 | "name": "ipython", 127 | "version": 3 128 | }, 129 | "file_extension": ".py", 130 | "mimetype": "text/x-python", 131 | "name": "python", 132 | "nbconvert_exporter": "python", 133 | "pygments_lexer": "ipython3", 134 | "version": "3.5.2" 135 | } 136 | }, 137 | "nbformat": 4, 138 | "nbformat_minor": 2 139 | } 140 | --------------------------------------------------------------------------------