├── LICENSE.md ├── Offensive_tweets.ipynb ├── README.md ├── app.py ├── classifying.py ├── datasets ├── training-v1.zip ├── training-v1 │ ├── offenseval-annotation.txt │ ├── offenseval-training-v1.tsv │ └── readme-trainingset-v1.txt ├── trial-data.zip └── trial-data │ ├── OffensEval-READMEv1.txt │ └── offenseval-trial.txt ├── embedding.py ├── helper.py ├── load_test_data.py └── preprocessing.py /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Ahmed Hammad 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Offensive_tweets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": { 5 | "id": "M5EkrzM-1_JU", 6 | "colab_type": "code", 7 | "colab": {} 8 | }, 9 | "cell_type": "code", 10 | "source": [ 11 | "#Imports\n", 12 | "import pandas as pd\n", 13 | "import numpy as np\n", 14 | "from tqdm import tqdm\n", 15 | "import copy" 16 | ], 17 | "execution_count": 0, 18 | "outputs": [] 19 | }, 20 | { 21 | "metadata": { 22 | "id": "q5m6vPje2hiy", 23 | "colab_type": "code", 24 | "outputId": "09f7d3cc-88a7-4a7b-e8ba-775406b76ac5", 25 | "colab": { 26 | "base_uri": "https://localhost:8080/", 27 | "height": 34 28 | } 29 | }, 30 | "cell_type": "code", 31 | "source": [ 32 | "train_directory = \"https://raw.githubusercontent.com/ahmedhammad97/offensive-dataset/master/offenseval-training-v1.tsv?token=AVUYG3_VUnrSFPn3HUsRuK-seXmB92NIks5cPnJ7wA%3D%3D\"\n", 33 | "print(\"Reading Dataset...\")\n", 34 | "train_data = pd.read_csv(train_directory, sep='\\t', header=0)" 35 | ], 36 | "execution_count": 6, 37 | "outputs": [ 38 | { 39 | "output_type": "stream", 40 | "text": [ 41 | "Reading Dataset...\n" 42 | ], 43 | "name": "stdout" 44 | } 45 | ] 46 | }, 47 | { 48 | "metadata": { 49 | "id": "ucASsaD24HvU", 50 | "colab_type": "code", 51 | "colab": {} 52 | }, 53 | "cell_type": "code", 54 | "source": [ 55 | "tweets = train_data[[\"tweet\"]]\n", 56 | "subtask_a_labels = train_data[[\"subtask_a\"]]\n", 57 | "subtask_b_labels = train_data.query(\"subtask_a == 'OFF'\")[[\"subtask_b\"]]\n", 58 | "subtask_c_labels = train_data.query(\"subtask_b == 'TIN'\")[[\"subtask_c\"]]\n", 59 | "\n", 60 | "clean_tweets = copy.deepcopy(tweets)" 61 | ], 62 | "execution_count": 0, 63 | "outputs": [] 64 | }, 65 | { 66 | "metadata": { 67 | "id": "J3FKc2Kp4MxS", 68 | "colab_type": "code", 69 | "colab": {} 70 | }, 71 | "cell_type": "code", 72 | "source": [ 73 | "##PREPROCESSING##" 74 | ], 75 | "execution_count": 0, 76 | "outputs": [] 77 | }, 78 | { 79 | "metadata": { 80 | "id": "EJHU2cRi5SH1", 81 | "colab_type": "code", 82 | "colab": { 83 | "base_uri": "https://localhost:8080/", 84 | "height": 50 85 | }, 86 | "outputId": "7208c3c9-7fc1-4862-e923-89db2eeea598" 87 | }, 88 | "cell_type": "code", 89 | "source": [ 90 | "import re\n", 91 | "import nltk\n", 92 | "nltk.download('punkt', 'stopwords')\n", 93 | "from nltk.corpus import stopwords\n", 94 | "from nltk.tokenize import word_tokenize\n", 95 | "from nltk.stem.lancaster import LancasterStemmer\n", 96 | "lancaster_stemmer = LancasterStemmer()\n", 97 | "from nltk.stem import WordNetLemmatizer\n", 98 | "wordnet_lemmatizer = WordNetLemmatizer()" 99 | ], 100 | "execution_count": 9, 101 | "outputs": [ 102 | { 103 | "output_type": "stream", 104 | "text": [ 105 | "[nltk_data] Downloading package punkt to stopwords...\n", 106 | "[nltk_data] Package punkt is already up-to-date!\n" 107 | ], 108 | "name": "stdout" 109 | } 110 | ] 111 | }, 112 | { 113 | "metadata": { 114 | "id": "r7qm0EYh5bhB", 115 | "colab_type": "code", 116 | "colab": {} 117 | }, 118 | "cell_type": "code", 119 | "source": [ 120 | "def take_data_to_shower(tweet):\n", 121 | " noises = ['URL', '@USER', '\\'ve', 'n\\'t', '\\'s', '\\'m']\n", 122 | "\n", 123 | " for noise in noises:\n", 124 | " tweet = tweet.replace(noise, '')\n", 125 | "\n", 126 | " return re.sub(r'[^a-zA-Z]', ' ', tweet)\n", 127 | "\n", 128 | "\n", 129 | "def tokenize(tweet):\n", 130 | " lower_tweet = tweet.lower()\n", 131 | " return word_tokenize(lower_tweet)\n", 132 | "\n", 133 | "\n", 134 | "def remove_stop_words(tokens):\n", 135 | " clean_tokens = []\n", 136 | " stopWords = set(stopwords.words('english'))\n", 137 | " for token in tokens:\n", 138 | " if token not in stopWords:\n", 139 | " if token.replace(' ', '') != '':\n", 140 | " if len(token) > 1:\n", 141 | " clean_tokens.append(token)\n", 142 | " return clean_tokens\n", 143 | "\n", 144 | "\n", 145 | "def stem_and_lem(tokens):\n", 146 | " clean_tokens = []\n", 147 | " for token in tokens:\n", 148 | " token = wordnet_lemmatizer.lemmatize(token)\n", 149 | " token = lancaster_stemmer.stem(token)\n", 150 | " if len(token) > 1:\n", 151 | " clean_tokens.append(token)\n", 152 | " return clean_tokens\n" 153 | ], 154 | "execution_count": 0, 155 | "outputs": [] 156 | }, 157 | { 158 | "metadata": { 159 | "id": "jSwKKcS35gmW", 160 | "colab_type": "code", 161 | "colab": { 162 | "base_uri": "https://localhost:8080/", 163 | "height": 84 164 | }, 165 | "outputId": "d7b0f0e6-2bf1-48fe-d95d-6ab2e74d5573" 166 | }, 167 | "cell_type": "code", 168 | "source": [ 169 | "tqdm.pandas(desc=\"Cleaning Data Phase I...\")\n", 170 | "clean_tweets['tweet'] = tweets['tweet'].progress_apply(take_data_to_shower)\n", 171 | "\n", 172 | "tqdm.pandas(desc=\"Tokenizing Data...\")\n", 173 | "clean_tweets['tokens'] = clean_tweets['tweet'].progress_apply(tokenize)\n", 174 | "\n", 175 | "tqdm.pandas(desc=\"Cleaning Data Phase II...\")\n", 176 | "clean_tweets['tokens'] = clean_tweets['tokens'].progress_apply(remove_stop_words)\n", 177 | "\n", 178 | "tqdm.pandas(desc=\"Stemming And Lemmatizing\")\n", 179 | "clean_tweets['tokens'] = clean_tweets['tokens'].progress_apply(stem_and_lem)\n", 180 | "\n", 181 | "text_vector = clean_tweets['tokens'].tolist()\n" 182 | ], 183 | "execution_count": 11, 184 | "outputs": [ 185 | { 186 | "output_type": "stream", 187 | "text": [ 188 | "Cleaning Data Phase I...: 100%|██████████| 13240/13240 [00:00<00:00, 90838.52it/s]\n", 189 | "Tokenizing Data...: 100%|██████████| 13240/13240 [00:02<00:00, 5726.47it/s]\n", 190 | "Cleaning Data Phase II...: 100%|██████████| 13240/13240 [00:02<00:00, 5410.97it/s]\n", 191 | "Stemming And Lemmatizing: 100%|██████████| 13240/13240 [00:05<00:00, 2476.73it/s]\n" 192 | ], 193 | "name": "stderr" 194 | } 195 | ] 196 | }, 197 | { 198 | "metadata": { 199 | "id": "8Y2fN2MC8Ara", 200 | "colab_type": "code", 201 | "colab": {} 202 | }, 203 | "cell_type": "code", 204 | "source": [ 205 | "##EMBEDDING##" 206 | ], 207 | "execution_count": 0, 208 | "outputs": [] 209 | }, 210 | { 211 | "metadata": { 212 | "id": "UoVtfyxb5lp3", 213 | "colab_type": "code", 214 | "colab": {} 215 | }, 216 | "cell_type": "code", 217 | "source": [ 218 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 219 | "\n", 220 | "def tfid(text_vector):\n", 221 | " vectorizer = TfidfVectorizer()\n", 222 | " untokenized_data =[' '.join(tweet) for tweet in tqdm(text_vector, \"Vectorizing...\")]\n", 223 | " vectorizer = vectorizer.fit(untokenized_data)\n", 224 | " vectors = vectorizer.transform(untokenized_data).toarray()\n", 225 | " return vectors\n", 226 | " \n", 227 | "def get_vectors(vectors, labels, keyword):\n", 228 | " if len(vectors) != len(labels):\n", 229 | " print(\"Unmatching sizes!\")\n", 230 | " return\n", 231 | " result = list()\n", 232 | " for vector, label in zip(vectors, labels):\n", 233 | " if label == keyword:\n", 234 | " result.append(vector)\n", 235 | " return result" 236 | ], 237 | "execution_count": 0, 238 | "outputs": [] 239 | }, 240 | { 241 | "metadata": { 242 | "id": "fuA8n1dt7VJe", 243 | "colab_type": "code", 244 | "outputId": "6a9a0365-8a2e-4ac9-dc4f-28083de3f0ab", 245 | "colab": { 246 | "base_uri": "https://localhost:8080/", 247 | "height": 34 248 | } 249 | }, 250 | "cell_type": "code", 251 | "source": [ 252 | "vectors_a = tfid(text_vector) # Numerical Vectors A\n", 253 | "labels_a = subtask_a_labels['subtask_a'].values.tolist() # Subtask A Labels\n", 254 | "\n", 255 | "vectors_b = get_vectors(vectors_a, labels_a, \"OFF\") # Numerical Vectors B\n", 256 | "labels_b = subtask_b_labels['subtask_b'].values.tolist() # Subtask B Labels\n", 257 | "\n", 258 | "vectors_c = get_vectors(vectors_b, labels_b, \"TIN\") # Numerical Vectors C\n", 259 | "labels_c = subtask_c_labels['subtask_c'].values.tolist() # Subtask C Labels" 260 | ], 261 | "execution_count": 14, 262 | "outputs": [ 263 | { 264 | "output_type": "stream", 265 | "text": [ 266 | "Vectorizing...: 100%|██████████| 13240/13240 [00:00<00:00, 687301.48it/s]\n" 267 | ], 268 | "name": "stderr" 269 | } 270 | ] 271 | }, 272 | { 273 | "metadata": { 274 | "id": "2xxecoX_7dWS", 275 | "colab_type": "code", 276 | "colab": {} 277 | }, 278 | "cell_type": "code", 279 | "source": [ 280 | "##CLASSIFING##" 281 | ], 282 | "execution_count": 0, 283 | "outputs": [] 284 | }, 285 | { 286 | "metadata": { 287 | "id": "vllyKsR475YC", 288 | "colab_type": "code", 289 | "colab": {} 290 | }, 291 | "cell_type": "code", 292 | "source": [ 293 | "from sklearn.model_selection import train_test_split\n", 294 | "from sklearn.neighbors import KNeighborsClassifier\n", 295 | "from sklearn.svm import SVC\n", 296 | "from sklearn.naive_bayes import GaussianNB, MultinomialNB\n", 297 | "from sklearn.tree import DecisionTreeClassifier\n", 298 | "from sklearn.ensemble import RandomForestClassifier\n", 299 | "from sklearn.metrics import accuracy_score, confusion_matrix\n", 300 | "from sklearn.model_selection import GridSearchCV\n", 301 | "from sklearn.linear_model import LogisticRegression\n", 302 | "\n", 303 | "def classify(vectors, labels, type=\"DT\"):\n", 304 | " # Random Splitting With Ratio 3 : 1\n", 305 | " train_vectors, test_vectors, train_labels, test_labels = train_test_split(vectors, labels, test_size=0.25)\n", 306 | "\n", 307 | " # Initialize Model\n", 308 | " classifier = None\n", 309 | " if(type==\"MNB\"):\n", 310 | " classifier = MultinomialNB(alpha=0.7)\n", 311 | " classifier.fit(train_vectors, train_labels)\n", 312 | " elif(type==\"KNN\"):\n", 313 | " classifier = KNeighborsClassifier(n_jobs=4)\n", 314 | " params = {'n_neighbors': [3,5,7,9], 'weights':['uniform', 'distance']}\n", 315 | " classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4)\n", 316 | " classifier.fit(train_vectors, train_labels)\n", 317 | " classifier = classifier.best_estimator_\n", 318 | " elif(type==\"SVM\"):\n", 319 | " classifier = SVC()\n", 320 | " classifier = GridSearchCV(classifier, {'C':[0.001, 0.01, 0.1, 1, 10]}, cv=3, n_jobs=4)\n", 321 | " classifier.fit(train_vectors, train_labels)\n", 322 | " classifier = classifier.best_estimator_\n", 323 | " elif(type==\"DT\"):\n", 324 | " classifier = DecisionTreeClassifier(max_depth=800, min_samples_split=5)\n", 325 | " params = {'criterion':['gini','entropy']}\n", 326 | " classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4)\n", 327 | " classifier.fit(train_vectors, train_labels)\n", 328 | " classifier = classifier.best_estimator_\n", 329 | " elif(type==\"RF\"):\n", 330 | " classifier = RandomForestClassifier(max_depth=800, min_samples_split=5)\n", 331 | " params = {'n_estimators': [n for n in range(50,200,50)], 'criterion':['gini','entropy'], }\n", 332 | " classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4)\n", 333 | " classifier.fit(train_vectors, train_labels)\n", 334 | " classifier = classifier.best_estimator_\n", 335 | " elif(type==\"LR\"):\n", 336 | " classifier = LogisticRegression(multi_class='auto', solver='newton-cg',)\n", 337 | " classifier = GridSearchCV(classifier, {\"C\":np.logspace(-3,3,7), \"penalty\":[\"l2\"]}, cv=3, n_jobs=4)\n", 338 | " classifier.fit(train_vectors, train_labels)\n", 339 | " classifier = classifier.best_estimator_\n", 340 | " else:\n", 341 | " print(\"Wrong Classifier Type!\")\n", 342 | " return\n", 343 | "\n", 344 | " accuracy = accuracy_score(train_labels, classifier.predict(train_vectors))\n", 345 | " print(\"Training Accuracy:\", accuracy)\n", 346 | " test_predictions = classifier.predict(test_vectors)\n", 347 | " accuracy = accuracy_score(test_labels, test_predictions)\n", 348 | " print(\"Test Accuracy:\", accuracy)\n", 349 | " print(\"Confusion Matrix:\", )\n", 350 | " print(confusion_matrix(test_labels, test_predictions))" 351 | ], 352 | "execution_count": 0, 353 | "outputs": [] 354 | }, 355 | { 356 | "metadata": { 357 | "id": "nUc62OmA8KrF", 358 | "colab_type": "code", 359 | "outputId": "af7faf87-d5ff-4c7d-987a-c35c0c3839c2", 360 | "colab": { 361 | "base_uri": "https://localhost:8080/", 362 | "height": 188 363 | } 364 | }, 365 | "cell_type": "code", 366 | "source": [ 367 | "print(\"\\nBuilding Model Subtask A...\")\n", 368 | "classify(vectors_a[:], labels_a[:], \"SVM\") # {MNB, KNN, SVM, DT, RF, LR}" 369 | ], 370 | "execution_count": 35, 371 | "outputs": [ 372 | { 373 | "output_type": "stream", 374 | "text": [ 375 | "\n", 376 | "Building Model Subtask A...\n" 377 | ], 378 | "name": "stdout" 379 | }, 380 | { 381 | "output_type": "stream", 382 | "text": [ 383 | "/usr/local/lib/python3.6/dist-packages/sklearn/svm/base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 384 | " \"avoid this warning.\", FutureWarning)\n" 385 | ], 386 | "name": "stderr" 387 | }, 388 | { 389 | "output_type": "stream", 390 | "text": [ 391 | "Training Accuracy: 0.66\n", 392 | "Test Accuracy: 0.664\n", 393 | "Confusion Matrix:\n", 394 | "[[166 0]\n", 395 | " [ 84 0]]\n" 396 | ], 397 | "name": "stdout" 398 | } 399 | ] 400 | }, 401 | { 402 | "metadata": { 403 | "id": "ASbewYhr8Yha", 404 | "colab_type": "code", 405 | "colab": { 406 | "base_uri": "https://localhost:8080/", 407 | "height": 222 408 | }, 409 | "outputId": "da461e12-13c6-4c7d-f090-9c37c5eb3bac" 410 | }, 411 | "cell_type": "code", 412 | "source": [ 413 | "print(\"\\nBuilding Model Subtask B...\")\n", 414 | "classify(vectors_b[:], labels_b[:], \"SVM\") # {MNB, KNN, SVM, DT, RF, LR}" 415 | ], 416 | "execution_count": 36, 417 | "outputs": [ 418 | { 419 | "output_type": "stream", 420 | "text": [ 421 | "\n", 422 | "Building Model Subtask B...\n" 423 | ], 424 | "name": "stdout" 425 | }, 426 | { 427 | "output_type": "stream", 428 | "text": [ 429 | "/usr/local/lib/python3.6/dist-packages/sklearn/externals/joblib/externals/loky/process_executor.py:706: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n", 430 | " \"timeout or by a memory leak.\", UserWarning\n", 431 | "/usr/local/lib/python3.6/dist-packages/sklearn/svm/base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 432 | " \"avoid this warning.\", FutureWarning)\n" 433 | ], 434 | "name": "stderr" 435 | }, 436 | { 437 | "output_type": "stream", 438 | "text": [ 439 | "Training Accuracy: 0.8773333333333333\n", 440 | "Test Accuracy: 0.884\n", 441 | "Confusion Matrix:\n", 442 | "[[221 0]\n", 443 | " [ 29 0]]\n" 444 | ], 445 | "name": "stdout" 446 | } 447 | ] 448 | }, 449 | { 450 | "metadata": { 451 | "id": "TAF7Vz3nlWj8", 452 | "colab_type": "code", 453 | "colab": { 454 | "base_uri": "https://localhost:8080/", 455 | "height": 205 456 | }, 457 | "outputId": "3aa22dd2-07e6-4768-ec44-80d18b697687" 458 | }, 459 | "cell_type": "code", 460 | "source": [ 461 | "print(\"\\nBuilding Model Subtask C...\")\n", 462 | "classify(vectors_c[:], labels_c[:], \"SVM\") # {MNB, KNN, SVM, DT, RF, LR}" 463 | ], 464 | "execution_count": 37, 465 | "outputs": [ 466 | { 467 | "output_type": "stream", 468 | "text": [ 469 | "\n", 470 | "Building Model Subtask C...\n" 471 | ], 472 | "name": "stdout" 473 | }, 474 | { 475 | "output_type": "stream", 476 | "text": [ 477 | "/usr/local/lib/python3.6/dist-packages/sklearn/svm/base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 478 | " \"avoid this warning.\", FutureWarning)\n" 479 | ], 480 | "name": "stderr" 481 | }, 482 | { 483 | "output_type": "stream", 484 | "text": [ 485 | "Training Accuracy: 0.628\n", 486 | "Test Accuracy: 0.604\n", 487 | "Confusion Matrix:\n", 488 | "[[ 0 70 0]\n", 489 | " [ 0 151 0]\n", 490 | " [ 0 29 0]]\n" 491 | ], 492 | "name": "stdout" 493 | } 494 | ] 495 | }, 496 | { 497 | "metadata": { 498 | "id": "QfhWGl8zCFt3", 499 | "colab_type": "code", 500 | "colab": {} 501 | }, 502 | "cell_type": "code", 503 | "source": [ 504 | "" 505 | ], 506 | "execution_count": 0, 507 | "outputs": [] 508 | } 509 | ], 510 | "metadata": { 511 | "colab": { 512 | "name": "Offensive_tweets.ipynb", 513 | "version": "0.3.2", 514 | "provenance": [], 515 | "collapsed_sections": [] 516 | }, 517 | "kernelspec": { 518 | "name": "python3", 519 | "display_name": "Python 3" 520 | }, 521 | "accelerator": "GPU" 522 | }, 523 | "nbformat": 4, 524 | "nbformat_minor": 0 525 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Offensive Language Detection 2 | Digital bullying is a daily phenomena that each and every one face from time to another. 3 | 4 | Proposing a solution that uses machine learning classifiers to detect this offensive language (tweets in our case), and then decide whether it is targeted, and if so, it classifies the target. 5 | 6 | ## Technologies 7 | - Scikit Learn 0.20 8 | - NLTK 3.4 9 | 10 | ## Classifiers 11 | - Multinomial Naive Bayes 12 | - K Neighbours 13 | - Support Vector Machine 14 | - Decision Tree 15 | - Random Forest 16 | - Logistic Regression 17 | 18 | ## How it works 19 | We divide the pre-processing phase into multiple stages, in which we remove stop words, emojis, mentions, urls and all kind of noise, along with a stage of lemmatizing and stemming. 20 | 21 | The clean toknized tweets is then sent to TF-IDF vectorizer, that takes care of converting the data into a model of numerical features that are ready to be used for classification. 22 | 23 | We apply cross validation on the training vectors with 0.3 splitting factor, while tuning some of the selected parameters to enhance the accuracy. 24 | 25 | Finally the best estimator of the selected classifier is used to predict the test labels. 26 | 27 | ## Results 28 | Classifing whether the tweet is offensive: 29 | Training Accuracy : 0.89457 30 | Test Accuracy : 0.83125 31 | 32 | Classifing whether the tweet is targeted: 33 | Training Accuracy : 0.913 34 | Test Accuracy : 0.6947 35 | 36 | Classifing the target: 37 | Training Accuracy : 0.9975 38 | Test Accuracy : 0.805 39 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | ####################################### 2 | ## Author: Ahmed Hammad 3 | ## License: MIT 4 | ## Email: hammad97official@gmail.com 5 | ## Website: www.ahmedhammad97.com 6 | ####################################### 7 | 8 | import pandas as pd 9 | import numpy as np 10 | from tqdm import tqdm 11 | import preprocessing, embedding, helper, classifying 12 | import copy 13 | 14 | 15 | train_directory = "datasets/training-v1/offenseval-training-v1.tsv" 16 | print("Reading Dataset...") 17 | train_data = pd.read_csv(train_directory, sep='\t', header=0) 18 | 19 | tweets = train_data[["tweet"]] 20 | subtask_a_labels = train_data[["subtask_a"]] 21 | subtask_b_labels = train_data.query("subtask_a == 'OFF'")[["subtask_b"]] 22 | subtask_c_labels = train_data.query("subtask_b == 'TIN'")[["subtask_c"]] 23 | 24 | clean_tweets = copy.deepcopy(tweets) 25 | 26 | tqdm.pandas(desc="Cleaning Data Phase I...") 27 | clean_tweets['tweet'] = tweets['tweet'].progress_apply(preprocessing.take_data_to_shower) 28 | 29 | tqdm.pandas(desc="Tokenizing Data...") 30 | clean_tweets['tokens'] = clean_tweets['tweet'].progress_apply(preprocessing.tokenize) 31 | 32 | tqdm.pandas(desc="Cleaning Data Phase II...") 33 | clean_tweets['tokens'] = clean_tweets['tokens'].progress_apply(preprocessing.remove_stop_words) 34 | 35 | tqdm.pandas(desc="Stemming And Lemmatizing") 36 | clean_tweets['tokens'] = clean_tweets['tokens'].progress_apply(preprocessing.stem_and_lem) 37 | 38 | text_vector = clean_tweets['tokens'].tolist() 39 | 40 | vectors_a = embedding.tfid(text_vector) # Numerical Vectors A 41 | labels_a = subtask_a_labels['subtask_a'].values.tolist() # Subtask A Labels 42 | 43 | vectors_b = helper.get_vectors(vectors_a, labels_a, "OFF") # Numerical Vectors B 44 | labels_b = subtask_b_labels['subtask_b'].values.tolist() # Subtask B Labels 45 | 46 | vectors_c = helper.get_vectors(vectors_b, labels_b, "TIN") # Numerical Vectors C 47 | labels_c = subtask_c_labels['subtask_c'].values.tolist() # Subtask C Labels 48 | 49 | print("\nBuilding Model Subtask A...") 50 | classifying.classify(vectors_a[:], labels_a[:], text_vector, "A", "MNB") 51 | 52 | print("\nBuilding Model Subtask B...") 53 | classifying.classify(vectors_b[:], labels_b[:], text_vector, "B", "SVM") 54 | 55 | print("\nBuilding Model Subtask C...") 56 | classifying.classify(vectors_c[:], labels_c[:], text_vector, "C", "RF") 57 | 58 | # You can choose from the classifiers {MNB, KNN, SVM, DT, RF, LR} 59 | # You can also try only a subset of the data for quick classification: 60 | # vectors_a[1000:3000], labels_a[1000:3000] -------------------------------------------------------------------------------- /classifying.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.neighbors import KNeighborsClassifier 4 | from sklearn.svm import SVC 5 | from sklearn.naive_bayes import GaussianNB, MultinomialNB 6 | from sklearn.tree import DecisionTreeClassifier 7 | from sklearn.ensemble import RandomForestClassifier 8 | from sklearn.metrics import accuracy_score, confusion_matrix 9 | from sklearn.model_selection import GridSearchCV 10 | from sklearn.linear_model import LogisticRegression 11 | from load_test_data import load 12 | 13 | def classify(vectors, labels, train_text, task, type="DT"): 14 | # Random Splitting With Ratio 3 : 1 15 | train_vectors, test_vectors, train_labels, test_labels = train_test_split(vectors, labels, test_size=0.333) 16 | test_vectors, test_labels = load(train_text, task) 17 | 18 | 19 | # Initialize Model 20 | classifier = None 21 | if(type=="MNB"): 22 | classifier = MultinomialNB(alpha=0.7) 23 | classifier.fit(train_vectors, train_labels) 24 | elif(type=="KNN"): 25 | classifier = KNeighborsClassifier(n_jobs=4) 26 | params = {'n_neighbors': [3,5,7,9], 'weights':['uniform', 'distance']} 27 | classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4) 28 | classifier.fit(train_vectors, train_labels) 29 | classifier = classifier.best_estimator_ 30 | elif(type=="SVM"): 31 | classifier = SVC() 32 | classifier = GridSearchCV(classifier, {'C':[0.001, 0.01, 0.1, 1, 10]}, cv=3, n_jobs=4) 33 | classifier.fit(train_vectors, train_labels) 34 | classifier = classifier.best_estimator_ 35 | elif(type=="DT"): 36 | classifier = DecisionTreeClassifier(max_depth=800, min_samples_split=5) 37 | params = {'criterion':['gini','entropy']} 38 | classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4) 39 | classifier.fit(train_vectors, train_labels) 40 | classifier = classifier.best_estimator_ 41 | elif(type=="RF"): 42 | classifier = RandomForestClassifier(max_depth=800, min_samples_split=5) 43 | params = {'n_estimators': [n for n in range(50,200,50)], 'criterion':['gini','entropy'], } 44 | classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4) 45 | classifier.fit(train_vectors, train_labels) 46 | classifier = classifier.best_estimator_ 47 | elif(type=="LR"): 48 | classifier = LogisticRegression(multi_class='auto', solver='newton-cg',) 49 | classifier = GridSearchCV(classifier, {"C":np.logspace(-3,3,7), "penalty":["l2"]}, cv=3, n_jobs=4) 50 | classifier.fit(train_vectors, train_labels) 51 | classifier = classifier.best_estimator_ 52 | else: 53 | print("Wrong Classifier Type!") 54 | return 55 | 56 | accuracy = accuracy_score(train_labels, classifier.predict(train_vectors)) 57 | print("Training Accuracy:", accuracy) 58 | test_predictions = classifier.predict(test_vectors) 59 | accuracy = accuracy_score(test_labels, test_predictions) 60 | print("Test Accuracy:", accuracy) 61 | print("Confusion Matrix:", ) 62 | print(confusion_matrix(test_labels, test_predictions)) 63 | -------------------------------------------------------------------------------- /datasets/training-v1.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhammad97/Offensive-Language-Detection/f5bd9056eada52f63862ba4f54d4d0c2d4b59df6/datasets/training-v1.zip -------------------------------------------------------------------------------- /datasets/training-v1/offenseval-annotation.txt: -------------------------------------------------------------------------------- 1 | 1) Overview 2 | 3 | This task requires the annotators to give their judgements on whether a tweet is offensive or not. Please note that the data contains offensive or sensitive content, including profanity and racial slurs. 4 | 5 | Steps 6 | There are 3 steps to complete this task - 7 | 8 | In the first step, the annotators mark the tweet as being offensive or not offensive. 9 | If the tweet is offensive then the annotators need to tell if the offense is targeted towards somebody or something or it is not targeted. 10 | If the offense is targeted then the annotators also need to tell who it is targeted against. 11 | 12 | Rules & Tips 13 | Please rate the tweet according to whether it is offensive generally or to the target, not whether you are personally offended by it. 14 | 15 | Sub-task A: Offensive or not 16 | 17 | In this sub-task we are interested in the identification of offensive posts and posts containing any form of (untargeted) profanity. In this sub-task there are 2 categories in which the tweet could be classified - 18 | 19 | Not Offensive - This post does not contain offense or profanity. Non-offensive posts do not include any form of offense or profanity. 20 | Offensive - This post contains offensive language or a targeted (veiled or direct) offense. In our annotation, we label a post as offensive if it contains any form of non-acceptable language (profanity) or a targeted offense which can be veiled or direct. To sum up this category includes insults, threats, and posts containing profane language and swear words. 21 | Sub-task B: Offense types 22 | 23 | In this sub-task we are interested in categorizing offenses. Only posts containing offenses are included in sub-task B. In this sub-task, annotators need to label from one of the following categories - 24 | 25 | Targeted Insult - A post containing an insult or a threat to an individual, group, or others; 26 | Untargeted - A post containing non-targeted profanity and swearing. 27 | Posts containing general profanity are not targeted but they contain non-acceptable language. On the other hand, insults and threats are targeted at an individual or group. 28 | 29 | Sub-task C: Offense target 30 | 31 | Finally, in sub-task C we are interested in the target of offenses. Only posts which are either insults or threats are included in this sub-task. The three categories included in sub-task C are the following: 32 | 33 | Individual - The target of the offensive post is an individual: a famous person, named individual or an unnamed person interacting in the conversation. 34 | Group - The target of the offensive post is a group of people considered as a unity due to the same ethnicity, gender or sexual orientation, political affiliation, religious belief, or something else. 35 | Other – The target of the offensive post does not belong to any of the previous two categories (e.g. an organization, a situation, an event, or an issue). 36 | 37 | All possible label combinations across all the sub-tasks are as below: 38 | 39 | Not offensive 40 | Offensive, Untargeted 41 | Offensive, Targeted Insult, (Individual | Group | Other) 42 | 43 | 2) Examples 44 | 45 | Sub-task A: Offensive language identification 46 | 47 | Some of the examples include: 48 | 49 | @thecomeback @JABItalia Fuck @APrecourt - Offensive 50 | Hey @LIRR , you are disgusting. Offensive 51 | A true American literary icon. #PhilipRoth will be missed. - Not offensive 52 | 53 | Sub-task B: Automatic categorization of offense types 54 | 55 | Some of the examples include: 56 | 57 | @thecomeback @JABItalia Fuck @APrecourt - Offensive Untargeted 58 | I mean I'm dating to get fucking attention - Offensive Untargeted 59 | Hey @LIRR , you are disgusting. Offensive, Targeted Insult 60 | @BreFields1 @jonesebonee18 fuck you lol - Offensive, Targeted Insult 61 | @karlsantix You are a complete knob! It's ppl like you who are messing up this country - Offensive, Targeted Insult 62 | If I pull up to yo crib and you offer me cockroach milk you getting yo ass beaten - Offensive, Targeted Insult 63 | @Top_Sergeant Assuming liberals are unarmed would be a grave mistake by the deplorables. - Offensive, Targeted Insult 64 | 65 | Sub-task C: Offense target identification 66 | 67 | Some of the examples include: 68 | 69 | Hey @LIRR , you are disgusting. - Offensive, Targeted Insult, Other 70 | @BreFields1 @jonesebonee18 fuck you lol - Offensive, Targeted Insult, Individual 71 | @karlsantix You are a complete knob! It's ppl like you who are messing up this country - Offensive, Targeted Insult, Individual 72 | If I pull up to yo crib and you offer me cockroach milk you getting yo ass beaten - Offensive, Targeted Threat, Individual 73 | @Top_Sergeant Assuming liberals are unarmed would be a grave mistake by the deplorables. - Offensive, Targeted Insult, Group -------------------------------------------------------------------------------- /datasets/training-v1/readme-trainingset-v1.txt: -------------------------------------------------------------------------------- 1 | ======================== 2 | 3 | OffensEval 2019: Identifying and Categorizing Offensive Language in Social Media (SemEval 2019 - Task 6) 4 | Training data 5 | v 1.0: November 28 2018 6 | https://competitions.codalab.org/competitions/20011 7 | 8 | ======================== 9 | 10 | 1) DESCRIPTION 11 | 12 | The file offenseval-training-v1.tsv contains 13,240 annotated tweets. 13 | 14 | The dataset was annotated using crowdsourcing. The gold labels were assigned taking the agreement of three annotators into consideration. No correction has been carried out on the crowdsourcing annotations. 15 | 16 | The file offenseval-annotation.txt contains a short summary of the annotation guidelines. 17 | 18 | Twitter user mentions were substituted by @USER and URLs have been substitute by URL. 19 | 20 | Each instance contains up to 3 labels each corresponding to one of the following sub-tasks: 21 | 22 | - Sub-task A: Offensive language identification; 23 | 24 | - Sub-task B: Automatic categorization of offense types; 25 | 26 | - Sub-task C: Offense target identification. 27 | 28 | 2) FORMAT 29 | 30 | Instances are included in TSV format as follows: 31 | 32 | ID INSTANCE SUBA SUBB SUBC 33 | 34 | Whenever a label is not given, a value NULL is inserted (e.g. INSTANCE NOT NULL NULL) 35 | 36 | The column names in the file are the following: 37 | 38 | id tweet subtask_a subtask_b subtask_c 39 | 40 | The labels used in the annotation are listed below. 41 | 42 | 3) TASKS AND LABELS 43 | 44 | (A) Sub-task A: Offensive language identification 45 | 46 | - (NOT) Not Offensive - This post does not contain offense or profanity. 47 | - (OFF) Offensive - This post contains offensive language or a targeted (veiled or direct) offense 48 | 49 | In our annotation, we label a post as offensive (OFF) if it contains any form of non-acceptable language (profanity) or a targeted offense, which can be veiled or direct. 50 | 51 | (B) Sub-task B: Automatic categorization of offense types 52 | 53 | - (TIN) Targeted Insult and Threats - A post containing an insult or threat to an individual, a group, or others (see categories in sub-task C). 54 | - (UNT) Untargeted - A post containing non-targeted profanity and swearing. 55 | 56 | Please note that now targeted threats (TTH) have been merged with targeted insults (TIN) and are listed under Targeted Insult and Threats (TIN). The TTH label present in the trial set is not included in this training set and will not be included in the test set. 57 | 58 | Posts containing general profanity are not targeted, but they contain non-acceptable language. 59 | 60 | (C) Sub-task C: Offense target identification 61 | 62 | - (IND) Individual - The target of the offensive post is an individual: a famous person, a named individual or an unnamed person interacting in the conversation. 63 | - (GRP) Group - The target of the offensive post is a group of people considered as a unity due to the same ethnicity, gender or sexual orientation, political affiliation, religious belief, or something else. 64 | - (OTH) Other – The target of the offensive post does not belong to any of the previous two categories (e.g., an organization, a situation, an event, or an issue) 65 | 66 | Please note that now organization are listed under Other (OTH). The ORG label present in the trial set is not included in this training set and will not be included in the test set. 67 | 68 | Label Combinations 69 | 70 | Here are the possible label combinations in the OffensEval annotation. 71 | 72 | - NOT NULL NULL 73 | - OFF UNT NULL 74 | - OFF TIN (IND|GRP|OTH) 75 | 76 | 4) TRAINING PREDICTIONS (IMPORTANT!) 77 | 78 | The OFFICIAL CodaLab competition (https://competitions.codalab.org/competitions/20011) will be open only in January. You will use it to upload your test set predictions which will be included in the official OffensEval ranks and shared task report. 79 | 80 | During the training stage, we created a PRACTICE CodaLab competition in which you can use to upload your predictions for each sub-task and evaluate your system. Here is the URL: https://competitions.codalab.org/competitions/20559?secret_key=5d5e72f8-bb17-49a6-9cf8-5827dafa2257 81 | 82 | 5) CREDITS 83 | 84 | Task Organizers 85 | 86 | Marcos Zampieri (University of Wolverhampton, UK) 87 | Shervin Malmasi (Amazon, USA) 88 | Preslav Nakov (Qatar Computing Research Insitute, Qatar) 89 | Sara Rosenthal (IBM Research, USA) 90 | Noura Farra (Columbia University, USA) 91 | Ritesh Kumar (Bhim Rao Ambedkar University, India) 92 | 93 | Contact 94 | 95 | semeval-2019-task-6@googlegroups.com -------------------------------------------------------------------------------- /datasets/trial-data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhammad97/Offensive-Language-Detection/f5bd9056eada52f63862ba4f54d4d0c2d4b59df6/datasets/trial-data.zip -------------------------------------------------------------------------------- /datasets/trial-data/OffensEval-READMEv1.txt: -------------------------------------------------------------------------------- 1 | ======================== 2 | 3 | OffensEval 2019: Identifying and Categorizing Offensive Language in Social Media (SemEval 2019 - Task 6) 4 | Trial data 5 | v 1.0: September 5 2018 6 | https://competitions.codalab.org/competitions/20011 7 | 8 | ======================== 9 | 10 | 1) DESCRIPTION 11 | 12 | The file offenseval-trial.txt contains 320 annotated tweets. 13 | 14 | Each instance contains up to 3 labels each corresponding to one of the following sub-tasks: 15 | 16 | - Sub-task A: Offensive language identification; 17 | 18 | - Sub-task B: Automatic categorization of offense types; 19 | 20 | - Sub-task C: Offense target identification. 21 | 22 | 2) FORMAT 23 | 24 | Instances are included in TSV format as follows: 25 | 26 | INSTANCE SUBA SUBB SUBC 27 | 28 | Whenever a label is not given, a value NULL is inserted (e.g. INSTANCE NOT NULL NULL) 29 | 30 | The labels used in the annotation are listed below. 31 | 32 | 3) TASKS AND LABELS 33 | 34 | (A) Sub-task A: Offensive language identification 35 | 36 | - (NOT) Not Offensive - This post does not contain offense or profanity. 37 | - (OFF) Offensive - This post contains offensive language or a targeted (veiled or direct) offense 38 | 39 | In our annotation, we label a post as offensive (OFF) if it contains any form of non-acceptable language (profanity) or a targeted offense, which can be veiled or direct. 40 | 41 | (B) Sub-task B: Automatic categorization of offense types 42 | 43 | - (TIN) Targeted Insult - A post containing an insult to an individual, a group, or an organization (see categories in sub-task C). 44 | - (TTH) Targeted Threat - A post containing a threat to an individual, a group, or an organization (see categories in sub-task C). 45 | - (UNT) Untargeted - A post containing non-targeted profanity and swearing. 46 | 47 | Posts containing general profanity are not targeted, but they contain non-acceptable language. 48 | 49 | (C) Sub-task C: Offense target identification 50 | 51 | - (IND) Individual - The target of the offensive post is an individual: a famous person, a named individual or an unnamed person interacting in the conversation. 52 | - (GRP) Group - The target of the offensive post is a group of people considered as a unity due to the same ethnicity, gender or sexual orientation, political affiliation, religious belief, or something else. 53 | - (ORG) Organization or Entity - The target of the offensive post is an organization (e.g., a company or an association) or an entity (e.g., a city, a country, a region, a continent, a location). 54 | - (OTH) Other – The target of the offensive post does not belong to any of the previous three categories (e.g., a situation, an event, or an issue) 55 | 56 | Label Combinations 57 | 58 | Here are the possible label combinations in the OffensEval annotation. 59 | 60 | - NOT NULL NULL 61 | - OFF UNT NULL 62 | - OFF TIN (IND|GRP|ORG|OTH) 63 | - OFF TTH (IND|GRP|ORG|OTH) 64 | 65 | IMPORTANT: The distribution of labels is the training and in the test data is likely to be different from the trial data (more offensive content). 66 | 67 | 4) CREDITS 68 | 69 | Task Organizers 70 | 71 | Marcos Zampieri (University of Wolverhampton, UK) 72 | Shervin Malmasi (Harvard Medical School, USA) 73 | Preslav Nakov (Qatar Computing Research Insitute, Qatar) 74 | Sara Rosenthal (IBM Research, USA) 75 | Noura Farra (Columbia University, USA) 76 | Ritesh Kumar (Bhim Rao Ambedkar University, India) 77 | 78 | Contact 79 | 80 | semeval-2019-task-6@googlegroups.com -------------------------------------------------------------------------------- /datasets/trial-data/offenseval-trial.txt: -------------------------------------------------------------------------------- 1 | @BreitbartNews OK Shannon, YOU tell the veterans in those locker rooms they have to stay there until the celebration of what they fought for is over. NOT NULL NULL 2 | @LeftyGlenn @jaredeker @BookUniverse @hashtagzema @RalphLombardi @NathanHRubin Fine... Because i could afford a gun if i wanted to. I could fit it into my budget. My budgeting is fine??? Here in canada we have gun insurance and gun control? And lotsa p NOT NULL NULL 3 | Hot Mom Sucks Off Step Son In Shower 8 min https://t.co/Y0zi9f5z6J OFF UNT NULL 4 | bro these are some cute butt plugs I’m trying to cop https://t.co/RsnxRF4HTi OFF UNT NULL 5 | Arizona Supreme Court strikes down state legislation prohibiting medical marijuana use on public college c... https://t.co/f2PTckxYvP NOT NULL NULL 6 | Arguing gun control is wrong of me whoever has a strong opinion doesn’t really care about facts they have a emotional connection to the issue and its a waste of me to argue. When am I gonna learn that lol. NOT NULL NULL 7 | Doctors’ interest in medical marijuana far outpaces available research https://t.co/JLdEuhdyBG via @thecannifornian_com https://t.co/qJIgzq4HiD NOT NULL NULL 8 | A must-read and a must-share for all your friends who support common sense" gun control. https://t.co/hiCzzpbdjy" NOT NULL NULL 9 | @Jo2timess Now that’s the dumbest shit I have ever heard in my life!!!LIKE WTF!!!! OFF UNT NULL 10 | Agreed! When all of this drama was unfolding agents I know said Comey had stacks of resignation letters piling up on his desk & no one wanted to interact with him anymore. When he let Clinton off the hook there were agents throwing shit at their TV's! You are a disgrace @Comey https://t.co/rZFBlgEWBy OFF UNT NULL 11 | @NewYorker On the condition of self-reading after the completion of the marriage ceremony in the royal ship and the… https://t.co/67CptVIuYW NOT NULL NULL 12 | Surprise Vote in Congress Protects Medical Marijuana From Federal Interference https://t.co/qtKKklkdke via @wordpressdotcom NOT NULL NULL 13 | Nationwide support for gun control falls https://t.co/Q2ojIuFt7p https://t.co/BfWSSuFWMQ NOT NULL NULL 14 | @NOA2iCY ooooh i love pineapples. i’m hoping to find a piña colada in a pineapple NOT NULL NULL 15 | Columbia Care expands home delivery service to all patients in New York State. https://t.co/1tGLhpK98N NOT NULL NULL 16 | @BreitbartNews Stunning !!! Obama is everywhere NOT NULL NULL 17 | When John from the @cNationChannel asked our CEO, Morris Denton, how difficult it was to pass medical marijuana legislation in Texas, his answer might surprise you. Listen to this great interview on @TheKushJournals from the recent #MJBizConNEXT. https://t.co/LYAPK5JTWq NOT NULL NULL 18 | it was sexy when this girl told me it was impressive that i could type fast cuz i failed all the touch typing class… https://t.co/YuDED5Sfx9 NOT NULL NULL 19 | @batsai @visiblecare I agree it is very unfortunate that they are starting off like this. Very bad press when starting something new means rough roads ahead NOT NULL NULL 20 | @cattunneycbc @pdmcleod yeah anyone who says that is a directive is full of shit. OFF UNT NULL 21 | Medical marijuana shops lack votes in Collier; commission puts off issue 6 months https://t.co/VvoLdPdQDl via @ndn NOT NULL NULL 22 | 79 percent of people answered this correctly. Can you? https://t.co/wjSXbrNyR1 NOT NULL NULL 23 | She’s right...he is pretty awesome! @Rockies ...don’t you agree?? https://t.co/fg729D20t2 NOT NULL NULL 24 | @BreitbartNews Guess he can just mute them then NOT NULL NULL 25 | Support for gun control slips in new poll https://t.co/35XV5wVeOo NOT NULL NULL 26 | Cyrus is a sweet, affectionate boy who loves to be with his people. He is very smart and loves to go for walks. He is very picky about his doggie friends but is good with some. He is looking for a #foreverhomecome meet him or click here: https://t.co/fxX8Ntftvz #adoptme #ymm https://t.co/GEis4SqqTf NOT NULL NULL 27 | @hughhewitt @mishacollins What the hell...? What does that tweet have to do with gun control...? Maybe try to make your tweet have some sort of sense... OFF TIN IND 28 | And not give a single fuck. https://t.co/b6vchAVYZl OFF UNT NULL 29 | Guess ima have to beat some lil kids asses today cause lil Becky not about to but my kid and get away with it https://t.co/RpRmn2tbAn OFF TTH IND 30 | Prayer: Keep it simple Keep it honest Keep it going #Alpha #prayer NOT NULL NULL 31 | By looking at the month you were born, we will reveal which symbol best represents your soul, and what that symbol says about the kind of person that you are! https://t.co/dJiqfJvAi2 NOT NULL NULL 32 | Does anyone know where @sarangkimKAI & @LubzzC are??? Or what they are doing now? NOT NULL NULL 33 | @thehill Even if this is factual, the overwhelming majority of Americans support increased gun control. NOT NULL NULL 34 | Medical marijuana opponents, Lt. Governor don't oppose Utah Patients jumping into lawsuit https://t.co/KIBc9JW8cz NOT NULL NULL 35 | @BrittMcHenry @Hankdiggy Hmm. There was no such conduct in place for a peaceful protest until today. You wanna be Patriots kill me. You are ALL for bans on kneeling but when it comes to saving lives with gun control you are no longer patriots. NOT NULL NULL 36 | @kookpics @Aoba_Seradaddy Look at how smug he is after 😂😂😂 NOT NULL NULL 37 | MustWeed: West Virginia Legislature May Revisit Medical Marijuana https://t.co/XrQuQoFZU7 https://t.co/bOVeBOl3yW NOT NULL NULL 38 | Arizona High Court to Rule on Whether Colleges Can Ban Medical Marijuana @mjdotc https://t.co/QrEOjz42bT #cannabiscommunitynews NOT NULL NULL 39 | The @netflix series #Explained already has my interest. Love is a feeling, monogamy is a rule."" NOT NULL NULL 40 | The U.S. is giving away the $30 billion medical marijuana industry. Why? Because the feds are bogarting the weed, while Israel and Canada are grabbing market share, @joshdean66 reports #tictocnews https://t.co/6e9AucRSCR https://t.co/ohMYVS39hT NOT NULL NULL 41 | When the government refers to medical marijuana farm in a state where its decriminalized as “organized crime”. Nice waste of millions of tax dollars America, can we focus on actual drug problems now? NOT NULL NULL 42 | @beefcakesteve same here, they are all so sweet, I love them NOT NULL NULL 43 | Medical marijuana opponents, Lt. Governor don't oppose Utah Patients jumping into lawsuit https://t.co/MoXKzc6cNt NOT NULL NULL 44 | @NewYorker Get over it. The weddings over. NOT NULL NULL 45 | BUSTED!!! Kemi Olunloyo Replies Linda Ikeji… Expose Photos Of The MOONBUMP She Is Wearing [See Photos]… https://t.co/DgVvxmMObC NOT NULL NULL 46 | 🍺🍿ready!! https://t.co/clD7tbC0z5 NOT NULL NULL 47 | @SpeakerRyan Does that include medical marijuana in states its not legal yet? NOT NULL NULL 48 | Most oncologists have discussed the use of medical #marijuana with their patients, a new study finds. https://t.co/n2nHbak8hC https://t.co/r9n3TI47wu NOT NULL NULL 49 | BUSTED!!! Kemi Olunloyo Replies Linda Ikeji… Expose Photos Of The MOONBUMP She Is Wearing [See Photos]… https://t.co/8jkhmryztU NOT NULL NULL 50 | They did it because he is the president's SIL. We are now a banana republic, where the only thing that matters is fealty to the dictator. Competence is irrelevant. Character is irrelevant. #GOPAbuseofPower https://t.co/4W6iu3hkmU NOT NULL NULL 51 | okay but it actually sucks so much that the first year I COULD go to every Reeperbahn Festival day, I'm in Strasbourg and can only attend the last day NOT NULL NULL 52 | @NewYorker Though, there was that film Pope Pius, Friend of the Nazis."" NOT NULL NULL 53 | IM FREEEEE!!!! WORST EXPERIENCE OF MY FUCKING LIFE OFF UNT NULL 54 | A bastard tick killed my sisters dog. Lyme disease is a real risk for female labradors, golden retrievers and Wheaten terriers. #LymeDisease #LymeDiseaseAwarenessMonth https://t.co/iuw4evNsL8 OFF UNT NULL 55 | bizarre Op-Ed Thinks Founding Fathers Would Support Gun Control Because Of Dead Kids https://t.co/y2fGt0P6el via @BearingArmsCom NOT NULL NULL 56 | @kinky_pooja's account is temporarily unavailable because it violates the Twitter Media Policy. Learn more. NOT NULL NULL 57 | @BreitbartNews Disgustingly sick turn of mindset for California, modern day Sodom and Gomorrah. NOT NULL NULL 58 | @NewYorker Identity politics at its best !!!! How did the first black woman in US Senate turn out ? Lol. NOT NULL NULL 59 | @Arbys @Electric_Mucus @badsandwich Thank you for not destroying the beauty Arbys. I ate you more because of the nihilism. Now I guess I'll eat more. Or nothing at all. NOT NULL NULL 60 | HEADLINE: girl continues to display flashy guns, further emphasizing the lack of gun control from legislatures. Call your local governor and do something about it today! https://t.co/s2ZTpg06BB NOT NULL NULL 61 | @BreitbartNews This account is his personal account and he has every right accorded to everyone else on Twitter so it doesn’t hold water. Twitter can’t treat Donald Trump any different than Obama. NOT NULL NULL 62 | @BreitbartNews Most of those harrassers should be thrown off Twitter under the rules. Singleing out an individual for purposes of harrassment. NOT NULL NULL 63 | Medical marijuana is on the ballot in June. And I saw a headline about how people oppose to medical marijuana and I just do not understand NOT NULL NULL 64 | Arizona Court to Rule on Medical Marijuana on Campuses @usnews https://t.co/CoYKMeVZYc https://t.co/OEj53Rne3x NOT NULL NULL 65 | @BreitbartNews Little Marco Trump will get things done OFF TIN IND 66 | @BreitbartNews This ruling may have implications for Twitter and its rampant use of shadow-bans and algorithmic censorship. We shall see...... #WWG1WGA https://t.co/F8ng8p56tE NOT NULL NULL 67 | @aaronLebeahm Good question. All we can do is hold on to this crazy ride and make sure we never stop talking about… https://t.co/8NZQZ6KeDc NOT NULL NULL 68 | @ABdeVilliers17 😭😭😭😭 i am not believe Sir You are in retire...... please cam back sir NOT NULL NULL 69 | @NewYorker If you suddenly found yourself atop the “swamp” aka business & politics as usual aka equally influential domestic & foreign interests professionally represented, how would you operate? Who would you trust? The establishment? No late NOT NULL NULL 70 | @thehill @DLoesch We already have gun control. Not one of those gun guntrol laws do anything to effect the cause of not only gun violence but violence in general. The condition of the human heart. We as nations turned away from Jehovah God he turned us ov NOT NULL NULL 71 | 10 #Marvel and #DC Parodies That Are Not For Kids https://t.co/PEUycBcQOU https://t.co/UfObfezQl6 NOT NULL NULL 72 | WWII always on that fuck shit i'm finna go play fortnite like a dweeb OFF UNT NULL 73 | @realDonaldTrump Everyone already know what you are trying to do with the fake news and witch hunt shit um....it’s not working OFF TIN IND 74 | @NewYorker The video shows how undercooked they are! I can't imagine paying $1000 for undercooked wings 🤣 #salmonella NOT NULL NULL 75 | @BreitbartNews I remember Kimmel being funny once. Years ago on the NFL today as a prop man!! NOT NULL NULL 76 | @NewYorker @arminfar go I am sure NOT NULL NULL 77 | @NewYorker Shocked to learn that the #respectthelaw hashtag used by the #MAGA-T's only applies when a minority is gunned down by the police. NOT NULL NULL 78 | This fine ass girl just got my number she better quit playing 😆😍 OFF UNT NULL 79 | if it's not about me its about Matt. blah blah blah worrrry about your bf who makes you feel like shit but you act like yall good. 😂😛✌ OFF TIN IND 80 | @saeychung Oh she is the worst. Ugh OFF TIN IND 81 | Internet Archive Removes Evidence That Companies Sold Stalkerware, But Not ISIS Propaganda https://t.co/PZoSbH7qZn reputationbank NOT NULL NULL 82 | @BreitbartNews And liberals block me on the regular. No double standard or bias here https://t.co/qtFKHkguxn NOT NULL NULL 83 | AWR Hawkins: Gun Control Is Not School Safety | Breitbart https://t.co/Hi4FtsVSgz https://t.co/hvxWSndM2O NOT NULL NULL 84 | Microdosing Is the Future of Cannabis | Medical Marijuana News- #MMJ https://t.co/0JAuwjGzFK NOT NULL NULL 85 | The university’s Lambert Center for the Study of #Medicinal #Cannabis and #Hemp had partnered with ioVita, a… https://t.co/Mr62VslKr8" NOT NULL NULL 86 | @BillOReilly Because marijuana doesn't hurt anyone. Please see medical marijuana NOT NULL NULL 87 | @BreitbartNews Here's some news. https://t.co/hCAWG2TqC4 NOT NULL NULL 88 | @NewYorker yes there are gold coated chicken wings. I do not knows who buys them at £1000 a plate.maybe I know the wrong people. NOT NULL NULL 89 | Having a medical marijuana card is an easy way to say you were under the influence in any situation NOT NULL NULL 90 | @BreitbartNews Good NOT NULL NULL 91 | @dvnmya Gemini’s are my favs fuck the haters tbh OFF UNT NULL 92 | @BreitbartNews Obama fed the country shit sandwiches for 8 years. Maybe Jim just has his addled mind confused about dates and who fed who what.. https://t.co/ynoQjQ11lT OFF TIN GRP 93 | @NewYorker One of great American #authors loved his work. RIP NOT NULL NULL 94 | Breitbart | Criminals do not obey gun control laws. Period.... https://t.co/MXZYpJfaGU https://t.co/tF87s986BK NOT NULL NULL 95 | @BreitbartNews Ironic because she's just like a shotgun.two cocks and she's ready to blow! OFF TIN IND 96 | So fucking irritated OFF UNT NULL 97 | Obama’s Education Secretary Is Wrong About Gun Control Being Best Way to Keep Kids Safe https://t.co/hdOeUHC50l https://t.co/WhBHYYRG2A NOT NULL NULL 98 | i told yall im not supposed to fuckin explain so stop asking me why goddamn https://t.co/NImBRp4GcO OFF UNT NULL 99 | Health department preparing plan in case voters approve medical marijuana measure https://t.co/P74cnDNS8W https://t.co/thHFDqGlxo NOT NULL NULL 100 | Should Florida let patients smoke medical marijuana? A judge will now decide https://t.co/4B49hsCmge NOT NULL NULL 101 | @DRoofwood @CNNPolitics Appear to be falling apart? They pled. They’re guilty. They are cooperating with mueller. So sessions recused himself just for honor. That’s not how law works. Recusals don’t happen b/c of honor. NOT NULL NULL 102 | @BreitbartNews That's a dangerous slippery slope. When is Twitter going to stop censoring opinions? NOT NULL NULL 103 | @BreitbartNews No one cares what Jim Carry does or says! he is irrelevant ! NOT NULL NULL 104 | @Auzqn @tafadzwamarere @lashiasn @garweh2011 @Bweraz @chichie19 @ChelleChipato This will be my husband some day aneta hake nekushushwa NOT NULL NULL 105 | Okay everyone always talks aboht the pathetic army and all the soy boy branches and gay shit and what not but how come no body is talking about the fucking spartans like nibba they save the world unlike some people OFF UNT NULL 106 | Click here to watch it: https://t.co/GLOhbdSHUb https://t.co/M87Wl2BE8q NOT NULL NULL 107 | @PanicAtTheDisco We love you so much!! NOT NULL NULL 108 | im just thinking about when we had to read cathedral" back in ap british lit and "the a&p" during my first year in college and both them were so fuckin boring 😴" OFF UNT NULL 109 | If you lose me bitch you losing 🤷🏾‍♂️ OFF UNT NULL 110 | @GOPLeader @POTUS Maybe they could start by allowing medical marijuana as a viable substitute instead of labeling it schedule 1. NOT NULL NULL 111 | Yessss My shit looks bad Im so glad im off tomorrow OFF UNT NULL 112 | Sasha really came for my jugular and called me on my Scorpio fetish last night. Bxtch. OFF UNT NULL 113 | Medical marijuana taken orally yes; smoke it & lungs destroyed,cancer, schizophrenia new studies show. Teachers c 'zoned out students' 0 learning, work ethic. Dr. Gupta has done 0 research. Gupta Killing students 4 wealth of Marijuana business @murdockme https://t.co/GYn5ZtdyZj NOT NULL NULL 114 | Arizona Supreme Court: Medical marijuana on #university campus not subject to arrest https://t.co/pkt1HVRBQx #MME #marijuana #cannabis #Medicalmarijuana #College https://t.co/iblv9wqZ2g NOT NULL NULL 115 | @NixBryan @thehill @DLoesch America was built on a republic, as far as gun control most Americans don't want it, it's an extremely unpopular opinion NOT NULL NULL 116 | @zackwhittaker I'm so sorry about this heartbreaking news. They are lucky to have you with them. NOT NULL NULL 117 | everyone keeps looking at me crazy when i tell them Junie bouta be 4 months 😂 that’s how big he is. 😅 NOT NULL NULL 118 | @NewYorker Capitalism is the antithesis of democracy. NOT NULL NULL 119 | @NewYorker Fakesters NOT NULL NULL 120 | @BreitbartNews This is funny I'm sick & tired of Lib comedians using their show to bash Trump and his family I'm sick of it! Why doI have 2 pay 4 this crap? We need to be placed on an even plane that's all I ask! We want 2 get our message out as well NOT NULL NULL 121 | @BreitbartNews The price of fame and arrogance NOT NULL NULL 122 | @thetangibleC4 49 people died at Pulse. 58 people died at Las Vegas. 22 high school student this year. Not enough for Republicans to even have a discussion about gun control. What happened to the bump stock ban? It is dead in congress because of Republ NOT NULL NULL 123 | @BreitbartNews Give it up Lame ass OFF UNT NULL 124 | @ketagoesglobal @OKANEMOCHI6 @XonEarth @MuslimIQ The Government" is in the pocket of the NRA who want nothing more than for the "right" people to have as many guns as possible. Who are these government officials who push so strongly to take all of your gu NOT NULL NULL 125 | @BreitbartNews By the way some of these Twitter comments are and ate threatening our president keep it up guys we got your back mr. president NOT NULL NULL 126 | As if i just waited an hour + at @TheChiquito coventry, then blanked a good 4 times when we asked how long food would be. Usually good service. Today however, absolutely shite. OFF UNT NULL 127 | @porpentina2017 @LunaLuvgood2017 @GOP They don’t. The GOP will keep supporting racketeer, illegitimate Trump. They never will stop the corruption of tRump. They are in it for the money. They want to destroy American democracy. NOT NULL NULL 128 | “Look, I just want to break up all your shit Call your mama phone Let her know that she raised a bitch, then dial tone, click” OFF UNT NULL 129 | This topic keeps expanding and growing like a weed with each new state that passes a law or brings it to the table for discussion: medical marijuana. Read more on how MSS can provide the best storage units for this industry! https://t.co/o04SCOr5w0 https://t.co/nvaDhCWKhC NOT NULL NULL 130 | Thinking she a pretty decent bitch but she a hoe prolly 📍 https://t.co/CGQndGerwl OFF UNT NULL 131 | @BreitbartNews So touching, so profound, so moving... I feel an Emy... NOT NULL NULL 132 | When you are insulted in public https://t.co/wbjdJnw4Ry #ministrylife NOT NULL NULL 133 | @DavidSonstebo @sweis @BoschGlobal @VWGroup @Ethan_Heilman @neha what about doing this in public? live stream of the curl-p atack? ill publish my IOTA adress with 100GI ... if you can hack it, its yours? deal? if not it shows that Ethan Heilman & Neha NOT NULL NULL 134 | @BreitbartNews But thank goodness he’s all got the #ThoughtPolice working overtime🤦‍♀️ NOT NULL NULL 135 | .@WCSIndia Attention Chief Warden #Rajasthan Dr.Reddy #SaveSuman Repeated requests for HELP have gone unanswered They are Breaking her SPIRIT .@timesofindia https://t.co/yzUoG8lnIc NOT NULL NULL 136 | Ask Erin Parszewski to be a guest on your podcast. She's as passionate about #business as she is about #health and #wellness! https://t.co/7YaUeZTD5C https://t.co/8AVznPA15i NOT NULL NULL 137 | @BreitbartNews Like he actually reads their tweets. lol NOT NULL NULL 138 | @NewYorker Yes she should NOT NULL NULL 139 | @SimplyKane_ I think Emery's primary job is to consolidate their top 4 position. Which he might be able to do next season with some major overhaul. I don't think they are asking for trophies from him so soon. He is a good manager but Arsenal's squad is po NOT NULL NULL 140 | @NewYorker Best of luck, Ms. Stacey Abrams -- rooting for you! NOT NULL NULL 141 | @NewYorker But do these firsts make her qualified? Just asking!----S NOT NULL NULL 142 | As a football fan, F u c k this shit ⬇️ And it’s not even the first or last time this organization mishandles an important issue. 🙄 https://t.co/ZmEQSWr3Jz OFF UNT NULL 143 | @NewYorker Hm. Sorry, not caring. NOT NULL NULL 144 | @joxnxgxrcix oh my gooooood he is such a man now NOT NULL NULL 145 | @mahwashajaz_ she is 11 year old, she should be called a child not teenager, even news paper reported it like that. NOT NULL NULL 146 | Obama’s Education Secretary Is Wrong About Gun Control Being Best Way to Keep Kids Safe https://t.co/8hSxiTA5v4 NOT NULL NULL 147 | I love simba with my whole heart even if she is a giant dumbass most the time OFF UNT NULL 148 | @BreitbartNews Every time Sch met speaks ill of our president I think he is really alk big about our FORMER preside… https://t.co/jHg6VaQpy4 NOT NULL NULL 149 | Our team specializes in helping patients across Ohio obtain their medical marijuana cards. Give us a call, or send us a message, if you’d like to find out more information about how you can get your medical marijuana card in Ohio. https://t.co/D95MBSdKVY #Ohio #medicalmarijuana https://t.co/MdI0MeZ1C3 NOT NULL NULL 150 | @BreitbartNews Well it sounds like they should pay their personnel more, instead of giving CEO’s big fat bonuses for cutting wages. NOT NULL NULL 151 | Lovely to meet Georgia Too 👍😀🙋🏻 @TinyRebelNpt @networkEDcymru #networkedlocal https://t.co/c3MDP4aMj8 NOT NULL NULL 152 | @NewYorker yeah but can he pay taxes? NOT NULL NULL 153 | Hot USA latina girlfriend sucking so nicely 5 min https://t.co/GjUW6g7Djx OFF TIN OTH 154 | GOP Rep proposes bill for more federal cannabis farms and better quality flower https://t.co/zRf0MftVOw https://t.co/0QZp4L3hzy NOT NULL NULL 155 | @NewYorker Sadly, probably not the first time she was alone at an event for Megan. Like every single mom who has had to say ‘Dad couldn’t be here but he really wanted to be. He just had to go to X. He’s so proud of u. I’ll send him pictures’ Brave face fo NOT NULL NULL 156 | @aStarlightDream @stephenasmith @StephenCurry30 @HoustonRockets If you are the two time MVP and the man that runs this house.. Then you need to show up in the 4th. Especially in the playoffs.. NOT NULL NULL 157 | @Alec_Krummen4 @Jander513 Even if he is, you like thug a little too much 😂 NOT NULL NULL 158 | @RealJack What you never see is us conservatives beating up on the mental left Democrats! They are always violent. . That speaks volumes about them. OFF TIN ORG 159 | @VP How silly to devote energy to this while allowing kids to be shot in school without any push for action on gun control. Par for the course I guess in this administration. NOT NULL NULL 160 | @ncbn @hd_kumaraswamy You tumble in Cong, YSRC trap. To set scores, u get desperate n keep losing your grew. 1st time, I felt very bad with U, seeing you are in Cong lobby, to please your ego. Hope you still remember the reason, how TDP started then, whom OFF TIN IND 161 | Seth Lancaster hits 17th HR of the year to give CCU 1-0 lead. He is tied with Kevin Woodall Jr., for the team lead. Lancaster has 31 career HR to move into tie for 7th on CCU list with the likes of Sappelt, Rigos, Remillard and Howle. NOT NULL NULL 162 | @BreitbartNews Glad it amuses you NOT NULL NULL 163 | This is a Man, a white Wolf, the Captain’s Girlfriend, he is in Infinity war !!! ( Ah bon ?? Ahaha ) https://t.co/A364FAFeYV OFF TIN IND 164 | @DailyCaller who remembers obama on TV the day after sandy hook when a major gun control vote was voted down.... he was MAAAAAAAAAAAAAAAAAD. https://t.co/Vb56HupU3X OFF TIN IND 165 | Tomi was not assaulted". She was splashed with a bit of cold water. Shes perfectly fine, just damp. Stop treating her like shes some sort of victim. Was it childish? Yes. Was it rude? Yes. But it was NOT assault. Calm the fuck down." NOT NULL NULL 166 | He points out that when opioids are used in combination with cannabis in animals, marijuana can boost an opioid's effectiveness without requiring higher dosages." https://t.co/aZNozHWF5C" NOT NULL NULL 167 | @alt_matt_tweet @realDonaldTrump He donates his entire salary from his Trump businesses every quarter to a worthwhile cause. If he is a conman, then I wish there were more Trumps in this world. #MAGA NOT NULL NULL 168 | Medical marijuana laws and adolescent marijuana use in the United States:systematic rev https://t.co/iQmJBPhckx NOT NULL NULL 169 | @VP I'm glad to see you are focused on the important issues of our time. Gun control can wait - am I right? OFF TIN IND 170 | @BreitbartNews Oh go screw himself. Seriously, this judge doesn't get the concept of the First Amendment. Put him right here in my face. I would debate his dumbassery. OFF TTH IND 171 | @thoughtidsay @Follow_Follow_ In what way does a Football League affect gun control or discrimination? They are asking their players to respect the Anthem, we would expect the same in the U.K. NOT NULL NULL 172 | I seriously need to get my shit together. NOT NULL NULL 173 | I need one of these jackets https://t.co/IgQq0vhGwp NOT NULL NULL 174 | First of all sebody find a boyfriend for @ektaravikapoor . She is so f$$$$$$ lonely https://t.co/JNXhfuvvRd OFF TIN IND 175 | Fuck I keep waking up so fucking horny I’m 🙃🙃🙃🙃🙃 OFF UNT NULL 176 | @NewYorker Sure they will. They didn't learn from the last meltdown so we'll have to go through it all again. NOT NULL NULL 177 | Interesting how guns get such a bad rap in the US, but other countries--with stricter laws--have higher mortality rates... #2ndAmendment https://t.co/0YucZmJwKC NOT NULL NULL 178 | Can’t trust all these celebrities when they wanna smooth their faces & shit. Apparently everyone’s got perfect skin 😂😂😂 OFF TIN GRP 179 | @BreitbartNews THIS PIECE OF CRAP AND HER LIES DEMOCRAP PARTY HAVE BEEN THROWING OUT FOR 30 YEARS SICK THEY Outsourced ALL OUR JOBS FACTORIES A FACT WHEN THESE COMPANIES MOVED OVER SEAS THERE ARE NO EPA STANDARDS TO OBEY WHATEVER JOBS WERE LEFT GAVE IT TO OFF TIN IND 180 | @visitmusiccity @Titans @NFL @EddieGeorge2727 @TheTimMcGraw Tim McGraw doesn't know his audience. U dont go campaig… https://t.co/pu8ZGIfSfW NOT NULL NULL 181 | @JoelSsenyonyi @ntvuganda It's move of pay back from the previous episodes. More two years and double pay would be equivalent to two terms. They are concluding some contract NOT NULL NULL 182 | He is now a board member of NW provincial government #SupraMahumapelo NOT NULL NULL 183 | Vote for me to perform for $50K Grand Prize at #Alabama All Ages Edition 5/24/18! https://t.co/i9iuXFgcnx #Coast2Coast https://t.co/Hd8477g3c3 NOT NULL NULL 184 | @hughhewitt @mishacollins Well he’s not wrong and as a Republican (non-Trump supporter) I agree we do need gun control ! And I am very aware of @mishacollins politics NOT NULL NULL 185 | @NewYorker She was gracious and dignified ! NOT NULL NULL 186 | @BreitbartNews If this is true than twitter blocking should be illegal. NOT NULL NULL 187 | @BreitbartNews It's hard to understand how representative Al Green wants to impeach a president. Who has brought black unemployment to its lowest levels. What hypocrisy we're all Americans and yet hatred and divisiveness is all the Democrat has to offer. OFF TIN IND 188 | @NewYorker They definitely have the numbers to do so if EVERYONE shows up & votes!! NOT NULL NULL 189 | @NewYorker He's used to administrations that sell Uranium for profit, spy on their political opponents, and bow to our enemies. Probably for the best #DrainTheSwamp NOT NULL NULL 190 | @Tini4you Bitte🤗😉🙈🙈 NOT NULL NULL 191 | @SenorRodriguezz It also penalized a team once for wearing pink for breast cancer awareness. It’s sad. But that and gun control are completely unrelated NOT NULL NULL 192 | Little girls, listen closely 'Cause no one told me But you deserve to know That in this world, you are not beholden You do not owe them Your body and your soul ISSO É UM HINO DE MÚSICA E O CLIPE ENTÃO https://t.co/21Lzl5FNpb #XtinaDemiVideo NOT NULL NULL 193 | Pennsylvania Becomes First State to Approve Medical Marijuana to Treat Opioid Addiction https://t.co/axaOAZZ1W6 NOT NULL NULL 194 | @Holdipoika They are i agree on that but it cant be helped NOT NULL NULL 195 | @NewYorker She was fine! She was perfectly beautiful, proud, loving and fine. NOT NULL NULL 196 | @Nick79278488 @AshanteSky 🤣🤣🤣 he is just going to bug you for free pics NOT NULL NULL 197 | @BreitbartNews If Trump can't block then none of our public officials can. So the judge in this case should start one up. Why aren't you on Twitter Naomi Buchwald? NOT NULL NULL 198 | @btschartdata You are doing amazing sweetie https://t.co/NYwl0wkxGT NOT NULL NULL 199 | @CervantesPR1 Thing is remakes are only required with old ass games. Something from PS1/PS2 because they're outdated.. that's why yakuza 1 & 2 needed remakes and Final Fantasy VII, PS3 games hold up hence why it's a 1080p/4k 60fps port. NOT NULL NULL 200 | Now that the Obama Admin. and their hacks are being exposed, all of the guilty with a brain are suddenly silent and the rest are screaming foul play! We will hear their propaganda well after they are wearing their orange pantsuits! 🍿Popcorn anyone!🍿🍿🍿🍿🍿 NOT NULL NULL 201 | my case manager just met me at the gas station and is bad ass OFF UNT NULL 202 | @NewYorker I get that puns are tempting, but the TV show title “Mad Men excluded women to emphasize ‘60s gender inequality. Referencing “Math Men” in a story about recent times unnecessarily erases women data scientists and engineers. NOT NULL NULL 203 | @qatarairways Hello, I need help getting past this error kindly... https://t.co/nWPgLfmLmH NOT NULL NULL 204 | @NewYorker Next! NOT NULL NULL 205 | this was my shit !! 🕺🏾 you wanna see some ass i wanna see some cash 😛 https://t.co/Nm3DRoXHY6 OFF UNT NULL 206 | @BreitbartNews She slept with him for money & you’re honoring her for resisting him?😂😂😂😂😂😂 Your level of stupidity amazes me!😂😂😂😂😂 OFF TIN IND 207 | @NewYorker @StelioLardas John McCain is a loser presidential candidate that stayed in office far too long and caused the Obama administration endless hateful comments. OFF TIN IND 208 | @BreitbartNews Schumer is dishonest, a liar and a corrupt representative. https://t.co/KTTJc9j4Su OFF TIN IND 209 | All that crap you talked online about her......I’m glad you met Cris, she is one of the nicest people you will ever meet https://t.co/TWigCloKvE NOT NULL NULL 210 | @NewYorker & utterly radiant.... NOT NULL NULL 211 | I love JC but he needs to start systematically taking Theresa May down. She is handing him the ammunition, he needs to use it! #PMQs OFF TTH IND 212 | @NewYorker @WakeUp2News I’m jealous of #Texas & #Georgia. You have some great candidates running. Meanwhile in #Florida, we are stuck with less than mediocre senator @marcorubio for another 4 1/2 yrs. Go out there and vote. Don’t waste this opportunit OFF TIN IND 213 | @SamWoodIII @NewJerseyAG @johnboenher @JanHefler @freedomisgreen @PhilMurphyNJ I just found this insect egg in my “medical marijuana” from CuraLeaf. It cost me $55 for 1/8. #socialjustice https://t.co/f66PXKbs6p NOT NULL NULL 214 | @BreitbartNews The Catholic Church is really screwed up. Nothing new here. OFF TIN GRP 215 | Why do people think I'm fucking velha all the time? OFF UNT NULL 216 | Arizona Supreme Court rules the legislature cannot criminalize marijuana possession on university campuses if a person has a medical marijuana card. Read the opinion https://t.co/bmARomcQtl @AZPMnews NOT NULL NULL 217 | @BreitbartNews Artwork" https://t.co/JJDXmCy22r" NOT NULL NULL 218 | @madimozart @TimesNow He is a man of action? Lol. The actions you see are illusions. NOT NULL NULL 219 | Cannabis has been suggested to be an effective painkiller for most people. Follow these tips to have a productive conversation with you doctor about #MedicalMarijuana for #ChronicPain management. https://t.co/IO40LmMDvu NOT NULL NULL 220 | The Arizona Supreme Court is scheduled to rule on the legality of medical marijuana on college campuses. https://t.co/De7o7LEK4J NOT NULL NULL 221 | Things HAVE changed, and we can change more! https://t.co/7fHyssJc2Y NOT NULL NULL 222 | @NewYorker Human incels could learn a thing or two from their fruitfly brethren. NOT NULL NULL 223 | @RickCarioti89 @AT12397451 @NYDailyNews You have called me “nuts” and the only reason I haven’t blocked you was because I thought you were a teenager and don’t know better. Now you insulted me again and you are done. NOT NULL NULL 224 | Click here for more Pictures: https://t.co/mclCwzPkQk https://t.co/DIIUHrUcdK NOT NULL NULL 225 | @BreitbartNews A Netflix Barry Soetoro aka Barack Obama Production Pedo Joe goes to Washington ! https://t.co/57tFDydmPv NOT NULL NULL 226 | @NewYorker Kill the traitors. OFF TTH OTH 227 | My wrist been fucked up for nearly a month now . This time im really going to the hospital to see what the fuck is wrong with it OFF UNT IND 228 | We dare you not to sing along. @frankievalli live with hits like “Sherry,” “Big Girls Don’t Cry,” and “Can’t Take My Eyes Off You” this Sunday in the Island Event Center. A few tickets remain. Get them before they are gone: https://t.co/GIYIPeTSkl https://t.co/zec9F3ILXa NOT NULL NULL 229 | @mishacollins That is always the way, someone says “sensible gun control” & they somehow hear “no more guns”. Most people won’t have to worry about heightened regulations affecting their ability to buy guns, but it will help to ensure people who shoul NOT NULL NULL 230 | I wish I cared more NOT NULL NULL 231 | If I pull up to yo crib and you offer me cockroach milk you getting yo ass beaten https://t.co/0fOlxyOone OFF TIN IND 232 | @ChewyBB @Femi_Sorry @Jacob_Rees_Mogg Poor he is not brought down he wasn't Hypocrite he isn't. Brighter than you ? Most definitely OFF TIN GRP 233 | @BreitbartNews Crying Chuck and the rest are on the run. 😂😂😂😂😂😂😂 NOT NULL NULL 234 | @jbwalk38 @BrownRampage @gavin_rowell @stephenasmith i didnt say he is better player now did i? i said he is not as good as he looked to be on lottery teams 1st option compared to being on a contending team NOT NULL NULL 235 | @BreitbartNews From Jimmy's comments. Jimmy may not know this, but when you type in Melania", Twitter automatically corrects it to "Melanie". So lay off the BS Jimmy until you know the scoop." OFF TIN IND 236 | @abstex Best wishes cocker. OFF TIN IND 237 | Common sense has come back to America. Hogg boy and CNN are now seen for what they are. Childish and bald faced liars. https://t.co/fBNN1kpPbx OFF TIN ORG 238 | i am glad that she feels happy now and i never thought that their relationship was toxic I always said that they had an healthy relationship ship but I'm so sad to hear that. I'm glad that she got out of this relationship and she is right. nothing but respect for my lil woman https://t.co/mYEliqWD14 NOT NULL NULL 239 | @NewYorker Another titan bites the dust. Can remember reading American Pastoral & being in awe of the artistry of the language & so-real-you-felt- like-you-knew-them characters. He had a real gift for weaving together complex, thought provoking st NOT NULL NULL 240 | @shabana_chisty I know right. 😂 he is a living emoji, no doubt in that. 😍😂 NOT NULL NULL 241 | *the citizens of Gai’dence all cheer for their young hero. They don’t yet realize who he is.* Hello...h-hey...*he waves to the cheering crowd, feeling a little flustered* NOT NULL NULL 242 | @SpeakerRyan @POTUS Why do so-called 'conservatives' want to control Legislatively everything about a woman's reproduction but have such an aversion to men's responsibility in bringing these unwanted pregnancies and why do they have such an aversion to Le NOT NULL NULL 243 | @NewYorker Make it so. NOT NULL NULL 244 | We dissected a seed today and discovered so much! We made a connection to the great scientist, Eugenie Clark. She dissected fish to learn more about them. (Thanks @MrsStreek302 for sharing your idea with us!) https://t.co/h2as9iqg6O NOT NULL NULL 245 | @curiositynsense @WSJ @rajendra_hash @amnesty Knowing Amnasty and other similar groups ,they always have their own hidden agenda. Do not trust them, they are setting us up for nasty surprise when we are going into election mode in 2019. NOT NULL NULL 246 | @Sacrimoni2John @krassenstein @pcblonde1 @Thomas1774Paine How? Also, how do you have nothing if it happened so long ago? It would seem that you are nothing more than a #pigeon, pooping upon the chess board of this thread. Derphearder. OFF TIN IND 247 | @TrisGretton Hi Tristan, thanks for getting in touch. I'm afraid you are through to Tesco Bank, you would need to contact Clubcard for any information regarding Clubcard points. However, I can tell you that all vouchers will now be worth 3 times the value NOT NULL NULL 248 | @GuardOfForever @TashaKheiriddin This is being driven by MADD. They are one of the biggest revenge mobs I have ever seen. If they truly cared about society, they would have the same fervor and anger for distracted driving as they do DUIs. NOT NULL NULL 249 | @BreitbartNews This is pretty sad, even for you guys. #TrumpBros https://t.co/TcR6Dy91To NOT NULL NULL 250 | @MaxPollard92 He should have gone arsenal. NOT NULL NULL 251 | @PfeifferPforeva Omg she is so beautiful 😍 NOT NULL NULL 252 | @Julius_S_Malema @EFFSouthAfrica @Sentletse he is getting ready for the EFF https://t.co/6qebmHYFhX NOT NULL NULL 253 | Y’all remember that California vs NYC bbq tweet? This already shits on both https://t.co/fy3Mr0yW8m OFF UNT NULL 254 | Welcome aboard Body Count Fighting you are now an official #MetalMinion of the #BrutalBrigade https://t.co/hnUqyjOivx https://t.co/ji8LKJ01iV NOT NULL NULL 255 | @OriginalMGM @VibeHi @TomiLahren 😂😂😂😂😂trigger? Your the one calling people a pussy. I dont have time for your fox news foolish ass. Ive herd all the same arguements from fools much smarter than you. You dont want a debate you want attention. Hire a OFF TIN IND 256 | Medical Marijuana Miracle: Cannabis Cream Helps Baby With Rare Skin Disease! https://t.co/IKRbtJZMnd NOT NULL NULL 257 | Holy shit," one man remarked as he crossed the bathroom's threshold on Tuesday, before whistling in astonished approval. "It don't even smell bad yet!" https://t.co/NxSGRod9Df via @Gothamist" OFF UNT NULL 258 | @BreitbartNews Will she be satisfying attendees with love oral performances? OFF TIN IND 259 | @BreitbartNews People need to go to jail NOT NULL NULL 260 | Lol I’m so tired of this world man. Shit just keep getting worse https://t.co/0taTDrGpc7 OFF UNT NULL 261 | @NewYorker 'My values are not his values,' John #Feeley, the Ambassador to #Panama, said of #Trump." "I think most people want a president who they trust to tell the truth always and who has some basic moral values,including loyalty to his own wife." Jimm NOT NULL NULL 262 | @BreitbartNews @GOP @FLOTUS @realDonaldTrump right to ger separate bedroom. NOT NULL NULL 263 | @Keith_a_Trip Styling G is a fuckkin classic OFF UNT NULL 264 | My mom just called me and said she is joining the NFL boycott. How many of yall are with us? F that league #NFLBoycott OFF UNT NULL 265 | @Tukeirai text me videos 😂 NOT NULL NULL 266 | There are many #autoimmunediseases on this list... what do you think? https://t.co/57QAeMkwh5 NOT NULL NULL 267 | @NewYorker LUXURY LINERS, MOBY DICK, AND THE GIPSY MOTH - 'I must go down to the seas again'. Loving the sea but hating to be on it; the story of a vicarious love of water https://t.co/rOmOzGIGlr https://t.co/ttRozqt08F NOT NULL NULL 268 | BUSTED!!! Kemi Olunloyo Replies Linda Ikeji… Expose Photos Of The MOONBUMP She Is Wearing [See Photos]… https://t.co/EjvT744jB5 NOT NULL NULL 269 | @realDonaldTrump You are all utterly delusional. If you were really pro-life" you would enact gun control measures to keep young people from being mass murdered in schools. If you were really "pro-life" you wouldn't make cuts to SNAP, Chip and Medicaid. Y NOT NULL NULL 270 | No like orgasm, nut is what comes out and don't chew it" https://t.co/bIktxkQjfX" OFF UNT NULL 271 | Retweeted Dr. Naseem Beauchman (@DoctorNas): You are not for everyone . That’s ok . Talk to the people who can hear you. NOT NULL NULL 272 | FL: Is It Safe To Give My Pet Medical Marijuana | 420 MAGAZINE ® https://t.co/Zsi1GWpRcy NOT NULL NULL 273 | U.S. attorney: Medical marijuana banking issue is not just a West Virginia issue: https://t.co/IMmSD3giyr NOT NULL NULL 274 | @BreitbartNews Jim Carey was APPROPRIATELY cast for the lead in DUMB and DUMBER. NOT NULL NULL 275 | AZ supreme court decision comes down: medical marijuana is now OK on campus, writes @rachelleingang https://t.co/nRfsHITsKL NOT NULL NULL 276 | Yeah not eveey relationship you have with the opposite sex has to be sexual/romantic. https://t.co/iaqSjxftbo NOT NULL NULL 277 | You are hurting Twitters feelings. https://t.co/kFeTIGibDx NOT NULL NULL 278 | @BillOReilly Kasich, your good buddy in Ohio signed a bill legalizing medical marijuana in 2016. Put that in your pipe and smoke it NOT NULL NULL 279 | Medical Marijuana, Inc. Announces Q1 2018 as the Largest Sales Revenue Quarter in Company ... https://t.co/rjnsIHCZc5 NOT NULL NULL 280 | Epilepsy Know For those of you wanting t know about the U.S States that are approving of Medical Marijuana #EK @friendlychoice https://t.co/4GtwBmX5da NOT NULL NULL 281 | @snapdeal don’t buy anything from Snapdeal.. they are fooling customers NOT NULL NULL 282 | @realDonaldTrump Fuck you, you fat piece of shit OFF TIN IND 283 | Missouri House’s medical marijuana bill is nothing but a smokescreen https://t.co/ENxcGmVaNs NOT NULL NULL 284 | @HardballChris @NicoleHockley She don't speak for me, I speak for myself. And as someone who's been shot- Screw gun control. NOT NULL NULL 285 | Update 3: Per author's facebook post, author now says this was NOT done by an impersonator. It was done by her via her agent, but it was a miscommunication" & she is now cancelling the "forever" trademark request." NOT NULL NULL 286 | Priorities are private. No matter how busy you are, remember that you are not a machine. Taste Life! Love. Embrace all the flavors. When you're clear about what you sense, the natural hum of happiness reflects in your eyes. Everyone can see it. https://t.co/KHIW236pr7 NOT NULL NULL 287 | @NewYorker Also a great Zionist that will be dearly missed 🇮🇱🇮🇱🇮🇱🇮🇱 RIP NOT NULL NULL 288 | What human diseases can be treated by medical marijuana? #BuyMyWeedOnline #MedicalMarijuana #Cannabis https://t.co/V8dq6fq62t NOT NULL NULL 289 | @bobbysbadbitch i’m tryna marry johnny, not you so.. 👋🏻 NOT NULL NULL 290 | @NewYorker This movie was so intriguing to me. History I was not aware of. I didn't know it was a book. Off to @amazon https://t.co/Pck5UUrAtr NOT NULL NULL 291 | Don't you just hate when you are out and about and your phone dies? #Lincoln #LNK comes thru for us with these charging stations! What a great idea for those who love to spend time at the parks! https://t.co/XBgpjDYBXs https://t.co/4gbkdqeVFi NOT NULL NULL 292 | 😍😍😍😍 he is fine asf https://t.co/HczseFrHQu OFF UNT NULL 293 | @FoxNews Liberals are losing their minds. They care more for the environment than for babies.They care more for street gangs that are a bunch of animals than for cops. Now they want to ban straws. They are losing their minds. NOT NULL NULL 294 | @BreitbartNews What did she resist? NOT NULL NULL 295 | @Glinner check this cunt out! https://t.co/H03SmnL9wd OFF UNT NULL 296 | @seanhannity Yeah thanks to your Nobel Emmy award winning idiot chief flip flopping on everything from Iran to gun control. OFF TIN IND 297 | @LauraLoomer It is amazing how short sighted our youth (& adults) can be. If they want gun control, free education, & free healthcare they should try living in Cuba to see what happens when government controls everything. #NoBueno NOT NULL NULL 298 | @BreitbartNews Can anyone say judicial overreach"?" NOT NULL NULL 299 | @BreitbartNews Why the hell does anything need to be gay enough"? Talk about narcissists, the gay community does everything they can to shame others into siding with them. Until the gay faction of the APA took over several years ago, gay was considered a NOT NULL NULL 300 | @TravisYeggy @MrGoodMemes @NathanHRubin LOL! I literally just provided you with the CDC report that states gun control doesn't show any proof of stopping violent crime. Also, that armed citizens use their firearms to stop TONS of violent acts.This is defi NOT NULL NULL 301 | @NewYorker And apparently Congress is willing to let him get away with anything/eveything. SMH NOT NULL NULL 302 | she got them ass shots that i like 👅 OFF UNT NULL 303 | @thecomeback @JABItalia Fuck @APrecourt OFF TIN IND 304 | Hey @LIRR , you are disgusting. https://t.co/KTcsBxt8Lb OFF TIN ORG 305 | @NewYorker Same NOT NULL NULL 306 | There is no such thing as gun control, there is only gun centralization into the hands of government; which has proven to be the most corrupt and murderous institution in the history of the world. https://t.co/mv77ClXOmh... https://t.co/R9kaQXwH5P NOT NULL NULL 307 | @NewYorker Here's some positive literature for summer perusal https://t.co/es7uc9MJ64 https://t.co/pzFNjUXASp NOT NULL NULL 308 | @NewYorker Just because she'd be the first don't vote for her just because. Look what happened with Obama. Aka la… https://t.co/B4doIPsDuH NOT NULL NULL 309 | DFS Hitman Lineup - DraftKings Only (GS vs. HOU only) #dfs https://t.co/VWxtg0o5Y5 https://t.co/c4UYPaKq8Y NOT NULL NULL 310 | If you missed the Dateline episode - Growing Promise and have been curious about how medical marijuana and CBD oil has been beneficial for so many - have a look!! This plant has been demonized and scrutinized and yet... https://t.co/dzYtVCgby3 NOT NULL NULL 311 | The Santa Fe shooter is not a victim" he is a murderer. The moment he picked up the gun and intended to shoot people he became a criminal and calling him anything else is an insult to his victims." NOT NULL NULL 312 | Im really grateful and i really feel special that i was born in the same day with Min Yoongi. Imagine i born and in… https://t.co/dBdPQdCVCF NOT NULL NULL 313 | @jchaltiwanger I guess this did reveal that we need to start with defining gun control on a national level, and acr… https://t.co/UfzylvJL01 NOT NULL NULL 314 | This looks like a really weird orgasm." https://t.co/nbjqYX9eN3" NOT NULL NULL 315 | @MrsMeganTheRed Sometimes my brain hits upon a thought or impression and it *ding*s back, ringing true like crystal… https://t.co/0WY2lEYf5I NOT NULL NULL 316 | @Providence1775 @SoCalTheist @TakeThatEpi @RosaRubicon @FreeThoughtPgh @ClayCassius @RJDownard @EBatterson… https://t.co/RUKGkMgbqB NOT NULL NULL 317 | Gun control support fades three months after Florida massacre: Reuters/Ipsos poll https://t.co/cREdhfF9Ui via Reuters NOT NULL NULL 318 | The hardest day to save is today!" So if it is important, schedule a time to handle it. An advisor may make it eas… https://t.co/RzF6Xyc9jv" NOT NULL NULL 319 | Rest well, Christian. The fact that you died is a horrible tragedy and may your family be covered in the precious b… https://t.co/CufDFHE03p NOT NULL NULL 320 | HAHAHA WHAT??? So does that mean the WWE Superstars that blocked me have to unblock me?? 😂😂 CAN I SUE??? #HolyShit https://t.co/SfS4l59U4X OFF UNT NULL 321 | -------------------------------------------------------------------------------- /embedding.py: -------------------------------------------------------------------------------- 1 | from os import listdir 2 | from gensim.models import Word2Vec 3 | from tqdm import tqdm 4 | import numpy as np 5 | from sklearn.feature_extraction.text import TfidfVectorizer 6 | 7 | def tfid(text_vector): 8 | vectorizer = TfidfVectorizer() 9 | untokenized_data =[' '.join(tweet) for tweet in tqdm(text_vector, "Vectorizing...")] 10 | vectorizer = vectorizer.fit(untokenized_data) 11 | vectors = vectorizer.transform(untokenized_data).toarray() 12 | return vectors 13 | 14 | def tfid_test(train_vectors, test_vectors): 15 | vectorizer = TfidfVectorizer() 16 | untokenized_data =[' '.join(tweet) for tweet in train_vectors] 17 | vectorizer = vectorizer.fit(untokenized_data) 18 | 19 | untokenized_data =[' '.join(tweet) for tweet in test_vectors] 20 | vectors = vectorizer.transform(untokenized_data).toarray() 21 | return vectors 22 | 23 | -------------------------------------------------------------------------------- /helper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def get_vectors(vectors, labels, keyword): 4 | if len(vectors) != len(labels): 5 | print("Unmatching sizes!") 6 | return 7 | result = list() 8 | for vector, label in zip(vectors, labels): 9 | if label == keyword: 10 | result.append(vector) 11 | return result 12 | -------------------------------------------------------------------------------- /load_test_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import preprocessing, embedding, helper, classifying 3 | import copy 4 | 5 | def load(train_vector, task='A'): 6 | directory = "datasets/trial-data/offenseval-trial.txt" 7 | print("Preparing Test Data...") 8 | data = pd.read_csv(directory, sep='\t', header=None) 9 | data.columns = ["tweet", "subtask_a", "subtask_b", "subtask_c"] 10 | 11 | tweets = data[["tweet"]] 12 | subtask_a_labels = data[["subtask_a"]] 13 | subtask_b_labels = data.query("subtask_a == 'OFF'")[["subtask_b"]] 14 | subtask_c_labels = data.query("subtask_b == 'TIN'")[["subtask_c"]] 15 | 16 | clean_tweets = copy.deepcopy(tweets) 17 | 18 | clean_tweets['tweet'] = tweets['tweet'].apply(preprocessing.take_data_to_shower) 19 | 20 | clean_tweets['tokens'] = clean_tweets['tweet'].apply(preprocessing.tokenize) 21 | 22 | clean_tweets['tokens'] = clean_tweets['tokens'].apply(preprocessing.remove_stop_words) 23 | 24 | clean_tweets['tokens'] = clean_tweets['tokens'].apply(preprocessing.stem_and_lem) 25 | 26 | text_vector = clean_tweets['tokens'].tolist() 27 | 28 | vectors_a = embedding.tfid_test(train_vector, text_vector) # Numerical Vectors A 29 | labels_a = subtask_a_labels['subtask_a'].values.tolist() # Subtask A Labels 30 | 31 | vectors_b = helper.get_vectors(vectors_a, labels_a, "OFF") # Numerical Vectors B 32 | labels_b = subtask_b_labels['subtask_b'].values.tolist() # Subtask B Labels 33 | 34 | vectors_c = helper.get_vectors(vectors_b, labels_b, "TIN") # Numerical Vectors C 35 | labels_c = subtask_c_labels['subtask_c'].values.tolist() # Subtask C Labels 36 | 37 | if(task=='A' or task=='a'): 38 | return vectors_a, labels_a 39 | elif(task=='B' or task=='b'): 40 | return vectors_b, labels_b 41 | elif(task=='C' or task=='c'): 42 | return vectors_c, labels_c 43 | else: 44 | print("Wrong Subtask!") 45 | return None 46 | -------------------------------------------------------------------------------- /preprocessing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import re, nltk 3 | from tqdm import tqdm 4 | from nltk.corpus import stopwords 5 | from nltk.tokenize import word_tokenize 6 | from nltk.stem.lancaster import LancasterStemmer 7 | lancaster_stemmer = LancasterStemmer() 8 | from nltk.stem import WordNetLemmatizer 9 | wordnet_lemmatizer = WordNetLemmatizer() 10 | 11 | def take_data_to_shower(tweet): 12 | noises = ['URL', '@USER', '\'ve', 'n\'t', '\'s', '\'m'] 13 | 14 | for noise in noises: 15 | tweet = tweet.replace(noise, '') 16 | 17 | return re.sub(r'[^a-zA-Z]', ' ', tweet) 18 | 19 | 20 | def tokenize(tweet): 21 | lower_tweet = tweet.lower() 22 | return word_tokenize(lower_tweet) 23 | 24 | 25 | def remove_stop_words(tokens): 26 | clean_tokens = [] 27 | stopWords = set(stopwords.words('english')) 28 | for token in tokens: 29 | if token not in stopWords: 30 | if token.replace(' ', '') != '': 31 | if len(token) > 1: 32 | clean_tokens.append(token) 33 | return clean_tokens 34 | 35 | 36 | def stem_and_lem(tokens): 37 | clean_tokens = [] 38 | for token in tokens: 39 | token = wordnet_lemmatizer.lemmatize(token) 40 | token = lancaster_stemmer.stem(token) 41 | if len(token) > 1: 42 | clean_tokens.append(token) 43 | return clean_tokens 44 | --------------------------------------------------------------------------------