└── Hate_Speech_Detection_Model.ipynb /Hate_Speech_Detection_Model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Hate Speech Detection Model.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "colab": { 23 | "base_uri": "https://localhost:8080/", 24 | "height": 206 25 | }, 26 | "id": "Fg3Z9CYuRMjk", 27 | "outputId": "9117c671-d4b2-4b41-f18b-70df7ad85774" 28 | }, 29 | "source": [ 30 | "import pandas as pd\n", 31 | "import numpy as np\n", 32 | "from sklearn.feature_extraction.text import CountVectorizer\n", 33 | "from sklearn.model_selection import train_test_split\n", 34 | "from sklearn.tree import DecisionTreeClassifier\n", 35 | "import re\n", 36 | "import nltk\n", 37 | "stemmer = nltk.SnowballStemmer(\"english\")\n", 38 | "from nltk.corpus import stopwords\n", 39 | "import string\n", 40 | "\n", 41 | "data = pd.read_csv(\"twitter.csv\")\n", 42 | "data.head()" 43 | ], 44 | "execution_count": 5, 45 | "outputs": [ 46 | { 47 | "output_type": "execute_result", 48 | "data": { 49 | "text/html": [ 50 | "
\n", 51 | "\n", 64 | "\n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | "
Unnamed: 0counthate_speechoffensive_languageneitherclasstweet
0030032!!! RT @mayasolovely: As a woman you shouldn't...
1130301!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2230301!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3330211!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4460601!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
\n", 130 | "
" 131 | ], 132 | "text/plain": [ 133 | " Unnamed: 0 count ... class tweet\n", 134 | "0 0 3 ... 2 !!! RT @mayasolovely: As a woman you shouldn't...\n", 135 | "1 1 3 ... 1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba...\n", 136 | "2 2 3 ... 1 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...\n", 137 | "3 3 3 ... 1 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...\n", 138 | "4 4 6 ... 1 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...\n", 139 | "\n", 140 | "[5 rows x 7 columns]" 141 | ] 142 | }, 143 | "metadata": { 144 | "tags": [] 145 | }, 146 | "execution_count": 5 147 | } 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "colab": { 154 | "base_uri": "https://localhost:8080/", 155 | "height": 206 156 | }, 157 | "id": "VzZNxA5wRlW0", 158 | "outputId": "d20ef5ce-a88c-4fa5-b9cb-decc8eb2c96c" 159 | }, 160 | "source": [ 161 | "data[\"labels\"] = data[\"class\"].map({0: \"Hate Speech\", 1: \"Offensive Language\", 2: \"No Hate and Offensive\"})\n", 162 | "data.head()" 163 | ], 164 | "execution_count": 6, 165 | "outputs": [ 166 | { 167 | "output_type": "execute_result", 168 | "data": { 169 | "text/html": [ 170 | "
\n", 171 | "\n", 184 | "\n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | "
Unnamed: 0counthate_speechoffensive_languageneitherclasstweetlabels
0030032!!! RT @mayasolovely: As a woman you shouldn't...No Hate and Offensive
1130301!!!!! RT @mleew17: boy dats cold...tyga dwn ba...Offensive Language
2230301!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...Offensive Language
3330211!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...Offensive Language
4460601!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...Offensive Language
\n", 256 | "
" 257 | ], 258 | "text/plain": [ 259 | " Unnamed: 0 ... labels\n", 260 | "0 0 ... No Hate and Offensive\n", 261 | "1 1 ... Offensive Language\n", 262 | "2 2 ... Offensive Language\n", 263 | "3 3 ... Offensive Language\n", 264 | "4 4 ... Offensive Language\n", 265 | "\n", 266 | "[5 rows x 8 columns]" 267 | ] 268 | }, 269 | "metadata": { 270 | "tags": [] 271 | }, 272 | "execution_count": 6 273 | } 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "metadata": { 279 | "colab": { 280 | "base_uri": "https://localhost:8080/", 281 | "height": 206 282 | }, 283 | "id": "_wc0epqHRrOW", 284 | "outputId": "b770b91f-171f-45d2-daff-bc045ab88ce5" 285 | }, 286 | "source": [ 287 | "data = data[[\"tweet\", \"labels\"]]\n", 288 | "data.head()" 289 | ], 290 | "execution_count": 7, 291 | "outputs": [ 292 | { 293 | "output_type": "execute_result", 294 | "data": { 295 | "text/html": [ 296 | "
\n", 297 | "\n", 310 | "\n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | "
tweetlabels
0!!! RT @mayasolovely: As a woman you shouldn't...No Hate and Offensive
1!!!!! RT @mleew17: boy dats cold...tyga dwn ba...Offensive Language
2!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...Offensive Language
3!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...Offensive Language
4!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...Offensive Language
\n", 346 | "
" 347 | ], 348 | "text/plain": [ 349 | " tweet labels\n", 350 | "0 !!! RT @mayasolovely: As a woman you shouldn't... No Hate and Offensive\n", 351 | "1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... Offensive Language\n", 352 | "2 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... Offensive Language\n", 353 | "3 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... Offensive Language\n", 354 | "4 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... Offensive Language" 355 | ] 356 | }, 357 | "metadata": { 358 | "tags": [] 359 | }, 360 | "execution_count": 7 361 | } 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "metadata": { 367 | "colab": { 368 | "base_uri": "https://localhost:8080/", 369 | "height": 241 370 | }, 371 | "id": "Ad8gNdT2Rvz1", 372 | "outputId": "8d7e722b-0774-493e-d8af-35b5f3bb05e1" 373 | }, 374 | "source": [ 375 | "nltk.download('stopwords')\n", 376 | "stopword=set(stopwords.words('english'))\n", 377 | "\n", 378 | "def clean(text):\n", 379 | " text = str(text).lower()\n", 380 | " text = re.sub('\\[.*?\\]', '', text)\n", 381 | " text = re.sub('https?://\\S+|www\\.\\S+', '', text)\n", 382 | " text = re.sub('<.*?>+', '', text)\n", 383 | " text = re.sub('[%s]' % re.escape(string.punctuation), '', text)\n", 384 | " text = re.sub('\\n', '', text)\n", 385 | " text = re.sub('\\w*\\d\\w*', '', text)\n", 386 | " text = [word for word in text.split(' ') if word not in stopword]\n", 387 | " text=\" \".join(text)\n", 388 | " text = [stemmer.stem(word) for word in text.split(' ')]\n", 389 | " text=\" \".join(text)\n", 390 | " return text\n", 391 | "data[\"tweet\"] = data[\"tweet\"].apply(clean)\n", 392 | "data.head()" 393 | ], 394 | "execution_count": 9, 395 | "outputs": [ 396 | { 397 | "output_type": "stream", 398 | "text": [ 399 | "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", 400 | "[nltk_data] Unzipping corpora/stopwords.zip.\n" 401 | ], 402 | "name": "stdout" 403 | }, 404 | { 405 | "output_type": "execute_result", 406 | "data": { 407 | "text/html": [ 408 | "
\n", 409 | "\n", 422 | "\n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | "
tweetlabels
0rt mayasolov woman shouldnt complain clean ho...No Hate and Offensive
1rt boy dat coldtyga dwn bad cuffin dat hoe ...Offensive Language
2rt urkindofbrand dawg rt ever fuck bitch sta...Offensive Language
3rt cganderson vivabas look like tranniOffensive Language
4rt shenikarobert shit hear might true might f...Offensive Language
\n", 458 | "
" 459 | ], 460 | "text/plain": [ 461 | " tweet labels\n", 462 | "0 rt mayasolov woman shouldnt complain clean ho... No Hate and Offensive\n", 463 | "1 rt boy dat coldtyga dwn bad cuffin dat hoe ... Offensive Language\n", 464 | "2 rt urkindofbrand dawg rt ever fuck bitch sta... Offensive Language\n", 465 | "3 rt cganderson vivabas look like tranni Offensive Language\n", 466 | "4 rt shenikarobert shit hear might true might f... Offensive Language" 467 | ] 468 | }, 469 | "metadata": { 470 | "tags": [] 471 | }, 472 | "execution_count": 9 473 | } 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "metadata": { 479 | "id": "URdY2e5ASD-x" 480 | }, 481 | "source": [ 482 | "x = np.array(data[\"tweet\"])\n", 483 | "y = np.array(data[\"labels\"])\n", 484 | "\n", 485 | "cv = CountVectorizer()\n", 486 | "X = cv.fit_transform(x) # Fit the Data\n", 487 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" 488 | ], 489 | "execution_count": 10, 490 | "outputs": [] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "metadata": { 495 | "colab": { 496 | "base_uri": "https://localhost:8080/" 497 | }, 498 | "id": "y_M3gS0WSQR6", 499 | "outputId": "4107c611-5914-459e-f7c4-8a4fdb873cd6" 500 | }, 501 | "source": [ 502 | "clf = DecisionTreeClassifier()\n", 503 | "clf.fit(X_train,y_train)\n", 504 | "clf.score(X_test,y_test)\n", 505 | "user = input(\"Enter a Text: \")\n", 506 | "data = cv.transform([user]).toarray()\n", 507 | "output = clf.predict(data)\n", 508 | "print(output)" 509 | ], 510 | "execution_count": 12, 511 | "outputs": [ 512 | { 513 | "output_type": "stream", 514 | "text": [ 515 | "Enter a Text: Let's unite and kill all the people who don't value our religion.\n", 516 | "['Hate Speech']\n" 517 | ], 518 | "name": "stdout" 519 | } 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "metadata": { 525 | "id": "5mG-MJcOSS5L" 526 | }, 527 | "source": [ 528 | "" 529 | ], 530 | "execution_count": null, 531 | "outputs": [] 532 | } 533 | ] 534 | } --------------------------------------------------------------------------------