└── Hate_Speech_Detection_Model.ipynb /Hate_Speech_Detection_Model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Hate Speech Detection Model.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "colab": { 23 | "base_uri": "https://localhost:8080/", 24 | "height": 206 25 | }, 26 | "id": "Fg3Z9CYuRMjk", 27 | "outputId": "9117c671-d4b2-4b41-f18b-70df7ad85774" 28 | }, 29 | "source": [ 30 | "import pandas as pd\n", 31 | "import numpy as np\n", 32 | "from sklearn.feature_extraction.text import CountVectorizer\n", 33 | "from sklearn.model_selection import train_test_split\n", 34 | "from sklearn.tree import DecisionTreeClassifier\n", 35 | "import re\n", 36 | "import nltk\n", 37 | "stemmer = nltk.SnowballStemmer(\"english\")\n", 38 | "from nltk.corpus import stopwords\n", 39 | "import string\n", 40 | "\n", 41 | "data = pd.read_csv(\"twitter.csv\")\n", 42 | "data.head()" 43 | ], 44 | "execution_count": 5, 45 | "outputs": [ 46 | { 47 | "output_type": "execute_result", 48 | "data": { 49 | "text/html": [ 50 | "

\n", 51 | "\n", 64 | "\n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | "

	Unnamed: 0	count	offensive_language	neither	class	tweet
0	0	3	0	3	2	!!! RT @mayasolovely: As a woman you shouldn't...
1	1	3	3	0	1	!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2	2	3	3	0	1	!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3	3	3	2	1	1	!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4	4	6	6	0	1	!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...

\n", 130 | "

" 131 | ], 132 | "text/plain": [ 133 | " Unnamed: 0 count ... class tweet\n", 134 | "0 0 3 ... 2 !!! RT @mayasolovely: As a woman you shouldn't...\n", 135 | "1 1 3 ... 1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba...\n", 136 | "2 2 3 ... 1 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...\n", 137 | "3 3 3 ... 1 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...\n", 138 | "4 4 6 ... 1 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...\n", 139 | "\n", 140 | "[5 rows x 7 columns]" 141 | ] 142 | }, 143 | "metadata": { 144 | "tags": [] 145 | }, 146 | "execution_count": 5 147 | } 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "colab": { 154 | "base_uri": "https://localhost:8080/", 155 | "height": 206 156 | }, 157 | "id": "VzZNxA5wRlW0", 158 | "outputId": "d20ef5ce-a88c-4fa5-b9cb-decc8eb2c96c" 159 | }, 160 | "source": [ 161 | "data[\"labels\"] = data[\"class\"].map({0: \"Hate Speech\", 1: \"Offensive Language\", 2: \"No Hate and Offensive\"})\n", 162 | "data.head()" 163 | ], 164 | "execution_count": 6, 165 | "outputs": [ 166 | { 167 | "output_type": "execute_result", 168 | "data": { 169 | "text/html": [ 170 | "

\n", 171 | "\n", 184 | "\n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | "

	Unnamed: 0	count	offensive_language	neither	class	tweet	labels
0	0	3	0	3	2	!!! RT @mayasolovely: As a woman you shouldn't...	No Hate and Offensive
1	1	3	3	0	1	!!!!! RT @mleew17: boy dats cold...tyga dwn ba...	Offensive Language
2	2	3	3	0	1	!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...	Offensive Language
3	3	3	2	1	1	!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...	Offensive Language
4	4	6	6	0	1	!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...	Offensive Language

\n", 256 | "

" 257 | ], 258 | "text/plain": [ 259 | " Unnamed: 0 ... labels\n", 260 | "0 0 ... No Hate and Offensive\n", 261 | "1 1 ... Offensive Language\n", 262 | "2 2 ... Offensive Language\n", 263 | "3 3 ... Offensive Language\n", 264 | "4 4 ... Offensive Language\n", 265 | "\n", 266 | "[5 rows x 8 columns]" 267 | ] 268 | }, 269 | "metadata": { 270 | "tags": [] 271 | }, 272 | "execution_count": 6 273 | } 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "metadata": { 279 | "colab": { 280 | "base_uri": "https://localhost:8080/", 281 | "height": 206 282 | }, 283 | "id": "_wc0epqHRrOW", 284 | "outputId": "b770b91f-171f-45d2-daff-bc045ab88ce5" 285 | }, 286 | "source": [ 287 | "data = data[[\"tweet\", \"labels\"]]\n", 288 | "data.head()" 289 | ], 290 | "execution_count": 7, 291 | "outputs": [ 292 | { 293 | "output_type": "execute_result", 294 | "data": { 295 | "text/html": [ 296 | "

\n", 297 | "\n", 310 | "\n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | "

	tweet	labels
0	!!! RT @mayasolovely: As a woman you shouldn't...	No Hate and Offensive
1	!!!!! RT @mleew17: boy dats cold...tyga dwn ba...	Offensive Language
2	!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...	Offensive Language
3	!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...	Offensive Language
4	!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...	Offensive Language

\n", 346 | "

" 347 | ], 348 | "text/plain": [ 349 | " tweet labels\n", 350 | "0 !!! RT @mayasolovely: As a woman you shouldn't... No Hate and Offensive\n", 351 | "1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... Offensive Language\n", 352 | "2 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... Offensive Language\n", 353 | "3 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... Offensive Language\n", 354 | "4 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... Offensive Language" 355 | ] 356 | }, 357 | "metadata": { 358 | "tags": [] 359 | }, 360 | "execution_count": 7 361 | } 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "metadata": { 367 | "colab": { 368 | "base_uri": "https://localhost:8080/", 369 | "height": 241 370 | }, 371 | "id": "Ad8gNdT2Rvz1", 372 | "outputId": "8d7e722b-0774-493e-d8af-35b5f3bb05e1" 373 | }, 374 | "source": [ 375 | "nltk.download('stopwords')\n", 376 | "stopword=set(stopwords.words('english'))\n", 377 | "\n", 378 | "def clean(text):\n", 379 | " text = str(text).lower()\n", 380 | " text = re.sub('\\[.*?\\]', '', text)\n", 381 | " text = re.sub('https?://\\S+|www\\.\\S+', '', text)\n", 382 | " text = re.sub('<.*?>+', '', text)\n", 383 | " text = re.sub('[%s]' % re.escape(string.punctuation), '', text)\n", 384 | " text = re.sub('\\n', '', text)\n", 385 | " text = re.sub('\\w*\\d\\w*', '', text)\n", 386 | " text = [word for word in text.split(' ') if word not in stopword]\n", 387 | " text=\" \".join(text)\n", 388 | " text = [stemmer.stem(word) for word in text.split(' ')]\n", 389 | " text=\" \".join(text)\n", 390 | " return text\n", 391 | "data[\"tweet\"] = data[\"tweet\"].apply(clean)\n", 392 | "data.head()" 393 | ], 394 | "execution_count": 9, 395 | "outputs": [ 396 | { 397 | "output_type": "stream", 398 | "text": [ 399 | "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", 400 | "[nltk_data] Unzipping corpora/stopwords.zip.\n" 401 | ], 402 | "name": "stdout" 403 | }, 404 | { 405 | "output_type": "execute_result", 406 | "data": { 407 | "text/html": [ 408 | "

\n", 409 | "\n", 422 | "\n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | "

	tweet	labels
0	rt mayasolov woman shouldnt complain clean ho...	No Hate and Offensive
1	rt boy dat coldtyga dwn bad cuffin dat hoe ...	Offensive Language
2	rt urkindofbrand dawg rt ever fuck bitch sta...	Offensive Language
3	rt cganderson vivabas look like tranni	Offensive Language
4	rt shenikarobert shit hear might true might f...	Offensive Language

\n", 458 | "

" 459 | ], 460 | "text/plain": [ 461 | " tweet labels\n", 462 | "0 rt mayasolov woman shouldnt complain clean ho... No Hate and Offensive\n", 463 | "1 rt boy dat coldtyga dwn bad cuffin dat hoe ... Offensive Language\n", 464 | "2 rt urkindofbrand dawg rt ever fuck bitch sta... Offensive Language\n", 465 | "3 rt cganderson vivabas look like tranni Offensive Language\n", 466 | "4 rt shenikarobert shit hear might true might f... Offensive Language" 467 | ] 468 | }, 469 | "metadata": { 470 | "tags": [] 471 | }, 472 | "execution_count": 9 473 | } 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "metadata": { 479 | "id": "URdY2e5ASD-x" 480 | }, 481 | "source": [ 482 | "x = np.array(data[\"tweet\"])\n", 483 | "y = np.array(data[\"labels\"])\n", 484 | "\n", 485 | "cv = CountVectorizer()\n", 486 | "X = cv.fit_transform(x) # Fit the Data\n", 487 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" 488 | ], 489 | "execution_count": 10, 490 | "outputs": [] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "metadata": { 495 | "colab": { 496 | "base_uri": "https://localhost:8080/" 497 | }, 498 | "id": "y_M3gS0WSQR6", 499 | "outputId": "4107c611-5914-459e-f7c4-8a4fdb873cd6" 500 | }, 501 | "source": [ 502 | "clf = DecisionTreeClassifier()\n", 503 | "clf.fit(X_train,y_train)\n", 504 | "clf.score(X_test,y_test)\n", 505 | "user = input(\"Enter a Text: \")\n", 506 | "data = cv.transform([user]).toarray()\n", 507 | "output = clf.predict(data)\n", 508 | "print(output)" 509 | ], 510 | "execution_count": 12, 511 | "outputs": [ 512 | { 513 | "output_type": "stream", 514 | "text": [ 515 | "Enter a Text: Let's unite and kill all the people who don't value our religion.\n", 516 | "['Hate Speech']\n" 517 | ], 518 | "name": "stdout" 519 | } 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "metadata": { 525 | "id": "5mG-MJcOSS5L" 526 | }, 527 | "source": [ 528 | "" 529 | ], 530 | "execution_count": null, 531 | "outputs": [] 532 | } 533 | ] 534 | } --------------------------------------------------------------------------------