└── Hate_Speech_Detection_Model.ipynb
/Hate_Speech_Detection_Model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "Hate Speech Detection Model.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": []
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "code",
21 | "metadata": {
22 | "colab": {
23 | "base_uri": "https://localhost:8080/",
24 | "height": 206
25 | },
26 | "id": "Fg3Z9CYuRMjk",
27 | "outputId": "9117c671-d4b2-4b41-f18b-70df7ad85774"
28 | },
29 | "source": [
30 | "import pandas as pd\n",
31 | "import numpy as np\n",
32 | "from sklearn.feature_extraction.text import CountVectorizer\n",
33 | "from sklearn.model_selection import train_test_split\n",
34 | "from sklearn.tree import DecisionTreeClassifier\n",
35 | "import re\n",
36 | "import nltk\n",
37 | "stemmer = nltk.SnowballStemmer(\"english\")\n",
38 | "from nltk.corpus import stopwords\n",
39 | "import string\n",
40 | "\n",
41 | "data = pd.read_csv(\"twitter.csv\")\n",
42 | "data.head()"
43 | ],
44 | "execution_count": 5,
45 | "outputs": [
46 | {
47 | "output_type": "execute_result",
48 | "data": {
49 | "text/html": [
50 | "
\n",
51 | "\n",
64 | "
\n",
65 | " \n",
66 | " \n",
67 | " | \n",
68 | " Unnamed: 0 | \n",
69 | " count | \n",
70 | " hate_speech | \n",
71 | " offensive_language | \n",
72 | " neither | \n",
73 | " class | \n",
74 | " tweet | \n",
75 | "
\n",
76 | " \n",
77 | " \n",
78 | " \n",
79 | " | 0 | \n",
80 | " 0 | \n",
81 | " 3 | \n",
82 | " 0 | \n",
83 | " 0 | \n",
84 | " 3 | \n",
85 | " 2 | \n",
86 | " !!! RT @mayasolovely: As a woman you shouldn't... | \n",
87 | "
\n",
88 | " \n",
89 | " | 1 | \n",
90 | " 1 | \n",
91 | " 3 | \n",
92 | " 0 | \n",
93 | " 3 | \n",
94 | " 0 | \n",
95 | " 1 | \n",
96 | " !!!!! RT @mleew17: boy dats cold...tyga dwn ba... | \n",
97 | "
\n",
98 | " \n",
99 | " | 2 | \n",
100 | " 2 | \n",
101 | " 3 | \n",
102 | " 0 | \n",
103 | " 3 | \n",
104 | " 0 | \n",
105 | " 1 | \n",
106 | " !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... | \n",
107 | "
\n",
108 | " \n",
109 | " | 3 | \n",
110 | " 3 | \n",
111 | " 3 | \n",
112 | " 0 | \n",
113 | " 2 | \n",
114 | " 1 | \n",
115 | " 1 | \n",
116 | " !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... | \n",
117 | "
\n",
118 | " \n",
119 | " | 4 | \n",
120 | " 4 | \n",
121 | " 6 | \n",
122 | " 0 | \n",
123 | " 6 | \n",
124 | " 0 | \n",
125 | " 1 | \n",
126 | " !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... | \n",
127 | "
\n",
128 | " \n",
129 | "
\n",
130 | "
"
131 | ],
132 | "text/plain": [
133 | " Unnamed: 0 count ... class tweet\n",
134 | "0 0 3 ... 2 !!! RT @mayasolovely: As a woman you shouldn't...\n",
135 | "1 1 3 ... 1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba...\n",
136 | "2 2 3 ... 1 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...\n",
137 | "3 3 3 ... 1 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...\n",
138 | "4 4 6 ... 1 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...\n",
139 | "\n",
140 | "[5 rows x 7 columns]"
141 | ]
142 | },
143 | "metadata": {
144 | "tags": []
145 | },
146 | "execution_count": 5
147 | }
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "metadata": {
153 | "colab": {
154 | "base_uri": "https://localhost:8080/",
155 | "height": 206
156 | },
157 | "id": "VzZNxA5wRlW0",
158 | "outputId": "d20ef5ce-a88c-4fa5-b9cb-decc8eb2c96c"
159 | },
160 | "source": [
161 | "data[\"labels\"] = data[\"class\"].map({0: \"Hate Speech\", 1: \"Offensive Language\", 2: \"No Hate and Offensive\"})\n",
162 | "data.head()"
163 | ],
164 | "execution_count": 6,
165 | "outputs": [
166 | {
167 | "output_type": "execute_result",
168 | "data": {
169 | "text/html": [
170 | "\n",
171 | "\n",
184 | "
\n",
185 | " \n",
186 | " \n",
187 | " | \n",
188 | " Unnamed: 0 | \n",
189 | " count | \n",
190 | " hate_speech | \n",
191 | " offensive_language | \n",
192 | " neither | \n",
193 | " class | \n",
194 | " tweet | \n",
195 | " labels | \n",
196 | "
\n",
197 | " \n",
198 | " \n",
199 | " \n",
200 | " | 0 | \n",
201 | " 0 | \n",
202 | " 3 | \n",
203 | " 0 | \n",
204 | " 0 | \n",
205 | " 3 | \n",
206 | " 2 | \n",
207 | " !!! RT @mayasolovely: As a woman you shouldn't... | \n",
208 | " No Hate and Offensive | \n",
209 | "
\n",
210 | " \n",
211 | " | 1 | \n",
212 | " 1 | \n",
213 | " 3 | \n",
214 | " 0 | \n",
215 | " 3 | \n",
216 | " 0 | \n",
217 | " 1 | \n",
218 | " !!!!! RT @mleew17: boy dats cold...tyga dwn ba... | \n",
219 | " Offensive Language | \n",
220 | "
\n",
221 | " \n",
222 | " | 2 | \n",
223 | " 2 | \n",
224 | " 3 | \n",
225 | " 0 | \n",
226 | " 3 | \n",
227 | " 0 | \n",
228 | " 1 | \n",
229 | " !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... | \n",
230 | " Offensive Language | \n",
231 | "
\n",
232 | " \n",
233 | " | 3 | \n",
234 | " 3 | \n",
235 | " 3 | \n",
236 | " 0 | \n",
237 | " 2 | \n",
238 | " 1 | \n",
239 | " 1 | \n",
240 | " !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... | \n",
241 | " Offensive Language | \n",
242 | "
\n",
243 | " \n",
244 | " | 4 | \n",
245 | " 4 | \n",
246 | " 6 | \n",
247 | " 0 | \n",
248 | " 6 | \n",
249 | " 0 | \n",
250 | " 1 | \n",
251 | " !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... | \n",
252 | " Offensive Language | \n",
253 | "
\n",
254 | " \n",
255 | "
\n",
256 | "
"
257 | ],
258 | "text/plain": [
259 | " Unnamed: 0 ... labels\n",
260 | "0 0 ... No Hate and Offensive\n",
261 | "1 1 ... Offensive Language\n",
262 | "2 2 ... Offensive Language\n",
263 | "3 3 ... Offensive Language\n",
264 | "4 4 ... Offensive Language\n",
265 | "\n",
266 | "[5 rows x 8 columns]"
267 | ]
268 | },
269 | "metadata": {
270 | "tags": []
271 | },
272 | "execution_count": 6
273 | }
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "metadata": {
279 | "colab": {
280 | "base_uri": "https://localhost:8080/",
281 | "height": 206
282 | },
283 | "id": "_wc0epqHRrOW",
284 | "outputId": "b770b91f-171f-45d2-daff-bc045ab88ce5"
285 | },
286 | "source": [
287 | "data = data[[\"tweet\", \"labels\"]]\n",
288 | "data.head()"
289 | ],
290 | "execution_count": 7,
291 | "outputs": [
292 | {
293 | "output_type": "execute_result",
294 | "data": {
295 | "text/html": [
296 | "\n",
297 | "\n",
310 | "
\n",
311 | " \n",
312 | " \n",
313 | " | \n",
314 | " tweet | \n",
315 | " labels | \n",
316 | "
\n",
317 | " \n",
318 | " \n",
319 | " \n",
320 | " | 0 | \n",
321 | " !!! RT @mayasolovely: As a woman you shouldn't... | \n",
322 | " No Hate and Offensive | \n",
323 | "
\n",
324 | " \n",
325 | " | 1 | \n",
326 | " !!!!! RT @mleew17: boy dats cold...tyga dwn ba... | \n",
327 | " Offensive Language | \n",
328 | "
\n",
329 | " \n",
330 | " | 2 | \n",
331 | " !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... | \n",
332 | " Offensive Language | \n",
333 | "
\n",
334 | " \n",
335 | " | 3 | \n",
336 | " !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... | \n",
337 | " Offensive Language | \n",
338 | "
\n",
339 | " \n",
340 | " | 4 | \n",
341 | " !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... | \n",
342 | " Offensive Language | \n",
343 | "
\n",
344 | " \n",
345 | "
\n",
346 | "
"
347 | ],
348 | "text/plain": [
349 | " tweet labels\n",
350 | "0 !!! RT @mayasolovely: As a woman you shouldn't... No Hate and Offensive\n",
351 | "1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... Offensive Language\n",
352 | "2 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... Offensive Language\n",
353 | "3 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... Offensive Language\n",
354 | "4 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... Offensive Language"
355 | ]
356 | },
357 | "metadata": {
358 | "tags": []
359 | },
360 | "execution_count": 7
361 | }
362 | ]
363 | },
364 | {
365 | "cell_type": "code",
366 | "metadata": {
367 | "colab": {
368 | "base_uri": "https://localhost:8080/",
369 | "height": 241
370 | },
371 | "id": "Ad8gNdT2Rvz1",
372 | "outputId": "8d7e722b-0774-493e-d8af-35b5f3bb05e1"
373 | },
374 | "source": [
375 | "nltk.download('stopwords')\n",
376 | "stopword=set(stopwords.words('english'))\n",
377 | "\n",
378 | "def clean(text):\n",
379 | " text = str(text).lower()\n",
380 | " text = re.sub('\\[.*?\\]', '', text)\n",
381 | " text = re.sub('https?://\\S+|www\\.\\S+', '', text)\n",
382 | " text = re.sub('<.*?>+', '', text)\n",
383 | " text = re.sub('[%s]' % re.escape(string.punctuation), '', text)\n",
384 | " text = re.sub('\\n', '', text)\n",
385 | " text = re.sub('\\w*\\d\\w*', '', text)\n",
386 | " text = [word for word in text.split(' ') if word not in stopword]\n",
387 | " text=\" \".join(text)\n",
388 | " text = [stemmer.stem(word) for word in text.split(' ')]\n",
389 | " text=\" \".join(text)\n",
390 | " return text\n",
391 | "data[\"tweet\"] = data[\"tweet\"].apply(clean)\n",
392 | "data.head()"
393 | ],
394 | "execution_count": 9,
395 | "outputs": [
396 | {
397 | "output_type": "stream",
398 | "text": [
399 | "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
400 | "[nltk_data] Unzipping corpora/stopwords.zip.\n"
401 | ],
402 | "name": "stdout"
403 | },
404 | {
405 | "output_type": "execute_result",
406 | "data": {
407 | "text/html": [
408 | "\n",
409 | "\n",
422 | "
\n",
423 | " \n",
424 | " \n",
425 | " | \n",
426 | " tweet | \n",
427 | " labels | \n",
428 | "
\n",
429 | " \n",
430 | " \n",
431 | " \n",
432 | " | 0 | \n",
433 | " rt mayasolov woman shouldnt complain clean ho... | \n",
434 | " No Hate and Offensive | \n",
435 | "
\n",
436 | " \n",
437 | " | 1 | \n",
438 | " rt boy dat coldtyga dwn bad cuffin dat hoe ... | \n",
439 | " Offensive Language | \n",
440 | "
\n",
441 | " \n",
442 | " | 2 | \n",
443 | " rt urkindofbrand dawg rt ever fuck bitch sta... | \n",
444 | " Offensive Language | \n",
445 | "
\n",
446 | " \n",
447 | " | 3 | \n",
448 | " rt cganderson vivabas look like tranni | \n",
449 | " Offensive Language | \n",
450 | "
\n",
451 | " \n",
452 | " | 4 | \n",
453 | " rt shenikarobert shit hear might true might f... | \n",
454 | " Offensive Language | \n",
455 | "
\n",
456 | " \n",
457 | "
\n",
458 | "
"
459 | ],
460 | "text/plain": [
461 | " tweet labels\n",
462 | "0 rt mayasolov woman shouldnt complain clean ho... No Hate and Offensive\n",
463 | "1 rt boy dat coldtyga dwn bad cuffin dat hoe ... Offensive Language\n",
464 | "2 rt urkindofbrand dawg rt ever fuck bitch sta... Offensive Language\n",
465 | "3 rt cganderson vivabas look like tranni Offensive Language\n",
466 | "4 rt shenikarobert shit hear might true might f... Offensive Language"
467 | ]
468 | },
469 | "metadata": {
470 | "tags": []
471 | },
472 | "execution_count": 9
473 | }
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "metadata": {
479 | "id": "URdY2e5ASD-x"
480 | },
481 | "source": [
482 | "x = np.array(data[\"tweet\"])\n",
483 | "y = np.array(data[\"labels\"])\n",
484 | "\n",
485 | "cv = CountVectorizer()\n",
486 | "X = cv.fit_transform(x) # Fit the Data\n",
487 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)"
488 | ],
489 | "execution_count": 10,
490 | "outputs": []
491 | },
492 | {
493 | "cell_type": "code",
494 | "metadata": {
495 | "colab": {
496 | "base_uri": "https://localhost:8080/"
497 | },
498 | "id": "y_M3gS0WSQR6",
499 | "outputId": "4107c611-5914-459e-f7c4-8a4fdb873cd6"
500 | },
501 | "source": [
502 | "clf = DecisionTreeClassifier()\n",
503 | "clf.fit(X_train,y_train)\n",
504 | "clf.score(X_test,y_test)\n",
505 | "user = input(\"Enter a Text: \")\n",
506 | "data = cv.transform([user]).toarray()\n",
507 | "output = clf.predict(data)\n",
508 | "print(output)"
509 | ],
510 | "execution_count": 12,
511 | "outputs": [
512 | {
513 | "output_type": "stream",
514 | "text": [
515 | "Enter a Text: Let's unite and kill all the people who don't value our religion.\n",
516 | "['Hate Speech']\n"
517 | ],
518 | "name": "stdout"
519 | }
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "metadata": {
525 | "id": "5mG-MJcOSS5L"
526 | },
527 | "source": [
528 | ""
529 | ],
530 | "execution_count": null,
531 | "outputs": []
532 | }
533 | ]
534 | }
--------------------------------------------------------------------------------