├── LICENSE
├── README.md
├── main.ipynb
├── main.py
└── spam.csv
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Pabitra Banerjee
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SPAM Detection NLP(AI) Model
2 |
3 | Here We're Going To Detect Spam Messages Using NLP.
4 |
5 | # Dataset Information
6 |
7 | The "spam" concept is diverse: advertisements for products/web sites, make money fast schemes, chain letters, pornography...
8 |
9 | The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged according being ham (legitimate) or spam.
10 |
11 | ## Attributes
12 |
13 | - SMS Messages
14 | - Label (spam/ham)
15 |
16 | **Download link:** https://www.kaggle.com/uciml/sms-spam-collection-dataset
17 |
18 | # Libraries
19 |
20 |
pandas
21 | numpy
22 | nltk
23 | re
24 | sklearn
25 |
26 | # Algorithms
27 |
28 | Logistic Regression
29 | Naive Bayes
30 | SVC
31 | Random Forest
32 |
33 | **Best Model Accuracy:** 98.27709978463747
--------------------------------------------------------------------------------
/main.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "## Dataset Information\n",
9 | "\n",
10 | "The \"spam\" concept is diverse: advertisements for products/web sites, make money fast schemes, chain letters, pornography...\n",
11 | "\n",
12 | "The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged according being ham (legitimate) or spam.\n",
13 | "\n",
14 | "## Attributes\n",
15 | "\n",
16 | "- SMS Messages\n",
17 | "- Label (spam/ham)"
18 | ]
19 | },
20 | {
21 | "attachments": {},
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## Import modules"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 23,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "import pandas as pd\n",
35 | "import numpy as np\n",
36 | "import nltk\n",
37 | "import re\n",
38 | "from nltk.corpus import stopwords"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 24,
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/html": [
49 | "\n",
50 | "\n",
63 | "
\n",
64 | " \n",
65 | " \n",
66 | " | \n",
67 | " v1 | \n",
68 | " v2 | \n",
69 | " Unnamed: 2 | \n",
70 | " Unnamed: 3 | \n",
71 | " Unnamed: 4 | \n",
72 | "
\n",
73 | " \n",
74 | " \n",
75 | " \n",
76 | " 0 | \n",
77 | " ham | \n",
78 | " Go until jurong point, crazy.. Available only ... | \n",
79 | " NaN | \n",
80 | " NaN | \n",
81 | " NaN | \n",
82 | "
\n",
83 | " \n",
84 | " 1 | \n",
85 | " ham | \n",
86 | " Ok lar... Joking wif u oni... | \n",
87 | " NaN | \n",
88 | " NaN | \n",
89 | " NaN | \n",
90 | "
\n",
91 | " \n",
92 | " 2 | \n",
93 | " spam | \n",
94 | " Free entry in 2 a wkly comp to win FA Cup fina... | \n",
95 | " NaN | \n",
96 | " NaN | \n",
97 | " NaN | \n",
98 | "
\n",
99 | " \n",
100 | " 3 | \n",
101 | " ham | \n",
102 | " U dun say so early hor... U c already then say... | \n",
103 | " NaN | \n",
104 | " NaN | \n",
105 | " NaN | \n",
106 | "
\n",
107 | " \n",
108 | " 4 | \n",
109 | " ham | \n",
110 | " Nah I don't think he goes to usf, he lives aro... | \n",
111 | " NaN | \n",
112 | " NaN | \n",
113 | " NaN | \n",
114 | "
\n",
115 | " \n",
116 | "
\n",
117 | "
"
118 | ],
119 | "text/plain": [
120 | " v1 v2 Unnamed: 2 \\\n",
121 | "0 ham Go until jurong point, crazy.. Available only ... NaN \n",
122 | "1 ham Ok lar... Joking wif u oni... NaN \n",
123 | "2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN \n",
124 | "3 ham U dun say so early hor... U c already then say... NaN \n",
125 | "4 ham Nah I don't think he goes to usf, he lives aro... NaN \n",
126 | "\n",
127 | " Unnamed: 3 Unnamed: 4 \n",
128 | "0 NaN NaN \n",
129 | "1 NaN NaN \n",
130 | "2 NaN NaN \n",
131 | "3 NaN NaN \n",
132 | "4 NaN NaN "
133 | ]
134 | },
135 | "execution_count": 24,
136 | "metadata": {},
137 | "output_type": "execute_result"
138 | }
139 | ],
140 | "source": [
141 | "df = pd.read_csv('spam.csv', encoding='latin-1')\n",
142 | "df.head()"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 25,
148 | "metadata": {},
149 | "outputs": [
150 | {
151 | "data": {
152 | "text/html": [
153 | "\n",
154 | "\n",
167 | "
\n",
168 | " \n",
169 | " \n",
170 | " | \n",
171 | " messages | \n",
172 | " label | \n",
173 | "
\n",
174 | " \n",
175 | " \n",
176 | " \n",
177 | " 0 | \n",
178 | " Go until jurong point, crazy.. Available only ... | \n",
179 | " ham | \n",
180 | "
\n",
181 | " \n",
182 | " 1 | \n",
183 | " Ok lar... Joking wif u oni... | \n",
184 | " ham | \n",
185 | "
\n",
186 | " \n",
187 | " 2 | \n",
188 | " Free entry in 2 a wkly comp to win FA Cup fina... | \n",
189 | " spam | \n",
190 | "
\n",
191 | " \n",
192 | " 3 | \n",
193 | " U dun say so early hor... U c already then say... | \n",
194 | " ham | \n",
195 | "
\n",
196 | " \n",
197 | " 4 | \n",
198 | " Nah I don't think he goes to usf, he lives aro... | \n",
199 | " ham | \n",
200 | "
\n",
201 | " \n",
202 | "
\n",
203 | "
"
204 | ],
205 | "text/plain": [
206 | " messages label\n",
207 | "0 Go until jurong point, crazy.. Available only ... ham\n",
208 | "1 Ok lar... Joking wif u oni... ham\n",
209 | "2 Free entry in 2 a wkly comp to win FA Cup fina... spam\n",
210 | "3 U dun say so early hor... U c already then say... ham\n",
211 | "4 Nah I don't think he goes to usf, he lives aro... ham"
212 | ]
213 | },
214 | "execution_count": 25,
215 | "metadata": {},
216 | "output_type": "execute_result"
217 | }
218 | ],
219 | "source": [
220 | "# get necessary columns for processing\n",
221 | "df = df[['v2', 'v1']]\n",
222 | "# df.rename(columns={'v2': 'messages', 'v1': 'label'}, inplace=True)\n",
223 | "df = df.rename(columns={'v2': 'messages', 'v1': 'label'})\n",
224 | "df.head()"
225 | ]
226 | },
227 | {
228 | "attachments": {},
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "## Preprocessing the dataset"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 26,
238 | "metadata": {},
239 | "outputs": [
240 | {
241 | "data": {
242 | "text/plain": [
243 | "messages 0\n",
244 | "label 0\n",
245 | "dtype: int64"
246 | ]
247 | },
248 | "execution_count": 26,
249 | "metadata": {},
250 | "output_type": "execute_result"
251 | }
252 | ],
253 | "source": [
254 | "# check for null values\n",
255 | "df.isnull().sum()"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 27,
261 | "metadata": {},
262 | "outputs": [],
263 | "source": [
264 | "STOPWORDS = set(stopwords.words('english'))\n",
265 | "\n",
266 | "def clean_text(text):\n",
267 | " # convert to lowercase\n",
268 | " text = text.lower()\n",
269 | " # remove special characters\n",
270 | " text = re.sub(r'[^0-9a-zA-Z]', ' ', text)\n",
271 | " # remove extra spaces\n",
272 | " text = re.sub(r'\\s+', ' ', text)\n",
273 | " # remove stopwords\n",
274 | " text = \" \".join(word for word in text.split() if word not in STOPWORDS)\n",
275 | " return text"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 28,
281 | "metadata": {},
282 | "outputs": [
283 | {
284 | "data": {
285 | "text/html": [
286 | "\n",
287 | "\n",
300 | "
\n",
301 | " \n",
302 | " \n",
303 | " | \n",
304 | " messages | \n",
305 | " label | \n",
306 | " clean_text | \n",
307 | "
\n",
308 | " \n",
309 | " \n",
310 | " \n",
311 | " 0 | \n",
312 | " Go until jurong point, crazy.. Available only ... | \n",
313 | " ham | \n",
314 | " go jurong point crazy available bugis n great ... | \n",
315 | "
\n",
316 | " \n",
317 | " 1 | \n",
318 | " Ok lar... Joking wif u oni... | \n",
319 | " ham | \n",
320 | " ok lar joking wif u oni | \n",
321 | "
\n",
322 | " \n",
323 | " 2 | \n",
324 | " Free entry in 2 a wkly comp to win FA Cup fina... | \n",
325 | " spam | \n",
326 | " free entry 2 wkly comp win fa cup final tkts 2... | \n",
327 | "
\n",
328 | " \n",
329 | " 3 | \n",
330 | " U dun say so early hor... U c already then say... | \n",
331 | " ham | \n",
332 | " u dun say early hor u c already say | \n",
333 | "
\n",
334 | " \n",
335 | " 4 | \n",
336 | " Nah I don't think he goes to usf, he lives aro... | \n",
337 | " ham | \n",
338 | " nah think goes usf lives around though | \n",
339 | "
\n",
340 | " \n",
341 | "
\n",
342 | "
"
343 | ],
344 | "text/plain": [
345 | " messages label \\\n",
346 | "0 Go until jurong point, crazy.. Available only ... ham \n",
347 | "1 Ok lar... Joking wif u oni... ham \n",
348 | "2 Free entry in 2 a wkly comp to win FA Cup fina... spam \n",
349 | "3 U dun say so early hor... U c already then say... ham \n",
350 | "4 Nah I don't think he goes to usf, he lives aro... ham \n",
351 | "\n",
352 | " clean_text \n",
353 | "0 go jurong point crazy available bugis n great ... \n",
354 | "1 ok lar joking wif u oni \n",
355 | "2 free entry 2 wkly comp win fa cup final tkts 2... \n",
356 | "3 u dun say early hor u c already say \n",
357 | "4 nah think goes usf lives around though "
358 | ]
359 | },
360 | "execution_count": 28,
361 | "metadata": {},
362 | "output_type": "execute_result"
363 | }
364 | ],
365 | "source": [
366 | "# clean the messages\n",
367 | "df['clean_text'] = df['messages'].apply(clean_text)\n",
368 | "df.head()"
369 | ]
370 | },
371 | {
372 | "attachments": {},
373 | "cell_type": "markdown",
374 | "metadata": {},
375 | "source": [
376 | "## Input Split"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": 29,
382 | "metadata": {},
383 | "outputs": [],
384 | "source": [
385 | "X = df['clean_text']\n",
386 | "y = df['label']"
387 | ]
388 | },
389 | {
390 | "attachments": {},
391 | "cell_type": "markdown",
392 | "metadata": {},
393 | "source": [
394 | "## Model Training"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 30,
400 | "metadata": {},
401 | "outputs": [],
402 | "source": [
403 | "from sklearn.pipeline import Pipeline\n",
404 | "from sklearn.model_selection import train_test_split, cross_val_score\n",
405 | "from sklearn.metrics import classification_report\n",
406 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer\n",
407 | "\n",
408 | "def classify(model, X, y):\n",
409 | " # train test split\n",
410 | " x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True, stratify=y)\n",
411 | " # model training\n",
412 | " pipeline_model = Pipeline([('vect', CountVectorizer()),\n",
413 | " ('tfidf', TfidfTransformer()),\n",
414 | " ('clf', model)])\n",
415 | " pipeline_model.fit(x_train, y_train)\n",
416 | " \n",
417 | " print('Accuracy:', pipeline_model.score(x_test, y_test)*100)\n",
418 | " \n",
419 | "# cv_score = cross_val_score(model, X, y, cv=5)\n",
420 | "# print(\"CV Score:\", np.mean(cv_score)*100)\n",
421 | " y_pred = pipeline_model.predict(x_test)\n",
422 | " print(classification_report(y_test, y_pred))"
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": 31,
428 | "metadata": {},
429 | "outputs": [
430 | {
431 | "name": "stdout",
432 | "output_type": "stream",
433 | "text": [
434 | "LogisticRegression Model\n",
435 | "------------------------\n",
436 | "Accuracy: 96.8413496051687\n",
437 | " precision recall f1-score support\n",
438 | "\n",
439 | " ham 0.97 1.00 0.98 1206\n",
440 | " spam 0.99 0.77 0.87 187\n",
441 | "\n",
442 | " accuracy 0.97 1393\n",
443 | " macro avg 0.98 0.88 0.92 1393\n",
444 | "weighted avg 0.97 0.97 0.97 1393\n",
445 | "\n",
446 | "======================================================\n"
447 | ]
448 | }
449 | ],
450 | "source": [
451 | "print(\"LogisticRegression Model\")\n",
452 | "print(\"------------------------\")\n",
453 | "from sklearn.linear_model import LogisticRegression\n",
454 | "model = LogisticRegression()\n",
455 | "classify(model, X, y)\n",
456 | "print(\"======================================================\")"
457 | ]
458 | },
459 | {
460 | "cell_type": "code",
461 | "execution_count": 32,
462 | "metadata": {},
463 | "outputs": [
464 | {
465 | "name": "stdout",
466 | "output_type": "stream",
467 | "text": [
468 | "MultinomialNB Model\n",
469 | "-------------------\n",
470 | "Accuracy: 96.69777458722182\n",
471 | " precision recall f1-score support\n",
472 | "\n",
473 | " ham 0.96 1.00 0.98 1206\n",
474 | " spam 1.00 0.75 0.86 187\n",
475 | "\n",
476 | " accuracy 0.97 1393\n",
477 | " macro avg 0.98 0.88 0.92 1393\n",
478 | "weighted avg 0.97 0.97 0.96 1393\n",
479 | "\n",
480 | "======================================================\n"
481 | ]
482 | }
483 | ],
484 | "source": [
485 | "print(\"MultinomialNB Model\")\n",
486 | "print(\"-------------------\")\n",
487 | "from sklearn.naive_bayes import MultinomialNB\n",
488 | "model = MultinomialNB()\n",
489 | "classify(model, X, y)\n",
490 | "print(\"======================================================\")"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": 33,
496 | "metadata": {},
497 | "outputs": [
498 | {
499 | "name": "stdout",
500 | "output_type": "stream",
501 | "text": [
502 | "SVC Model\n",
503 | "---------\n",
504 | "Accuracy: 98.27709978463747\n",
505 | " precision recall f1-score support\n",
506 | "\n",
507 | " ham 0.98 1.00 0.99 1206\n",
508 | " spam 1.00 0.87 0.93 187\n",
509 | "\n",
510 | " accuracy 0.98 1393\n",
511 | " macro avg 0.99 0.94 0.96 1393\n",
512 | "weighted avg 0.98 0.98 0.98 1393\n",
513 | "\n",
514 | "======================================================\n"
515 | ]
516 | }
517 | ],
518 | "source": [
519 | "print(\"SVC Model\")\n",
520 | "print(\"---------\")\n",
521 | "from sklearn.svm import SVC\n",
522 | "model = SVC(C=3)\n",
523 | "classify(model, X, y)\n",
524 | "print(\"======================================================\")"
525 | ]
526 | },
527 | {
528 | "cell_type": "code",
529 | "execution_count": 34,
530 | "metadata": {},
531 | "outputs": [
532 | {
533 | "name": "stdout",
534 | "output_type": "stream",
535 | "text": [
536 | "RandomForestClassifier Model\n",
537 | "----------------------------\n",
538 | "Accuracy: 97.27207465900933\n",
539 | " precision recall f1-score support\n",
540 | "\n",
541 | " ham 0.97 1.00 0.98 1206\n",
542 | " spam 1.00 0.80 0.89 187\n",
543 | "\n",
544 | " accuracy 0.97 1393\n",
545 | " macro avg 0.98 0.90 0.94 1393\n",
546 | "weighted avg 0.97 0.97 0.97 1393\n",
547 | "\n",
548 | "======================================================\n"
549 | ]
550 | }
551 | ],
552 | "source": [
553 | "print(\"RandomForestClassifier Model\")\n",
554 | "print(\"----------------------------\")\n",
555 | "from sklearn.ensemble import RandomForestClassifier\n",
556 | "model = RandomForestClassifier()\n",
557 | "classify(model, X, y)\n",
558 | "print(\"======================================================\")"
559 | ]
560 | }
561 | ],
562 | "metadata": {
563 | "kernelspec": {
564 | "display_name": "Python 3",
565 | "language": "python",
566 | "name": "python3"
567 | },
568 | "language_info": {
569 | "codemirror_mode": {
570 | "name": "ipython",
571 | "version": 3
572 | },
573 | "file_extension": ".py",
574 | "mimetype": "text/x-python",
575 | "name": "python",
576 | "nbconvert_exporter": "python",
577 | "pygments_lexer": "ipython3",
578 | "version": "3.11.2"
579 | },
580 | "orig_nbformat": 4
581 | },
582 | "nbformat": 4,
583 | "nbformat_minor": 2
584 | }
585 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import nltk
4 | import re
5 | from nltk.corpus import stopwords
6 |
7 | df = pd.read_csv('C:/Users/rocks/OneDrive/Desktop/Projects/SPAM-AI/spam.csv', encoding='latin-1')
8 | df.head()
9 |
10 | # get necessary columns for processing
11 | df = df[['v2', 'v1']]
12 | # df.rename(columns={'v2': 'messages', 'v1': 'label'}, inplace=True)
13 | df = df.rename(columns={'v2': 'messages', 'v1': 'label'})
14 | df.head()
15 |
16 | # check for null values
17 | df.isnull().sum()
18 |
19 | STOPWORDS = set(stopwords.words('english'))
20 |
21 | def clean_text(text):
22 | # convert to lowercase
23 | text = text.lower()
24 | # remove special characters
25 | text = re.sub(r'[^0-9a-zA-Z]', ' ', text)
26 | # remove extra spaces
27 | text = re.sub(r'\s+', ' ', text)
28 | # remove stopwords
29 | text = " ".join(word for word in text.split() if word not in STOPWORDS)
30 | return text
31 |
32 | # clean the messages
33 | df['clean_text'] = df['messages'].apply(clean_text)
34 | df.head()
35 |
36 | # Input Split
37 | X = df['clean_text']
38 | y = df['label']
39 |
40 | # Model Training
41 | from sklearn.pipeline import Pipeline
42 | from sklearn.model_selection import train_test_split, cross_val_score
43 | from sklearn.metrics import classification_report
44 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
45 |
46 | def classify(model, X, y):
47 | # train test split
48 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True, stratify=y)
49 | # model training
50 | pipeline_model = Pipeline([('vect', CountVectorizer()),
51 | ('tfidf', TfidfTransformer()),
52 | ('clf', model)])
53 | pipeline_model.fit(x_train, y_train)
54 |
55 | print('Accuracy:', pipeline_model.score(x_test, y_test)*100)
56 |
57 | # cv_score = cross_val_score(model, X, y, cv=5)
58 | # print("CV Score:", np.mean(cv_score)*100)
59 | y_pred = pipeline_model.predict(x_test)
60 | print(classification_report(y_test, y_pred))
61 |
62 | # LogisticRegression Model
63 | print("LogisticRegression Model")
64 | print("------------------------")
65 | from sklearn.linear_model import LogisticRegression
66 | model = LogisticRegression()
67 | classify(model, X, y)
68 | print("======================================================")
69 |
70 | # MultinomialNB Model
71 | print("MultinomialNB Model")
72 | print("-------------------")
73 | from sklearn.naive_bayes import MultinomialNB
74 | model = MultinomialNB()
75 | classify(model, X, y)
76 | print("======================================================")
77 |
78 | # SVC Model
79 | print("SVC Model")
80 | print("---------")
81 | from sklearn.svm import SVC
82 | model = SVC(C=3)
83 | classify(model, X, y)
84 | print("======================================================")
85 |
86 | # RandomForestClassifier Model
87 | print("RandomForestClassifier Model")
88 | print("----------------------------")
89 | from sklearn.ensemble import RandomForestClassifier
90 | model = RandomForestClassifier()
91 | classify(model, X, y)
92 | print("======================================================")
--------------------------------------------------------------------------------
/spam.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PB2204/SPAM-Detection-Model/c093eb91a043db3a192a62cc3a5911b6ac508f0a/spam.csv
--------------------------------------------------------------------------------