├── LICENSE ├── README.md ├── main.ipynb ├── main.py └── spam.csv /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Pabitra Banerjee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SPAM Detection NLP(AI) Model 2 | 3 | Here We're Going To Detect Spam Messages Using NLP. 4 | 5 | # Dataset Information 6 | 7 | The "spam" concept is diverse: advertisements for products/web sites, make money fast schemes, chain letters, pornography... 8 | 9 | The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged according being ham (legitimate) or spam. 10 | 11 | ## Attributes 12 | 13 | - SMS Messages 14 | - Label (spam/ham) 15 | 16 | **Download link:** https://www.kaggle.com/uciml/sms-spam-collection-dataset 17 | 18 | # Libraries 19 | 20 |
  • pandas 21 |
  • numpy 22 |
  • nltk 23 |
  • re 24 |
  • sklearn 25 | 26 | # Algorithms 27 | 28 |
  • Logistic Regression 29 |
  • Naive Bayes 30 |
  • SVC 31 |
  • Random Forest 32 | 33 | **Best Model Accuracy:** 98.27709978463747 -------------------------------------------------------------------------------- /main.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "## Dataset Information\n", 9 | "\n", 10 | "The \"spam\" concept is diverse: advertisements for products/web sites, make money fast schemes, chain letters, pornography...\n", 11 | "\n", 12 | "The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged according being ham (legitimate) or spam.\n", 13 | "\n", 14 | "## Attributes\n", 15 | "\n", 16 | "- SMS Messages\n", 17 | "- Label (spam/ham)" 18 | ] 19 | }, 20 | { 21 | "attachments": {}, 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Import modules" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 23, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import pandas as pd\n", 35 | "import numpy as np\n", 36 | "import nltk\n", 37 | "import re\n", 38 | "from nltk.corpus import stopwords" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 24, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/html": [ 49 | "
    \n", 50 | "\n", 63 | "\n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | "
    v1v2Unnamed: 2Unnamed: 3Unnamed: 4
    0hamGo until jurong point, crazy.. Available only ...NaNNaNNaN
    1hamOk lar... Joking wif u oni...NaNNaNNaN
    2spamFree entry in 2 a wkly comp to win FA Cup fina...NaNNaNNaN
    3hamU dun say so early hor... U c already then say...NaNNaNNaN
    4hamNah I don't think he goes to usf, he lives aro...NaNNaNNaN
    \n", 117 | "
    " 118 | ], 119 | "text/plain": [ 120 | " v1 v2 Unnamed: 2 \\\n", 121 | "0 ham Go until jurong point, crazy.. Available only ... NaN \n", 122 | "1 ham Ok lar... Joking wif u oni... NaN \n", 123 | "2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN \n", 124 | "3 ham U dun say so early hor... U c already then say... NaN \n", 125 | "4 ham Nah I don't think he goes to usf, he lives aro... NaN \n", 126 | "\n", 127 | " Unnamed: 3 Unnamed: 4 \n", 128 | "0 NaN NaN \n", 129 | "1 NaN NaN \n", 130 | "2 NaN NaN \n", 131 | "3 NaN NaN \n", 132 | "4 NaN NaN " 133 | ] 134 | }, 135 | "execution_count": 24, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "df = pd.read_csv('spam.csv', encoding='latin-1')\n", 142 | "df.head()" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 25, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "data": { 152 | "text/html": [ 153 | "
    \n", 154 | "\n", 167 | "\n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | "
    messageslabel
    0Go until jurong point, crazy.. Available only ...ham
    1Ok lar... Joking wif u oni...ham
    2Free entry in 2 a wkly comp to win FA Cup fina...spam
    3U dun say so early hor... U c already then say...ham
    4Nah I don't think he goes to usf, he lives aro...ham
    \n", 203 | "
    " 204 | ], 205 | "text/plain": [ 206 | " messages label\n", 207 | "0 Go until jurong point, crazy.. Available only ... ham\n", 208 | "1 Ok lar... Joking wif u oni... ham\n", 209 | "2 Free entry in 2 a wkly comp to win FA Cup fina... spam\n", 210 | "3 U dun say so early hor... U c already then say... ham\n", 211 | "4 Nah I don't think he goes to usf, he lives aro... ham" 212 | ] 213 | }, 214 | "execution_count": 25, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "# get necessary columns for processing\n", 221 | "df = df[['v2', 'v1']]\n", 222 | "# df.rename(columns={'v2': 'messages', 'v1': 'label'}, inplace=True)\n", 223 | "df = df.rename(columns={'v2': 'messages', 'v1': 'label'})\n", 224 | "df.head()" 225 | ] 226 | }, 227 | { 228 | "attachments": {}, 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "## Preprocessing the dataset" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 26, 238 | "metadata": {}, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "text/plain": [ 243 | "messages 0\n", 244 | "label 0\n", 245 | "dtype: int64" 246 | ] 247 | }, 248 | "execution_count": 26, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "# check for null values\n", 255 | "df.isnull().sum()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 27, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "STOPWORDS = set(stopwords.words('english'))\n", 265 | "\n", 266 | "def clean_text(text):\n", 267 | " # convert to lowercase\n", 268 | " text = text.lower()\n", 269 | " # remove special characters\n", 270 | " text = re.sub(r'[^0-9a-zA-Z]', ' ', text)\n", 271 | " # remove extra spaces\n", 272 | " text = re.sub(r'\\s+', ' ', text)\n", 273 | " # remove stopwords\n", 274 | " text = \" \".join(word for word in text.split() if word not in STOPWORDS)\n", 275 | " return text" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 28, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/html": [ 286 | "
    \n", 287 | "\n", 300 | "\n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | "
    messageslabelclean_text
    0Go until jurong point, crazy.. Available only ...hamgo jurong point crazy available bugis n great ...
    1Ok lar... Joking wif u oni...hamok lar joking wif u oni
    2Free entry in 2 a wkly comp to win FA Cup fina...spamfree entry 2 wkly comp win fa cup final tkts 2...
    3U dun say so early hor... U c already then say...hamu dun say early hor u c already say
    4Nah I don't think he goes to usf, he lives aro...hamnah think goes usf lives around though
    \n", 342 | "
    " 343 | ], 344 | "text/plain": [ 345 | " messages label \\\n", 346 | "0 Go until jurong point, crazy.. Available only ... ham \n", 347 | "1 Ok lar... Joking wif u oni... ham \n", 348 | "2 Free entry in 2 a wkly comp to win FA Cup fina... spam \n", 349 | "3 U dun say so early hor... U c already then say... ham \n", 350 | "4 Nah I don't think he goes to usf, he lives aro... ham \n", 351 | "\n", 352 | " clean_text \n", 353 | "0 go jurong point crazy available bugis n great ... \n", 354 | "1 ok lar joking wif u oni \n", 355 | "2 free entry 2 wkly comp win fa cup final tkts 2... \n", 356 | "3 u dun say early hor u c already say \n", 357 | "4 nah think goes usf lives around though " 358 | ] 359 | }, 360 | "execution_count": 28, 361 | "metadata": {}, 362 | "output_type": "execute_result" 363 | } 364 | ], 365 | "source": [ 366 | "# clean the messages\n", 367 | "df['clean_text'] = df['messages'].apply(clean_text)\n", 368 | "df.head()" 369 | ] 370 | }, 371 | { 372 | "attachments": {}, 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "## Input Split" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 29, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "X = df['clean_text']\n", 386 | "y = df['label']" 387 | ] 388 | }, 389 | { 390 | "attachments": {}, 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "## Model Training" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 30, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "from sklearn.pipeline import Pipeline\n", 404 | "from sklearn.model_selection import train_test_split, cross_val_score\n", 405 | "from sklearn.metrics import classification_report\n", 406 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer\n", 407 | "\n", 408 | "def classify(model, X, y):\n", 409 | " # train test split\n", 410 | " x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True, stratify=y)\n", 411 | " # model training\n", 412 | " pipeline_model = Pipeline([('vect', CountVectorizer()),\n", 413 | " ('tfidf', TfidfTransformer()),\n", 414 | " ('clf', model)])\n", 415 | " pipeline_model.fit(x_train, y_train)\n", 416 | " \n", 417 | " print('Accuracy:', pipeline_model.score(x_test, y_test)*100)\n", 418 | " \n", 419 | "# cv_score = cross_val_score(model, X, y, cv=5)\n", 420 | "# print(\"CV Score:\", np.mean(cv_score)*100)\n", 421 | " y_pred = pipeline_model.predict(x_test)\n", 422 | " print(classification_report(y_test, y_pred))" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 31, 428 | "metadata": {}, 429 | "outputs": [ 430 | { 431 | "name": "stdout", 432 | "output_type": "stream", 433 | "text": [ 434 | "LogisticRegression Model\n", 435 | "------------------------\n", 436 | "Accuracy: 96.8413496051687\n", 437 | " precision recall f1-score support\n", 438 | "\n", 439 | " ham 0.97 1.00 0.98 1206\n", 440 | " spam 0.99 0.77 0.87 187\n", 441 | "\n", 442 | " accuracy 0.97 1393\n", 443 | " macro avg 0.98 0.88 0.92 1393\n", 444 | "weighted avg 0.97 0.97 0.97 1393\n", 445 | "\n", 446 | "======================================================\n" 447 | ] 448 | } 449 | ], 450 | "source": [ 451 | "print(\"LogisticRegression Model\")\n", 452 | "print(\"------------------------\")\n", 453 | "from sklearn.linear_model import LogisticRegression\n", 454 | "model = LogisticRegression()\n", 455 | "classify(model, X, y)\n", 456 | "print(\"======================================================\")" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 32, 462 | "metadata": {}, 463 | "outputs": [ 464 | { 465 | "name": "stdout", 466 | "output_type": "stream", 467 | "text": [ 468 | "MultinomialNB Model\n", 469 | "-------------------\n", 470 | "Accuracy: 96.69777458722182\n", 471 | " precision recall f1-score support\n", 472 | "\n", 473 | " ham 0.96 1.00 0.98 1206\n", 474 | " spam 1.00 0.75 0.86 187\n", 475 | "\n", 476 | " accuracy 0.97 1393\n", 477 | " macro avg 0.98 0.88 0.92 1393\n", 478 | "weighted avg 0.97 0.97 0.96 1393\n", 479 | "\n", 480 | "======================================================\n" 481 | ] 482 | } 483 | ], 484 | "source": [ 485 | "print(\"MultinomialNB Model\")\n", 486 | "print(\"-------------------\")\n", 487 | "from sklearn.naive_bayes import MultinomialNB\n", 488 | "model = MultinomialNB()\n", 489 | "classify(model, X, y)\n", 490 | "print(\"======================================================\")" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 33, 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "name": "stdout", 500 | "output_type": "stream", 501 | "text": [ 502 | "SVC Model\n", 503 | "---------\n", 504 | "Accuracy: 98.27709978463747\n", 505 | " precision recall f1-score support\n", 506 | "\n", 507 | " ham 0.98 1.00 0.99 1206\n", 508 | " spam 1.00 0.87 0.93 187\n", 509 | "\n", 510 | " accuracy 0.98 1393\n", 511 | " macro avg 0.99 0.94 0.96 1393\n", 512 | "weighted avg 0.98 0.98 0.98 1393\n", 513 | "\n", 514 | "======================================================\n" 515 | ] 516 | } 517 | ], 518 | "source": [ 519 | "print(\"SVC Model\")\n", 520 | "print(\"---------\")\n", 521 | "from sklearn.svm import SVC\n", 522 | "model = SVC(C=3)\n", 523 | "classify(model, X, y)\n", 524 | "print(\"======================================================\")" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 34, 530 | "metadata": {}, 531 | "outputs": [ 532 | { 533 | "name": "stdout", 534 | "output_type": "stream", 535 | "text": [ 536 | "RandomForestClassifier Model\n", 537 | "----------------------------\n", 538 | "Accuracy: 97.27207465900933\n", 539 | " precision recall f1-score support\n", 540 | "\n", 541 | " ham 0.97 1.00 0.98 1206\n", 542 | " spam 1.00 0.80 0.89 187\n", 543 | "\n", 544 | " accuracy 0.97 1393\n", 545 | " macro avg 0.98 0.90 0.94 1393\n", 546 | "weighted avg 0.97 0.97 0.97 1393\n", 547 | "\n", 548 | "======================================================\n" 549 | ] 550 | } 551 | ], 552 | "source": [ 553 | "print(\"RandomForestClassifier Model\")\n", 554 | "print(\"----------------------------\")\n", 555 | "from sklearn.ensemble import RandomForestClassifier\n", 556 | "model = RandomForestClassifier()\n", 557 | "classify(model, X, y)\n", 558 | "print(\"======================================================\")" 559 | ] 560 | } 561 | ], 562 | "metadata": { 563 | "kernelspec": { 564 | "display_name": "Python 3", 565 | "language": "python", 566 | "name": "python3" 567 | }, 568 | "language_info": { 569 | "codemirror_mode": { 570 | "name": "ipython", 571 | "version": 3 572 | }, 573 | "file_extension": ".py", 574 | "mimetype": "text/x-python", 575 | "name": "python", 576 | "nbconvert_exporter": "python", 577 | "pygments_lexer": "ipython3", 578 | "version": "3.11.2" 579 | }, 580 | "orig_nbformat": 4 581 | }, 582 | "nbformat": 4, 583 | "nbformat_minor": 2 584 | } 585 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import nltk 4 | import re 5 | from nltk.corpus import stopwords 6 | 7 | df = pd.read_csv('C:/Users/rocks/OneDrive/Desktop/Projects/SPAM-AI/spam.csv', encoding='latin-1') 8 | df.head() 9 | 10 | # get necessary columns for processing 11 | df = df[['v2', 'v1']] 12 | # df.rename(columns={'v2': 'messages', 'v1': 'label'}, inplace=True) 13 | df = df.rename(columns={'v2': 'messages', 'v1': 'label'}) 14 | df.head() 15 | 16 | # check for null values 17 | df.isnull().sum() 18 | 19 | STOPWORDS = set(stopwords.words('english')) 20 | 21 | def clean_text(text): 22 | # convert to lowercase 23 | text = text.lower() 24 | # remove special characters 25 | text = re.sub(r'[^0-9a-zA-Z]', ' ', text) 26 | # remove extra spaces 27 | text = re.sub(r'\s+', ' ', text) 28 | # remove stopwords 29 | text = " ".join(word for word in text.split() if word not in STOPWORDS) 30 | return text 31 | 32 | # clean the messages 33 | df['clean_text'] = df['messages'].apply(clean_text) 34 | df.head() 35 | 36 | # Input Split 37 | X = df['clean_text'] 38 | y = df['label'] 39 | 40 | # Model Training 41 | from sklearn.pipeline import Pipeline 42 | from sklearn.model_selection import train_test_split, cross_val_score 43 | from sklearn.metrics import classification_report 44 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer 45 | 46 | def classify(model, X, y): 47 | # train test split 48 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True, stratify=y) 49 | # model training 50 | pipeline_model = Pipeline([('vect', CountVectorizer()), 51 | ('tfidf', TfidfTransformer()), 52 | ('clf', model)]) 53 | pipeline_model.fit(x_train, y_train) 54 | 55 | print('Accuracy:', pipeline_model.score(x_test, y_test)*100) 56 | 57 | # cv_score = cross_val_score(model, X, y, cv=5) 58 | # print("CV Score:", np.mean(cv_score)*100) 59 | y_pred = pipeline_model.predict(x_test) 60 | print(classification_report(y_test, y_pred)) 61 | 62 | # LogisticRegression Model 63 | print("LogisticRegression Model") 64 | print("------------------------") 65 | from sklearn.linear_model import LogisticRegression 66 | model = LogisticRegression() 67 | classify(model, X, y) 68 | print("======================================================") 69 | 70 | # MultinomialNB Model 71 | print("MultinomialNB Model") 72 | print("-------------------") 73 | from sklearn.naive_bayes import MultinomialNB 74 | model = MultinomialNB() 75 | classify(model, X, y) 76 | print("======================================================") 77 | 78 | # SVC Model 79 | print("SVC Model") 80 | print("---------") 81 | from sklearn.svm import SVC 82 | model = SVC(C=3) 83 | classify(model, X, y) 84 | print("======================================================") 85 | 86 | # RandomForestClassifier Model 87 | print("RandomForestClassifier Model") 88 | print("----------------------------") 89 | from sklearn.ensemble import RandomForestClassifier 90 | model = RandomForestClassifier() 91 | classify(model, X, y) 92 | print("======================================================") -------------------------------------------------------------------------------- /spam.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PB2204/SPAM-Detection-Model/c093eb91a043db3a192a62cc3a5911b6ac508f0a/spam.csv --------------------------------------------------------------------------------