├── AirBNB_EDA.ipynb ├── Airlines_Passenger_Traffic_TSA.ipynb ├── Animated_Weather_Graphs.ipynb ├── Bangladesh_Economic_Indicators_Analysis.ipynb ├── Bank_Customer_Churn_Prediction.ipynb ├── COVID_19_Analysis_and_Prediction.ipynb ├── Car_Features_Analysis.ipynb ├── Credit_Card_Fraud_Detection.ipynb ├── HR_Analysis.ipynb ├── Heart_Attack_Prediction.ipynb ├── Hotel_Reviews_Sentiment_Prediction.ipynb ├── House_Price_Prediction.ipynb ├── IMDb_Sentiment_Analysis.ipynb ├── IPL_EDA.ipynb ├── Image_Segmentation_using_FastAI.ipynb ├── Iris_Classifier_Model_Comparison.ipynb ├── Loan_Default_Prediction.ipynb ├── Mall_Customers_CLustering.ipynb ├── Movie_Recommendation_Engine.ipynb ├── README.md ├── SMS_Spam_Detection_NLP_.ipynb ├── Stock_Market_Analysis_and_Prediction_.ipynb ├── Suicide_Rate_Analysis.ipynb ├── Udemy_Courses_Recommendation_and_Auditor.ipynb ├── Video_Games_Sale_Prediction_and_EDA.ipynb └── World_Happiness_EDA.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Data-Analysis- 2 | Different types of data analytics projects : EDA, PDA, DDA, TSA and much more..... 3 | ## Contents 4 | 5 |
6 | 7 |
    8 |
  1. Car Feature Analysis
  2. 9 |
  3. Iris Classifier Model Comparison
  4. 10 |
  5. COVID-19 Analysis and Prediction
  6. 11 |
  7. Airbnb Explanatory Data Analysis
  8. 12 |
  9. IMDb Sentiment Analysis
  10. 13 |
  11. Mall Customer Clustering
  12. 14 |
  13. IPL Explanatory Data Analysis
  14. 15 |
  15. Movie Recommendation Engine
  16. 16 |
  17. Credit Card Fraud Detection
  18. 17 |
  19. Hotel Reviews Sentiment Prediction
  20. 18 |
  21. Loan Default Prediction
  22. 19 |
  23. Flight Passenger Traffic : TSA
  24. 20 |
  25. Suicide Rate Analysis
  26. 21 |
  27. Videogames Sales Prediction and EDA
  28. 22 |
  29. Stock Market Prediction and Analysis using LSTM
  30. 23 |
  31. Bank Customer Churn Prediction
  32. 24 |
  33. Heart Attack Prediction
  34. 25 |
  35. SMS Spam Detection using NLP
  36. 26 |
  37. World Happiness EDA
  38. 27 |
  39. HR Data Analytics
  40. 28 |
  41. Udemy Course Recommendation and Auditor
  42. 29 |
  43. Animated Time Series Graphs
  44. 30 |
31 | -------------------------------------------------------------------------------- /SMS_Spam_Detection_NLP_.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "SMS Spam Detection : NLP .ipynb", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyNJUawe/r6XNkbjIbffEqyp", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "EsyQyOF1ccef" 31 | }, 32 | "source": [ 33 | "# SMS Spam Detection using Natural Language Processing\n", 34 | "\n", 35 | "
" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "id": "vRRCtuLXcqPm" 42 | }, 43 | "source": [ 44 | "## Approach :\n", 45 | "### i. Representing text as numerical data\n", 46 | "### ii. Reading a text-based dataset into pandas\n", 47 | "### iii.Vectorizing our dataset\n", 48 | "### iv. Building and evaluating a model\n", 49 | "### v. Comparing models\n", 50 | "### vi. Examining a model for further insight\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "id": "QH0PTBSFdBQe" 57 | }, 58 | "source": [ 59 | "## Importing necessary libraries" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "metadata": { 65 | "colab": { 66 | "base_uri": "https://localhost:8080/" 67 | }, 68 | "id": "wSJGpEu1cWmI", 69 | "outputId": "07098125-2340-48cf-a0cd-67440e9f24db" 70 | }, 71 | "source": [ 72 | "import pandas as pd\n", 73 | "import numpy as np\n", 74 | "import matplotlib.pyplot as plt\n", 75 | "import seaborn as sns\n", 76 | "import nltk\n", 77 | "nltk.download('stopwords')\n", 78 | "%matplotlib inline\n", 79 | "sns.set_style(\"darkgrid\")\n", 80 | "plt.style.use(\"fivethirtyeight\")\n", 81 | "print(\"Necessary packages included successfully!\")" 82 | ], 83 | "execution_count": 27, 84 | "outputs": [ 85 | { 86 | "output_type": "stream", 87 | "text": [ 88 | "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", 89 | "[nltk_data] Unzipping corpora/stopwords.zip.\n", 90 | "Necessary packages included successfully!\n" 91 | ], 92 | "name": "stdout" 93 | } 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": { 99 | "id": "6gz7JglLjH8U" 100 | }, 101 | "source": [ 102 | "## 1. Representing text as numerical data" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "metadata": { 108 | "id": "HEh2dqT6i7-i" 109 | }, 110 | "source": [ 111 | "# example text for model training (SMS messages)\n", 112 | "simple_train = ['call you tonight', 'Call me a cab', 'Please call me... PLEASE!']" 113 | ], 114 | "execution_count": 3, 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "metadata": { 120 | "id": "QhGclG-ZjRui" 121 | }, 122 | "source": [ 123 | "# import and instantiate CountVectorizer (with the default parameters)\n", 124 | "from sklearn.feature_extraction.text import CountVectorizer\n", 125 | "vect = CountVectorizer()" 126 | ], 127 | "execution_count": 4, 128 | "outputs": [] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "metadata": { 133 | "colab": { 134 | "base_uri": "https://localhost:8080/" 135 | }, 136 | "id": "2OZQeUtxj2a3", 137 | "outputId": "8e0bd715-c0c1-4587-a4a0-1865dcaadd56" 138 | }, 139 | "source": [ 140 | "# learn the 'vocabulary' of the training data (occurs in-place)\n", 141 | "vect.fit(simple_train)" 142 | ], 143 | "execution_count": 5, 144 | "outputs": [ 145 | { 146 | "output_type": "execute_result", 147 | "data": { 148 | "text/plain": [ 149 | "CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", 150 | " dtype=, encoding='utf-8', input='content',\n", 151 | " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", 152 | " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", 153 | " strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", 154 | " tokenizer=None, vocabulary=None)" 155 | ] 156 | }, 157 | "metadata": { 158 | "tags": [] 159 | }, 160 | "execution_count": 5 161 | } 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "metadata": { 167 | "colab": { 168 | "base_uri": "https://localhost:8080/" 169 | }, 170 | "id": "3mwQZpkFj5Hw", 171 | "outputId": "981e5255-4d28-44dd-e1d0-4fb5d809b713" 172 | }, 173 | "source": [ 174 | "# examine the fitted vocabulary\n", 175 | "vect.get_feature_names()" 176 | ], 177 | "execution_count": 6, 178 | "outputs": [ 179 | { 180 | "output_type": "execute_result", 181 | "data": { 182 | "text/plain": [ 183 | "['cab', 'call', 'me', 'please', 'tonight', 'you']" 184 | ] 185 | }, 186 | "metadata": { 187 | "tags": [] 188 | }, 189 | "execution_count": 6 190 | } 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "metadata": { 196 | "colab": { 197 | "base_uri": "https://localhost:8080/" 198 | }, 199 | "id": "DYXT6QnCj8Y-", 200 | "outputId": "e0df455a-6810-4298-8396-030a1aa58cfb" 201 | }, 202 | "source": [ 203 | "# transform training data into a 'document-term matrix'\n", 204 | "simple_train_dtm = vect.transform(simple_train)\n", 205 | "simple_train_dtm" 206 | ], 207 | "execution_count": 7, 208 | "outputs": [ 209 | { 210 | "output_type": "execute_result", 211 | "data": { 212 | "text/plain": [ 213 | "<3x6 sparse matrix of type ''\n", 214 | "\twith 9 stored elements in Compressed Sparse Row format>" 215 | ] 216 | }, 217 | "metadata": { 218 | "tags": [] 219 | }, 220 | "execution_count": 7 221 | } 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "metadata": { 227 | "colab": { 228 | "base_uri": "https://localhost:8080/" 229 | }, 230 | "id": "Vx0TufUWj_f7", 231 | "outputId": "d7851980-c53a-41a7-9ebb-aad6b7b9bf7d" 232 | }, 233 | "source": [ 234 | "# convert sparse matrix to a dense matrix\n", 235 | "simple_train_dtm.toarray()" 236 | ], 237 | "execution_count": 8, 238 | "outputs": [ 239 | { 240 | "output_type": "execute_result", 241 | "data": { 242 | "text/plain": [ 243 | "array([[0, 1, 0, 0, 1, 1],\n", 244 | " [1, 1, 1, 0, 0, 0],\n", 245 | " [0, 1, 1, 2, 0, 0]])" 246 | ] 247 | }, 248 | "metadata": { 249 | "tags": [] 250 | }, 251 | "execution_count": 8 252 | } 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "metadata": { 258 | "colab": { 259 | "base_uri": "https://localhost:8080/", 260 | "height": 137 261 | }, 262 | "id": "L72IJthbkDH-", 263 | "outputId": "5e34feb8-9219-4c14-a5b9-16d4fd626c25" 264 | }, 265 | "source": [ 266 | "# examine the vocabulary and document-term matrix together\n", 267 | "pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())" 268 | ], 269 | "execution_count": 9, 270 | "outputs": [ 271 | { 272 | "output_type": "execute_result", 273 | "data": { 274 | "text/html": [ 275 | "
\n", 276 | "\n", 289 | "\n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | "
cabcallmepleasetonightyou
0010011
1111000
2011200
\n", 331 | "
" 332 | ], 333 | "text/plain": [ 334 | " cab call me please tonight you\n", 335 | "0 0 1 0 0 1 1\n", 336 | "1 1 1 1 0 0 0\n", 337 | "2 0 1 1 2 0 0" 338 | ] 339 | }, 340 | "metadata": { 341 | "tags": [] 342 | }, 343 | "execution_count": 9 344 | } 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "metadata": { 350 | "colab": { 351 | "base_uri": "https://localhost:8080/" 352 | }, 353 | "id": "__82s5G9kGSb", 354 | "outputId": "d7457d64-a598-43dd-a1ed-e88915271b98" 355 | }, 356 | "source": [ 357 | "# check the type of the document-term matrix\n", 358 | "type(simple_train_dtm)" 359 | ], 360 | "execution_count": 10, 361 | "outputs": [ 362 | { 363 | "output_type": "execute_result", 364 | "data": { 365 | "text/plain": [ 366 | "scipy.sparse.csr.csr_matrix" 367 | ] 368 | }, 369 | "metadata": { 370 | "tags": [] 371 | }, 372 | "execution_count": 10 373 | } 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "metadata": { 379 | "colab": { 380 | "base_uri": "https://localhost:8080/" 381 | }, 382 | "id": "HGBIqef6kLPj", 383 | "outputId": "2078557e-bd13-4485-a2fe-590797e10422" 384 | }, 385 | "source": [ 386 | "# examine the sparse matrix contents\n", 387 | "print(simple_train_dtm)" 388 | ], 389 | "execution_count": 11, 390 | "outputs": [ 391 | { 392 | "output_type": "stream", 393 | "text": [ 394 | " (0, 1)\t1\n", 395 | " (0, 4)\t1\n", 396 | " (0, 5)\t1\n", 397 | " (1, 0)\t1\n", 398 | " (1, 1)\t1\n", 399 | " (1, 2)\t1\n", 400 | " (2, 1)\t1\n", 401 | " (2, 2)\t1\n", 402 | " (2, 3)\t2\n" 403 | ], 404 | "name": "stdout" 405 | } 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "metadata": { 411 | "id": "kMv7gIQ4kNx8" 412 | }, 413 | "source": [ 414 | "# example text for model testing\n", 415 | "simple_test = [\"please don't call me\"]" 416 | ], 417 | "execution_count": 12, 418 | "outputs": [] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "metadata": { 423 | "colab": { 424 | "base_uri": "https://localhost:8080/" 425 | }, 426 | "id": "v76GaavFkSDd", 427 | "outputId": "f4c4e694-1100-48ee-c888-2fade4a7b330" 428 | }, 429 | "source": [ 430 | "# transform testing data into a document-term matrix (using existing vocabulary)\n", 431 | "simple_test_dtm = vect.transform(simple_test)\n", 432 | "simple_test_dtm.toarray()" 433 | ], 434 | "execution_count": 13, 435 | "outputs": [ 436 | { 437 | "output_type": "execute_result", 438 | "data": { 439 | "text/plain": [ 440 | "array([[0, 1, 1, 1, 0, 0]])" 441 | ] 442 | }, 443 | "metadata": { 444 | "tags": [] 445 | }, 446 | "execution_count": 13 447 | } 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "metadata": { 453 | "colab": { 454 | "base_uri": "https://localhost:8080/", 455 | "height": 77 456 | }, 457 | "id": "jOZOL10OkUh6", 458 | "outputId": "62dffe55-9741-4b14-ba86-aa4faab32681" 459 | }, 460 | "source": [ 461 | "# examine the vocabulary and document-term matrix together\n", 462 | "pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())" 463 | ], 464 | "execution_count": 14, 465 | "outputs": [ 466 | { 467 | "output_type": "execute_result", 468 | "data": { 469 | "text/html": [ 470 | "
\n", 471 | "\n", 484 | "\n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | "
cabcallmepleasetonightyou
0011100
\n", 508 | "
" 509 | ], 510 | "text/plain": [ 511 | " cab call me please tonight you\n", 512 | "0 0 1 1 1 0 0" 513 | ] 514 | }, 515 | "metadata": { 516 | "tags": [] 517 | }, 518 | "execution_count": 14 519 | } 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": { 525 | "id": "ghaGexa1kwEQ" 526 | }, 527 | "source": [ 528 | "## 2. Reading a text-based dataset into notebook" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "metadata": { 534 | "colab": { 535 | "base_uri": "https://localhost:8080/", 536 | "height": 196 537 | }, 538 | "id": "Tie7p7IfkXpQ", 539 | "outputId": "5d01f03c-0429-4e0e-a049-1c4a4b7bc7a9" 540 | }, 541 | "source": [ 542 | "url = 'https://raw.githubusercontent.com/MainakRepositor/Datasets-/master/spam.csv'\n", 543 | "sms = pd.read_csv(url,error_bad_lines=False,encoding='latin-1')\n", 544 | "sms.dropna(how=\"any\", inplace=True, axis=1)\n", 545 | "sms.columns = ['label', 'message']\n", 546 | "sms.head()" 547 | ], 548 | "execution_count": 15, 549 | "outputs": [ 550 | { 551 | "output_type": "execute_result", 552 | "data": { 553 | "text/html": [ 554 | "
\n", 555 | "\n", 568 | "\n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | "
labelmessage
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
\n", 604 | "
" 605 | ], 606 | "text/plain": [ 607 | " label message\n", 608 | "0 ham Go until jurong point, crazy.. Available only ...\n", 609 | "1 ham Ok lar... Joking wif u oni...\n", 610 | "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n", 611 | "3 ham U dun say so early hor... U c already then say...\n", 612 | "4 ham Nah I don't think he goes to usf, he lives aro..." 613 | ] 614 | }, 615 | "metadata": { 616 | "tags": [] 617 | }, 618 | "execution_count": 15 619 | } 620 | ] 621 | }, 622 | { 623 | "cell_type": "markdown", 624 | "metadata": { 625 | "id": "ftTUkK67k0Wd" 626 | }, 627 | "source": [ 628 | "## 3. Exploratory Data Analysis (EDA)" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "metadata": { 634 | "colab": { 635 | "base_uri": "https://localhost:8080/", 636 | "height": 166 637 | }, 638 | "id": "tgkEiFG9kqrl", 639 | "outputId": "4971ab2d-026a-4a91-c5c2-d68e91de6586" 640 | }, 641 | "source": [ 642 | "sms.describe()" 643 | ], 644 | "execution_count": 16, 645 | "outputs": [ 646 | { 647 | "output_type": "execute_result", 648 | "data": { 649 | "text/html": [ 650 | "
\n", 651 | "\n", 664 | "\n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | "
labelmessage
count55725572
unique25169
tophamSorry, I'll call later
freq482530
\n", 695 | "
" 696 | ], 697 | "text/plain": [ 698 | " label message\n", 699 | "count 5572 5572\n", 700 | "unique 2 5169\n", 701 | "top ham Sorry, I'll call later\n", 702 | "freq 4825 30" 703 | ] 704 | }, 705 | "metadata": { 706 | "tags": [] 707 | }, 708 | "execution_count": 16 709 | } 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "metadata": { 715 | "colab": { 716 | "base_uri": "https://localhost:8080/", 717 | "height": 166 718 | }, 719 | "id": "8tOf8rdrk5pu", 720 | "outputId": "8831c62e-7b70-4be0-a33f-5ae27015acbe" 721 | }, 722 | "source": [ 723 | "sms.groupby('label').describe()" 724 | ], 725 | "execution_count": 17, 726 | "outputs": [ 727 | { 728 | "output_type": "execute_result", 729 | "data": { 730 | "text/html": [ 731 | "
\n", 732 | "\n", 749 | "\n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | "
message
countuniquetopfreq
label
ham48254516Sorry, I'll call later30
spam747653Please call our customer service representativ...4
\n", 787 | "
" 788 | ], 789 | "text/plain": [ 790 | " message \n", 791 | " count unique top freq\n", 792 | "label \n", 793 | "ham 4825 4516 Sorry, I'll call later 30\n", 794 | "spam 747 653 Please call our customer service representativ... 4" 795 | ] 796 | }, 797 | "metadata": { 798 | "tags": [] 799 | }, 800 | "execution_count": 17 801 | } 802 | ] 803 | }, 804 | { 805 | "cell_type": "code", 806 | "metadata": { 807 | "colab": { 808 | "base_uri": "https://localhost:8080/", 809 | "height": 196 810 | }, 811 | "id": "fvkzXgIxk8qd", 812 | "outputId": "a39c1b97-5d1b-4196-ae9f-222389039515" 813 | }, 814 | "source": [ 815 | "# convert label to a numerical variable\n", 816 | "sms['label_num'] = sms.label.map({'ham':0, 'spam':1})\n", 817 | "sms.head()" 818 | ], 819 | "execution_count": 18, 820 | "outputs": [ 821 | { 822 | "output_type": "execute_result", 823 | "data": { 824 | "text/html": [ 825 | "
\n", 826 | "\n", 839 | "\n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | "
labelmessagelabel_num
0hamGo until jurong point, crazy.. Available only ...0
1hamOk lar... Joking wif u oni...0
2spamFree entry in 2 a wkly comp to win FA Cup fina...1
3hamU dun say so early hor... U c already then say...0
4hamNah I don't think he goes to usf, he lives aro...0
\n", 881 | "
" 882 | ], 883 | "text/plain": [ 884 | " label message label_num\n", 885 | "0 ham Go until jurong point, crazy.. Available only ... 0\n", 886 | "1 ham Ok lar... Joking wif u oni... 0\n", 887 | "2 spam Free entry in 2 a wkly comp to win FA Cup fina... 1\n", 888 | "3 ham U dun say so early hor... U c already then say... 0\n", 889 | "4 ham Nah I don't think he goes to usf, he lives aro... 0" 890 | ] 891 | }, 892 | "metadata": { 893 | "tags": [] 894 | }, 895 | "execution_count": 18 896 | } 897 | ] 898 | }, 899 | { 900 | "cell_type": "code", 901 | "metadata": { 902 | "colab": { 903 | "base_uri": "https://localhost:8080/", 904 | "height": 196 905 | }, 906 | "id": "0tH1qcZ4lAJh", 907 | "outputId": "19fe514d-c68f-4a6e-c396-d6b7ec43e95c" 908 | }, 909 | "source": [ 910 | "sms['message_len'] = sms.message.apply(len)\n", 911 | "sms.head()" 912 | ], 913 | "execution_count": 19, 914 | "outputs": [ 915 | { 916 | "output_type": "execute_result", 917 | "data": { 918 | "text/html": [ 919 | "
\n", 920 | "\n", 933 | "\n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | "
labelmessagelabel_nummessage_len
0hamGo until jurong point, crazy.. Available only ...0111
1hamOk lar... Joking wif u oni...029
2spamFree entry in 2 a wkly comp to win FA Cup fina...1155
3hamU dun say so early hor... U c already then say...049
4hamNah I don't think he goes to usf, he lives aro...061
\n", 981 | "
" 982 | ], 983 | "text/plain": [ 984 | " label ... message_len\n", 985 | "0 ham ... 111\n", 986 | "1 ham ... 29\n", 987 | "2 spam ... 155\n", 988 | "3 ham ... 49\n", 989 | "4 ham ... 61\n", 990 | "\n", 991 | "[5 rows x 4 columns]" 992 | ] 993 | }, 994 | "metadata": { 995 | "tags": [] 996 | }, 997 | "execution_count": 19 998 | } 999 | ] 1000 | }, 1001 | { 1002 | "cell_type": "code", 1003 | "metadata": { 1004 | "colab": { 1005 | "base_uri": "https://localhost:8080/", 1006 | "height": 553 1007 | }, 1008 | "id": "bWwHe6YWlDRa", 1009 | "outputId": "f50b177e-854b-43f3-9a66-fcdd8752fccd" 1010 | }, 1011 | "source": [ 1012 | "plt.figure(figsize=(17, 8))\n", 1013 | "\n", 1014 | "sms[sms.label=='ham'].message_len.plot(bins=35, kind='hist', color='blue', \n", 1015 | " label='Ham messages', alpha=0.6)\n", 1016 | "sms[sms.label=='spam'].message_len.plot(kind='hist', color='red', \n", 1017 | " label='Spam messages', alpha=0.6)\n", 1018 | "plt.legend()\n", 1019 | "plt.xlabel(\"Message Length\")" 1020 | ], 1021 | "execution_count": 20, 1022 | "outputs": [ 1023 | { 1024 | "output_type": "execute_result", 1025 | "data": { 1026 | "text/plain": [ 1027 | "Text(0.5, 0, 'Message Length')" 1028 | ] 1029 | }, 1030 | "metadata": { 1031 | "tags": [] 1032 | }, 1033 | "execution_count": 20 1034 | }, 1035 | { 1036 | "output_type": "display_data", 1037 | "data": { 1038 | "image/png": "\n", 1039 | "text/plain": [ 1040 | "
" 1041 | ] 1042 | }, 1043 | "metadata": { 1044 | "tags": [] 1045 | } 1046 | } 1047 | ] 1048 | }, 1049 | { 1050 | "cell_type": "code", 1051 | "metadata": { 1052 | "colab": { 1053 | "base_uri": "https://localhost:8080/", 1054 | "height": 286 1055 | }, 1056 | "id": "pEax9b3WlHCT", 1057 | "outputId": "f058c836-36a2-4dd2-9b2a-001ce8540432" 1058 | }, 1059 | "source": [ 1060 | "sms[sms.label=='ham'].describe()" 1061 | ], 1062 | "execution_count": 21, 1063 | "outputs": [ 1064 | { 1065 | "output_type": "execute_result", 1066 | "data": { 1067 | "text/html": [ 1068 | "
\n", 1069 | "\n", 1082 | "\n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | "
label_nummessage_len
count4825.04825.000000
mean0.071.023627
std0.058.016023
min0.02.000000
25%0.033.000000
50%0.052.000000
75%0.092.000000
max0.0910.000000
\n", 1133 | "
" 1134 | ], 1135 | "text/plain": [ 1136 | " label_num message_len\n", 1137 | "count 4825.0 4825.000000\n", 1138 | "mean 0.0 71.023627\n", 1139 | "std 0.0 58.016023\n", 1140 | "min 0.0 2.000000\n", 1141 | "25% 0.0 33.000000\n", 1142 | "50% 0.0 52.000000\n", 1143 | "75% 0.0 92.000000\n", 1144 | "max 0.0 910.000000" 1145 | ] 1146 | }, 1147 | "metadata": { 1148 | "tags": [] 1149 | }, 1150 | "execution_count": 21 1151 | } 1152 | ] 1153 | }, 1154 | { 1155 | "cell_type": "code", 1156 | "metadata": { 1157 | "colab": { 1158 | "base_uri": "https://localhost:8080/", 1159 | "height": 286 1160 | }, 1161 | "id": "N8qmqZyQlKcP", 1162 | "outputId": "e11b5776-6550-414f-b815-e1b220b68408" 1163 | }, 1164 | "source": [ 1165 | "sms[sms.label=='spam'].describe()" 1166 | ], 1167 | "execution_count": 22, 1168 | "outputs": [ 1169 | { 1170 | "output_type": "execute_result", 1171 | "data": { 1172 | "text/html": [ 1173 | "
\n", 1174 | "\n", 1187 | "\n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | "
label_nummessage_len
count747.0747.000000
mean1.0138.866131
std0.029.183082
min1.013.000000
25%1.0132.500000
50%1.0149.000000
75%1.0157.000000
max1.0224.000000
\n", 1238 | "
" 1239 | ], 1240 | "text/plain": [ 1241 | " label_num message_len\n", 1242 | "count 747.0 747.000000\n", 1243 | "mean 1.0 138.866131\n", 1244 | "std 0.0 29.183082\n", 1245 | "min 1.0 13.000000\n", 1246 | "25% 1.0 132.500000\n", 1247 | "50% 1.0 149.000000\n", 1248 | "75% 1.0 157.000000\n", 1249 | "max 1.0 224.000000" 1250 | ] 1251 | }, 1252 | "metadata": { 1253 | "tags": [] 1254 | }, 1255 | "execution_count": 22 1256 | } 1257 | ] 1258 | }, 1259 | { 1260 | "cell_type": "code", 1261 | "metadata": { 1262 | "colab": { 1263 | "base_uri": "https://localhost:8080/", 1264 | "height": 118 1265 | }, 1266 | "id": "kUcvNzFqlNEo", 1267 | "outputId": "79a639f0-b7a2-4282-f31f-504f8122b851" 1268 | }, 1269 | "source": [ 1270 | "sms[sms.message_len == 910].message.iloc[0]" 1271 | ], 1272 | "execution_count": 23, 1273 | "outputs": [ 1274 | { 1275 | "output_type": "execute_result", 1276 | "data": { 1277 | "application/vnd.google.colaboratory.intrinsic+json": { 1278 | "type": "string" 1279 | }, 1280 | "text/plain": [ 1281 | "\"For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later..\"" 1282 | ] 1283 | }, 1284 | "metadata": { 1285 | "tags": [] 1286 | }, 1287 | "execution_count": 23 1288 | } 1289 | ] 1290 | }, 1291 | { 1292 | "cell_type": "markdown", 1293 | "metadata": { 1294 | "id": "7a_J_HjxlV8W" 1295 | }, 1296 | "source": [ 1297 | "## 4. Text Pre-processing" 1298 | ] 1299 | }, 1300 | { 1301 | "cell_type": "code", 1302 | "metadata": { 1303 | "colab": { 1304 | "base_uri": "https://localhost:8080/", 1305 | "height": 196 1306 | }, 1307 | "id": "ABcHa1mslPuH", 1308 | "outputId": "d61dd583-dedb-4fd5-e7a8-23280ad46bb7" 1309 | }, 1310 | "source": [ 1311 | "import string\n", 1312 | "from nltk.corpus import stopwords\n", 1313 | "\n", 1314 | "def text_process(mess):\n", 1315 | " \"\"\"\n", 1316 | " Takes in a string of text, then performs the following:\n", 1317 | " 1. Remove all punctuation\n", 1318 | " 2. Remove all stopwords\n", 1319 | " 3. Returns a list of the cleaned text\n", 1320 | " \"\"\"\n", 1321 | " STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']\n", 1322 | " # Check characters to see if they are in punctuation\n", 1323 | " nopunc = [char for char in mess if char not in string.punctuation]\n", 1324 | "\n", 1325 | " # Join the characters again to form the string.\n", 1326 | " nopunc = ''.join(nopunc)\n", 1327 | " \n", 1328 | " # Now just remove any stopwords\n", 1329 | " return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])\n", 1330 | " \n", 1331 | "sms.head()" 1332 | ], 1333 | "execution_count": 24, 1334 | "outputs": [ 1335 | { 1336 | "output_type": "execute_result", 1337 | "data": { 1338 | "text/html": [ 1339 | "
\n", 1340 | "\n", 1353 | "\n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | "
labelmessagelabel_nummessage_len
0hamGo until jurong point, crazy.. Available only ...0111
1hamOk lar... Joking wif u oni...029
2spamFree entry in 2 a wkly comp to win FA Cup fina...1155
3hamU dun say so early hor... U c already then say...049
4hamNah I don't think he goes to usf, he lives aro...061
\n", 1401 | "
" 1402 | ], 1403 | "text/plain": [ 1404 | " label ... message_len\n", 1405 | "0 ham ... 111\n", 1406 | "1 ham ... 29\n", 1407 | "2 spam ... 155\n", 1408 | "3 ham ... 49\n", 1409 | "4 ham ... 61\n", 1410 | "\n", 1411 | "[5 rows x 4 columns]" 1412 | ] 1413 | }, 1414 | "metadata": { 1415 | "tags": [] 1416 | }, 1417 | "execution_count": 24 1418 | } 1419 | ] 1420 | }, 1421 | { 1422 | "cell_type": "code", 1423 | "metadata": { 1424 | "colab": { 1425 | "base_uri": "https://localhost:8080/", 1426 | "height": 196 1427 | }, 1428 | "id": "bujuGVqSlcFn", 1429 | "outputId": "3763a794-c1b9-4262-c97a-5dd7656a7330" 1430 | }, 1431 | "source": [ 1432 | "sms['clean_msg'] = sms.message.apply(text_process)\n", 1433 | "sms.head()" 1434 | ], 1435 | "execution_count": 28, 1436 | "outputs": [ 1437 | { 1438 | "output_type": "execute_result", 1439 | "data": { 1440 | "text/html": [ 1441 | "
\n", 1442 | "\n", 1455 | "\n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | "
labelmessagelabel_nummessage_lenclean_msg
0hamGo until jurong point, crazy.. Available only ...0111Go jurong point crazy Available bugis n great ...
1hamOk lar... Joking wif u oni...029Ok lar Joking wif oni
2spamFree entry in 2 a wkly comp to win FA Cup fina...1155Free entry wkly comp win FA Cup final tkts 21s...
3hamU dun say so early hor... U c already then say...049dun say early hor c already say
4hamNah I don't think he goes to usf, he lives aro...061Nah think goes usf lives around though
\n", 1509 | "
" 1510 | ], 1511 | "text/plain": [ 1512 | " label ... clean_msg\n", 1513 | "0 ham ... Go jurong point crazy Available bugis n great ...\n", 1514 | "1 ham ... Ok lar Joking wif oni\n", 1515 | "2 spam ... Free entry wkly comp win FA Cup final tkts 21s...\n", 1516 | "3 ham ... dun say early hor c already say\n", 1517 | "4 ham ... Nah think goes usf lives around though\n", 1518 | "\n", 1519 | "[5 rows x 5 columns]" 1520 | ] 1521 | }, 1522 | "metadata": { 1523 | "tags": [] 1524 | }, 1525 | "execution_count": 28 1526 | } 1527 | ] 1528 | }, 1529 | { 1530 | "cell_type": "code", 1531 | "metadata": { 1532 | "colab": { 1533 | "base_uri": "https://localhost:8080/" 1534 | }, 1535 | "id": "r_n-DKwelfan", 1536 | "outputId": "3feee23f-e322-4cb1-82f5-273229cedaa1" 1537 | }, 1538 | "source": [ 1539 | "type(stopwords.words('english'))" 1540 | ], 1541 | "execution_count": 29, 1542 | "outputs": [ 1543 | { 1544 | "output_type": "execute_result", 1545 | "data": { 1546 | "text/plain": [ 1547 | "list" 1548 | ] 1549 | }, 1550 | "metadata": { 1551 | "tags": [] 1552 | }, 1553 | "execution_count": 29 1554 | } 1555 | ] 1556 | }, 1557 | { 1558 | "cell_type": "code", 1559 | "metadata": { 1560 | "colab": { 1561 | "base_uri": "https://localhost:8080/" 1562 | }, 1563 | "id": "yNrGdgljlsAx", 1564 | "outputId": "38f42f2e-17ae-4ee4-e4c2-43ac5a10d4cb" 1565 | }, 1566 | "source": [ 1567 | "from collections import Counter\n", 1568 | "\n", 1569 | "words = sms[sms.label=='ham'].clean_msg.apply(lambda x: [word.lower() for word in x.split()])\n", 1570 | "ham_words = Counter()\n", 1571 | "\n", 1572 | "for msg in words:\n", 1573 | " ham_words.update(msg)\n", 1574 | " \n", 1575 | "print(ham_words.most_common(50))" 1576 | ], 1577 | "execution_count": 30, 1578 | "outputs": [ 1579 | { 1580 | "output_type": "stream", 1581 | "text": [ 1582 | "[('get', 303), ('ltgt', 276), ('ok', 272), ('go', 247), ('ill', 236), ('know', 232), ('got', 231), ('like', 229), ('call', 229), ('come', 224), ('good', 222), ('time', 189), ('day', 187), ('love', 185), ('going', 167), ('want', 163), ('one', 162), ('home', 160), ('lor', 160), ('need', 156), ('sorry', 153), ('still', 146), ('see', 137), ('n', 134), ('later', 134), ('da', 131), ('r', 131), ('back', 129), ('think', 128), ('well', 126), ('today', 125), ('send', 123), ('tell', 121), ('cant', 118), ('ì', 117), ('hi', 117), ('take', 112), ('much', 112), ('oh', 111), ('night', 107), ('hey', 106), ('happy', 105), ('great', 100), ('way', 100), ('hope', 99), ('pls', 98), ('work', 96), ('wat', 95), ('thats', 94), ('dear', 94)]\n" 1583 | ], 1584 | "name": "stdout" 1585 | } 1586 | ] 1587 | }, 1588 | { 1589 | "cell_type": "code", 1590 | "metadata": { 1591 | "colab": { 1592 | "base_uri": "https://localhost:8080/" 1593 | }, 1594 | "id": "3KVBmlrIlujl", 1595 | "outputId": "fb138d9a-51fc-458a-99b5-74d6d81629d6" 1596 | }, 1597 | "source": [ 1598 | "words = sms[sms.label=='spam'].clean_msg.apply(lambda x: [word.lower() for word in x.split()])\n", 1599 | "spam_words = Counter()\n", 1600 | "\n", 1601 | "for msg in words:\n", 1602 | " spam_words.update(msg)\n", 1603 | " \n", 1604 | "print(spam_words.most_common(50))" 1605 | ], 1606 | "execution_count": 31, 1607 | "outputs": [ 1608 | { 1609 | "output_type": "stream", 1610 | "text": [ 1611 | "[('call', 347), ('free', 216), ('txt', 150), ('mobile', 123), ('text', 120), ('claim', 113), ('stop', 113), ('reply', 101), ('prize', 92), ('get', 83), ('new', 69), ('send', 67), ('nokia', 65), ('urgent', 63), ('cash', 62), ('win', 60), ('contact', 56), ('service', 55), ('please', 52), ('guaranteed', 50), ('customer', 49), ('16', 49), ('week', 49), ('tone', 48), ('per', 46), ('phone', 45), ('18', 43), ('chat', 42), ('awarded', 38), ('draw', 38), ('latest', 36), ('å£1000', 35), ('line', 35), ('150ppm', 34), ('mins', 34), ('receive', 33), ('camera', 33), ('1', 33), ('every', 33), ('message', 32), ('holiday', 32), ('landline', 32), ('shows', 31), ('å£2000', 31), ('go', 31), ('box', 30), ('number', 30), ('apply', 29), ('code', 29), ('live', 29)]\n" 1612 | ], 1613 | "name": "stdout" 1614 | } 1615 | ] 1616 | }, 1617 | { 1618 | "cell_type": "code", 1619 | "metadata": { 1620 | "colab": { 1621 | "base_uri": "https://localhost:8080/" 1622 | }, 1623 | "id": "IeKB4sQtlwxj", 1624 | "outputId": "553294bd-30e7-4e8d-cacc-185a42665a8d" 1625 | }, 1626 | "source": [ 1627 | "# how to define X and y (from the SMS data) for use with COUNTVECTORIZER\n", 1628 | "X = sms.clean_msg\n", 1629 | "y = sms.label_num\n", 1630 | "print(X.shape)\n", 1631 | "print(y.shape)" 1632 | ], 1633 | "execution_count": 32, 1634 | "outputs": [ 1635 | { 1636 | "output_type": "stream", 1637 | "text": [ 1638 | "(5572,)\n", 1639 | "(5572,)\n" 1640 | ], 1641 | "name": "stdout" 1642 | } 1643 | ] 1644 | }, 1645 | { 1646 | "cell_type": "code", 1647 | "metadata": { 1648 | "colab": { 1649 | "base_uri": "https://localhost:8080/" 1650 | }, 1651 | "id": "qwXh2kDel0a3", 1652 | "outputId": "c6395419-8fac-47cd-c02b-a97b964d63e1" 1653 | }, 1654 | "source": [ 1655 | "# split X and y into training and testing sets \n", 1656 | "from sklearn.model_selection import train_test_split\n", 1657 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)\n", 1658 | "print(X_train.shape)\n", 1659 | "print(X_test.shape)\n", 1660 | "print(y_train.shape)\n", 1661 | "print(y_test.shape)" 1662 | ], 1663 | "execution_count": 33, 1664 | "outputs": [ 1665 | { 1666 | "output_type": "stream", 1667 | "text": [ 1668 | "(4179,)\n", 1669 | "(1393,)\n", 1670 | "(4179,)\n", 1671 | "(1393,)\n" 1672 | ], 1673 | "name": "stdout" 1674 | } 1675 | ] 1676 | }, 1677 | { 1678 | "cell_type": "code", 1679 | "metadata": { 1680 | "colab": { 1681 | "base_uri": "https://localhost:8080/" 1682 | }, 1683 | "id": "9CdsKNV9l6Ij", 1684 | "outputId": "554084da-ec1b-4311-caee-e1ef4faea2dd" 1685 | }, 1686 | "source": [ 1687 | "from sklearn.feature_extraction.text import CountVectorizer\n", 1688 | "\n", 1689 | "# instantiate the vectorizer\n", 1690 | "vect = CountVectorizer()\n", 1691 | "vect.fit(X_train)" 1692 | ], 1693 | "execution_count": 34, 1694 | "outputs": [ 1695 | { 1696 | "output_type": "execute_result", 1697 | "data": { 1698 | "text/plain": [ 1699 | "CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", 1700 | " dtype=, encoding='utf-8', input='content',\n", 1701 | " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", 1702 | " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", 1703 | " strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", 1704 | " tokenizer=None, vocabulary=None)" 1705 | ] 1706 | }, 1707 | "metadata": { 1708 | "tags": [] 1709 | }, 1710 | "execution_count": 34 1711 | } 1712 | ] 1713 | }, 1714 | { 1715 | "cell_type": "code", 1716 | "metadata": { 1717 | "id": "RfkRg3e-l8Wy" 1718 | }, 1719 | "source": [ 1720 | "# learn training data vocabulary, then use it to create a document-term matrix\n", 1721 | "X_train_dtm = vect.transform(X_train)" 1722 | ], 1723 | "execution_count": 35, 1724 | "outputs": [] 1725 | }, 1726 | { 1727 | "cell_type": "code", 1728 | "metadata": { 1729 | "id": "1AoyWLyhl_aR" 1730 | }, 1731 | "source": [ 1732 | "# equivalently: combine fit and transform into a single step\n", 1733 | "X_train_dtm = vect.fit_transform(X_train)" 1734 | ], 1735 | "execution_count": 36, 1736 | "outputs": [] 1737 | }, 1738 | { 1739 | "cell_type": "code", 1740 | "metadata": { 1741 | "colab": { 1742 | "base_uri": "https://localhost:8080/" 1743 | }, 1744 | "id": "dnx4f9FrmCyI", 1745 | "outputId": "fe20bf55-55ca-4278-c068-0f9d3ea64927" 1746 | }, 1747 | "source": [ 1748 | "# examine the document-term matrix\n", 1749 | "X_train_dtm" 1750 | ], 1751 | "execution_count": 37, 1752 | "outputs": [ 1753 | { 1754 | "output_type": "execute_result", 1755 | "data": { 1756 | "text/plain": [ 1757 | "<4179x7996 sparse matrix of type ''\n", 1758 | "\twith 34796 stored elements in Compressed Sparse Row format>" 1759 | ] 1760 | }, 1761 | "metadata": { 1762 | "tags": [] 1763 | }, 1764 | "execution_count": 37 1765 | } 1766 | ] 1767 | }, 1768 | { 1769 | "cell_type": "code", 1770 | "metadata": { 1771 | "colab": { 1772 | "base_uri": "https://localhost:8080/" 1773 | }, 1774 | "id": "v5Tc7njsmE-7", 1775 | "outputId": "e2fa087f-06f9-494f-d4ad-3f9662fdc624" 1776 | }, 1777 | "source": [ 1778 | "# transform testing data (using fitted vocabulary) into a document-term matrix\n", 1779 | "X_test_dtm = vect.transform(X_test)\n", 1780 | "X_test_dtm" 1781 | ], 1782 | "execution_count": 38, 1783 | "outputs": [ 1784 | { 1785 | "output_type": "execute_result", 1786 | "data": { 1787 | "text/plain": [ 1788 | "<1393x7996 sparse matrix of type ''\n", 1789 | "\twith 9971 stored elements in Compressed Sparse Row format>" 1790 | ] 1791 | }, 1792 | "metadata": { 1793 | "tags": [] 1794 | }, 1795 | "execution_count": 38 1796 | } 1797 | ] 1798 | }, 1799 | { 1800 | "cell_type": "code", 1801 | "metadata": { 1802 | "colab": { 1803 | "base_uri": "https://localhost:8080/" 1804 | }, 1805 | "id": "KLXHpl3cmHSF", 1806 | "outputId": "bd78b3f7-d3c8-45f4-e056-525148d962bb" 1807 | }, 1808 | "source": [ 1809 | "from sklearn.feature_extraction.text import TfidfTransformer\n", 1810 | "\n", 1811 | "tfidf_transformer = TfidfTransformer()\n", 1812 | "tfidf_transformer.fit(X_train_dtm)\n", 1813 | "tfidf_transformer.transform(X_train_dtm)" 1814 | ], 1815 | "execution_count": 39, 1816 | "outputs": [ 1817 | { 1818 | "output_type": "execute_result", 1819 | "data": { 1820 | "text/plain": [ 1821 | "<4179x7996 sparse matrix of type ''\n", 1822 | "\twith 34796 stored elements in Compressed Sparse Row format>" 1823 | ] 1824 | }, 1825 | "metadata": { 1826 | "tags": [] 1827 | }, 1828 | "execution_count": 39 1829 | } 1830 | ] 1831 | }, 1832 | { 1833 | "cell_type": "markdown", 1834 | "metadata": { 1835 | "id": "5jrMB2sGmNtH" 1836 | }, 1837 | "source": [ 1838 | "## 5. Building and evaluating a model" 1839 | ] 1840 | }, 1841 | { 1842 | "cell_type": "code", 1843 | "metadata": { 1844 | "colab": { 1845 | "base_uri": "https://localhost:8080/" 1846 | }, 1847 | "id": "_UA_grwfmKAx", 1848 | "outputId": "68f9c903-b1bf-4109-c056-d7e7e70e44cd" 1849 | }, 1850 | "source": [ 1851 | "# import and instantiate a Multinomial Naive Bayes model\n", 1852 | "from sklearn.naive_bayes import MultinomialNB\n", 1853 | "nb = MultinomialNB()\n", 1854 | "# train the model using X_train_dtm (timing it with an IPython \"magic command\")\n", 1855 | "%time nb.fit(X_train_dtm, y_train)" 1856 | ], 1857 | "execution_count": 40, 1858 | "outputs": [ 1859 | { 1860 | "output_type": "stream", 1861 | "text": [ 1862 | "CPU times: user 6.55 ms, sys: 0 ns, total: 6.55 ms\n", 1863 | "Wall time: 9.38 ms\n" 1864 | ], 1865 | "name": "stdout" 1866 | }, 1867 | { 1868 | "output_type": "execute_result", 1869 | "data": { 1870 | "text/plain": [ 1871 | "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" 1872 | ] 1873 | }, 1874 | "metadata": { 1875 | "tags": [] 1876 | }, 1877 | "execution_count": 40 1878 | } 1879 | ] 1880 | }, 1881 | { 1882 | "cell_type": "code", 1883 | "metadata": { 1884 | "colab": { 1885 | "base_uri": "https://localhost:8080/" 1886 | }, 1887 | "id": "sX3Xtcn8mQ6F", 1888 | "outputId": "07e0995c-2596-41d3-b467-a1039e6300cc" 1889 | }, 1890 | "source": [ 1891 | "# make class predictions for X_test_dtm\n", 1892 | "y_pred_class = nb.predict(X_test_dtm)\n", 1893 | "# calculate accuracy of class predictions\n", 1894 | "from sklearn import metrics\n", 1895 | "metrics.accuracy_score(y_test, y_pred_class)" 1896 | ], 1897 | "execution_count": 41, 1898 | "outputs": [ 1899 | { 1900 | "output_type": "execute_result", 1901 | "data": { 1902 | "text/plain": [ 1903 | "0.9827709978463748" 1904 | ] 1905 | }, 1906 | "metadata": { 1907 | "tags": [] 1908 | }, 1909 | "execution_count": 41 1910 | } 1911 | ] 1912 | }, 1913 | { 1914 | "cell_type": "code", 1915 | "metadata": { 1916 | "colab": { 1917 | "base_uri": "https://localhost:8080/" 1918 | }, 1919 | "id": "Zs88HASKmUOy", 1920 | "outputId": "500deb05-b52e-4c8c-b171-d01de5050da4" 1921 | }, 1922 | "source": [ 1923 | "# print the confusion matrix\n", 1924 | "metrics.confusion_matrix(y_test, y_pred_class)" 1925 | ], 1926 | "execution_count": 42, 1927 | "outputs": [ 1928 | { 1929 | "output_type": "execute_result", 1930 | "data": { 1931 | "text/plain": [ 1932 | "array([[1205, 8],\n", 1933 | " [ 16, 164]])" 1934 | ] 1935 | }, 1936 | "metadata": { 1937 | "tags": [] 1938 | }, 1939 | "execution_count": 42 1940 | } 1941 | ] 1942 | }, 1943 | { 1944 | "cell_type": "code", 1945 | "metadata": { 1946 | "colab": { 1947 | "base_uri": "https://localhost:8080/" 1948 | }, 1949 | "id": "o_UjIBNEmW1Q", 1950 | "outputId": "899110aa-c49c-463c-df46-cd03830198f7" 1951 | }, 1952 | "source": [ 1953 | "X_test.shape" 1954 | ], 1955 | "execution_count": 43, 1956 | "outputs": [ 1957 | { 1958 | "output_type": "execute_result", 1959 | "data": { 1960 | "text/plain": [ 1961 | "(1393,)" 1962 | ] 1963 | }, 1964 | "metadata": { 1965 | "tags": [] 1966 | }, 1967 | "execution_count": 43 1968 | } 1969 | ] 1970 | }, 1971 | { 1972 | "cell_type": "code", 1973 | "metadata": { 1974 | "colab": { 1975 | "base_uri": "https://localhost:8080/" 1976 | }, 1977 | "id": "d28YU2iUmZXm", 1978 | "outputId": "47bb2ae0-2e40-44e5-95ed-8aba3975f965" 1979 | }, 1980 | "source": [ 1981 | "X_test[y_pred_class > y_test]" 1982 | ], 1983 | "execution_count": 44, 1984 | "outputs": [ 1985 | { 1986 | "output_type": "execute_result", 1987 | "data": { 1988 | "text/plain": [ 1989 | "2418 Madamregret disturbancemight receive reference...\n", 1990 | "4598 laid airtel line rest\n", 1991 | "386 Customer place call\n", 1992 | "1289 HeyGreat dealFarm tour 9am 5pm 95pax 50 deposi...\n", 1993 | "5094 Hi ShanilRakhesh herethanksi exchanged uncut d...\n", 1994 | "494 free nowcan call\n", 1995 | "759 Call youcarlos isare phones vibrate acting mig...\n", 1996 | "3140 Customer place call\n", 1997 | "Name: clean_msg, dtype: object" 1998 | ] 1999 | }, 2000 | "metadata": { 2001 | "tags": [] 2002 | }, 2003 | "execution_count": 44 2004 | } 2005 | ] 2006 | }, 2007 | { 2008 | "cell_type": "code", 2009 | "metadata": { 2010 | "colab": { 2011 | "base_uri": "https://localhost:8080/" 2012 | }, 2013 | "id": "QCdvZlsNmfY6", 2014 | "outputId": "3adc09a8-6bd6-476f-da06-253ac0d5dfaa" 2015 | }, 2016 | "source": [ 2017 | "# print message text for false negatives (spam incorrectly classifier)\n", 2018 | "X_test[y_pred_class < y_test]" 2019 | ], 2020 | "execution_count": 45, 2021 | "outputs": [ 2022 | { 2023 | "output_type": "execute_result", 2024 | "data": { 2025 | "text/plain": [ 2026 | "4674 Hi babe Chloe r smashed saturday night great w...\n", 2027 | "3528 Xmas New Years Eve tickets sale club day 10am ...\n", 2028 | "3417 LIFE never much fun great came made truly spec...\n", 2029 | "2773 come takes little time child afraid dark becom...\n", 2030 | "1960 Guess Somebody know secretly fancies Wanna fin...\n", 2031 | "5 FreeMsg Hey darling 3 weeks word back Id like ...\n", 2032 | "2078 85233 FREERingtoneReply REAL\n", 2033 | "1457 CLAIRE havin borin time alone wanna cum 2nite ...\n", 2034 | "190 unique enough Find 30th August wwwareyouunique...\n", 2035 | "2429 Guess IThis first time created web page WWWASJ...\n", 2036 | "3057 unsubscribed services Get tons sexy babes hunk...\n", 2037 | "1021 Guess Somebody know secretly fancies Wanna fin...\n", 2038 | "4067 TBSPERSOLVO chasing us since Sept forå£38 defi...\n", 2039 | "3358 Sorry missed call lets talk time 07090201529\n", 2040 | "2821 ROMCAPspam Everyone around responding well pre...\n", 2041 | "2247 Back work 2morro half term C 2nite sexy passio...\n", 2042 | "Name: clean_msg, dtype: object" 2043 | ] 2044 | }, 2045 | "metadata": { 2046 | "tags": [] 2047 | }, 2048 | "execution_count": 45 2049 | } 2050 | ] 2051 | }, 2052 | { 2053 | "cell_type": "code", 2054 | "metadata": { 2055 | "colab": { 2056 | "base_uri": "https://localhost:8080/", 2057 | "height": 35 2058 | }, 2059 | "id": "ptFhEiLimiCZ", 2060 | "outputId": "514a66db-36a0-4214-bb3f-a774a9b82e72" 2061 | }, 2062 | "source": [ 2063 | "# example of false negative \n", 2064 | "X_test[4949]" 2065 | ], 2066 | "execution_count": 46, 2067 | "outputs": [ 2068 | { 2069 | "output_type": "execute_result", 2070 | "data": { 2071 | "application/vnd.google.colaboratory.intrinsic+json": { 2072 | "type": "string" 2073 | }, 2074 | "text/plain": [ 2075 | "'Hi probably much fun get message thought id txt cos bored james farting night'" 2076 | ] 2077 | }, 2078 | "metadata": { 2079 | "tags": [] 2080 | }, 2081 | "execution_count": 46 2082 | } 2083 | ] 2084 | }, 2085 | { 2086 | "cell_type": "code", 2087 | "metadata": { 2088 | "colab": { 2089 | "base_uri": "https://localhost:8080/" 2090 | }, 2091 | "id": "KCIj1ItimlhQ", 2092 | "outputId": "f7a656ee-0d05-448f-8b7b-69c90a5be155" 2093 | }, 2094 | "source": [ 2095 | "# calculate predicted probabilities for X_test_dtm (poorly calibrated)\n", 2096 | "y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]\n", 2097 | "y_pred_prob" 2098 | ], 2099 | "execution_count": 47, 2100 | "outputs": [ 2101 | { 2102 | "output_type": "execute_result", 2103 | "data": { 2104 | "text/plain": [ 2105 | "array([2.11903975e-02, 3.97831612e-04, 1.06470895e-03, ...,\n", 2106 | " 1.31939653e-02, 9.99821127e-05, 6.04083365e-06])" 2107 | ] 2108 | }, 2109 | "metadata": { 2110 | "tags": [] 2111 | }, 2112 | "execution_count": 47 2113 | } 2114 | ] 2115 | }, 2116 | { 2117 | "cell_type": "code", 2118 | "metadata": { 2119 | "colab": { 2120 | "base_uri": "https://localhost:8080/" 2121 | }, 2122 | "id": "wZjMVAZemomm", 2123 | "outputId": "43ec9b84-3d96-4065-e066-d4bd8eb15b5f" 2124 | }, 2125 | "source": [ 2126 | "# calculate AUC\n", 2127 | "metrics.roc_auc_score(y_test, y_pred_prob)" 2128 | ], 2129 | "execution_count": 48, 2130 | "outputs": [ 2131 | { 2132 | "output_type": "execute_result", 2133 | "data": { 2134 | "text/plain": [ 2135 | "0.9774342768159751" 2136 | ] 2137 | }, 2138 | "metadata": { 2139 | "tags": [] 2140 | }, 2141 | "execution_count": 48 2142 | } 2143 | ] 2144 | }, 2145 | { 2146 | "cell_type": "code", 2147 | "metadata": { 2148 | "colab": { 2149 | "base_uri": "https://localhost:8080/" 2150 | }, 2151 | "id": "sfeq_Xr7mrDm", 2152 | "outputId": "d607a8ec-7303-45a2-eb7e-3fa67146a39e" 2153 | }, 2154 | "source": [ 2155 | "from sklearn.feature_extraction.text import TfidfTransformer\n", 2156 | "from sklearn.pipeline import Pipeline\n", 2157 | "\n", 2158 | "pipe = Pipeline([('bow', CountVectorizer()), \n", 2159 | " ('tfid', TfidfTransformer()), \n", 2160 | " ('model', MultinomialNB())])\n", 2161 | "pipe.fit(X_train, y_train)" 2162 | ], 2163 | "execution_count": 49, 2164 | "outputs": [ 2165 | { 2166 | "output_type": "execute_result", 2167 | "data": { 2168 | "text/plain": [ 2169 | "Pipeline(memory=None,\n", 2170 | " steps=[('bow',\n", 2171 | " CountVectorizer(analyzer='word', binary=False,\n", 2172 | " decode_error='strict',\n", 2173 | " dtype=, encoding='utf-8',\n", 2174 | " input='content', lowercase=True, max_df=1.0,\n", 2175 | " max_features=None, min_df=1,\n", 2176 | " ngram_range=(1, 1), preprocessor=None,\n", 2177 | " stop_words=None, strip_accents=None,\n", 2178 | " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", 2179 | " tokenizer=None, vocabulary=None)),\n", 2180 | " ('tfid',\n", 2181 | " TfidfTransformer(norm='l2', smooth_idf=True,\n", 2182 | " sublinear_tf=False, use_idf=True)),\n", 2183 | " ('model',\n", 2184 | " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],\n", 2185 | " verbose=False)" 2186 | ] 2187 | }, 2188 | "metadata": { 2189 | "tags": [] 2190 | }, 2191 | "execution_count": 49 2192 | } 2193 | ] 2194 | }, 2195 | { 2196 | "cell_type": "code", 2197 | "metadata": { 2198 | "colab": { 2199 | "base_uri": "https://localhost:8080/" 2200 | }, 2201 | "id": "VXV7G_RtmtuI", 2202 | "outputId": "c0f9ab52-d7a3-4573-d06e-5d66d22af906" 2203 | }, 2204 | "source": [ 2205 | "y_pred = pipe.predict(X_test)\n", 2206 | "metrics.accuracy_score(y_test, y_pred)" 2207 | ], 2208 | "execution_count": 50, 2209 | "outputs": [ 2210 | { 2211 | "output_type": "execute_result", 2212 | "data": { 2213 | "text/plain": [ 2214 | "0.9669777458722182" 2215 | ] 2216 | }, 2217 | "metadata": { 2218 | "tags": [] 2219 | }, 2220 | "execution_count": 50 2221 | } 2222 | ] 2223 | }, 2224 | { 2225 | "cell_type": "code", 2226 | "metadata": { 2227 | "colab": { 2228 | "base_uri": "https://localhost:8080/" 2229 | }, 2230 | "id": "gPRQSIP-mwym", 2231 | "outputId": "5fd9f66a-3b74-43bb-e577-c1720abba3f3" 2232 | }, 2233 | "source": [ 2234 | "metrics.confusion_matrix(y_test, y_pred)" 2235 | ], 2236 | "execution_count": 51, 2237 | "outputs": [ 2238 | { 2239 | "output_type": "execute_result", 2240 | "data": { 2241 | "text/plain": [ 2242 | "array([[1213, 0],\n", 2243 | " [ 46, 134]])" 2244 | ] 2245 | }, 2246 | "metadata": { 2247 | "tags": [] 2248 | }, 2249 | "execution_count": 51 2250 | } 2251 | ] 2252 | }, 2253 | { 2254 | "cell_type": "markdown", 2255 | "metadata": { 2256 | "id": "yZW33zDRm3Lz" 2257 | }, 2258 | "source": [ 2259 | "## 6. Comparing models" 2260 | ] 2261 | }, 2262 | { 2263 | "cell_type": "code", 2264 | "metadata": { 2265 | "colab": { 2266 | "base_uri": "https://localhost:8080/" 2267 | }, 2268 | "id": "Bgl9ffKdmz8O", 2269 | "outputId": "8bd03dd4-97fb-45d8-ff03-d8252b6ac139" 2270 | }, 2271 | "source": [ 2272 | "# import an instantiate a logistic regression model\n", 2273 | "from sklearn.linear_model import LogisticRegression\n", 2274 | "logreg = LogisticRegression(solver='liblinear')\n", 2275 | "# train the model using X_train_dtm\n", 2276 | "%time logreg.fit(X_train_dtm, y_train)" 2277 | ], 2278 | "execution_count": 52, 2279 | "outputs": [ 2280 | { 2281 | "output_type": "stream", 2282 | "text": [ 2283 | "CPU times: user 19.8 ms, sys: 3.81 ms, total: 23.6 ms\n", 2284 | "Wall time: 29 ms\n" 2285 | ], 2286 | "name": "stdout" 2287 | }, 2288 | { 2289 | "output_type": "execute_result", 2290 | "data": { 2291 | "text/plain": [ 2292 | "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", 2293 | " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", 2294 | " multi_class='auto', n_jobs=None, penalty='l2',\n", 2295 | " random_state=None, solver='liblinear', tol=0.0001, verbose=0,\n", 2296 | " warm_start=False)" 2297 | ] 2298 | }, 2299 | "metadata": { 2300 | "tags": [] 2301 | }, 2302 | "execution_count": 52 2303 | } 2304 | ] 2305 | }, 2306 | { 2307 | "cell_type": "code", 2308 | "metadata": { 2309 | "colab": { 2310 | "base_uri": "https://localhost:8080/" 2311 | }, 2312 | "id": "E-EYcA-sm6Uu", 2313 | "outputId": "9b11a513-b2f4-490a-b336-048b4ce177f7" 2314 | }, 2315 | "source": [ 2316 | "# make class predictions for X_test_dtm\n", 2317 | "y_pred_class = logreg.predict(X_test_dtm)\n", 2318 | "# calculate predicted probabilities for X_test_dtm (well calibrated)\n", 2319 | "y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]\n", 2320 | "y_pred_prob" 2321 | ], 2322 | "execution_count": 53, 2323 | "outputs": [ 2324 | { 2325 | "output_type": "execute_result", 2326 | "data": { 2327 | "text/plain": [ 2328 | "array([0.01694418, 0.0152182 , 0.08261755, ..., 0.02198942, 0.00531726,\n", 2329 | " 0.00679188])" 2330 | ] 2331 | }, 2332 | "metadata": { 2333 | "tags": [] 2334 | }, 2335 | "execution_count": 53 2336 | } 2337 | ] 2338 | }, 2339 | { 2340 | "cell_type": "code", 2341 | "metadata": { 2342 | "colab": { 2343 | "base_uri": "https://localhost:8080/" 2344 | }, 2345 | "id": "gm65nFGkm9fB", 2346 | "outputId": "0f1b8e50-1aa6-4544-9787-533e26a097ea" 2347 | }, 2348 | "source": [ 2349 | "# calculate accuracy\n", 2350 | "metrics.accuracy_score(y_test, y_pred_class)" 2351 | ], 2352 | "execution_count": 54, 2353 | "outputs": [ 2354 | { 2355 | "output_type": "execute_result", 2356 | "data": { 2357 | "text/plain": [ 2358 | "0.9842067480258435" 2359 | ] 2360 | }, 2361 | "metadata": { 2362 | "tags": [] 2363 | }, 2364 | "execution_count": 54 2365 | } 2366 | ] 2367 | }, 2368 | { 2369 | "cell_type": "code", 2370 | "metadata": { 2371 | "colab": { 2372 | "base_uri": "https://localhost:8080/" 2373 | }, 2374 | "id": "UXP8643anACj", 2375 | "outputId": "45a3e687-769a-4bb0-f3c3-f01c4af370fa" 2376 | }, 2377 | "source": [ 2378 | "metrics.confusion_matrix(y_test, y_pred_class)" 2379 | ], 2380 | "execution_count": 55, 2381 | "outputs": [ 2382 | { 2383 | "output_type": "execute_result", 2384 | "data": { 2385 | "text/plain": [ 2386 | "array([[1213, 0],\n", 2387 | " [ 22, 158]])" 2388 | ] 2389 | }, 2390 | "metadata": { 2391 | "tags": [] 2392 | }, 2393 | "execution_count": 55 2394 | } 2395 | ] 2396 | }, 2397 | { 2398 | "cell_type": "code", 2399 | "metadata": { 2400 | "colab": { 2401 | "base_uri": "https://localhost:8080/" 2402 | }, 2403 | "id": "zVoMyCBdnCMl", 2404 | "outputId": "6e03c07f-a2b8-4ade-daa0-fab8d9511bad" 2405 | }, 2406 | "source": [ 2407 | "# calculate AUC\n", 2408 | "metrics.roc_auc_score(y_test, y_pred_prob)" 2409 | ], 2410 | "execution_count": 62, 2411 | "outputs": [ 2412 | { 2413 | "output_type": "execute_result", 2414 | "data": { 2415 | "text/plain": [ 2416 | "0.9835714940001832" 2417 | ] 2418 | }, 2419 | "metadata": { 2420 | "tags": [] 2421 | }, 2422 | "execution_count": 62 2423 | } 2424 | ] 2425 | }, 2426 | { 2427 | "cell_type": "markdown", 2428 | "metadata": { 2429 | "id": "g8EoAyq6npFn" 2430 | }, 2431 | "source": [ 2432 | "## Conclusion:\n", 2433 | "\n", 2434 | "### Thus the model is 98% accurate and most spam messeages have been categorized accordingly." 2435 | ] 2436 | }, 2437 | { 2438 | "cell_type": "code", 2439 | "metadata": { 2440 | "id": "fyp0b_8fngFu" 2441 | }, 2442 | "source": [ 2443 | "" 2444 | ], 2445 | "execution_count": null, 2446 | "outputs": [] 2447 | } 2448 | ] 2449 | } --------------------------------------------------------------------------------