├── Fake News.ipynb ├── Readme.md └── model.pkl /Fake News.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Data Import" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "import matplotlib.pyplot as plt" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "True_news = pd.read_csv('True.csv')\n", 28 | "Fake_news = pd.read_csv('Fake.csv')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "True_news['label'] = 0" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "Fake_news['label'] = 1" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 5, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/html": [ 57 | "
\n", 58 | "\n", 71 | "\n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | "
titletextsubjectdatelabel
0As U.S. budget fight looms, Republicans flip t...WASHINGTON (Reuters) - The head of a conservat...politicsNewsDecember 31, 20170
1U.S. military to accept transgender recruits o...WASHINGTON (Reuters) - Transgender people will...politicsNewsDecember 29, 20170
2Senior U.S. Republican senator: 'Let Mr. Muell...WASHINGTON (Reuters) - The special counsel inv...politicsNewsDecember 31, 20170
3FBI Russia probe helped by Australian diplomat...WASHINGTON (Reuters) - Trump campaign adviser ...politicsNewsDecember 30, 20170
4Trump wants Postal Service to charge 'much mor...SEATTLE/WASHINGTON (Reuters) - President Donal...politicsNewsDecember 29, 20170
\n", 125 | "
" 126 | ], 127 | "text/plain": [ 128 | " title \\\n", 129 | "0 As U.S. budget fight looms, Republicans flip t... \n", 130 | "1 U.S. military to accept transgender recruits o... \n", 131 | "2 Senior U.S. Republican senator: 'Let Mr. Muell... \n", 132 | "3 FBI Russia probe helped by Australian diplomat... \n", 133 | "4 Trump wants Postal Service to charge 'much mor... \n", 134 | "\n", 135 | " text subject \\\n", 136 | "0 WASHINGTON (Reuters) - The head of a conservat... politicsNews \n", 137 | "1 WASHINGTON (Reuters) - Transgender people will... politicsNews \n", 138 | "2 WASHINGTON (Reuters) - The special counsel inv... politicsNews \n", 139 | "3 WASHINGTON (Reuters) - Trump campaign adviser ... politicsNews \n", 140 | "4 SEATTLE/WASHINGTON (Reuters) - President Donal... politicsNews \n", 141 | "\n", 142 | " date label \n", 143 | "0 December 31, 2017 0 \n", 144 | "1 December 29, 2017 0 \n", 145 | "2 December 31, 2017 0 \n", 146 | "3 December 30, 2017 0 \n", 147 | "4 December 29, 2017 0 " 148 | ] 149 | }, 150 | "execution_count": 5, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "True_news.head()" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 6, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/html": [ 167 | "
\n", 168 | "\n", 181 | "\n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | "
titletextsubjectdatelabel
0Donald Trump Sends Out Embarrassing New Year’...Donald Trump just couldn t wish all Americans ...NewsDecember 31, 20171
1Drunk Bragging Trump Staffer Started Russian ...House Intelligence Committee Chairman Devin Nu...NewsDecember 31, 20171
2Sheriff David Clarke Becomes An Internet Joke...On Friday, it was revealed that former Milwauk...NewsDecember 30, 20171
3Trump Is So Obsessed He Even Has Obama’s Name...On Christmas day, Donald Trump announced that ...NewsDecember 29, 20171
4Pope Francis Just Called Out Donald Trump Dur...Pope Francis used his annual Christmas Day mes...NewsDecember 25, 20171
\n", 235 | "
" 236 | ], 237 | "text/plain": [ 238 | " title \\\n", 239 | "0 Donald Trump Sends Out Embarrassing New Year’... \n", 240 | "1 Drunk Bragging Trump Staffer Started Russian ... \n", 241 | "2 Sheriff David Clarke Becomes An Internet Joke... \n", 242 | "3 Trump Is So Obsessed He Even Has Obama’s Name... \n", 243 | "4 Pope Francis Just Called Out Donald Trump Dur... \n", 244 | "\n", 245 | " text subject \\\n", 246 | "0 Donald Trump just couldn t wish all Americans ... News \n", 247 | "1 House Intelligence Committee Chairman Devin Nu... News \n", 248 | "2 On Friday, it was revealed that former Milwauk... News \n", 249 | "3 On Christmas day, Donald Trump announced that ... News \n", 250 | "4 Pope Francis used his annual Christmas Day mes... News \n", 251 | "\n", 252 | " date label \n", 253 | "0 December 31, 2017 1 \n", 254 | "1 December 31, 2017 1 \n", 255 | "2 December 30, 2017 1 \n", 256 | "3 December 29, 2017 1 \n", 257 | "4 December 25, 2017 1 " 258 | ] 259 | }, 260 | "execution_count": 6, 261 | "metadata": {}, 262 | "output_type": "execute_result" 263 | } 264 | ], 265 | "source": [ 266 | "Fake_news.head()" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 7, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "dataset1 = True_news[['text','label']]\n", 276 | "dataset2 = Fake_news[['text','label']]" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 8, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "dataset = pd.concat([dataset1 , dataset2])" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 9, 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "text/plain": [ 296 | "(44898, 2)" 297 | ] 298 | }, 299 | "execution_count": 9, 300 | "metadata": {}, 301 | "output_type": "execute_result" 302 | } 303 | ], 304 | "source": [ 305 | "dataset.shape" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "### Null values" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 10, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/plain": [ 323 | "text 0\n", 324 | "label 0\n", 325 | "dtype: int64" 326 | ] 327 | }, 328 | "execution_count": 10, 329 | "metadata": {}, 330 | "output_type": "execute_result" 331 | } 332 | ], 333 | "source": [ 334 | "dataset.isnull().sum() # no null values" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "### Balanced or Unbalanced dataset" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 11, 347 | "metadata": {}, 348 | "outputs": [ 349 | { 350 | "data": { 351 | "text/plain": [ 352 | "1 23481\n", 353 | "0 21417\n", 354 | "Name: label, dtype: int64" 355 | ] 356 | }, 357 | "execution_count": 11, 358 | "metadata": {}, 359 | "output_type": "execute_result" 360 | } 361 | ], 362 | "source": [ 363 | "dataset['label'].value_counts()" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 12, 369 | "metadata": {}, 370 | "outputs": [ 371 | { 372 | "data": { 373 | "text/plain": [ 374 | "(21417, 2)" 375 | ] 376 | }, 377 | "execution_count": 12, 378 | "metadata": {}, 379 | "output_type": "execute_result" 380 | } 381 | ], 382 | "source": [ 383 | "dataset1.shape # true news" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 13, 389 | "metadata": {}, 390 | "outputs": [ 391 | { 392 | "data": { 393 | "text/plain": [ 394 | "(23481, 2)" 395 | ] 396 | }, 397 | "execution_count": 13, 398 | "metadata": {}, 399 | "output_type": "execute_result" 400 | } 401 | ], 402 | "source": [ 403 | "dataset2.shape # fake news" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": {}, 409 | "source": [ 410 | "### Shuffle or Resample" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 14, 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "dataset = dataset.sample(frac = 1)" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 15, 425 | "metadata": {}, 426 | "outputs": [ 427 | { 428 | "data": { 429 | "text/html": [ 430 | "
\n", 431 | "\n", 444 | "\n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | "
textlabel
2216121st Century Wire says It was an awkward fit o...1
17723MONROVIA (Reuters) - One of Liberia s leading ...0
13759MOSCOW (Reuters) - Former Russian economy mini...0
9387(Reuters) - Officials from 11 U.S. states sued...0
18704COLBERT LANGUAGE WARNING! This late night ho...1
2476Kellyanne Conway, who serves as a senior advis...1
16571I ll bet you re thinking this is a joke, right...1
6836WASHINGTON (Reuters) - U.S. President-elect Do...0
13742Don t buy into the media lie that every LEGAL ...1
468Donald Trump has the maturity of a toddler, an...1
13729TEGUCIGALPA (Reuters) - With 70 percent of bal...0
7098Who needs experts, really, when one can rely o...1
3084There s a man who s been permanently banned fr...1
5895BERLIN (Reuters) - German Chancellor Angela Me...0
4123WASHINGTON (Reuters) - U.S. Treasury Secretary...0
6989NEW YORK (Reuters) - President-elect Donald Tr...0
10640Will Austria s new mandates help to save their...1
7968NEW YORK (Reuters) - A majority of Americans ...0
2934WASHINGTON (Reuters) - Twenty moderate Republi...0
5332NEW YORK (Reuters) - NATO is the “strongest al...0
\n", 555 | "
" 556 | ], 557 | "text/plain": [ 558 | " text label\n", 559 | "22161 21st Century Wire says It was an awkward fit o... 1\n", 560 | "17723 MONROVIA (Reuters) - One of Liberia s leading ... 0\n", 561 | "13759 MOSCOW (Reuters) - Former Russian economy mini... 0\n", 562 | "9387 (Reuters) - Officials from 11 U.S. states sued... 0\n", 563 | "18704 COLBERT LANGUAGE WARNING! This late night ho... 1\n", 564 | "2476 Kellyanne Conway, who serves as a senior advis... 1\n", 565 | "16571 I ll bet you re thinking this is a joke, right... 1\n", 566 | "6836 WASHINGTON (Reuters) - U.S. President-elect Do... 0\n", 567 | "13742 Don t buy into the media lie that every LEGAL ... 1\n", 568 | "468 Donald Trump has the maturity of a toddler, an... 1\n", 569 | "13729 TEGUCIGALPA (Reuters) - With 70 percent of bal... 0\n", 570 | "7098 Who needs experts, really, when one can rely o... 1\n", 571 | "3084 There s a man who s been permanently banned fr... 1\n", 572 | "5895 BERLIN (Reuters) - German Chancellor Angela Me... 0\n", 573 | "4123 WASHINGTON (Reuters) - U.S. Treasury Secretary... 0\n", 574 | "6989 NEW YORK (Reuters) - President-elect Donald Tr... 0\n", 575 | "10640 Will Austria s new mandates help to save their... 1\n", 576 | "7968 NEW YORK (Reuters) - A majority of Americans ... 0\n", 577 | "2934 WASHINGTON (Reuters) - Twenty moderate Republi... 0\n", 578 | "5332 NEW YORK (Reuters) - NATO is the “strongest al... 0" 579 | ] 580 | }, 581 | "execution_count": 15, 582 | "metadata": {}, 583 | "output_type": "execute_result" 584 | } 585 | ], 586 | "source": [ 587 | "dataset.head(20)" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 16, 593 | "metadata": {}, 594 | "outputs": [], 595 | "source": [ 596 | "import nltk" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 17, 602 | "metadata": {}, 603 | "outputs": [], 604 | "source": [ 605 | "import re\n", 606 | "from nltk.corpus import stopwords\n", 607 | "from nltk.stem import WordNetLemmatizer" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": 18, 613 | "metadata": {}, 614 | "outputs": [], 615 | "source": [ 616 | "ps = WordNetLemmatizer()" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": 19, 622 | "metadata": {}, 623 | "outputs": [], 624 | "source": [ 625 | "stopwords = stopwords.words('english')" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": 20, 631 | "metadata": {}, 632 | "outputs": [ 633 | { 634 | "name": "stderr", 635 | "output_type": "stream", 636 | "text": [ 637 | "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", 638 | "[nltk_data] Package wordnet is already up-to-date!\n" 639 | ] 640 | }, 641 | { 642 | "data": { 643 | "text/plain": [ 644 | "True" 645 | ] 646 | }, 647 | "execution_count": 20, 648 | "metadata": {}, 649 | "output_type": "execute_result" 650 | } 651 | ], 652 | "source": [ 653 | "nltk.download('wordnet')" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": 21, 659 | "metadata": {}, 660 | "outputs": [], 661 | "source": [ 662 | "def cleaning_data(row):\n", 663 | " \n", 664 | " # convert text to into lower case\n", 665 | " row = row.lower() \n", 666 | " \n", 667 | " # this line of code only take words from text and remove number and special character using RegX\n", 668 | " row = re.sub('[^a-zA-Z]' , ' ' , row)\n", 669 | " \n", 670 | " # split the data and make token.\n", 671 | " token = row.split() \n", 672 | " \n", 673 | " # lemmatize the word and remove stop words like a, an , the , is ,are ...\n", 674 | " news = [ps.lemmatize(word) for word in token if not word in stopwords] \n", 675 | " \n", 676 | " # finaly join all the token with space\n", 677 | " cleanned_news = ' '.join(news) \n", 678 | " \n", 679 | " # return cleanned data\n", 680 | " return cleanned_news " 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": 22, 686 | "metadata": {}, 687 | "outputs": [], 688 | "source": [ 689 | "dataset['text'] = dataset['text'].apply(lambda x : cleaning_data(x))" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": 23, 695 | "metadata": {}, 696 | "outputs": [ 697 | { 698 | "data": { 699 | "text/plain": [ 700 | "text 0\n", 701 | "label 0\n", 702 | "dtype: int64" 703 | ] 704 | }, 705 | "execution_count": 23, 706 | "metadata": {}, 707 | "output_type": "execute_result" 708 | } 709 | ], 710 | "source": [ 711 | "dataset.isnull().sum()" 712 | ] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": 24, 717 | "metadata": {}, 718 | "outputs": [], 719 | "source": [ 720 | "from sklearn.feature_extraction.text import TfidfVectorizer" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 25, 726 | "metadata": {}, 727 | "outputs": [], 728 | "source": [ 729 | "vectorizer = TfidfVectorizer(max_features = 50000 , lowercase=False , ngram_range=(1,2))" 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": 26, 735 | "metadata": {}, 736 | "outputs": [ 737 | { 738 | "data": { 739 | "text/plain": [ 740 | "(44898, 2)" 741 | ] 742 | }, 743 | "execution_count": 26, 744 | "metadata": {}, 745 | "output_type": "execute_result" 746 | } 747 | ], 748 | "source": [ 749 | "dataset.shape" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": 27, 755 | "metadata": {}, 756 | "outputs": [], 757 | "source": [ 758 | "X = dataset.iloc[:35000,0]\n", 759 | "y = dataset.iloc[:35000,1]" 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": 28, 765 | "metadata": {}, 766 | "outputs": [ 767 | { 768 | "data": { 769 | "text/plain": [ 770 | "22161 st century wire say awkward fit nerve donald m...\n", 771 | "17723 monrovia reuters one liberia leading political...\n", 772 | "13759 moscow reuters former russian economy minister...\n", 773 | "9387 reuters official u state sued obama administra...\n", 774 | "18704 colbert language warning late night host one a...\n", 775 | "Name: text, dtype: object" 776 | ] 777 | }, 778 | "execution_count": 28, 779 | "metadata": {}, 780 | "output_type": "execute_result" 781 | } 782 | ], 783 | "source": [ 784 | "X.head()" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": 29, 790 | "metadata": {}, 791 | "outputs": [ 792 | { 793 | "data": { 794 | "text/plain": [ 795 | "22161 1\n", 796 | "17723 0\n", 797 | "13759 0\n", 798 | "9387 0\n", 799 | "18704 1\n", 800 | "Name: label, dtype: int64" 801 | ] 802 | }, 803 | "execution_count": 29, 804 | "metadata": {}, 805 | "output_type": "execute_result" 806 | } 807 | ], 808 | "source": [ 809 | "y.head()" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": 30, 815 | "metadata": {}, 816 | "outputs": [], 817 | "source": [ 818 | "from sklearn.model_selection import train_test_split\n", 819 | "train_data , test_data , train_label , test_label = train_test_split(X , y , test_size = 0.2 ,random_state = 0)" 820 | ] 821 | }, 822 | { 823 | "cell_type": "code", 824 | "execution_count": 31, 825 | "metadata": {}, 826 | "outputs": [], 827 | "source": [ 828 | "vec_train_data = vectorizer.fit_transform(train_data)" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": 32, 834 | "metadata": {}, 835 | "outputs": [], 836 | "source": [ 837 | "vec_train_data = vec_train_data.toarray()" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": 33, 843 | "metadata": {}, 844 | "outputs": [ 845 | { 846 | "data": { 847 | "text/plain": [ 848 | "((28000,), (7000,))" 849 | ] 850 | }, 851 | "execution_count": 33, 852 | "metadata": {}, 853 | "output_type": "execute_result" 854 | } 855 | ], 856 | "source": [ 857 | "train_data.shape , test_data.shape" 858 | ] 859 | }, 860 | { 861 | "cell_type": "code", 862 | "execution_count": 34, 863 | "metadata": {}, 864 | "outputs": [], 865 | "source": [ 866 | "vec_test_data = vectorizer.transform(test_data).toarray()" 867 | ] 868 | }, 869 | { 870 | "cell_type": "code", 871 | "execution_count": 35, 872 | "metadata": {}, 873 | "outputs": [ 874 | { 875 | "data": { 876 | "text/plain": [ 877 | "((28000, 50000), (7000, 50000))" 878 | ] 879 | }, 880 | "execution_count": 35, 881 | "metadata": {}, 882 | "output_type": "execute_result" 883 | } 884 | ], 885 | "source": [ 886 | "vec_train_data.shape , vec_test_data.shape" 887 | ] 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": 36, 892 | "metadata": {}, 893 | "outputs": [ 894 | { 895 | "data": { 896 | "text/plain": [ 897 | "1 14615\n", 898 | "0 13385\n", 899 | "Name: label, dtype: int64" 900 | ] 901 | }, 902 | "execution_count": 36, 903 | "metadata": {}, 904 | "output_type": "execute_result" 905 | } 906 | ], 907 | "source": [ 908 | "train_label.value_counts() # balanced partition" 909 | ] 910 | }, 911 | { 912 | "cell_type": "code", 913 | "execution_count": 37, 914 | "metadata": {}, 915 | "outputs": [ 916 | { 917 | "data": { 918 | "text/plain": [ 919 | "1 3674\n", 920 | "0 3326\n", 921 | "Name: label, dtype: int64" 922 | ] 923 | }, 924 | "execution_count": 37, 925 | "metadata": {}, 926 | "output_type": "execute_result" 927 | } 928 | ], 929 | "source": [ 930 | "test_label.value_counts() # balanced partition" 931 | ] 932 | }, 933 | { 934 | "cell_type": "code", 935 | "execution_count": 38, 936 | "metadata": {}, 937 | "outputs": [], 938 | "source": [ 939 | "training_data = pd.DataFrame(vec_train_data , columns=vectorizer.get_feature_names())\n", 940 | "testing_data = pd.DataFrame(vec_test_data , columns= vectorizer.get_feature_names())" 941 | ] 942 | }, 943 | { 944 | "cell_type": "code", 945 | "execution_count": 39, 946 | "metadata": {}, 947 | "outputs": [], 948 | "source": [ 949 | "from sklearn.naive_bayes import MultinomialNB" 950 | ] 951 | }, 952 | { 953 | "cell_type": "code", 954 | "execution_count": 40, 955 | "metadata": {}, 956 | "outputs": [], 957 | "source": [ 958 | "from sklearn.metrics import accuracy_score,classification_report" 959 | ] 960 | }, 961 | { 962 | "cell_type": "code", 963 | "execution_count": 41, 964 | "metadata": {}, 965 | "outputs": [], 966 | "source": [ 967 | "clf = MultinomialNB()" 968 | ] 969 | }, 970 | { 971 | "cell_type": "code", 972 | "execution_count": 42, 973 | "metadata": {}, 974 | "outputs": [], 975 | "source": [ 976 | "clf.fit(training_data, train_label)\n", 977 | "y_pred = clf.predict(testing_data)" 978 | ] 979 | }, 980 | { 981 | "cell_type": "markdown", 982 | "metadata": {}, 983 | "source": [ 984 | "### MultinomialNB" 985 | ] 986 | }, 987 | { 988 | "cell_type": "code", 989 | "execution_count": 43, 990 | "metadata": {}, 991 | "outputs": [ 992 | { 993 | "data": { 994 | "text/plain": [ 995 | "1 3734\n", 996 | "0 3266\n", 997 | "dtype: int64" 998 | ] 999 | }, 1000 | "execution_count": 43, 1001 | "metadata": {}, 1002 | "output_type": "execute_result" 1003 | } 1004 | ], 1005 | "source": [ 1006 | "pd.Series(y_pred).value_counts()" 1007 | ] 1008 | }, 1009 | { 1010 | "cell_type": "code", 1011 | "execution_count": 44, 1012 | "metadata": {}, 1013 | "outputs": [ 1014 | { 1015 | "data": { 1016 | "text/plain": [ 1017 | "1 3674\n", 1018 | "0 3326\n", 1019 | "Name: label, dtype: int64" 1020 | ] 1021 | }, 1022 | "execution_count": 44, 1023 | "metadata": {}, 1024 | "output_type": "execute_result" 1025 | } 1026 | ], 1027 | "source": [ 1028 | "test_label.value_counts()" 1029 | ] 1030 | }, 1031 | { 1032 | "cell_type": "code", 1033 | "execution_count": 45, 1034 | "metadata": {}, 1035 | "outputs": [ 1036 | { 1037 | "name": "stdout", 1038 | "output_type": "stream", 1039 | "text": [ 1040 | " precision recall f1-score support\n", 1041 | "\n", 1042 | " 0 0.96 0.94 0.95 3326\n", 1043 | " 1 0.95 0.96 0.96 3674\n", 1044 | "\n", 1045 | " accuracy 0.95 7000\n", 1046 | " macro avg 0.95 0.95 0.95 7000\n", 1047 | "weighted avg 0.95 0.95 0.95 7000\n", 1048 | "\n" 1049 | ] 1050 | } 1051 | ], 1052 | "source": [ 1053 | "print(classification_report(test_label , y_pred))" 1054 | ] 1055 | }, 1056 | { 1057 | "cell_type": "markdown", 1058 | "metadata": {}, 1059 | "source": [ 1060 | "Now predict on both train set" 1061 | ] 1062 | }, 1063 | { 1064 | "cell_type": "code", 1065 | "execution_count": 46, 1066 | "metadata": {}, 1067 | "outputs": [ 1068 | { 1069 | "name": "stdout", 1070 | "output_type": "stream", 1071 | "text": [ 1072 | " precision recall f1-score support\n", 1073 | "\n", 1074 | " 0 0.96 0.95 0.96 13385\n", 1075 | " 1 0.96 0.96 0.96 14615\n", 1076 | "\n", 1077 | " accuracy 0.96 28000\n", 1078 | " macro avg 0.96 0.96 0.96 28000\n", 1079 | "weighted avg 0.96 0.96 0.96 28000\n", 1080 | "\n" 1081 | ] 1082 | } 1083 | ], 1084 | "source": [ 1085 | "y_pred_train = clf.predict(training_data)\n", 1086 | "print(classification_report(train_label , y_pred_train))" 1087 | ] 1088 | }, 1089 | { 1090 | "cell_type": "code", 1091 | "execution_count": 47, 1092 | "metadata": {}, 1093 | "outputs": [ 1094 | { 1095 | "data": { 1096 | "text/plain": [ 1097 | "0.9584642857142858" 1098 | ] 1099 | }, 1100 | "execution_count": 47, 1101 | "metadata": {}, 1102 | "output_type": "execute_result" 1103 | } 1104 | ], 1105 | "source": [ 1106 | "accuracy_score(train_label , y_pred_train)" 1107 | ] 1108 | }, 1109 | { 1110 | "cell_type": "code", 1111 | "execution_count": 48, 1112 | "metadata": {}, 1113 | "outputs": [ 1114 | { 1115 | "data": { 1116 | "text/plain": [ 1117 | "0.9531428571428572" 1118 | ] 1119 | }, 1120 | "execution_count": 48, 1121 | "metadata": {}, 1122 | "output_type": "execute_result" 1123 | } 1124 | ], 1125 | "source": [ 1126 | "accuracy_score(test_label , y_pred)" 1127 | ] 1128 | }, 1129 | { 1130 | "cell_type": "code", 1131 | "execution_count": 49, 1132 | "metadata": {}, 1133 | "outputs": [], 1134 | "source": [ 1135 | "news = cleaning_data(str(\"Imposters posing as army personnel on the social media have been called out by the Indian Army as false news and disinformation.\"))" 1136 | ] 1137 | }, 1138 | { 1139 | "cell_type": "code", 1140 | "execution_count": 50, 1141 | "metadata": {}, 1142 | "outputs": [ 1143 | { 1144 | "data": { 1145 | "text/plain": [ 1146 | "array([1])" 1147 | ] 1148 | }, 1149 | "execution_count": 50, 1150 | "metadata": {}, 1151 | "output_type": "execute_result" 1152 | } 1153 | ], 1154 | "source": [ 1155 | "single_prediction = clf.predict(vectorizer.transform([news]).toarray())\n", 1156 | "single_prediction" 1157 | ] 1158 | }, 1159 | { 1160 | "cell_type": "markdown", 1161 | "metadata": {}, 1162 | "source": [ 1163 | "### Save the Model" 1164 | ] 1165 | }, 1166 | { 1167 | "cell_type": "code", 1168 | "execution_count": 2, 1169 | "metadata": {}, 1170 | "outputs": [], 1171 | "source": [ 1172 | "import joblib " 1173 | ] 1174 | }, 1175 | { 1176 | "cell_type": "code", 1177 | "execution_count": 53, 1178 | "metadata": {}, 1179 | "outputs": [ 1180 | { 1181 | "data": { 1182 | "text/plain": [ 1183 | "['model.pkl']" 1184 | ] 1185 | }, 1186 | "execution_count": 53, 1187 | "metadata": {}, 1188 | "output_type": "execute_result" 1189 | } 1190 | ], 1191 | "source": [ 1192 | "joblib.dump(clf , 'model.pkl')" 1193 | ] 1194 | }, 1195 | { 1196 | "cell_type": "code", 1197 | "execution_count": null, 1198 | "metadata": {}, 1199 | "outputs": [], 1200 | "source": [ 1201 | "model = joblib.load('model.pkl')" 1202 | ] 1203 | } 1204 | ], 1205 | "metadata": { 1206 | "kernelspec": { 1207 | "display_name": "Python 3", 1208 | "language": "python", 1209 | "name": "python3" 1210 | }, 1211 | "language_info": { 1212 | "codemirror_mode": { 1213 | "name": "ipython", 1214 | "version": 3 1215 | }, 1216 | "file_extension": ".py", 1217 | "mimetype": "text/x-python", 1218 | "name": "python", 1219 | "nbconvert_exporter": "python", 1220 | "pygments_lexer": "ipython3", 1221 | "version": "3.7.4" 1222 | } 1223 | }, 1224 | "nbformat": 4, 1225 | "nbformat_minor": 4 1226 | } 1227 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthan89-py/Fake_News_detection/840c105c82bf7756569744e647bbc374ea32faa9/model.pkl --------------------------------------------------------------------------------