├── Fake News.ipynb
├── Readme.md
└── model.pkl


/Fake News.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "### Data Import"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": 1,
  13 |    "metadata": {},
  14 |    "outputs": [],
  15 |    "source": [
  16 |     "import numpy as np\n",
  17 |     "import pandas as pd\n",
  18 |     "import matplotlib.pyplot as plt"
  19 |    ]
  20 |   },
  21 |   {
  22 |    "cell_type": "code",
  23 |    "execution_count": 2,
  24 |    "metadata": {},
  25 |    "outputs": [],
  26 |    "source": [
  27 |     "True_news = pd.read_csv('True.csv')\n",
  28 |     "Fake_news = pd.read_csv('Fake.csv')"
  29 |    ]
  30 |   },
  31 |   {
  32 |    "cell_type": "code",
  33 |    "execution_count": 3,
  34 |    "metadata": {},
  35 |    "outputs": [],
  36 |    "source": [
  37 |     "True_news['label'] = 0"
  38 |    ]
  39 |   },
  40 |   {
  41 |    "cell_type": "code",
  42 |    "execution_count": 4,
  43 |    "metadata": {},
  44 |    "outputs": [],
  45 |    "source": [
  46 |     "Fake_news['label'] = 1"
  47 |    ]
  48 |   },
  49 |   {
  50 |    "cell_type": "code",
  51 |    "execution_count": 5,
  52 |    "metadata": {},
  53 |    "outputs": [
  54 |     {
  55 |      "data": {
  56 |       "text/html": [
  57 |        "<div>\n",
  58 |        "<style scoped>\n",
  59 |        "    .dataframe tbody tr th:only-of-type {\n",
  60 |        "        vertical-align: middle;\n",
  61 |        "    }\n",
  62 |        "\n",
  63 |        "    .dataframe tbody tr th {\n",
  64 |        "        vertical-align: top;\n",
  65 |        "    }\n",
  66 |        "\n",
  67 |        "    .dataframe thead th {\n",
  68 |        "        text-align: right;\n",
  69 |        "    }\n",
  70 |        "</style>\n",
  71 |        "<table border=\"1\" class=\"dataframe\">\n",
  72 |        "  <thead>\n",
  73 |        "    <tr style=\"text-align: right;\">\n",
  74 |        "      <th></th>\n",
  75 |        "      <th>title</th>\n",
  76 |        "      <th>text</th>\n",
  77 |        "      <th>subject</th>\n",
  78 |        "      <th>date</th>\n",
  79 |        "      <th>label</th>\n",
  80 |        "    </tr>\n",
  81 |        "  </thead>\n",
  82 |        "  <tbody>\n",
  83 |        "    <tr>\n",
  84 |        "      <th>0</th>\n",
  85 |        "      <td>As U.S. budget fight looms, Republicans flip t...</td>\n",
  86 |        "      <td>WASHINGTON (Reuters) - The head of a conservat...</td>\n",
  87 |        "      <td>politicsNews</td>\n",
  88 |        "      <td>December 31, 2017</td>\n",
  89 |        "      <td>0</td>\n",
  90 |        "    </tr>\n",
  91 |        "    <tr>\n",
  92 |        "      <th>1</th>\n",
  93 |        "      <td>U.S. military to accept transgender recruits o...</td>\n",
  94 |        "      <td>WASHINGTON (Reuters) - Transgender people will...</td>\n",
  95 |        "      <td>politicsNews</td>\n",
  96 |        "      <td>December 29, 2017</td>\n",
  97 |        "      <td>0</td>\n",
  98 |        "    </tr>\n",
  99 |        "    <tr>\n",
 100 |        "      <th>2</th>\n",
 101 |        "      <td>Senior U.S. Republican senator: 'Let Mr. Muell...</td>\n",
 102 |        "      <td>WASHINGTON (Reuters) - The special counsel inv...</td>\n",
 103 |        "      <td>politicsNews</td>\n",
 104 |        "      <td>December 31, 2017</td>\n",
 105 |        "      <td>0</td>\n",
 106 |        "    </tr>\n",
 107 |        "    <tr>\n",
 108 |        "      <th>3</th>\n",
 109 |        "      <td>FBI Russia probe helped by Australian diplomat...</td>\n",
 110 |        "      <td>WASHINGTON (Reuters) - Trump campaign adviser ...</td>\n",
 111 |        "      <td>politicsNews</td>\n",
 112 |        "      <td>December 30, 2017</td>\n",
 113 |        "      <td>0</td>\n",
 114 |        "    </tr>\n",
 115 |        "    <tr>\n",
 116 |        "      <th>4</th>\n",
 117 |        "      <td>Trump wants Postal Service to charge 'much mor...</td>\n",
 118 |        "      <td>SEATTLE/WASHINGTON (Reuters) - President Donal...</td>\n",
 119 |        "      <td>politicsNews</td>\n",
 120 |        "      <td>December 29, 2017</td>\n",
 121 |        "      <td>0</td>\n",
 122 |        "    </tr>\n",
 123 |        "  </tbody>\n",
 124 |        "</table>\n",
 125 |        "</div>"
 126 |       ],
 127 |       "text/plain": [
 128 |        "                                               title  \\\n",
 129 |        "0  As U.S. budget fight looms, Republicans flip t...   \n",
 130 |        "1  U.S. military to accept transgender recruits o...   \n",
 131 |        "2  Senior U.S. Republican senator: 'Let Mr. Muell...   \n",
 132 |        "3  FBI Russia probe helped by Australian diplomat...   \n",
 133 |        "4  Trump wants Postal Service to charge 'much mor...   \n",
 134 |        "\n",
 135 |        "                                                text       subject  \\\n",
 136 |        "0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   \n",
 137 |        "1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   \n",
 138 |        "2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   \n",
 139 |        "3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   \n",
 140 |        "4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   \n",
 141 |        "\n",
 142 |        "                 date  label  \n",
 143 |        "0  December 31, 2017       0  \n",
 144 |        "1  December 29, 2017       0  \n",
 145 |        "2  December 31, 2017       0  \n",
 146 |        "3  December 30, 2017       0  \n",
 147 |        "4  December 29, 2017       0  "
 148 |       ]
 149 |      },
 150 |      "execution_count": 5,
 151 |      "metadata": {},
 152 |      "output_type": "execute_result"
 153 |     }
 154 |    ],
 155 |    "source": [
 156 |     "True_news.head()"
 157 |    ]
 158 |   },
 159 |   {
 160 |    "cell_type": "code",
 161 |    "execution_count": 6,
 162 |    "metadata": {},
 163 |    "outputs": [
 164 |     {
 165 |      "data": {
 166 |       "text/html": [
 167 |        "<div>\n",
 168 |        "<style scoped>\n",
 169 |        "    .dataframe tbody tr th:only-of-type {\n",
 170 |        "        vertical-align: middle;\n",
 171 |        "    }\n",
 172 |        "\n",
 173 |        "    .dataframe tbody tr th {\n",
 174 |        "        vertical-align: top;\n",
 175 |        "    }\n",
 176 |        "\n",
 177 |        "    .dataframe thead th {\n",
 178 |        "        text-align: right;\n",
 179 |        "    }\n",
 180 |        "</style>\n",
 181 |        "<table border=\"1\" class=\"dataframe\">\n",
 182 |        "  <thead>\n",
 183 |        "    <tr style=\"text-align: right;\">\n",
 184 |        "      <th></th>\n",
 185 |        "      <th>title</th>\n",
 186 |        "      <th>text</th>\n",
 187 |        "      <th>subject</th>\n",
 188 |        "      <th>date</th>\n",
 189 |        "      <th>label</th>\n",
 190 |        "    </tr>\n",
 191 |        "  </thead>\n",
 192 |        "  <tbody>\n",
 193 |        "    <tr>\n",
 194 |        "      <th>0</th>\n",
 195 |        "      <td>Donald Trump Sends Out Embarrassing New Year’...</td>\n",
 196 |        "      <td>Donald Trump just couldn t wish all Americans ...</td>\n",
 197 |        "      <td>News</td>\n",
 198 |        "      <td>December 31, 2017</td>\n",
 199 |        "      <td>1</td>\n",
 200 |        "    </tr>\n",
 201 |        "    <tr>\n",
 202 |        "      <th>1</th>\n",
 203 |        "      <td>Drunk Bragging Trump Staffer Started Russian ...</td>\n",
 204 |        "      <td>House Intelligence Committee Chairman Devin Nu...</td>\n",
 205 |        "      <td>News</td>\n",
 206 |        "      <td>December 31, 2017</td>\n",
 207 |        "      <td>1</td>\n",
 208 |        "    </tr>\n",
 209 |        "    <tr>\n",
 210 |        "      <th>2</th>\n",
 211 |        "      <td>Sheriff David Clarke Becomes An Internet Joke...</td>\n",
 212 |        "      <td>On Friday, it was revealed that former Milwauk...</td>\n",
 213 |        "      <td>News</td>\n",
 214 |        "      <td>December 30, 2017</td>\n",
 215 |        "      <td>1</td>\n",
 216 |        "    </tr>\n",
 217 |        "    <tr>\n",
 218 |        "      <th>3</th>\n",
 219 |        "      <td>Trump Is So Obsessed He Even Has Obama’s Name...</td>\n",
 220 |        "      <td>On Christmas day, Donald Trump announced that ...</td>\n",
 221 |        "      <td>News</td>\n",
 222 |        "      <td>December 29, 2017</td>\n",
 223 |        "      <td>1</td>\n",
 224 |        "    </tr>\n",
 225 |        "    <tr>\n",
 226 |        "      <th>4</th>\n",
 227 |        "      <td>Pope Francis Just Called Out Donald Trump Dur...</td>\n",
 228 |        "      <td>Pope Francis used his annual Christmas Day mes...</td>\n",
 229 |        "      <td>News</td>\n",
 230 |        "      <td>December 25, 2017</td>\n",
 231 |        "      <td>1</td>\n",
 232 |        "    </tr>\n",
 233 |        "  </tbody>\n",
 234 |        "</table>\n",
 235 |        "</div>"
 236 |       ],
 237 |       "text/plain": [
 238 |        "                                               title  \\\n",
 239 |        "0   Donald Trump Sends Out Embarrassing New Year’...   \n",
 240 |        "1   Drunk Bragging Trump Staffer Started Russian ...   \n",
 241 |        "2   Sheriff David Clarke Becomes An Internet Joke...   \n",
 242 |        "3   Trump Is So Obsessed He Even Has Obama’s Name...   \n",
 243 |        "4   Pope Francis Just Called Out Donald Trump Dur...   \n",
 244 |        "\n",
 245 |        "                                                text subject  \\\n",
 246 |        "0  Donald Trump just couldn t wish all Americans ...    News   \n",
 247 |        "1  House Intelligence Committee Chairman Devin Nu...    News   \n",
 248 |        "2  On Friday, it was revealed that former Milwauk...    News   \n",
 249 |        "3  On Christmas day, Donald Trump announced that ...    News   \n",
 250 |        "4  Pope Francis used his annual Christmas Day mes...    News   \n",
 251 |        "\n",
 252 |        "                date  label  \n",
 253 |        "0  December 31, 2017      1  \n",
 254 |        "1  December 31, 2017      1  \n",
 255 |        "2  December 30, 2017      1  \n",
 256 |        "3  December 29, 2017      1  \n",
 257 |        "4  December 25, 2017      1  "
 258 |       ]
 259 |      },
 260 |      "execution_count": 6,
 261 |      "metadata": {},
 262 |      "output_type": "execute_result"
 263 |     }
 264 |    ],
 265 |    "source": [
 266 |     "Fake_news.head()"
 267 |    ]
 268 |   },
 269 |   {
 270 |    "cell_type": "code",
 271 |    "execution_count": 7,
 272 |    "metadata": {},
 273 |    "outputs": [],
 274 |    "source": [
 275 |     "dataset1 = True_news[['text','label']]\n",
 276 |     "dataset2 = Fake_news[['text','label']]"
 277 |    ]
 278 |   },
 279 |   {
 280 |    "cell_type": "code",
 281 |    "execution_count": 8,
 282 |    "metadata": {},
 283 |    "outputs": [],
 284 |    "source": [
 285 |     "dataset = pd.concat([dataset1 , dataset2])"
 286 |    ]
 287 |   },
 288 |   {
 289 |    "cell_type": "code",
 290 |    "execution_count": 9,
 291 |    "metadata": {},
 292 |    "outputs": [
 293 |     {
 294 |      "data": {
 295 |       "text/plain": [
 296 |        "(44898, 2)"
 297 |       ]
 298 |      },
 299 |      "execution_count": 9,
 300 |      "metadata": {},
 301 |      "output_type": "execute_result"
 302 |     }
 303 |    ],
 304 |    "source": [
 305 |     "dataset.shape"
 306 |    ]
 307 |   },
 308 |   {
 309 |    "cell_type": "markdown",
 310 |    "metadata": {},
 311 |    "source": [
 312 |     "### Null values"
 313 |    ]
 314 |   },
 315 |   {
 316 |    "cell_type": "code",
 317 |    "execution_count": 10,
 318 |    "metadata": {},
 319 |    "outputs": [
 320 |     {
 321 |      "data": {
 322 |       "text/plain": [
 323 |        "text     0\n",
 324 |        "label    0\n",
 325 |        "dtype: int64"
 326 |       ]
 327 |      },
 328 |      "execution_count": 10,
 329 |      "metadata": {},
 330 |      "output_type": "execute_result"
 331 |     }
 332 |    ],
 333 |    "source": [
 334 |     "dataset.isnull().sum() # no null values"
 335 |    ]
 336 |   },
 337 |   {
 338 |    "cell_type": "markdown",
 339 |    "metadata": {},
 340 |    "source": [
 341 |     "### Balanced or Unbalanced dataset"
 342 |    ]
 343 |   },
 344 |   {
 345 |    "cell_type": "code",
 346 |    "execution_count": 11,
 347 |    "metadata": {},
 348 |    "outputs": [
 349 |     {
 350 |      "data": {
 351 |       "text/plain": [
 352 |        "1    23481\n",
 353 |        "0    21417\n",
 354 |        "Name: label, dtype: int64"
 355 |       ]
 356 |      },
 357 |      "execution_count": 11,
 358 |      "metadata": {},
 359 |      "output_type": "execute_result"
 360 |     }
 361 |    ],
 362 |    "source": [
 363 |     "dataset['label'].value_counts()"
 364 |    ]
 365 |   },
 366 |   {
 367 |    "cell_type": "code",
 368 |    "execution_count": 12,
 369 |    "metadata": {},
 370 |    "outputs": [
 371 |     {
 372 |      "data": {
 373 |       "text/plain": [
 374 |        "(21417, 2)"
 375 |       ]
 376 |      },
 377 |      "execution_count": 12,
 378 |      "metadata": {},
 379 |      "output_type": "execute_result"
 380 |     }
 381 |    ],
 382 |    "source": [
 383 |     "dataset1.shape # true news"
 384 |    ]
 385 |   },
 386 |   {
 387 |    "cell_type": "code",
 388 |    "execution_count": 13,
 389 |    "metadata": {},
 390 |    "outputs": [
 391 |     {
 392 |      "data": {
 393 |       "text/plain": [
 394 |        "(23481, 2)"
 395 |       ]
 396 |      },
 397 |      "execution_count": 13,
 398 |      "metadata": {},
 399 |      "output_type": "execute_result"
 400 |     }
 401 |    ],
 402 |    "source": [
 403 |     "dataset2.shape # fake news"
 404 |    ]
 405 |   },
 406 |   {
 407 |    "cell_type": "markdown",
 408 |    "metadata": {},
 409 |    "source": [
 410 |     "### Shuffle or Resample"
 411 |    ]
 412 |   },
 413 |   {
 414 |    "cell_type": "code",
 415 |    "execution_count": 14,
 416 |    "metadata": {},
 417 |    "outputs": [],
 418 |    "source": [
 419 |     "dataset = dataset.sample(frac = 1)"
 420 |    ]
 421 |   },
 422 |   {
 423 |    "cell_type": "code",
 424 |    "execution_count": 15,
 425 |    "metadata": {},
 426 |    "outputs": [
 427 |     {
 428 |      "data": {
 429 |       "text/html": [
 430 |        "<div>\n",
 431 |        "<style scoped>\n",
 432 |        "    .dataframe tbody tr th:only-of-type {\n",
 433 |        "        vertical-align: middle;\n",
 434 |        "    }\n",
 435 |        "\n",
 436 |        "    .dataframe tbody tr th {\n",
 437 |        "        vertical-align: top;\n",
 438 |        "    }\n",
 439 |        "\n",
 440 |        "    .dataframe thead th {\n",
 441 |        "        text-align: right;\n",
 442 |        "    }\n",
 443 |        "</style>\n",
 444 |        "<table border=\"1\" class=\"dataframe\">\n",
 445 |        "  <thead>\n",
 446 |        "    <tr style=\"text-align: right;\">\n",
 447 |        "      <th></th>\n",
 448 |        "      <th>text</th>\n",
 449 |        "      <th>label</th>\n",
 450 |        "    </tr>\n",
 451 |        "  </thead>\n",
 452 |        "  <tbody>\n",
 453 |        "    <tr>\n",
 454 |        "      <th>22161</th>\n",
 455 |        "      <td>21st Century Wire says It was an awkward fit o...</td>\n",
 456 |        "      <td>1</td>\n",
 457 |        "    </tr>\n",
 458 |        "    <tr>\n",
 459 |        "      <th>17723</th>\n",
 460 |        "      <td>MONROVIA (Reuters) - One of Liberia s leading ...</td>\n",
 461 |        "      <td>0</td>\n",
 462 |        "    </tr>\n",
 463 |        "    <tr>\n",
 464 |        "      <th>13759</th>\n",
 465 |        "      <td>MOSCOW (Reuters) - Former Russian economy mini...</td>\n",
 466 |        "      <td>0</td>\n",
 467 |        "    </tr>\n",
 468 |        "    <tr>\n",
 469 |        "      <th>9387</th>\n",
 470 |        "      <td>(Reuters) - Officials from 11 U.S. states sued...</td>\n",
 471 |        "      <td>0</td>\n",
 472 |        "    </tr>\n",
 473 |        "    <tr>\n",
 474 |        "      <th>18704</th>\n",
 475 |        "      <td>COLBERT   LANGUAGE WARNING! This late night ho...</td>\n",
 476 |        "      <td>1</td>\n",
 477 |        "    </tr>\n",
 478 |        "    <tr>\n",
 479 |        "      <th>2476</th>\n",
 480 |        "      <td>Kellyanne Conway, who serves as a senior advis...</td>\n",
 481 |        "      <td>1</td>\n",
 482 |        "    </tr>\n",
 483 |        "    <tr>\n",
 484 |        "      <th>16571</th>\n",
 485 |        "      <td>I ll bet you re thinking this is a joke, right...</td>\n",
 486 |        "      <td>1</td>\n",
 487 |        "    </tr>\n",
 488 |        "    <tr>\n",
 489 |        "      <th>6836</th>\n",
 490 |        "      <td>WASHINGTON (Reuters) - U.S. President-elect Do...</td>\n",
 491 |        "      <td>0</td>\n",
 492 |        "    </tr>\n",
 493 |        "    <tr>\n",
 494 |        "      <th>13742</th>\n",
 495 |        "      <td>Don t buy into the media lie that every LEGAL ...</td>\n",
 496 |        "      <td>1</td>\n",
 497 |        "    </tr>\n",
 498 |        "    <tr>\n",
 499 |        "      <th>468</th>\n",
 500 |        "      <td>Donald Trump has the maturity of a toddler, an...</td>\n",
 501 |        "      <td>1</td>\n",
 502 |        "    </tr>\n",
 503 |        "    <tr>\n",
 504 |        "      <th>13729</th>\n",
 505 |        "      <td>TEGUCIGALPA (Reuters) - With 70 percent of bal...</td>\n",
 506 |        "      <td>0</td>\n",
 507 |        "    </tr>\n",
 508 |        "    <tr>\n",
 509 |        "      <th>7098</th>\n",
 510 |        "      <td>Who needs experts, really, when one can rely o...</td>\n",
 511 |        "      <td>1</td>\n",
 512 |        "    </tr>\n",
 513 |        "    <tr>\n",
 514 |        "      <th>3084</th>\n",
 515 |        "      <td>There s a man who s been permanently banned fr...</td>\n",
 516 |        "      <td>1</td>\n",
 517 |        "    </tr>\n",
 518 |        "    <tr>\n",
 519 |        "      <th>5895</th>\n",
 520 |        "      <td>BERLIN (Reuters) - German Chancellor Angela Me...</td>\n",
 521 |        "      <td>0</td>\n",
 522 |        "    </tr>\n",
 523 |        "    <tr>\n",
 524 |        "      <th>4123</th>\n",
 525 |        "      <td>WASHINGTON (Reuters) - U.S. Treasury Secretary...</td>\n",
 526 |        "      <td>0</td>\n",
 527 |        "    </tr>\n",
 528 |        "    <tr>\n",
 529 |        "      <th>6989</th>\n",
 530 |        "      <td>NEW YORK (Reuters) - President-elect Donald Tr...</td>\n",
 531 |        "      <td>0</td>\n",
 532 |        "    </tr>\n",
 533 |        "    <tr>\n",
 534 |        "      <th>10640</th>\n",
 535 |        "      <td>Will Austria s new mandates help to save their...</td>\n",
 536 |        "      <td>1</td>\n",
 537 |        "    </tr>\n",
 538 |        "    <tr>\n",
 539 |        "      <th>7968</th>\n",
 540 |        "      <td>NEW YORK (Reuters) - A majority of Americans ...</td>\n",
 541 |        "      <td>0</td>\n",
 542 |        "    </tr>\n",
 543 |        "    <tr>\n",
 544 |        "      <th>2934</th>\n",
 545 |        "      <td>WASHINGTON (Reuters) - Twenty moderate Republi...</td>\n",
 546 |        "      <td>0</td>\n",
 547 |        "    </tr>\n",
 548 |        "    <tr>\n",
 549 |        "      <th>5332</th>\n",
 550 |        "      <td>NEW YORK (Reuters) - NATO is the “strongest al...</td>\n",
 551 |        "      <td>0</td>\n",
 552 |        "    </tr>\n",
 553 |        "  </tbody>\n",
 554 |        "</table>\n",
 555 |        "</div>"
 556 |       ],
 557 |       "text/plain": [
 558 |        "                                                    text  label\n",
 559 |        "22161  21st Century Wire says It was an awkward fit o...      1\n",
 560 |        "17723  MONROVIA (Reuters) - One of Liberia s leading ...      0\n",
 561 |        "13759  MOSCOW (Reuters) - Former Russian economy mini...      0\n",
 562 |        "9387   (Reuters) - Officials from 11 U.S. states sued...      0\n",
 563 |        "18704  COLBERT   LANGUAGE WARNING! This late night ho...      1\n",
 564 |        "2476   Kellyanne Conway, who serves as a senior advis...      1\n",
 565 |        "16571  I ll bet you re thinking this is a joke, right...      1\n",
 566 |        "6836   WASHINGTON (Reuters) - U.S. President-elect Do...      0\n",
 567 |        "13742  Don t buy into the media lie that every LEGAL ...      1\n",
 568 |        "468    Donald Trump has the maturity of a toddler, an...      1\n",
 569 |        "13729  TEGUCIGALPA (Reuters) - With 70 percent of bal...      0\n",
 570 |        "7098   Who needs experts, really, when one can rely o...      1\n",
 571 |        "3084   There s a man who s been permanently banned fr...      1\n",
 572 |        "5895   BERLIN (Reuters) - German Chancellor Angela Me...      0\n",
 573 |        "4123   WASHINGTON (Reuters) - U.S. Treasury Secretary...      0\n",
 574 |        "6989   NEW YORK (Reuters) - President-elect Donald Tr...      0\n",
 575 |        "10640  Will Austria s new mandates help to save their...      1\n",
 576 |        "7968    NEW YORK (Reuters) - A majority of Americans ...      0\n",
 577 |        "2934   WASHINGTON (Reuters) - Twenty moderate Republi...      0\n",
 578 |        "5332   NEW YORK (Reuters) - NATO is the “strongest al...      0"
 579 |       ]
 580 |      },
 581 |      "execution_count": 15,
 582 |      "metadata": {},
 583 |      "output_type": "execute_result"
 584 |     }
 585 |    ],
 586 |    "source": [
 587 |     "dataset.head(20)"
 588 |    ]
 589 |   },
 590 |   {
 591 |    "cell_type": "code",
 592 |    "execution_count": 16,
 593 |    "metadata": {},
 594 |    "outputs": [],
 595 |    "source": [
 596 |     "import nltk"
 597 |    ]
 598 |   },
 599 |   {
 600 |    "cell_type": "code",
 601 |    "execution_count": 17,
 602 |    "metadata": {},
 603 |    "outputs": [],
 604 |    "source": [
 605 |     "import re\n",
 606 |     "from nltk.corpus import stopwords\n",
 607 |     "from nltk.stem import WordNetLemmatizer"
 608 |    ]
 609 |   },
 610 |   {
 611 |    "cell_type": "code",
 612 |    "execution_count": 18,
 613 |    "metadata": {},
 614 |    "outputs": [],
 615 |    "source": [
 616 |     "ps = WordNetLemmatizer()"
 617 |    ]
 618 |   },
 619 |   {
 620 |    "cell_type": "code",
 621 |    "execution_count": 19,
 622 |    "metadata": {},
 623 |    "outputs": [],
 624 |    "source": [
 625 |     "stopwords = stopwords.words('english')"
 626 |    ]
 627 |   },
 628 |   {
 629 |    "cell_type": "code",
 630 |    "execution_count": 20,
 631 |    "metadata": {},
 632 |    "outputs": [
 633 |     {
 634 |      "name": "stderr",
 635 |      "output_type": "stream",
 636 |      "text": [
 637 |       "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
 638 |       "[nltk_data]   Package wordnet is already up-to-date!\n"
 639 |      ]
 640 |     },
 641 |     {
 642 |      "data": {
 643 |       "text/plain": [
 644 |        "True"
 645 |       ]
 646 |      },
 647 |      "execution_count": 20,
 648 |      "metadata": {},
 649 |      "output_type": "execute_result"
 650 |     }
 651 |    ],
 652 |    "source": [
 653 |     "nltk.download('wordnet')"
 654 |    ]
 655 |   },
 656 |   {
 657 |    "cell_type": "code",
 658 |    "execution_count": 21,
 659 |    "metadata": {},
 660 |    "outputs": [],
 661 |    "source": [
 662 |     "def cleaning_data(row):\n",
 663 |     "    \n",
 664 |     "    # convert text to into lower case\n",
 665 |     "    row = row.lower() \n",
 666 |     "    \n",
 667 |     "    # this line of code only take words from text and remove number and special character using RegX\n",
 668 |     "    row = re.sub('[^a-zA-Z]' , ' ' , row)\n",
 669 |     "    \n",
 670 |     "    # split the data and make token.\n",
 671 |     "    token = row.split() \n",
 672 |     "    \n",
 673 |     "    # lemmatize the word and remove stop words like a, an , the , is ,are ...\n",
 674 |     "    news = [ps.lemmatize(word) for word in token if not word in stopwords]  \n",
 675 |     "    \n",
 676 |     "    # finaly join all the token with space\n",
 677 |     "    cleanned_news = ' '.join(news) \n",
 678 |     "    \n",
 679 |     "    # return cleanned data\n",
 680 |     "    return cleanned_news "
 681 |    ]
 682 |   },
 683 |   {
 684 |    "cell_type": "code",
 685 |    "execution_count": 22,
 686 |    "metadata": {},
 687 |    "outputs": [],
 688 |    "source": [
 689 |     "dataset['text'] = dataset['text'].apply(lambda x : cleaning_data(x))"
 690 |    ]
 691 |   },
 692 |   {
 693 |    "cell_type": "code",
 694 |    "execution_count": 23,
 695 |    "metadata": {},
 696 |    "outputs": [
 697 |     {
 698 |      "data": {
 699 |       "text/plain": [
 700 |        "text     0\n",
 701 |        "label    0\n",
 702 |        "dtype: int64"
 703 |       ]
 704 |      },
 705 |      "execution_count": 23,
 706 |      "metadata": {},
 707 |      "output_type": "execute_result"
 708 |     }
 709 |    ],
 710 |    "source": [
 711 |     "dataset.isnull().sum()"
 712 |    ]
 713 |   },
 714 |   {
 715 |    "cell_type": "code",
 716 |    "execution_count": 24,
 717 |    "metadata": {},
 718 |    "outputs": [],
 719 |    "source": [
 720 |     "from sklearn.feature_extraction.text import TfidfVectorizer"
 721 |    ]
 722 |   },
 723 |   {
 724 |    "cell_type": "code",
 725 |    "execution_count": 25,
 726 |    "metadata": {},
 727 |    "outputs": [],
 728 |    "source": [
 729 |     "vectorizer = TfidfVectorizer(max_features = 50000 , lowercase=False , ngram_range=(1,2))"
 730 |    ]
 731 |   },
 732 |   {
 733 |    "cell_type": "code",
 734 |    "execution_count": 26,
 735 |    "metadata": {},
 736 |    "outputs": [
 737 |     {
 738 |      "data": {
 739 |       "text/plain": [
 740 |        "(44898, 2)"
 741 |       ]
 742 |      },
 743 |      "execution_count": 26,
 744 |      "metadata": {},
 745 |      "output_type": "execute_result"
 746 |     }
 747 |    ],
 748 |    "source": [
 749 |     "dataset.shape"
 750 |    ]
 751 |   },
 752 |   {
 753 |    "cell_type": "code",
 754 |    "execution_count": 27,
 755 |    "metadata": {},
 756 |    "outputs": [],
 757 |    "source": [
 758 |     "X = dataset.iloc[:35000,0]\n",
 759 |     "y = dataset.iloc[:35000,1]"
 760 |    ]
 761 |   },
 762 |   {
 763 |    "cell_type": "code",
 764 |    "execution_count": 28,
 765 |    "metadata": {},
 766 |    "outputs": [
 767 |     {
 768 |      "data": {
 769 |       "text/plain": [
 770 |        "22161    st century wire say awkward fit nerve donald m...\n",
 771 |        "17723    monrovia reuters one liberia leading political...\n",
 772 |        "13759    moscow reuters former russian economy minister...\n",
 773 |        "9387     reuters official u state sued obama administra...\n",
 774 |        "18704    colbert language warning late night host one a...\n",
 775 |        "Name: text, dtype: object"
 776 |       ]
 777 |      },
 778 |      "execution_count": 28,
 779 |      "metadata": {},
 780 |      "output_type": "execute_result"
 781 |     }
 782 |    ],
 783 |    "source": [
 784 |     "X.head()"
 785 |    ]
 786 |   },
 787 |   {
 788 |    "cell_type": "code",
 789 |    "execution_count": 29,
 790 |    "metadata": {},
 791 |    "outputs": [
 792 |     {
 793 |      "data": {
 794 |       "text/plain": [
 795 |        "22161    1\n",
 796 |        "17723    0\n",
 797 |        "13759    0\n",
 798 |        "9387     0\n",
 799 |        "18704    1\n",
 800 |        "Name: label, dtype: int64"
 801 |       ]
 802 |      },
 803 |      "execution_count": 29,
 804 |      "metadata": {},
 805 |      "output_type": "execute_result"
 806 |     }
 807 |    ],
 808 |    "source": [
 809 |     "y.head()"
 810 |    ]
 811 |   },
 812 |   {
 813 |    "cell_type": "code",
 814 |    "execution_count": 30,
 815 |    "metadata": {},
 816 |    "outputs": [],
 817 |    "source": [
 818 |     "from sklearn.model_selection import train_test_split\n",
 819 |     "train_data , test_data , train_label , test_label = train_test_split(X , y , test_size = 0.2 ,random_state = 0)"
 820 |    ]
 821 |   },
 822 |   {
 823 |    "cell_type": "code",
 824 |    "execution_count": 31,
 825 |    "metadata": {},
 826 |    "outputs": [],
 827 |    "source": [
 828 |     "vec_train_data = vectorizer.fit_transform(train_data)"
 829 |    ]
 830 |   },
 831 |   {
 832 |    "cell_type": "code",
 833 |    "execution_count": 32,
 834 |    "metadata": {},
 835 |    "outputs": [],
 836 |    "source": [
 837 |     "vec_train_data = vec_train_data.toarray()"
 838 |    ]
 839 |   },
 840 |   {
 841 |    "cell_type": "code",
 842 |    "execution_count": 33,
 843 |    "metadata": {},
 844 |    "outputs": [
 845 |     {
 846 |      "data": {
 847 |       "text/plain": [
 848 |        "((28000,), (7000,))"
 849 |       ]
 850 |      },
 851 |      "execution_count": 33,
 852 |      "metadata": {},
 853 |      "output_type": "execute_result"
 854 |     }
 855 |    ],
 856 |    "source": [
 857 |     "train_data.shape , test_data.shape"
 858 |    ]
 859 |   },
 860 |   {
 861 |    "cell_type": "code",
 862 |    "execution_count": 34,
 863 |    "metadata": {},
 864 |    "outputs": [],
 865 |    "source": [
 866 |     "vec_test_data = vectorizer.transform(test_data).toarray()"
 867 |    ]
 868 |   },
 869 |   {
 870 |    "cell_type": "code",
 871 |    "execution_count": 35,
 872 |    "metadata": {},
 873 |    "outputs": [
 874 |     {
 875 |      "data": {
 876 |       "text/plain": [
 877 |        "((28000, 50000), (7000, 50000))"
 878 |       ]
 879 |      },
 880 |      "execution_count": 35,
 881 |      "metadata": {},
 882 |      "output_type": "execute_result"
 883 |     }
 884 |    ],
 885 |    "source": [
 886 |     "vec_train_data.shape , vec_test_data.shape"
 887 |    ]
 888 |   },
 889 |   {
 890 |    "cell_type": "code",
 891 |    "execution_count": 36,
 892 |    "metadata": {},
 893 |    "outputs": [
 894 |     {
 895 |      "data": {
 896 |       "text/plain": [
 897 |        "1    14615\n",
 898 |        "0    13385\n",
 899 |        "Name: label, dtype: int64"
 900 |       ]
 901 |      },
 902 |      "execution_count": 36,
 903 |      "metadata": {},
 904 |      "output_type": "execute_result"
 905 |     }
 906 |    ],
 907 |    "source": [
 908 |     "train_label.value_counts() # balanced partition"
 909 |    ]
 910 |   },
 911 |   {
 912 |    "cell_type": "code",
 913 |    "execution_count": 37,
 914 |    "metadata": {},
 915 |    "outputs": [
 916 |     {
 917 |      "data": {
 918 |       "text/plain": [
 919 |        "1    3674\n",
 920 |        "0    3326\n",
 921 |        "Name: label, dtype: int64"
 922 |       ]
 923 |      },
 924 |      "execution_count": 37,
 925 |      "metadata": {},
 926 |      "output_type": "execute_result"
 927 |     }
 928 |    ],
 929 |    "source": [
 930 |     "test_label.value_counts() # balanced partition"
 931 |    ]
 932 |   },
 933 |   {
 934 |    "cell_type": "code",
 935 |    "execution_count": 38,
 936 |    "metadata": {},
 937 |    "outputs": [],
 938 |    "source": [
 939 |     "training_data = pd.DataFrame(vec_train_data , columns=vectorizer.get_feature_names())\n",
 940 |     "testing_data = pd.DataFrame(vec_test_data , columns= vectorizer.get_feature_names())"
 941 |    ]
 942 |   },
 943 |   {
 944 |    "cell_type": "code",
 945 |    "execution_count": 39,
 946 |    "metadata": {},
 947 |    "outputs": [],
 948 |    "source": [
 949 |     "from sklearn.naive_bayes import MultinomialNB"
 950 |    ]
 951 |   },
 952 |   {
 953 |    "cell_type": "code",
 954 |    "execution_count": 40,
 955 |    "metadata": {},
 956 |    "outputs": [],
 957 |    "source": [
 958 |     "from sklearn.metrics import accuracy_score,classification_report"
 959 |    ]
 960 |   },
 961 |   {
 962 |    "cell_type": "code",
 963 |    "execution_count": 41,
 964 |    "metadata": {},
 965 |    "outputs": [],
 966 |    "source": [
 967 |     "clf = MultinomialNB()"
 968 |    ]
 969 |   },
 970 |   {
 971 |    "cell_type": "code",
 972 |    "execution_count": 42,
 973 |    "metadata": {},
 974 |    "outputs": [],
 975 |    "source": [
 976 |     "clf.fit(training_data, train_label)\n",
 977 |     "y_pred  = clf.predict(testing_data)"
 978 |    ]
 979 |   },
 980 |   {
 981 |    "cell_type": "markdown",
 982 |    "metadata": {},
 983 |    "source": [
 984 |     "### MultinomialNB"
 985 |    ]
 986 |   },
 987 |   {
 988 |    "cell_type": "code",
 989 |    "execution_count": 43,
 990 |    "metadata": {},
 991 |    "outputs": [
 992 |     {
 993 |      "data": {
 994 |       "text/plain": [
 995 |        "1    3734\n",
 996 |        "0    3266\n",
 997 |        "dtype: int64"
 998 |       ]
 999 |      },
1000 |      "execution_count": 43,
1001 |      "metadata": {},
1002 |      "output_type": "execute_result"
1003 |     }
1004 |    ],
1005 |    "source": [
1006 |     "pd.Series(y_pred).value_counts()"
1007 |    ]
1008 |   },
1009 |   {
1010 |    "cell_type": "code",
1011 |    "execution_count": 44,
1012 |    "metadata": {},
1013 |    "outputs": [
1014 |     {
1015 |      "data": {
1016 |       "text/plain": [
1017 |        "1    3674\n",
1018 |        "0    3326\n",
1019 |        "Name: label, dtype: int64"
1020 |       ]
1021 |      },
1022 |      "execution_count": 44,
1023 |      "metadata": {},
1024 |      "output_type": "execute_result"
1025 |     }
1026 |    ],
1027 |    "source": [
1028 |     "test_label.value_counts()"
1029 |    ]
1030 |   },
1031 |   {
1032 |    "cell_type": "code",
1033 |    "execution_count": 45,
1034 |    "metadata": {},
1035 |    "outputs": [
1036 |     {
1037 |      "name": "stdout",
1038 |      "output_type": "stream",
1039 |      "text": [
1040 |       "              precision    recall  f1-score   support\n",
1041 |       "\n",
1042 |       "           0       0.96      0.94      0.95      3326\n",
1043 |       "           1       0.95      0.96      0.96      3674\n",
1044 |       "\n",
1045 |       "    accuracy                           0.95      7000\n",
1046 |       "   macro avg       0.95      0.95      0.95      7000\n",
1047 |       "weighted avg       0.95      0.95      0.95      7000\n",
1048 |       "\n"
1049 |      ]
1050 |     }
1051 |    ],
1052 |    "source": [
1053 |     "print(classification_report(test_label , y_pred))"
1054 |    ]
1055 |   },
1056 |   {
1057 |    "cell_type": "markdown",
1058 |    "metadata": {},
1059 |    "source": [
1060 |     "Now predict on both train set"
1061 |    ]
1062 |   },
1063 |   {
1064 |    "cell_type": "code",
1065 |    "execution_count": 46,
1066 |    "metadata": {},
1067 |    "outputs": [
1068 |     {
1069 |      "name": "stdout",
1070 |      "output_type": "stream",
1071 |      "text": [
1072 |       "              precision    recall  f1-score   support\n",
1073 |       "\n",
1074 |       "           0       0.96      0.95      0.96     13385\n",
1075 |       "           1       0.96      0.96      0.96     14615\n",
1076 |       "\n",
1077 |       "    accuracy                           0.96     28000\n",
1078 |       "   macro avg       0.96      0.96      0.96     28000\n",
1079 |       "weighted avg       0.96      0.96      0.96     28000\n",
1080 |       "\n"
1081 |      ]
1082 |     }
1083 |    ],
1084 |    "source": [
1085 |     "y_pred_train = clf.predict(training_data)\n",
1086 |     "print(classification_report(train_label , y_pred_train))"
1087 |    ]
1088 |   },
1089 |   {
1090 |    "cell_type": "code",
1091 |    "execution_count": 47,
1092 |    "metadata": {},
1093 |    "outputs": [
1094 |     {
1095 |      "data": {
1096 |       "text/plain": [
1097 |        "0.9584642857142858"
1098 |       ]
1099 |      },
1100 |      "execution_count": 47,
1101 |      "metadata": {},
1102 |      "output_type": "execute_result"
1103 |     }
1104 |    ],
1105 |    "source": [
1106 |     "accuracy_score(train_label , y_pred_train)"
1107 |    ]
1108 |   },
1109 |   {
1110 |    "cell_type": "code",
1111 |    "execution_count": 48,
1112 |    "metadata": {},
1113 |    "outputs": [
1114 |     {
1115 |      "data": {
1116 |       "text/plain": [
1117 |        "0.9531428571428572"
1118 |       ]
1119 |      },
1120 |      "execution_count": 48,
1121 |      "metadata": {},
1122 |      "output_type": "execute_result"
1123 |     }
1124 |    ],
1125 |    "source": [
1126 |     "accuracy_score(test_label , y_pred)"
1127 |    ]
1128 |   },
1129 |   {
1130 |    "cell_type": "code",
1131 |    "execution_count": 49,
1132 |    "metadata": {},
1133 |    "outputs": [],
1134 |    "source": [
1135 |     "news = cleaning_data(str(\"Imposters posing as army personnel on the social media have been called out by the Indian Army as false news and disinformation.\"))"
1136 |    ]
1137 |   },
1138 |   {
1139 |    "cell_type": "code",
1140 |    "execution_count": 50,
1141 |    "metadata": {},
1142 |    "outputs": [
1143 |     {
1144 |      "data": {
1145 |       "text/plain": [
1146 |        "array([1])"
1147 |       ]
1148 |      },
1149 |      "execution_count": 50,
1150 |      "metadata": {},
1151 |      "output_type": "execute_result"
1152 |     }
1153 |    ],
1154 |    "source": [
1155 |     "single_prediction = clf.predict(vectorizer.transform([news]).toarray())\n",
1156 |     "single_prediction"
1157 |    ]
1158 |   },
1159 |   {
1160 |    "cell_type": "markdown",
1161 |    "metadata": {},
1162 |    "source": [
1163 |     "### Save the Model"
1164 |    ]
1165 |   },
1166 |   {
1167 |    "cell_type": "code",
1168 |    "execution_count": 2,
1169 |    "metadata": {},
1170 |    "outputs": [],
1171 |    "source": [
1172 |     "import joblib "
1173 |    ]
1174 |   },
1175 |   {
1176 |    "cell_type": "code",
1177 |    "execution_count": 53,
1178 |    "metadata": {},
1179 |    "outputs": [
1180 |     {
1181 |      "data": {
1182 |       "text/plain": [
1183 |        "['model.pkl']"
1184 |       ]
1185 |      },
1186 |      "execution_count": 53,
1187 |      "metadata": {},
1188 |      "output_type": "execute_result"
1189 |     }
1190 |    ],
1191 |    "source": [
1192 |     "joblib.dump(clf , 'model.pkl')"
1193 |    ]
1194 |   },
1195 |   {
1196 |    "cell_type": "code",
1197 |    "execution_count": null,
1198 |    "metadata": {},
1199 |    "outputs": [],
1200 |    "source": [
1201 |     "model = joblib.load('model.pkl')"
1202 |    ]
1203 |   }
1204 |  ],
1205 |  "metadata": {
1206 |   "kernelspec": {
1207 |    "display_name": "Python 3",
1208 |    "language": "python",
1209 |    "name": "python3"
1210 |   },
1211 |   "language_info": {
1212 |    "codemirror_mode": {
1213 |     "name": "ipython",
1214 |     "version": 3
1215 |    },
1216 |    "file_extension": ".py",
1217 |    "mimetype": "text/x-python",
1218 |    "name": "python",
1219 |    "nbconvert_exporter": "python",
1220 |    "pygments_lexer": "ipython3",
1221 |    "version": "3.7.4"
1222 |   }
1223 |  },
1224 |  "nbformat": 4,
1225 |  "nbformat_minor": 4
1226 | }
1227 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manthan89-py/Fake_News_detection/840c105c82bf7756569744e647bbc374ea32faa9/model.pkl


--------------------------------------------------------------------------------
	title	text	subject	date
0	As U.S. budget fight looms, Republicans flip t...	WASHINGTON (Reuters) - The head of a conservat...	politicsNews	December 31, 2017
1	U.S. military to accept transgender recruits o...	WASHINGTON (Reuters) - Transgender people will...	politicsNews	December 29, 2017
2	Senior U.S. Republican senator: 'Let Mr. Muell...	WASHINGTON (Reuters) - The special counsel inv...	politicsNews	December 31, 2017
3	FBI Russia probe helped by Australian diplomat...	WASHINGTON (Reuters) - Trump campaign adviser ...	politicsNews	December 30, 2017
4	Trump wants Postal Service to charge 'much mor...	SEATTLE/WASHINGTON (Reuters) - President Donal...	politicsNews	December 29, 2017
	title	text	subject	date	label
0	Donald Trump Sends Out Embarrassing New Year’...	Donald Trump just couldn t wish all Americans ...	News	December 31, 2017	1
1	Drunk Bragging Trump Staffer Started Russian ...	House Intelligence Committee Chairman Devin Nu...	News	December 31, 2017	1
2	Sheriff David Clarke Becomes An Internet Joke...	On Friday, it was revealed that former Milwauk...	News	December 30, 2017	1
3	Trump Is So Obsessed He Even Has Obama’s Name...	On Christmas day, Donald Trump announced that ...	News	December 29, 2017	1
4	Pope Francis Just Called Out Donald Trump Dur...	Pope Francis used his annual Christmas Day mes...	News	December 25, 2017	1
	text	label
22161	21st Century Wire says It was an awkward fit o...	1
17723	MONROVIA (Reuters) - One of Liberia s leading ...	0
13759	MOSCOW (Reuters) - Former Russian economy mini...	0
9387	(Reuters) - Officials from 11 U.S. states sued...	0
18704	COLBERT LANGUAGE WARNING! This late night ho...	1
2476	Kellyanne Conway, who serves as a senior advis...	1
16571	I ll bet you re thinking this is a joke, right...	1
6836	WASHINGTON (Reuters) - U.S. President-elect Do...	0
13742	Don t buy into the media lie that every LEGAL ...	1
468	Donald Trump has the maturity of a toddler, an...	1
13729	TEGUCIGALPA (Reuters) - With 70 percent of bal...	0
7098	Who needs experts, really, when one can rely o...	1
3084	There s a man who s been permanently banned fr...	1
5895	BERLIN (Reuters) - German Chancellor Angela Me...	0
4123	WASHINGTON (Reuters) - U.S. Treasury Secretary...	0
6989	NEW YORK (Reuters) - President-elect Donald Tr...	0
10640	Will Austria s new mandates help to save their...	1
7968	NEW YORK (Reuters) - A majority of Americans ...	0
2934	WASHINGTON (Reuters) - Twenty moderate Republi...	0
5332	NEW YORK (Reuters) - NATO is the “strongest al...	0